├── .git-blame-ignore-revs ├── .github ├── ISSUE_TEMPLATE.md └── workflows │ ├── cli_vamb.yml │ ├── lint.yml │ ├── snakemake_avamb.yml │ └── unittest.yml ├── .gitignore ├── .readthedocs.yaml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmark ├── Project.toml ├── benchmark.png ├── benchmarks.csv └── make_plots.jl ├── doc ├── README.md ├── conf.py ├── front_page.md ├── histogram.npz ├── how_to_run.md ├── index.rst ├── inputs_outputs.md ├── installation.md └── tips.md ├── pyproject.toml ├── setup.cfg ├── src ├── concatenate.py ├── create_fasta.py ├── create_kernel.py └── merge_aemb.py ├── test ├── data │ ├── aemb │ │ ├── 6.aemb.tsv │ │ ├── 7.aemb.tsv │ │ └── 8.aemb.tsv │ ├── bam │ │ ├── 10.bam │ │ ├── 11.bam │ │ └── 12.bam │ ├── fasta.fna │ └── marker.fna ├── test_aamb_encode.py ├── test_cluster.py ├── test_encode.py ├── test_parsebam.py ├── test_parsecontigs.py ├── test_parsemarkers.py ├── test_reclustering.py ├── test_results.py ├── test_semisupervised_encode.py ├── test_vambtools.py └── testtools.py ├── vamb ├── __init__.py ├── __main__.py ├── aamb_encode.py ├── cluster.py ├── encode.py ├── hloss_misc.py ├── kernel.npz ├── marker.hmm ├── parsebam.py ├── parsecontigs.py ├── parsemarkers.py ├── reclustering.py ├── semisupervised_encode.py ├── taxonomy.py ├── taxvamb_encode.py └── vambtools.py └── workflow_avamb ├── README.md ├── avamb.snake.conda.smk ├── config.json ├── envs ├── avamb.yaml ├── checkm2.yml ├── minimap2.yaml └── samtools.yaml └── src ├── abundances_mask.py ├── create_abundances.py ├── create_cluster_scores_bin_path_dict.py ├── manual_drep_JN.py ├── mv_bins_from_mdrep_clusters.py ├── rip_bins.py ├── transfer_contigs_and_aggregate_all_nc_bins.py ├── update_cluster_scores_dict_after_ripping.py ├── workflow_tools.py ├── write_abundances.py └── write_clusters_from_dereplicated_and_ripped_bins.sh /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | e6480cefc77abcbd06d86e438504e8db9e8276eb 2 | 910f64c9fc294fa5c92aa19015a02a376c1d5ecc 3 | 5a0cc3a4d9dd8ddcb74bdebd9119cffacbe25942 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Thank you for making an issue. 2 | If you are submitting a bug report, it will help us if you include the following information: 3 | 4 | - Your version of Python and Vamb. 5 | - The log file (called `log.txt`) from the output directory 6 | - The full error message produced by Vamb, if any 7 | -------------------------------------------------------------------------------- /.github/workflows/cli_vamb.yml: -------------------------------------------------------------------------------- 1 | name: Command line interface tests 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | workflow_dispatch: 9 | inputs: 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | build: 16 | runs-on: ubuntu-latest 17 | strategy: 18 | matrix: 19 | python-version: ["3.11"] 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | cache: 'pip' # caching pip dependencies 28 | cache-dependency-path: '**/pyproject.toml' 29 | - name: Download fixtures 30 | run: | 31 | wget https://www.dropbox.com/scl/fi/10tdf0w0kf70pf46hy8ks/ci_data.zip\?rlkey\=smlcinkesuwiw557zulgbb59l\&st\=hhokiqma\&dl\=0 -O ci_data.zip 32 | unzip -o ci_data.zip 33 | - name: Install dependencies 34 | run: | 35 | python -m pip install --upgrade pip 36 | pip install flake8 pytest 37 | pip install -e . 38 | - name: Run VAMB 39 | run: | 40 | vamb bin default --outdir outdir_vamb --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz -l 32 -e 10 -q 2 -o C --minfasta 200000 -t 10 41 | ls -la outdir_vamb 42 | cat outdir_vamb/log.txt 43 | - name: Run TaxVAMB 44 | run: | 45 | vamb bin taxvamb --outdir outdir_taxvamb --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy taxonomy_mock.tsv -pe 10 -pt 10 -e 10 -q 2 3 -t 10 -o C --minfasta 200000 46 | ls -la outdir_taxvamb 47 | cat outdir_taxvamb/log.txt 48 | vamb bin taxvamb --outdir outdir_taxvamb_no_predict --no_predictor --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy taxonomy_mock.tsv -e 10 -q 2 3 -t 10 -o C --minfasta 200000 49 | ls -la outdir_taxvamb_no_predict 50 | cat outdir_taxvamb_no_predict/log.txt 51 | vamb bin taxvamb --outdir outdir_taxvamb_preds --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --no_predictor --taxonomy outdir_taxvamb/results_taxometer.tsv -e 10 -q 2 -t 10 -o C --minfasta 200000 52 | ls -la outdir_taxvamb_preds 53 | cat outdir_taxvamb_preds/log.txt 54 | - name: Run Taxometer 55 | run: | 56 | vamb taxometer --outdir outdir_taxometer --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy taxonomy_mock.tsv -pe 10 -pt 10 57 | ls -la outdir_taxometer 58 | cat outdir_taxometer/log.txt 59 | - name: Run k-means reclustering 60 | run: | 61 | vamb recluster --outdir outdir_recluster --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --latent_path outdir_taxvamb/vaevae_latent.npz --clusters_path outdir_taxvamb/vaevae_clusters_split.tsv --markers markers_mock.npz --algorithm kmeans --minfasta 200000 62 | ls -la outdir_recluster 63 | cat outdir_recluster/log.txt 64 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | pull_request: 5 | branches: [ "master" ] 6 | 7 | jobs: 8 | lint: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v4 12 | - uses: astral-sh/ruff-action@v3 13 | with: 14 | version: "0.11.x" 15 | 16 | 17 | format: 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v4 21 | - uses: astral-sh/ruff-action@v3 22 | with: 23 | version: "0.11.x" 24 | args: 'format --check' 25 | -------------------------------------------------------------------------------- /.github/workflows/snakemake_avamb.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install snakemake and AVAMB dependencies and run the AVAMB snakemake pipeline 2 | 3 | name: AVAMB snakemake - runs daily 4 | 5 | on: 6 | workflow_dispatch: 7 | inputs: 8 | schedule: 9 | - cron: "0 1 * * *" 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | build: 16 | runs-on: ubuntu-latest 17 | strategy: 18 | matrix: 19 | python-version: ["3.9"] 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | cache: 'pip' 28 | cache-dependency-path: '**/pyproject.toml' 29 | - uses: mamba-org/setup-micromamba@v1 30 | with: 31 | micromamba-version: '1.3.1-0' 32 | environment-file: workflow_avamb/envs/avamb.yaml 33 | environment-name: avamb 34 | create-args: >- 35 | python=3.9.16 36 | init-shell: >- 37 | bash 38 | cache-environment: true 39 | post-cleanup: 'all' 40 | - name: Install dependencies to avamb environment 41 | run: | 42 | which pip 43 | pip install -e . 44 | pip freeze 45 | git clone https://github.com/chklovski/CheckM2.git 46 | shell: micromamba-shell {0} 47 | - name: Install CheckM2 environment 48 | run: | 49 | micromamba create -n checkm2 python=3.8.15 pandas=2.1.1 50 | micromamba env update -n checkm2 --file workflow_avamb/envs/checkm2.yml 51 | eval "$(micromamba shell hook --shell=bash)" 52 | micromamba activate checkm2 53 | cd CheckM2 && git checkout e563159 && python setup.py install && cd .. 54 | checkm2 database --download 55 | shell: micromamba-shell {0} 56 | - name: Download fixtures 57 | run: | 58 | wget https://www.dropbox.com/scl/fi/q54wfho3ultb0otq5z3rh/testset_snakemake.zip\?rlkey\=7tbsc2giff0s42ppdmeb706fa\&dl\=0 -O testset_snakemake.zip 59 | unzip testset_snakemake.zip 60 | ls -la 61 | ls -la testset_snakemake 62 | pwd 63 | - name: Snakemake 64 | uses: snakemake/snakemake-github-action@v1.25.1 65 | with: 66 | directory: '.test' 67 | snakefile: 'workflow_avamb/avamb.snake.conda.smk' 68 | args: '--cores 4 --configfile workflow_avamb/config.json --use-conda' 69 | -------------------------------------------------------------------------------- /.github/workflows/unittest.yml: -------------------------------------------------------------------------------- 1 | name: Unittest 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | permissions: 10 | contents: read 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | # There is a bug in Python 3.13.0 which breaks Vamb's tests, fixed in 3.13.1 18 | python-version: ["3.10", "3.11", "3.12", "3.13.1"] 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | cache: 'pip' # caching pip dependencies 27 | cache-dependency-path: '**/pyproject.toml' 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | pip install pytest 32 | pip install -e . 33 | - name: Run tests 34 | run: python -m pytest test 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | __pycache__ 3 | .coverage 4 | *.o 5 | src/_vambtools.cpp 6 | **.c 7 | .eggs 8 | *~ 9 | vamb.egg-info 10 | changelog 11 | .DS_Store 12 | .ipynb_checkpoints 13 | TODO.md 14 | build/ 15 | dist/ 16 | **.vscode 17 | # doc 18 | doc/_build 19 | doc/reference 20 | target 21 | Manifest.toml 22 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.10" 13 | # You can also specify other tool versions: 14 | # nodejs: "19" 15 | # rust: "1.64" 16 | # golang: "1.19" 17 | 18 | # Build documentation in the "docs/" directory with Sphinx 19 | sphinx: 20 | configuration: doc/conf.py 21 | 22 | # Optionally build your docs in additional formats such as PDF and ePub 23 | # formats: 24 | # - pdf 25 | # - epub 26 | 27 | # Optional but recommended, declare the Python requirements required 28 | # to build your documentation 29 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 30 | python: 31 | install: 32 | - method: pip 33 | path: . 34 | extra_requirements: 35 | - docs 36 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v5.0.0 [UNRELEASED] 4 | Version 5 is a major release that includes several breaking changes to the API, 5 | as well as new types of models, improved binning accuracy, and more user 6 | friendliness. 7 | 8 | ### Added 9 | * Added the TaxVamb binner - a semi-supervised model that can augment binning 10 | using taxonomic assignment from e.g. mmseqs2 of some of the input contigs. 11 | TaxVamb is state-of-the-art, and significantly outperforms all other Vamb 12 | models when the taxonomic assignment is reasonably good. 13 | TaxVamb is available from command-line using `vamb bin taxvamb` 14 | * Added the Taxometer annotation refiner. This program enhances taxonomic 15 | assignment of metagenomic contigs using composition and abundance. 16 | TaxVamb will automatically run Taxometer to increase accuracy. 17 | Taxometer is available from command-line using `vamb taxometer` 18 | * [EXPERIMENTAL] Added reclustering functionality, which reclusters an existing 19 | binning using single-copy genes, using a technique inspired by the SemiBin2 20 | binner. This improves bacterial bins. 21 | We may remove this feature in future versions of Vamb. 22 | 23 | ### Breaking changes 24 | * The command-line interface of Vamb has been changed, such that the different 25 | functionality should be used through subcommands. For example, the binners in 26 | Vamb are accesible through `vamb bin`. 27 | Also, a few command-line flags have been removed. 28 | * All output files ending in `.tsv` is now actually in TSV format. Previously, 29 | Vamb did not include a header in the file, as the TSV format requires. 30 | In version 5, the header is included. 31 | * The file `mask.npz` is no longer output, because the encoder no longer masks 32 | any sequences. 33 | * The name of the output clusters files have been changed. When binsplitting is 34 | used, Vamb now outputs both the split, and the unsplit clusters. 35 | The name of the output files are now: 36 | - `vae_clusters_split.tsv` 37 | - `vae_clusters_unsplit.tsv` 38 | And similarly for e.g. `vaevae_clusters_split.tsv`. 39 | When binsplitting is not used, only the unsplit clusters are output. 40 | * The `benchmark` module of Vamb has been removed, as it is superseded by our 41 | new benchmarking tool https://github.com/jakobnissen/BinBencher.jl 42 | 43 | ### Other changes 44 | * Several details of the clustering algorithm has been rehauled. 45 | It now returns more accurate clusters and may be faster in some circumstances. 46 | However, GPU clustering may be significantly slower. (#198) 47 | * Vamb now uses both relative and absolute abundances in the encoder, compared 48 | to only the relative ones before. This improves binning, especially when using 49 | a low number of samples (#210) 50 | * Vamb now binsplits with `-o C` by default. 51 | - To disable binsplitting, pass `-o` without an argument 52 | * Vamb now supports passing abundances in TSV format. This TSV can created very 53 | efficiently using the `strobealign` aligner with the `--aemb` flag. 54 | * If passing abundances in BAM format, it is now recommended to pass in a 55 | directory with all the BAM files using the --bamdir flag, instead of using 56 | the old --bamfiles flag. 57 | * Vamb no longer errors when the batch size is too large. 58 | * Several errors and warnings have been improved: 59 | - The user is warned if any sequences are filtered away for falling below 60 | the contig size cutoff (flag `-m`). 61 | - Improved the error message when the FASTA and BAM headers to not match. 62 | - Vamb now errors early if the binsplit separator (flag `-o`) is not found 63 | in the parsed contig identifiers. 64 | If the binsplit separator is not set explicitly and defaults to `-o C`, 65 | Vamb will instead warn the user and disable binsplitting. 66 | * Vamb now writes its log to both stderr and to the logfile. Every line in the 67 | log is now timestamped, and formatted better. 68 | * Vamb now outputs metadata about the unsplit clusters in the output TSV file 69 | `vae_clusters_metadata.tsv`. 70 | * Vamb now correctly uses a random seed on each invokation (#213) 71 | * Fixed various bugs and undoubtedly introduced some fresh ones. 72 | 73 | ## v4.1.3 74 | * Fix a bug that resulting in poor clustering results (#179) 75 | 76 | ## v4.1.2 77 | * Fix a bug in src/create_fasta.py 78 | * Bugfix: Make seeding the RNG work from command line 79 | * Bump compatible Cython version 80 | 81 | ## v4.1.1 82 | * Create tmp directory in parsebam if needed for pycoverm (issue # 167) 83 | 84 | ## v4.1.0 85 | * Fix typo in output AAE_Z cluster names. They are now called e.g. "aae_z_1" 86 | instead of "aae_z1" 87 | * Clean up the directory structure of Avamb workflow. 88 | * Fix the CheckM2 dependencies to allow CheckM2 to be installed 89 | * Allow the Avamb workflow to be run on Slurm clusters 90 | * Fix issue #161: Mismatched refhash when spaces in FASTA headers 91 | * Allow setting the RNG seed from command line 92 | 93 | ## v4.0.1 94 | * Fix Random.choice for Tensor on Python 3.11. See issue #148 95 | 96 | ## v4.0.0 97 | Version 4 is a thorough rewrite of major parts of Vamb that has taken more than a year. 98 | Vamb now ships with with an upgraded dual variational autoencoder (VAE) and 99 | adversatial autoencoder (AAE) model, usable in a CheckM based workflow. 100 | The code quality and test suite has gotten significant upgrades, making Vamb 101 | more stable and robust to bugs. 102 | Vamb version is slightly faster and produces better bins than v3. 103 | The user interface has gotten limited changes. 104 | 105 | ### Breaking changes 106 | * The official API of Vamb is now defined only in terms of its command-line 107 | interface. This means that from now on, Vamb can freely change and modify its 108 | internal functions, even in minor releases or patch releases. 109 | If you are using Vamb as a Python package, it means you should precisely 110 | specify the full version of Vamb used in order to ensure reproducibility. 111 | * Benchmark procedure has been changed, so benchmark results are incompatible 112 | with results from v3. Benchmarking is now considered an implementation detail, 113 | and is not stable across releases. 114 | * Vamb no longer outputs TNF, sequence names and sequence lengths as .npz files. 115 | Instead, it produces a `composition.npz` that contains all this information 116 | and more. 117 | As a consequence command-line options `--tnfs`, `--names` and `--lengths` 118 | have been removed, and replaced with the single `--composition` option. 119 | * The output .npz array `rpkm.npz` has been changed in a backwards incompatible 120 | way. From version 4, the content of the output .npz files are considered an 121 | implementation detail. 122 | * The depths input option `--jgi` has been removed. To use depths computed by 123 | an external program, construct an instance of the `Abundance` class from your 124 | depths and save it using its `.save` method to an `rpkm.npz` file. 125 | (though read the Notable changes section below). 126 | 127 | ### New features 128 | * Vamb now included an optional AAE model along the VAE model. 129 | Users may run the VAE model, where it behaves similarly to v3, or run the mixed 130 | VAE/AAE model, in which both models will be run on the same dataset. 131 | * The Snakemake workflow has been rehauled, and how defaults to using 132 | the VAE/AAE combined model, using CheckM to dereplicate. 133 | * Vamb is now more easily installed via pip: `pip install vamb`. We have fixed 134 | a bunch of issues that caused installation problems. 135 | * By default, Vamb gzip compresses FASTA files written using the `--minfasta` 136 | flag. 137 | 138 | ### Notable other changes 139 | * Using the combined VAE-AAE workflow, the user can get significantly better bins. 140 | * Vamb now uses `CoverM` internally to calculate abundances. This means it is 141 | significantly faster and more accurate than before. 142 | Thus, we no longer recommend users computing depths with MetaBAT2's JGI tool. 143 | * Lots of bugfixes may have changed Vamb's behaviour in a backwards incompatible 144 | way for certain edge cases. For example, FASTA identifiers are now required to 145 | match the name specification in the SAM format to ensure the identifiers are 146 | the same in FASTA and BAM files. 147 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Vamb 2 | The Git repository is currently hosted at https://github.com/RasmussenLab/vamb 3 | 4 | ## Git workflow 5 | In order for your contribution to be easily integrated into a package that is concurrently worked on by multiple people, it's important that you adhere to the Git workflow that we use for this repo. 6 | 7 | #### Feature branches 8 | We never push directly to master. Instead, create a new feature branch on your own fork, and make a PR from your fork to master. 9 | Feature branches are any branch that contain new code that will eventually be merged to master, whether this is an actual feature, or a bugfix or whatever. 10 | We recommend creating your feature branch from an updated version of master, to make it easier to merge into master again. 11 | 12 | For large features, feature branches can contain huge changes, and be in development over months. Rebase on master as often as possible. 13 | However, where feasible, keep your feature branches' diff relative to master small. If your feature branch contain multiple independent changes, instead make multiple different PRs on different feature branches. This is easier to review, and to bisect if necessary. 14 | 15 | Make sure to squash your commits on your feature branches as necessary to keep the history clean. 16 | A good rule of thumb is that 1 commit = 1 PR, but there may be exceptions. 17 | Also, please delete your feature branches after they've been merged to master so they don't accumulate. 18 | 19 | #### Release branches 20 | Releases are only cut from release branches. 21 | The purpose of release branches is to keep a version of Vamb that is more stable than the development version found on master. 22 | This stability is achieved by only adding bugfixes to release branches, not new features. Over time, the bugfixes will accumulate, while the new features (which mostly are where new bugs come from), are added to master only. 23 | Release branches are named "release", plus the major and minor version, like so: "release-4.1". They are always cut from master. 24 | We only backport bugfixes to one, or a few release branches at a time, so old release branches quickly get outdated. However, we will not remove them. 25 | 26 | Release branches are never merged back to master. If commits from master are needed in a release branch, you may cherry-pick them from master. 27 | This is the only case where commits may be duplicated on two different branches. 28 | 29 | #### Tags 30 | Each release of Vamb (from a release branch) is tagged with a lowercase "v", then a SemVer 2.0 version, e.g. "v4.1.3". 31 | A tag unambiguously refers to a commit, and is never removed. 32 | Ideally, the tagged commit should be the one that updates the version in `vamb/__init__.py`. 33 | 34 | #### Testing 35 | Our CI pipeline currently uses a formatter and a linter to check for issues (currently, the Ruff formatter and linter). 36 | To quicken development time, you can install these locally so you can catch these issues before they are caught in CI. 37 | 38 | #### Dependencies 39 | Please avoid adding new dependencies if at all practical. 40 | We already have lots of issues with out dependencies, and don't want any more. 41 | 42 | ## Example commands 43 | We assume: 44 | - The https://github.com/RasmussenLab/vamb repo is added as a remote with the name `upstream` 45 | - Your own fork of Vamb is added as a remote called `origin` 46 | ### Making an new PR 47 | Syncronize the master branches between your repo and upstream. 48 | Do this before making any new branches from master. 49 | ```shell 50 | $ git switch master 51 | $ git pull upstream master 52 | $ git push origin master 53 | ``` 54 | 55 | Make a new branch, with a feature, here for example "kmer-compression". 56 | Name your branch accordingly. 57 | ```shell 58 | $ git switch -c kmer-compression 59 | ``` 60 | 61 | Write your code, then test it. 62 | This requires you to have installed Vamb (preferentially with `pip install -e .`), 63 | and installed `pytest` and `ruff`: 64 | ```shell 65 | $ python -m pytest # test the code 66 | $ ruff check . # run the linter 67 | $ ruff format . # run the formatter 68 | ``` 69 | 70 | Commit it, then push to `origin` 71 | ```shell 72 | $ git add * # add your files 73 | $ git status # check you didnt add spurious unneeded files 74 | $ git commit # then write a commit message 75 | $ git push -u origin kmer-compression 76 | ``` 77 | 78 | Navigate to Vamb's GitHub repo (the `upstream` one), then on the Pull Requests tab make a new PR from `kmer-compression` 79 | to `upstream`'s master. 80 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 University of Copenhagen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include vamb/kernel.npz 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Vamb 2 | [![Read the Doc](https://readthedocs.org/projects/vamb/badge/?version=latest)](https://vamb.readthedocs.io/en/latest/) 3 | 4 | Read the documentation on how to use Vamb here: https://vamb.readthedocs.io/en/latest/ 5 | 6 | Vamb is a family of metagenomic binners which feeds kmer composition and abundance into a variational autoencoder and clusters the embedding to form bins. 7 | Its binners perform excellently with multiple samples, and pretty good on single-sample data. 8 | 9 | ## Programs in Vamb 10 | The Vamb package contains several programs, including three binners: 11 | 12 | * __TaxVamb__: A semi-supervised binner that uses taxonomy information from e.g. `mmseqs taxonomy`. 13 | TaxVamb produces the best results, but requires you have run a taxonomic annotation workflow. 14 | [Link to article](https://doi.org/10.1101/2024.10.25.620172). 15 | * __Vamb__: The original binner based on variational autoencoders. 16 | This has been upgraded significantly since its original release. 17 | Vamb strikes a good balance between speed and accuracy. 18 | [Link to article](https://doi.org/10.1038/s41587-020-00777-4). 19 | * __Avamb__: An obsolete ensemble model based on Vamb and adversarial autoencoders. 20 | Avamb has an accuracy in between Vamb and TaxVamb, but is more computationally demanding than either. 21 | We don't recommend running Avamb: If you have the compute to run it, you should instead run TaxVamb 22 | See the [Avamb README page](https://github.com/RasmussenLab/avamb/tree/avamb_new/workflow_avamb) for more information. 23 | [Link to article](https://doi.org/10.1038/s42003-023-05452-3). 24 | 25 | And a taxonomy predictor: 26 | * __Taxometer__: This tool refines arbitrary taxonomy predictions (e.g. from `mmseqs taxonomy`) using kmer composition and co-abundance. 27 | [Link to article](https://www.nature.com/articles/s41467-024-52771-y) 28 | 29 | See also [our tool BinBencher.jl](https://github.com/jakobnissen/BinBencher.jl) for evaluating metagenomic bins when a ground truth is available, 30 | e.g. for simulated data or a mock microbiome. 31 | 32 | ## Quickstart 33 | For more details, and how to run on an example dataset [see the documentation.](https://vamb.readthedocs.io/en/latest/) 34 | 35 | ```shell 36 | # Assemble your reads, one assembly per sample, e.g. with SPAdes 37 | for sample in 1 2 3; do 38 | spades.py --meta ${sample}.{fw,rv}.fq.gz -t 24 -m 100gb -o asm_${sample}; 39 | done 40 | 41 | # Concatenate your assemblies, and rename the contigs to the naming scheme 42 | # S{sample}C{original contig name}. This can be done with a script provided by Vamb 43 | # in the vamb/src directory 44 | python src/concatenate.py contigs.fna.gz asm_{1,2,3}/contigs.fasta 45 | 46 | # Estimate sample-wise abundance by mapping reads to the contigs. 47 | # Any mapper will do, but we recommend strobealign with the --aemb flag 48 | mkdir aemb 49 | for sample in 1 2 3; do 50 | strobealign -t 8 --aemb contigs.fna.gz ${sample}.{fw,rv}.fq.gz > aemb/${sample}.tsv; 51 | done 52 | 53 | # Create an abundance TSV file from --aemb outputs using the script in vamb/src dir 54 | python src/merge_aemb.py aemb abundance.tsv 55 | 56 | # Run Vamb using the contigs and the directory with abundance files 57 | vamb bin default --outdir vambout --fasta contigs.fna.gz --abundance_tsv abundance.tsv 58 | ``` 59 | -------------------------------------------------------------------------------- /benchmark/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" 3 | CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" 4 | CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" 5 | DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" 6 | Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" 7 | 8 | [compat] 9 | CSV = "=0.10.15" 10 | CairoMakie = "=0.13.2" 11 | CategoricalArrays = "=0.10.8" 12 | DataFrames = "=1.7.0" 13 | Statistics = "=1.11.1" 14 | julia = "1.11" -------------------------------------------------------------------------------- /benchmark/benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RasmussenLab/vamb/c53e5419392ce3f46ffc9038d2df0004b2d94569/benchmark/benchmark.png -------------------------------------------------------------------------------- /benchmark/benchmarks.csv: -------------------------------------------------------------------------------- 1 | version,dataset,run,nc,mq,seconds 2 | v3.0.6,Airways,3,55,100,9877.15 3 | v3.0.6,Airways,1,55,100,10829.63 4 | v3.0.6,Airways,2,55,100,10230.52 5 | v3.0.6,Gastrointestinal,1,83,103,4872.34 6 | v3.0.6,Gastrointestinal,3,83,103,4449.65 7 | v3.0.6,Gastrointestinal,2,83,103,4776.16 8 | v3.0.6,Oral,2,119,159,11790.15 9 | v3.0.6,Oral,1,119,159,11546.48 10 | v3.0.6,Oral,3,119,159,11500.1 11 | v3.0.6,Skin,2,79,128,10384.21 12 | v3.0.6,Skin,3,79,128,10302.17 13 | v3.0.6,Skin,1,79,128,10241.69 14 | v3.0.6,Urogenital,2,78,98,3606.18 15 | v3.0.6,Urogenital,3,78,98,3585.2 16 | v3.0.6,Urogenital,1,78,98,3614.02 17 | v4.1.3,Airways,2,58,103,2017.4 18 | v4.1.3,Airways,3,58,103,2045.9 19 | v4.1.3,Airways,1,58,103,2020.46 20 | v4.1.3,Gastrointestinal,2,88,109,1040.9 21 | v4.1.3,Gastrointestinal,1,88,109,992.93 22 | v4.1.3,Gastrointestinal,3,88,109,985.28 23 | v4.1.3,Oral,1,125,160,2498.71 24 | v4.1.3,Oral,2,125,160,2230.58 25 | v4.1.3,Oral,3,125,160,2163.1 26 | v4.1.3,Skin,2,77,131,2101.47 27 | v4.1.3,Skin,3,77,131,2180.22 28 | v4.1.3,Skin,1,77,131,2154.07 29 | v4.1.3,Urogenital,3,76,101,1691.71 30 | v4.1.3,Urogenital,1,76,101,1848.71 31 | v4.1.3,Urogenital,2,76,101,1892.25 32 | v5.0.2,Airways,3,74,111,2178.08 33 | v5.0.2,Airways,1,78,117,2805.76 34 | v5.0.2,Airways,2,73,114,2203.5 35 | v5.0.2,Gastrointestinal,1,123,145,1176.34 36 | v5.0.2,Gastrointestinal,3,123,145,998.33 37 | v5.0.2,Gastrointestinal,2,121,143,962.34 38 | v5.0.2,Oral,2,145,190,2623.77 39 | v5.0.2,Oral,1,144,179,3067.2 40 | v5.0.2,Oral,3,146,182,1990.03 41 | v5.0.2,Skin,2,102,152,2076.27 42 | v5.0.2,Skin,3,106,157,1511.35 43 | v5.0.2,Skin,1,98,149,2003.8 44 | v5.0.2,Urogenital,2,102,127,684.11 45 | v5.0.2,Urogenital,1,106,129,856.93 46 | v5.0.2,Urogenital,3,104,129,717.86 47 | -------------------------------------------------------------------------------- /benchmark/make_plots.jl: -------------------------------------------------------------------------------- 1 | using CairoMakie 2 | using DataFrames 3 | using CSV 4 | using Statistics 5 | using CategoricalArrays 6 | 7 | df = CSV.read("benchmarks.csv", DataFrame) 8 | 9 | sort!(df, [:version, :dataset]) 10 | 11 | df.version = categorical(df.version) 12 | df.dataset = categorical(df.dataset) 13 | 14 | n_versions = length(levels(df.version)) 15 | n_datasets = length(levels(df.dataset)) 16 | 17 | summary = combine( 18 | groupby(df, [:version, :dataset]), 19 | :nc => mean => :nc_mean, 20 | :mq => mean => :mq_mean, 21 | :seconds => (i -> minimum(i) / 3600) => :hours_min, 22 | ) 23 | 24 | combined = combine( 25 | groupby(summary, [:version]), 26 | :hours_min => sum => :hours_total 27 | ) 28 | 29 | colors = Makie.wong_colors(); 30 | fig = Figure(); 31 | 32 | grid = fig[1, 1:2] = GridLayout() 33 | 34 | # Plot accuracy 35 | let 36 | global ax_accuracy = Axis( 37 | grid[1, 1], 38 | title = "Accuracy", 39 | xticks = ( 40 | (((n_versions - 1) / 2) + 1):(n_versions + 1):(n_datasets * (n_versions + 1)), 41 | levels(summary.dataset), 42 | ), 43 | limits = (nothing, nothing, 0, nothing), 44 | ylabel = "Genomes recovered as NC / MQ bins", 45 | ylabelpadding = 0, 46 | xlabelpadding = -15, 47 | xticklabelrotation = 0.2, 48 | xticklabelpad = 10, 49 | xticklabelalign = (:center, :center), 50 | xlabel = "Dataset", 51 | ) 52 | 53 | xs = Float64[] 54 | ncs = Float64[] 55 | mqs = Float64[] 56 | for row in eachrow(summary) 57 | dataset_offset = (levelcode(row.dataset) - 1) * (n_versions + 1) 58 | version_offset = levelcode(row.version) - 1 59 | push!(xs, dataset_offset + version_offset + 1) 60 | push!(ncs, row.nc_mean) 61 | push!(mqs, row.mq_mean) 62 | end 63 | for (index, label) in enumerate(levels(summary.version)) 64 | indices = ((index - 1) * n_datasets + 1):(index * n_datasets) 65 | for (ys, alpha) in [(mqs, 0.6), (ncs, 1.0)] 66 | barplot!( 67 | ax_accuracy, 68 | xs[indices], 69 | ys[indices]; 70 | color = colors[index], 71 | alpha = alpha, 72 | width = 1, 73 | label = alpha == 1 ? label : nothing, 74 | ) 75 | end 76 | end 77 | end 78 | 79 | # Plot timing 80 | let 81 | global ax_runtime = Axis( 82 | grid[1, 2], 83 | title = "Runtime", 84 | limits = (nothing, nothing, 0, nothing), 85 | ylabel = "Runtime (all datasets, hours)", 86 | yticks = 1:11, 87 | xticksvisible = false, 88 | xticklabelsvisible = false, 89 | ) 90 | 91 | sort!(combined, [:version]; by = levelcode) 92 | for i in 1:n_versions 93 | barplot!( 94 | ax_runtime, 95 | [i], 96 | [combined.hours_total[i]], 97 | color = colors[i], 98 | width = 1, 99 | ) 100 | end 101 | end 102 | 103 | colgap!(grid, 10) 104 | colsize!(grid, 2, Relative(0.2)) 105 | 106 | axislegend(ax_accuracy, "Vamb version", position = :cb) 107 | 108 | save("benchmark.png", fig) 109 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # Docs creation 2 | 3 | In order to build the docs you need to 4 | 5 | 1. install sphinx and additional support packages 6 | 2. build the package reference files 7 | 3. run sphinx to create a local html version 8 | 9 | The documentation is build using readthedocs automatically. 10 | 11 | Install the docs dependencies of the package (as speciefied in toml): 12 | 13 | ```bash 14 | # in main folder 15 | pip install '.[docs]' 16 | ``` 17 | 18 | ## Build docs using Sphinx command line tools 19 | 20 | Command to be run from `path/to/doc`, i.e. from within the `doc` folder: 21 | 22 | Options: 23 | - `--separate` to build separate pages for each (sub-)module 24 | 25 | ```bash 26 | # pwd: doc 27 | # apidoc 28 | # sphinx-apidoc --force --implicit-namespaces --module-first -o reference ../vamb 29 | # build docs 30 | sphinx-build -n -W --keep-going -b html ./ ./_build/ 31 | ``` 32 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | from importlib import metadata 15 | 16 | 17 | # -- Project information ----------------------------------------------------- 18 | 19 | project = "vamb" 20 | copyright = "2024, Jakob Nybo Nissen, Simon Rasmussen" # ! please update 21 | author = "Jakob Nybo Nissen, Simon Rasmussen" 22 | PACKAGE_VERSION = metadata.version("vamb") 23 | version = PACKAGE_VERSION 24 | release = PACKAGE_VERSION 25 | 26 | 27 | # -- General configuration --------------------------------------------------- 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | "sphinx.ext.autodoc", 34 | "sphinx.ext.autodoc.typehints", 35 | "sphinx.ext.viewcode", 36 | "sphinx.ext.napoleon", 37 | "sphinx.ext.intersphinx", 38 | "sphinx_new_tab_link", 39 | "myst_nb", 40 | ] 41 | 42 | # https://myst-nb.readthedocs.io/en/latest/computation/execute.html 43 | nb_execution_mode = "auto" 44 | 45 | myst_enable_extensions = ["dollarmath", "amsmath"] 46 | 47 | # Plolty support through require javascript library 48 | # https://myst-nb.readthedocs.io/en/latest/render/interactive.html#plotly 49 | html_js_files = [ 50 | "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js" 51 | ] 52 | 53 | # https://myst-nb.readthedocs.io/en/latest/configuration.html 54 | # Execution 55 | nb_execution_raise_on_error = True 56 | # Rendering 57 | nb_merge_streams = True 58 | 59 | # Add any paths that contain templates here, relative to this directory. 60 | templates_path = ["_templates"] 61 | 62 | # List of patterns, relative to source directory, that match files and 63 | # directories to ignore when looking for source files. 64 | # This pattern also affects html_static_path and html_extra_path. 65 | exclude_patterns = [ 66 | "_build", 67 | "Thumbs.db", 68 | ".DS_Store", 69 | ".npz", 70 | ] 71 | 72 | 73 | # Intersphinx options 74 | intersphinx_mapping = { 75 | "python": ("https://docs.python.org/3", None), 76 | "torch": ("https://pytorch.org/docs/stable/index.html", None), 77 | "numpy": ("https://numpy.org/doc/stable/", None), 78 | # "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), 79 | # "scikit-learn": ("https://scikit-learn.org/stable/", None), 80 | # "matplotlib": ("https://matplotlib.org/stable/", None), 81 | } 82 | 83 | # -- Options for HTML output ------------------------------------------------- 84 | 85 | # The theme to use for HTML and HTML Help pages. See the documentation for 86 | # a list of builtin themes. 87 | # See: 88 | # https://github.com/executablebooks/MyST-NB/blob/master/docs/conf.py 89 | # html_title = "" 90 | html_theme = "sphinx_book_theme" 91 | # html_logo = "_static/logo-wide.svg" 92 | # html_favicon = "_static/logo-square.svg" 93 | html_theme_options = { 94 | "github_url": "https://github.com/RasmussenLab/vamb", 95 | "repository_url": "https://github.com/RasmussenLab/vamb", 96 | "repository_branch": "main", 97 | "home_page_in_toc": True, 98 | "path_to_docs": "docs", 99 | "show_navbar_depth": 2, 100 | "use_edit_page_button": True, 101 | "use_repository_button": True, 102 | "use_download_button": True, 103 | "launch_buttons": { 104 | "colab_url": "https://colab.research.google.com" 105 | # "binderhub_url": "https://mybinder.org", 106 | # "notebook_interface": "jupyterlab", 107 | }, 108 | "navigation_with_keys": False, 109 | } 110 | 111 | # Add any paths that contain custom static files (such as style sheets) here, 112 | # relative to this directory. They are copied after the builtin static files, 113 | # so a file named "default.css" will overwrite the builtin "default.css". 114 | # html_static_path = ["_static"] 115 | 116 | 117 | # -- Setup for sphinx-apidoc ------------------------------------------------- 118 | 119 | # Read the Docs doesn't support running arbitrary commands like tox. 120 | # sphinx-apidoc needs to be called manually if Sphinx is running there. 121 | # https://github.com/readthedocs/readthedocs.org/issues/1139 122 | 123 | if os.environ.get("READTHEDOCS") == "True": 124 | from pathlib import Path 125 | 126 | PROJECT_ROOT = Path(__file__).parent.parent 127 | PACKAGE_ROOT = PROJECT_ROOT / "vamb" 128 | 129 | # def run_apidoc(_): 130 | # from sphinx.ext import apidoc 131 | # 132 | # apidoc.main( 133 | # [ 134 | # "--force", 135 | # "--implicit-namespaces", 136 | # "--module-first", 137 | # "--separate", 138 | # "-o", 139 | # str(PROJECT_ROOT / "doc" / "reference"), 140 | # str(PACKAGE_ROOT), 141 | # str(PACKAGE_ROOT / "*.c"), 142 | # str(PACKAGE_ROOT / "*.so"), 143 | # ] 144 | # ) 145 | # 146 | # def setup(app): 147 | # app.connect("builder-inited", run_apidoc) 148 | -------------------------------------------------------------------------------- /doc/front_page.md: -------------------------------------------------------------------------------- 1 | # Variational Autoencoders for Metagenomic Binning (Vamb) 2 | 3 | Vamb is a family of metagenomic binners which feeds kmer composition and abundance into a variational autoencoder and clusters the embedding to form bins. 4 | Its binners perform excellently with multiple samples, and pretty good on single-sample data. 5 | 6 | ## Programs in Vamb 7 | The Vamb package contains several programs, including three binners: 8 | 9 | * __TaxVamb__: A semi-supervised binner that uses taxonomy information from e.g. `mmseqs taxonomy`. 10 | TaxVamb produces the best results, but requires you have run a taxonomic annotation workflow. 11 | [Link to article](https://doi.org/10.1101/2024.10.25.620172). 12 | * __Vamb__: The original binner based on variational autoencoders. 13 | This has been upgraded significantly since its original release. 14 | Vamb strikes a good balance between speed and accuracy. 15 | [Link to article](https://doi.org/10.1038/s41587-020-00777-4). 16 | * __Avamb__: An obsolete ensemble model based on Vamb and adversarial autoencoders. 17 | Avamb has an accuracy in between Vamb and TaxVamb, but is more computationally demanding than either. 18 | We don't recommend running Avamb: If you have the compute to run it, you should instead run TaxVamb 19 | See the [Avamb README page](https://github.com/RasmussenLab/vamb/tree/master/workflow_avamb) for more information. 20 | [Link to article](https://doi.org/10.1038/s42003-023-05452-3). 21 | 22 | And a taxonomy predictor: 23 | * __Taxometer__: This tool refines arbitrary taxonomy predictions (e.g. from `mmseqs taxonomy`) using kmer composition and co-abundance. 24 | [Link to article](https://www.nature.com/articles/s41467-024-52771-y) 25 | 26 | See also [our tool BinBencher.jl](https://github.com/jakobnissen/BinBencher.jl) for evaluating metagenomic bins when a ground truth is available, 27 | e.g. for simulated data or a mock microbiome. 28 | -------------------------------------------------------------------------------- /doc/histogram.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RasmussenLab/vamb/c53e5419392ce3f46ffc9038d2df0004b2d94569/doc/histogram.npz -------------------------------------------------------------------------------- /doc/how_to_run.md: -------------------------------------------------------------------------------- 1 | # Running Vamb 2 | Most users will want to copy and change the commands from the quickstart section below. 3 | Users with more advanced data, or who really wants to dig into Vamb to get the most out of Vamb should read the in-depth sections below. 4 | 5 | First figure out what you want to run: 6 | * Do you have contigs plus reads plus a taxonomic annotation of the contigs? Use __TaxVamb__ 7 | * Do you only have contigs plus reads and want a decent, fast binner? Use __Vamb__ 8 | 9 | We also support the now-obsolete __AVAMB__ binner. Its performance is in between TaxVamb and Vamb, 10 | but it requires more compute than either. 11 | I recommend new users run either TaxVamb or Vamb. 12 | 13 | ## Quickstart 14 | The general workflow looks like this. 15 | For more detailed information, see the documentation page on Vamb's inputs and outputs, as well as the page with tips on how to run Vamb. 16 | 17 | ```shell 18 | # Assemble your reads, one assembly per sample, e.g. with SPAdes 19 | for sample in 1 2 3; do 20 | spades.py --meta ${sample}.{fw,rv}.fq.gz -t 24 -m 100gb -o asm_${sample}; 21 | done 22 | 23 | # Concatenate your assemblies, and rename the contigs to the naming scheme 24 | # S{sample}C{original contig name}. This can be done with a script provided by Vamb 25 | # in the vamb/src directory 26 | python src/concatenate.py contigs.fna.gz asm_{1,2,3}/contigs.fasta 27 | 28 | # Estimate sample-wise abundance by mapping reads to the contigs. 29 | # Any mapper will do, but we recommend strobealign with the --aemb flag 30 | mkdir aemb 31 | for sample in 1 2 3; do 32 | strobealign -t 8 --aemb contigs.fna.gz ${sample}.{fw,rv}.fq.gz > aemb/${sample}.tsv; 33 | done 34 | 35 | # Create an abundance TSV file from --aemb outputs using the script in vamb/src dir 36 | python src/merge_aemb.py aemb abundance.tsv 37 | 38 | # Run Vamb using the contigs and the directory with abundance files 39 | vamb bin default --outdir vambout --fasta contigs.fna.gz --abundance_tsv abundance.tsv 40 | ``` 41 | 42 | ## Running with test data 43 | We provide example data under the "releases" section on the Vamb Github repository: https://github.com/RasmussenLab/vamb/releases/download/input_data/inputs.tar.gz 44 | 45 | After downloading, extract its content: 46 | ```shell 47 | $ tar -xzf inputs.tar.gz 48 | ``` 49 | 50 | This data is only for demonstrating the Vamb commands, and test running Vamb, and does not reflect a realistic metagenome. It is not suitable for benchmarking the accuracy of any binner. 51 | 52 | The following commands makes use of these example files. You can substitute those files with your own in the commands. 53 | 54 | 55 | ### Vamb 56 | Default command: 57 | 58 | ```shell 59 | $ vamb bin default --outdir out1 --fasta contigs.fna.gz --abundance_tsv abundances.tsv 60 | ``` 61 | 62 | ### TaxVamb 63 | For TaxVamb, it's almost the same, but we also provide the taxonomy file: 64 | 65 | ```shell 66 | $ vamb bin taxvamb --outdir out2 --fasta contigs.fna.gz --abundance_tsv abundances.tsv --taxonomy taxonomy.tsv 67 | ``` 68 | 69 | ### Taxometer 70 | Same default arguments as TaxVamb: 71 | 72 | ```shell 73 | $ vamb taxometer --outdir out3 --fasta contigs.fna.gz --abundance_tsv abundances.tsv --taxonomy taxonomy.tsv 74 | ``` 75 | 76 | ### AVAMB 77 | See the README.md file in the `workflow_avamb` directory. 78 | 79 | ### Reducing the number of epochs for testing 80 | For testing purposes, e.g. when running on the test data, it may be useful to reduce the number of training epochs, so Vamb finishes faster. 81 | This will cause Vamb's models to be severely underfitted and perform terribly, so doing it is only recommended for testing. 82 | 83 | * For Vamb: Add flags `-e 5 -q 2 3` 84 | * For TaxVamb: Add flags `-e 5 -q 2 3 -pe 5` 85 | * For Taxometer: Add flags `-pe 5` 86 | 87 | ## Explanation of command-line options 88 | Each program in Vamb only has a subset of the following options. 89 | 90 | * `-h, --help`: Print help and exit 91 | * `--version`: Print version to stdout and exit 92 | * `--outdir`: Output directory to create. Must not exist. Parent directory must exist. 93 | * `-m`: Ignore contigs shorter than this value. Too short contigs have an unstable kmer composition 94 | and abundance signal, and therefore adds too much noise to the binning process. 95 | * `-p` Number of threads to use. Note that Vamb has limited control over the number of threads used by 96 | its underlying libraries such as PyTorch, NumPy and BLAS. Although Vamb tries its best to limit the 97 | number of threads to the number specified, that might not always work. 98 | * `--norefcheck`: Disable reference hash checking between composition, abundance and taxonomic inputs. 99 | See the section on reference hash checking in the input section. 100 | * `--cuda`: Use a graphical processing unit for model training and clustering. 101 | Must have a CUDA-compatible version of PyTorch installed, and an NVIDIA GPU which supports CUDA. 102 | * `--seed`: Pass an integer seed for the random number generation. Vamb will use this seed to attempt reproducibility. Note that PyTorch does not support reproducible training of models, so passing this seed does not guarantee that Vamb will produce the same results from the same data. 103 | * `--minfasta`: Output all bins with a total size (sum of contig lengths) greater than or equal to this 104 | number. The bins will be output in a directory called `bins` under the output directory, and each bin 105 | will be a FASTA file with the same name as the bin, suffixed by ".fna". 106 | * `-o` Set binsplit separator. See the section on binsplitting in "tips for running Vamb" section for its meaning. 107 | If not passed, defaults to `C` if 'C' is present in all identifiers. 108 | To disable binsplitting, pass `-o` without an argument. 109 | * `--no_predictor`: When running TaxVamb, if this flag is not set, TaxVamb will automatically run 110 | Taxometer when given an unrefined input taxonomy to refine it. 111 | Using a refined taxonomy usually improves the accuracy of TaxVamb. 112 | * `--fasta`: FASTA input file. See section on Vamb inputs and outputs. 113 | * `--composition`: NPZ composition input file. See section on Vamb inputs and outputs. 114 | * `--bamdir`: Directory with BAM files to use for abundance. See section on Vamb inputs and outputs. 115 | * `--abundance_tsv`: TSV file with precomputed abundances. See section on Vamb inputs and outputs. 116 | * `--abundance`: NPZ abundance input file. See section on Vamb inputs and outputs. 117 | * `--taxonomy`: TSV file with refined or unrefined taxonomy. See section on Vamb inputs and outputs. 118 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: front_page.md 2 | :parser: myst_parser.sphinx_ 3 | 4 | Table of contents 5 | ================== 6 | 7 | .. toctree:: 8 | :maxdepth: 3 9 | 10 | installation.md 11 | 12 | .. toctree:: 13 | :maxdepth: 3 14 | 15 | how_to_run.md 16 | 17 | .. toctree:: 18 | :maxdepth: 3 19 | 20 | inputs_outputs.md 21 | 22 | .. toctree:: 23 | :maxdepth: 3 24 | 25 | tips.md 26 | 27 | Indices and tables 28 | ================== 29 | 30 | * :ref:`genindex` 31 | * :ref:`modindex` 32 | * :ref:`search` 33 | -------------------------------------------------------------------------------- /doc/inputs_outputs.md: -------------------------------------------------------------------------------- 1 | # Vamb inputs and outputs 2 | All modes of Vamb takes various _inputs_ and produces various _outputs_. 3 | Currently, all modes take the following two central inputs: 4 | 5 | * The kmer-composition of the sequence (the _composition_). 6 | * The abundance of the contigs in each sample (the _abundance_). 7 | 8 | For inputs that take significant time to produce, Vamb will serialize the parsed input to a file, such that future runs of Vamb can use that instead of re-computing it. 9 | 10 | ## Composition 11 | The composition is computed from the input contig file in FASTA format (the 'catalogue'). 12 | From command line, this looks like: 13 | 14 | ```shell 15 | --fasta contigs.fna.gz 16 | ``` 17 | 18 | Where the catalogue may be either gzipped or a plain FASTA file. 19 | 20 | Vamb produces the best results when run with the "multi-split" workflow, as demonstrated in the quickstart section in "how to run Vamb". 21 | In this workflow, samples are assembled independently, and the resulting contigs are concatenated to a single FASTA file before binning. 22 | After binning, the bins can be split into sample-wise pure bins. 23 | 24 | To do this splitting (termed "binsplitting"), Vamb needs to know which contig came from which sample. 25 | Therefore, it's recommended that the FASTA headers are formatted in the following pattern: 26 | `{samplename}C{contigname}` 27 | 28 | Where `{samplename}` is some text that uniquely identifies each sample, and `{contigname}` uniquely identifies each contig within a sample. 29 | For example, if the samples are named S1, S2, S3, etc., and the contigs are named 1, 2, 3, etc, a FASTA header may be `S3C119`. 30 | 31 | After the `composition.npz` has been created, Vamb will write the composition in the output file `composition.npz`. 32 | Future runs of Vamb can then instead use the following option to load the composition directly: 33 | 34 | ```shell 35 | --composition composition.npz 36 | ``` 37 | 38 | ## Abundance 39 | The abundance may be computed from either: 40 | * A TSV file with the header being "contigname" followed by one sample name per sample, 41 | and the values in the TSV file being precomputed abundances. 42 | These may be derived from `paste`ing together outputs from the tool `strobealign --aemb`, and concatenating the result to a TSV header. 43 | * A directory of sorted BAM files generated by mapping the reads of each sample to the contig catalogue. 44 | 45 | On the command line, abundance input can be specified as: 46 | ```shell 47 | --abundance_tsv abundance.tsv 48 | ``` 49 | or 50 | ```shell 51 | --bamdir dir_with_bam_files 52 | ``` 53 | 54 | Once the abundance has been parsed, Vamb will produce the file `abundance.npz`, which can be used for future 55 | Vamb runs instead: 56 | ```shell 57 | --abundance abundance.npz 58 | ``` 59 | 60 | ### Abundance TSV format 61 | This follows the ordinary `.tsv` format, with a mandatory header, and disallowing `\t` in contig names. 62 | The header of the first column name must be exactly `contigname`. 63 | 64 | The abundance TSV file in the correct format can be created from the output of `strobealign --aemb` using the script in `src/merge_aemb.py`: 65 | 66 | ```shell 67 | python src/merge_aemb.py input_dir abundance.tsv 68 | ``` 69 | 70 | Example file: 71 | ``` 72 | contigname S1 S2 S3 73 | S1C1 1.53 1.11 4.1e2 74 | S1C2 0.94 9.2 5.1e2 75 | S2C1 1.2e-3 0 9.2 76 | S3C1 88.21 51.2 12.1e3 77 | S3C2 14.1 90.1 13.01 78 | ``` 79 | 80 | ### Abundance as BAM files 81 | If you don't want to compute abundance using `strobealign --aemb` and create a TSV file from its output (recommended), 82 | Vamb can compute abundances from BAM files. 83 | 84 | To do this: 85 | 86 | * Create the FASTA contig catalogue as described in the section of "composition". 87 | * Map the reads for each sample to the catalogue, to obtain on BAM file per sample. 88 | 89 | Using the aligner [minimap2](https://github.com/lh3/minimap2) as well as [samstrip](https://github.com/jakobnissen/samstrip) and [samtools](https://github.com/samtools/samtools), the commands may be: 90 | 91 | ```shell 92 | # Index the FASTA file so it only has to be done once instead of on every mapping 93 | minimap2 -I 32G -d catalogue.mmi catalogue.fasta; 94 | 95 | # Map each sample. Here, using 8 threads, using paired short reads. 96 | minimap2 -t 8 -ax sr catalogue.mmi s1.fw.fq.gz s1.rv.fq.gz | samstrip | samtools view -F 3584 -b - > s1.bam 97 | ``` 98 | 99 | _Note that if you use minimap2 specifically, be aware of [this bug in minimap2](https://github.com/lh3/minimap2/issues/37), where, if the index 100 | is not large enough, the output will be an invalid SAM file. To get around this, use enough RAM when indexing (e.g. set option `-I` appropriately)_ 101 | 102 | ### Reference hash checking 103 | To ensure the integrity of the data, Vamb will compare the identifiers in the composition (ultimately: headers in the FASTA file) with the contig names from the abundance input (TSV file contig names, or BAM sequence names) and, if provided, that of the taxonomic input. 104 | 105 | To do this efficiently, the identifiers are _hashed_ to provide a _reference hash_ (refhash), which is compared, and an error is thrown if they are different. 106 | 107 | If you, for some reason, can't create input files with matching identifiers, and you are 100% sure the order of the sequences is identical in the composition input and abundance input, you can disable this reference hashing with the `--norefcheck` option. 108 | 109 | ## Taxonomy 110 | Vamb operates with two kinds of taxonomies: 111 | * _Unrefined_ taxonomies give the taxonomic annotation for each contig 112 | * _Refined_ taxonomies gives the taxonomic annotation _plus a probability estimate_ for each contig 113 | 114 | Vamb's __Taxometer__ tool can be used to refine a taxonomy. 115 | It takes an unrefined taxonomy as input, and outputs a refined taxonomy. 116 | TaxVamb usually achieves better results if its taxonomy input is refined with Taxometer. 117 | 118 | Both refined and unrefined taxonomies can be used for TaxVamb. 119 | By default, if TaxVamb gets an unrefined taxonomy, it will automatically refine it with Taxometer, unless `--no_predict` is passed. 120 | 121 | Taxonomy files are TSV files with the following format: 122 | * Header: `contigs\tpredictions` for unrefined taxonomies and `contigs\tpredictions\tscores` for refined ones. 123 | * In the `contigs` column: The FASTA identifier for every contig in the catalogue. 124 | * In the `predictions` column: A semicolon-separated string with taxonomic levels, for each of the following seven ranks, in order: 125 | domain, phylum, class, order, family, genus, species. Lower ranks may be omitted. 126 | There is no requirement that the labels are actually meaningful, i.e. that they correspond to any real taxonomic clade. 127 | * In the `scores` column: A semicolon separated list of floats, one per element in the `predictions` column. 128 | 129 | The following are examples of a VALID rows in the prediction column: 130 | ``` 131 | Bacteria;Bacillota;Clostridia 132 | Bacteria;Bacillota;Bacilli;Bacillales 133 | Bacteria;Pseudomonadota;Gammaproteobacteria;Moraxellales;Moraxellaceae;Acinetobacter;Acinetobacter sp. TTH0-4 134 | ``` 135 | 136 | The following are example of INVALID rows in the prediction column: 137 | * Invalid: Begins with class instead of domain: `Clostridia;Eubacteriales;Lachnospiraceae;Roseburia;Roseburia hominis` 138 | * Invalid: Skips the phylum: `Bacteria;Gammaproteobacteria;Moraxellales;Moraxellaceae;Acinetobacter;Acinetobacter sp. TTH0-4` 139 | 140 | The following is an example of a valid, unrefined taxonomy file: 141 | ``` 142 | contigs predictions 143 | S18C13 Bacteria;Bacillota;Clostridia;Eubacteriales 144 | S18C25 Bacteria;Pseudomonadota 145 | S18C67 Bacteria;Bacillota;Bacilli;Bacillales;Staphylococcaceae;Staphylococcus 146 | ``` 147 | 148 | Our tool [__Taxconverter__](https://github.com/RasmussenLab/taxconverter) can be used to create unrefined taxonomy files from MMSeqs2, Centrifuge, Kraken2, Metabuli or MetaMaps output files. 149 | 150 | # Outputs 151 | 152 | ## Vamb 153 | - `log.txt` - A text file with information about the Vamb run. Look here (and at stderr) if you experience errors. 154 | - `composition.npz`: A Numpy .npz file that contain all kmer composition information computed by Vamb from the FASTA file. 155 | This can be provided to another run of Vamb to skip the composition calculation step. 156 | This is not produced if an existing `composition.npz` was used to run Vamb. 157 | - `abundance.npz`: Similar to `composition.npz`, but this file contains information calculated from the abundance TSV file (or BAM files). 158 | Using this as input instead of BAM files will skip re-parsing the BAM files, which take a significant amount of time. 159 | This file is not produced if an existing `abundance.npz` was used to run Vamb. 160 | - `model.pt`: A file containing the trained VAE model. When running Vamb from a Python interpreter, the VAE can be loaded from this file to skip training. 161 | - `latent.npz`: This contains the output of the VAE model, the embedding of each of the contigs. 162 | - `bins`: If `--minfasta` is set, this is a directory with one FASTA file per bin, after binsplitting. 163 | - `vae_clusters_unsplit.tsv` - A two-column TSV text with the header `clustername\tcontigname`, then one row per sequence: 164 | Left column for the cluster (i.e bin) name, right column for the sequence name. 165 | You can create the FASTA-file bins themselves using the script in `src/create_fasta.py` 166 | - (if binsplitting is enabled:) `vae_clusters_split.tsv`, similar to the unsplit version, but after binsplitting. 167 | See the section on binsplitting on the page"tips for running Vamb". 168 | - `vae_clusters_metadata.tsv`: A file with some metadata about clusters. 169 | - Name: The name of the cluster 170 | - Radius: Cosine radius in embedding space. Small clusters are usually more likely to be pure. 171 | - Peak/valley ratio: A small PVR means the cluster's edges is more well defined, and hence the cluster is more likely pure 172 | - Kind: Currently, Vamb produces three kinds of clusters: 173 | - Normal: Defined by a local density in latent space. Most good clusters are of this type 174 | - Loner: A contig far away from everything else in latent space. 175 | - Fallback: After failing to produce good clusters for some time, these (usually poor) clusters are created 176 | to not get stuck in an infinite loop when clustering 177 | - Bp: Sum of length of all sequences in the cluster 178 | - Ncontigs: Number of sequences in the cluster 179 | - Medoid: Name of contig used as the cluster's medoid, i.e. the center of the cluster 180 | 181 | ## TaxVamb 182 | * `log.txt`, `composition.npz` and `abundance.npz`: Same as when running `Vamb` 183 | * `predictor_model.pt` and `results_taxometer.tsv`: If Taxometer was used to automatically refine TaxVamb. See the Taxometer output section. 184 | * `vaevae_clusters_{split,unsplit,metadata}.tsv`: Same as when running `Vamb`, but from TaxVamb's VAEVAE model 185 | * `vaevae_model.pt` A PyTorch model with the trained VAEVAE model. 186 | 187 | ## Taxometer 188 | * `log.txt`, `composition.npz` and `abundance.npz`: Same as when running `Vamb` 189 | * `predictor_model.pt`: A PyTorch model file containing the trained predictor. 190 | * `results_taxometer.tsv`: A refined taxonomy file (see the section on files on the "how to run" page) 191 | 192 | ## AVAMB 193 | Same as Vamb, but also: 194 | - `aae_y_clusters_{split,unsplit}.tsv`: The clusters obtained from the categorical latent space 195 | - `aae_z_latent.npz`: Like `latent.npz`, but of the adversarial Z latent space 196 | - `aae_z_clusters_{metadata,split,unsplit}.tsv`: Like the corresponding `vae_clusters*` files, but from the adversarial Z latent space 197 | 198 | -------------------------------------------------------------------------------- /doc/installation.md: -------------------------------------------------------------------------------- 1 | # How to install Vamb 2 | Vamb is in continuous development, and the latest versions are significantly better than older versions. 3 | For the best results, make sure to install the released latest version. 4 | 5 | ## Recommended: Install with `pip` 6 | Recommended: Vamb can be installed with `pip` (thanks to contribution from C. Titus Brown): 7 | ```shell 8 | pip install vamb 9 | ``` 10 | 11 | Note: Check that you've installed the latest version by comparing the installed version with [the latest version on PyPI](https://pypi.org/project/vamb/#history). 12 | 13 | Note: An active Conda environment can hijack your system's linker, causing an error during installation. 14 | If you see issues, either deactivate `conda`, or delete the `~/miniconda/compiler_compats` directory before installing with pip. 15 | 16 | ## Install a specific version of Vamb 17 | If you want to install the latest version from GitHub, or you want to change Vamb's source code, you should install it like this: 18 | 19 | ```shell 20 | # Clone the desired branch from the repository, here master 21 | git clone https://github.com/RasmussenLab/vamb -b master 22 | cd vamb 23 | # The `-e` flag will make Vamb change if the source code is changed after install 24 | pip install -e . 25 | ``` 26 | 27 | __Note that the master branch is work-in-progress, have not been thoroughly tested, and is expected to have more bugs.__ 28 | 29 | ## Avoid using Conda to install Vamb 30 | The version of Vamb currently on BioConda is out of date, and significantly less accurate than the latest current version. 31 | We have also experienced that our users have more issues with installations from Conda. 32 | We will only be releasing new versions to be installable with `pip`. 33 | -------------------------------------------------------------------------------- /doc/tips.md: -------------------------------------------------------------------------------- 1 | # Tips for running Vamb 2 | 3 | ## Use the latest released version 4 | Vamb generally gets faster and more accurate over time, so it's worth it to get the latest version. 5 | Note that Conda releases are typically (far) behind pip releases, so I recommend installation using pip. 6 | 7 | ```{image} ../benchmark/benchmark.png 8 | :alt: Vamb gets better over time 9 | :width: 600px 10 | ``` 11 | _Figure 1: Newer Vamb releases are faster and more accurate. 12 | Results are from binning the CAMI2 toy human microbiome short read gold standard assembly datasets, using the recommended multi-split workflow, and binned with Vamb using default settings._ 13 | 14 | ## Garbage in, garbage out 15 | For the best results when running Vamb, make sure the inputs to Vamb are as good as they can be. 16 | In particular, the assembly process is a main bottleneck in the total binning workflow, so improving assembly 17 | by e.g. preprocessing reads, using a better assembler, or switching to long read technology can make a big difference. 18 | 19 | ## Postprocess your bins 20 | On principle, Vamb will bin every single input contig. 21 | Currently, Vamb's bins are also _disjoint_, meaning each contig is present in only one bin. 22 | 23 | Having to place every contig into a big, even those with a weak binning signal, 24 | means that a large number of contigs will be binned poorly. 25 | Often, these poor-quality contigs are put in a bin of their own, or with just one or two smaller contigs. 26 | Practically speaking, this means _most bins produces by Vamb will be of poor quality_. 27 | 28 | Hence, to use bins you can rely on, you will need to postprocess your bins: 29 | * You may filter the bins by size, if you are only looking for organisms 30 | and not e.g. plasmids. 31 | For example, removing all bins < 250,000 bp in size will remove most poor quality bins, 32 | while keeping all bacterial genomes with a reasonable level of completeness. 33 | * Using tools such as CheckM2 to score your bins, you can keep only the bins 34 | that pass some scoring criteria 35 | * You may use the information in the `vae_clusters_metadata.tsv` file (see Output), 36 | and e.g. remove all clusters marked as "Fallback", below a certain size, or with a too 37 | high peak-valley ratio. However, this is only recommended for advanced users. 38 | 39 | ## How binsplitting works 40 | In the recommended workflow, each sample is assembled independently, then the contigs are pooled 41 | and binning together. 42 | After Vamb have encoded the input features into the embedding (latent space), the embedding is clustered 43 | to clusters. 44 | The clusters thus may contain contigs from multiple samples, and may represent the same genome assembled 45 | in different samples. 46 | To obtain mono-sample bins from the clusters, the clusters then split by their sample of origin in a process we call binsplitting. 47 | This reduces duplication in the output bins, and better preserves inter-sample diversity. 48 | 49 | Binsplitting is done by looking at the identifiers (headers) of the contigs in the FASTA file: 50 | They are assumed to be named according to the scheme ``, 51 | where: 52 | * The sample identifier uniquely identifies the same that the contig came from, 53 | * The separator separates the sample- and contig identifier, and is guaranteed to not be contained in the sample identifier 54 | * The contig identifier uniquely identifies the contig within the sample. 55 | When using the provided `src/concatenate.py` script, the names conform to this scheme, being named e.g. 56 | `S5C1042`, for sample 5, contig 1042. In this case, the binsplit separator is 'C'. 57 | 58 | The separator can be set on command-line with the flag `-o`. 59 | It defaults to 'C', if all headers contain a 'C'. 60 | To disable binsplitting, pass `-o` without an argument. 61 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html 2 | [project] 3 | dynamic = ["version"] 4 | name = "vamb" 5 | dependencies = [ 6 | "vambcore == 0.1.2", 7 | "numpy == 1.26.4", 8 | "torch == 2.6.0", 9 | "pycoverm == 0.6.2", 10 | "networkx == 3.4.2", 11 | "scikit-learn == 1.6.1", 12 | "dadaptation == 3.2", 13 | "loguru == 0.7.3", 14 | "pyhmmer == 0.10.15", 15 | "pyrodigal == 3.6.3", 16 | ] 17 | # Currently pycoverm does not have binaries for Python > 3.13. 18 | # The dependency resolver, will not error on Python 3.14, but attempt 19 | # to build pycoverm from source, but will not get the deps required for that. 20 | requires-python = "<3.14,>=3.10.0" 21 | scripts = {vamb = "vamb.__main__:main"} 22 | 23 | [project.optional-dependencies] 24 | docs = [ 25 | "sphinx", 26 | "sphinx-book-theme", 27 | "myst-nb", 28 | "ipywidgets", 29 | "sphinx-new-tab-link!=0.2.2", 30 | ] 31 | 32 | [metadata] 33 | authors = [ 34 | {name = "Jakob Nybo Nissen", email = "jakobnybonissen@gmail.com"}, 35 | {name = "Pau Piera", email = "pau.piera@cpr.ku.dk"}, 36 | {name = "Simon Rasmussen", email = "simon.rasmussen@cpr.ku.dk"}, 37 | ] 38 | description = "Variational and Adversarial autoencoders for Metagenomic Binning" 39 | license = "MIT" 40 | readme = {file = "README.md"} 41 | url = "https://github.com/RasmussenLab/vamb" 42 | 43 | [build-system] 44 | requires = [ 45 | "setuptools ~= 70.1", 46 | "setuptools-scm >= 8.0", 47 | ] 48 | build-backend = "setuptools.build_meta" 49 | 50 | [tool.ruff] 51 | lint.ignore = [ 52 | "E722", # Use bare except. 53 | "E402", # import not at top - needed for the hack in __init__.py 54 | ] 55 | 56 | # pyproject.toml 57 | [tool.pytest.ini_options] 58 | filterwarnings = [ 59 | "error", 60 | "ignore::DeprecationWarning", 61 | "ignore::UserWarning", 62 | ] 63 | 64 | [tool.setuptools_scm] 65 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [options] 2 | packages = find: 3 | include_package_data = True 4 | -------------------------------------------------------------------------------- /src/concatenate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import argparse 5 | import gzip 6 | import vamb 7 | 8 | parser = argparse.ArgumentParser( 9 | description="""Creates the input FASTA file for Vamb. 10 | Input should be one or more FASTA files, each from a sample-specific assembly. 11 | If keepnames is False, resulting FASTA can be binsplit with separator 'C'.""", 12 | formatter_class=argparse.RawDescriptionHelpFormatter, 13 | add_help=True, 14 | ) 15 | 16 | parser.add_argument("outpath", help="Path to output FASTA file") 17 | parser.add_argument("inpaths", help="Paths to input FASTA file(s)", nargs="+") 18 | parser.add_argument( 19 | "-m", 20 | dest="minlength", 21 | metavar="", 22 | type=int, 23 | default=2000, 24 | help="Discard sequences below this length [2000]", 25 | ) 26 | parser.add_argument( 27 | "--keepnames", action="store_true", help="Do not rename sequences [False]" 28 | ) 29 | parser.add_argument("--nozip", action="store_true", help="Do not gzip output [False]") 30 | 31 | args = parser.parse_args() 32 | 33 | # Check inputs 34 | for path in args.inpaths: 35 | if not os.path.isfile(path): 36 | raise FileNotFoundError(path) 37 | 38 | if os.path.exists(args.outpath): 39 | raise FileExistsError(args.outpath) 40 | 41 | outpath = os.path.normpath(args.outpath) 42 | parent = os.path.dirname(outpath) 43 | if parent != "" and not os.path.isdir(parent): 44 | raise NotADirectoryError( 45 | f'Output file cannot be created: Parent directory "{parent}" is not an existing directory' 46 | ) 47 | 48 | # Run the code. Compressing DNA is easy, this is not much bigger than level 9, but 49 | # many times faster 50 | filehandle = ( 51 | open(outpath, "w") if args.nozip else gzip.open(outpath, "wt", compresslevel=1) 52 | ) 53 | try: 54 | vamb.vambtools.concatenate_fasta( 55 | filehandle, args.inpaths, minlength=args.minlength, rename=(not args.keepnames) 56 | ) 57 | except: 58 | filehandle.close() 59 | raise 60 | -------------------------------------------------------------------------------- /src/create_fasta.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import vamb 4 | import pathlib 5 | 6 | parser = argparse.ArgumentParser( 7 | description="""Command-line bin creator. 8 | Will read the entire content of the FASTA file into memory - beware.""", 9 | formatter_class=argparse.RawDescriptionHelpFormatter, 10 | add_help=False, 11 | ) 12 | 13 | parser.add_argument("fastapath", help="Path to FASTA file") 14 | parser.add_argument("clusterspath", help="Path to clusters.tsv") 15 | parser.add_argument("minsize", help="Minimum size of bin in bp", type=int, default=0) 16 | parser.add_argument("outdir", help="Directory to create") 17 | 18 | if len(sys.argv) == 1: 19 | parser.print_help() 20 | sys.exit() 21 | 22 | args = parser.parse_args() 23 | 24 | # Read in FASTA files only to get its length. This way, we can avoid storing 25 | # in memory contigs for sequences that will never get output anyway 26 | lens: dict[str, int] = dict() 27 | with vamb.vambtools.Reader(args.fastapath) as file: 28 | for record in vamb.vambtools.byte_iterfasta(file, args.fastapath): 29 | lens[record.identifier] = len(record) 30 | 31 | with open(args.clusterspath) as file: 32 | clusters = vamb.vambtools.read_clusters(file) 33 | 34 | clusters = { 35 | cluster: contigs 36 | for (cluster, contigs) in clusters.items() 37 | if sum(lens[c] for c in contigs) >= args.minsize 38 | } 39 | 40 | with vamb.vambtools.Reader(args.fastapath) as file: 41 | vamb.vambtools.write_bins(pathlib.Path(args.outdir), clusters, file, maxbins=None) 42 | -------------------------------------------------------------------------------- /src/create_kernel.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Create kernel for use in kmer frequencies. 2 | Method copied from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2765972/ 3 | 4 | Principle: 5 | There are 256 tetranucleotides, so a frequency distribution (tetranucleotide frequency, TNF) 6 | is a length 256 vector. But the individual TNFs are not independent. For example, AAAT 7 | must correlate highly with AATA. The TNFs are subject to at least 3 linear constrains: 8 | 9 | 1) The vector must sum to one. We simply shift the TNF down by 1/256 to make it sum to zero 10 | for simplicity instead. 11 | 2) We cannot distinguish between a kmer and its reverse complement because the sequencede 12 | strand is arbitrary. So we must count e.g. AGAT as one half of AGAT and one half ATCT. 13 | So each kmer's frequency is the same as its reverse-complement. 14 | 3) Every time a kmer is observed, the next kmer must have three overlapping nucleotides. 15 | E.g. every observation of AGAT is followed by GATA, GATC, GATG or GATT. Same for previous 16 | kmer. in other words, sum(xABC) = sum(ABCx). 17 | This is not true right at the ends of the sequences because the kmers stop eventually, but 18 | that can be considered a measurement error, and we don't care about it. 19 | 20 | We list these linear constrains and produce kernel L that works on tnf matrix T such that 21 | TL = P, a smaller projected TNF space. 22 | 23 | Notably, for constraint 2 to be true, we need to average the frequency between a kmer 24 | and its reverse complement. We can do this with a matrix multiply with an averaging kernel 25 | R. So: 26 | 27 | P = (TR)L = T(RL) = TK 28 | 29 | We thus calculate K = RL and save this for use in Vamb for projection. 30 | """ 31 | 32 | from os.path import abspath, dirname, join 33 | import numpy as np 34 | import itertools 35 | from scipy.linalg import null_space 36 | 37 | 38 | def reverse_complement(nuc): 39 | table = str.maketrans("ACGT", "TGCA") 40 | return nuc[::-1].translate(table) 41 | 42 | 43 | def all_kmers(k): 44 | for i in itertools.product("ACGT", repeat=k): 45 | yield ("".join(i)) 46 | 47 | 48 | def create_projection_kernel(): 49 | indexof = {kmer: i for i, kmer in enumerate(all_kmers(4))} 50 | linear_equations = list() 51 | 52 | # Constraint one: Frequencies sum to one (or in this scaled case, zero) 53 | linear_equations.append([1] * 256) 54 | 55 | # Constaint two: Frequencies are same as that of reverse complement 56 | for kmer in all_kmers(4): 57 | revcomp = reverse_complement(kmer) 58 | 59 | # Only look at canonical kmers - this makes no difference 60 | if kmer >= revcomp: 61 | continue 62 | 63 | line = [0] * 256 64 | line[indexof[kmer]] = 1 65 | line[indexof[revcomp]] = -1 66 | linear_equations.append(line) 67 | 68 | # Constraint three: sum(ABCx) = sum(xABC) 69 | for trimer in all_kmers(3): 70 | line = [0] * 256 71 | for suffix in "ACGT": 72 | line[indexof[trimer + suffix]] += 1 73 | for prefix in "ACGT": 74 | line[indexof[prefix + trimer]] += -1 75 | linear_equations.append(line) 76 | 77 | linear_equations = np.array(linear_equations) 78 | kernel = null_space(linear_equations).astype(np.float32) 79 | assert kernel.shape == (256, 103) 80 | return kernel 81 | 82 | 83 | def create_rc_kernel(): 84 | indexof = {kmer: i for i, kmer in enumerate(all_kmers(4))} 85 | rc_matrix = np.zeros((256, 256), dtype=np.float32) 86 | for col, kmer in enumerate(all_kmers(4)): 87 | revcomp = reverse_complement(kmer) 88 | rc_matrix[indexof[kmer], col] += 0.5 89 | rc_matrix[indexof[revcomp], col] += 0.5 90 | 91 | return rc_matrix 92 | 93 | 94 | def create_dual_kernel(): 95 | return np.dot(create_rc_kernel(), create_projection_kernel()) 96 | 97 | 98 | dual_kernel = create_dual_kernel() 99 | 100 | # Prevent overwriting kernel when running tests 101 | if __name__ == "__main__": 102 | path = join(dirname(dirname(abspath(__file__))), "vamb", "kernel.npz") 103 | np.savez_compressed(path, dual_kernel) 104 | -------------------------------------------------------------------------------- /src/merge_aemb.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import sys 4 | from math import isinf, isnan 5 | from pathlib import Path 6 | 7 | parser = argparse.ArgumentParser( 8 | description="""Merge output files of `strobealign --aemb` to a single abundance TSV file. 9 | The sample names will be the basenames of the paths in the input directory.""", 10 | formatter_class=argparse.RawDescriptionHelpFormatter, 11 | add_help=True, 12 | ) 13 | 14 | parser.add_argument("input_dir", help="Path to directory of --aemb output files") 15 | parser.add_argument( 16 | "output_file", help="Path to write output TSV file (must not exist)" 17 | ) 18 | 19 | args = parser.parse_args() 20 | 21 | 22 | def exit_with(message: str): 23 | print(message, file=sys.stderr) 24 | exit(1) 25 | 26 | 27 | # Check input directory exists 28 | input = Path(args.input_dir) 29 | output = Path(args.output_file) 30 | 31 | if not input.is_dir(): 32 | exit_with(f"Error: Input is not an existing directory: '{input}'") 33 | 34 | # Check output file's parent is an existing directory. 35 | if not output.parent.is_dir(): 36 | exit_with( 37 | f"Error: Output file cannot be created: Parent directory '{output.parent}' is not an existing directory" 38 | ) 39 | 40 | if output.exists(): 41 | exit_with(f"Error: Output file already exists: '{output}'") 42 | 43 | files = sorted(input.iterdir()) 44 | 45 | for file in files: 46 | for char in ("\n", "\r", "\t", "\v"): 47 | if char in file.name: 48 | exit_with( 49 | f"Error: File name '{file.name}' contains a char {repr(char)}, which is not permitted in Vamb" 50 | ) 51 | 52 | 53 | def exit_on_line(path: Path, line: int, message: str): 54 | exit_with(f"Error: {message}, in file '{path}' on line {line}") 55 | 56 | 57 | # We allow an empty directory, but let's warn the user since it's an easy mistake 58 | # to make. 59 | if len(files) == 0: 60 | # N.B: We don't use exitwith here, because we want the exit code to be 0 61 | # indicating this is not an error. 62 | print("Warning: No files in input directory", sys.stderr) 63 | exit(0) 64 | 65 | 66 | # Parses an --aemb file, yielding (identifier, depth), where depth is a non-negative, 67 | # non-inf, non-nan float. 68 | def parse_lines(path: Path): 69 | with open(path) as file: 70 | for lineno_minus_one, line in enumerate(file): 71 | line = line.rstrip() 72 | 73 | # If line is empty or whitespace-only, it must be the last line. 74 | # --aemb does not produce trailing whitespace 75 | if not line: 76 | for next_line in file: 77 | if next_line.rstrip(): 78 | exit_on_line( 79 | path, lineno_minus_one + 1, "Found non-trailing empty line" 80 | ) 81 | return 82 | 83 | fields = line.split("\t") 84 | 85 | # Currently --aemb only outputs two columns, but they document explicitly 86 | # that they may add other columns in the future 87 | if len(fields) < 2: 88 | exit_on_line( 89 | path, lineno_minus_one + 1, "Not at least two tab-separated columns" 90 | ) 91 | 92 | (identifier, depth_str) = (fields[0], fields[1]) 93 | try: 94 | depth = float(depth_str) 95 | except ValueError: 96 | exit_on_line( 97 | path, lineno_minus_one + 1, "Depth cannot be parsed as float" 98 | ) 99 | except: 100 | raise 101 | 102 | if isnan(depth) or isinf(depth) or depth < 0.0: 103 | exit_on_line( 104 | path, lineno_minus_one + 1, "Depth is negative, NaN or infinite" 105 | ) 106 | 107 | yield (identifier, depth) 108 | 109 | 110 | # We allow the order of rows to differ between the files, so we need to be able 111 | # to convert an identifier into a row index for subsequent files 112 | identifier_to_index: dict[str, int] = dict() 113 | 114 | # We store depths in a matrix, but we need to have parsed the first file to know 115 | # how big to make the matrix 116 | first_depths: list[float] = [] 117 | identifiers: list[str] = [] 118 | for identifier, depth in parse_lines(files[0]): 119 | length = len(identifier_to_index) 120 | identifier_to_index[identifier] = length 121 | # If the identifier has previously been seen, the dict entry will be overwritten 122 | if len(identifier_to_index) == length: 123 | exit_with( 124 | f"Duplicate sequence name found in file '{files[0]}': '{identifier}'", 125 | ) 126 | first_depths.append(depth) 127 | identifiers.append(identifier) 128 | 129 | # Initialize with -1, so we can search for it at the end and make sure no entries 130 | # are uninitialized 131 | matrix = np.full((len(identifiers), len(files)), -1.0, dtype=np.float32) 132 | matrix[:, 0] = first_depths 133 | 134 | del first_depths 135 | 136 | # Fill in the rest of the files 137 | for col_minus_one, file in enumerate(files[1:]): 138 | n_seen_identifiers = 0 139 | for identifier, depth in parse_lines(file): 140 | n_seen_identifiers += 1 141 | index = identifier_to_index.get(identifier) 142 | 143 | # Ensure all entries in this file have a known index (i.e. are also 144 | # in the first file) 145 | if index is None: 146 | exit_with( 147 | f"Error: Identifier '{identifier}' found in file '{file}' " 148 | "but not present in all files.", 149 | ) 150 | 151 | if matrix[index, col_minus_one + 1] != -1.0: 152 | exit_with( 153 | f"Error: Identifier '{identifier}' present multiple times in file '{file}'" 154 | ) 155 | 156 | matrix[index, col_minus_one + 1] = depth 157 | 158 | # Check that this file does not have a strict subset of identifiers from the first 159 | # file. After that, we know the set of identifiers is exactly the same 160 | if n_seen_identifiers != len(identifiers): 161 | exit_with( 162 | f"Error: File '{file}' does not have all identifiers of file '{files[0]}'." 163 | ) 164 | 165 | assert -1.0 not in matrix, ( 166 | "Matrix not full; this is a bug in the script and should never happen" 167 | ) 168 | 169 | with open(output, "w") as file: 170 | # We already checked this, but let's check it again 171 | assert len(matrix) == len(identifiers) 172 | print("contigname", "\t".join([p.name for p in files]), sep="\t", file=file) 173 | for identifier, row in zip(identifiers, matrix): 174 | print(identifier, "\t".join([str(i) for i in row]), sep="\t", file=file) 175 | -------------------------------------------------------------------------------- /test/data/aemb/6.aemb.tsv: -------------------------------------------------------------------------------- 1 | S27C95602 5.988746 2 | S27C25358 25.066412 3 | S27C181335 1.159981 4 | S4C222286 33.167842 5 | S11C13125 6.825609 6 | S4C480978 6.578677 7 | S12C228927 6.716019 8 | S27C93037 13.361650 9 | S9C124493 16.475576 10 | S27C214882 6.249275 11 | S7C273086 3.115955 12 | S12C85159 3.793851 13 | -------------------------------------------------------------------------------- /test/data/aemb/7.aemb.tsv: -------------------------------------------------------------------------------- 1 | S27C95602 0.000000 2 | S27C25358 40.494834 3 | S27C181335 1.123731 4 | S4C222286 41.094294 5 | S11C13125 0.000000 6 | S4C480978 0.000617 7 | S12C228927 0.000000 8 | S27C93037 0.084345 9 | S9C124493 0.000000 10 | S27C214882 1.003976 11 | S7C273086 2.705933 12 | S12C85159 4.306267 13 | -------------------------------------------------------------------------------- /test/data/aemb/8.aemb.tsv: -------------------------------------------------------------------------------- 1 | S27C95602 0.000000 2 | S27C25358 2.157007 3 | S27C181335 5.691155 4 | S4C222286 35.064668 5 | S11C13125 0.000000 6 | S4C480978 0.000000 7 | S12C228927 0.000000 8 | S27C93037 0.099672 9 | S9C124493 0.578131 10 | S27C214882 0.000000 11 | S7C273086 1.028035 12 | S12C85159 2.950335 13 | -------------------------------------------------------------------------------- /test/data/bam/10.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RasmussenLab/vamb/c53e5419392ce3f46ffc9038d2df0004b2d94569/test/data/bam/10.bam -------------------------------------------------------------------------------- /test/data/bam/11.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RasmussenLab/vamb/c53e5419392ce3f46ffc9038d2df0004b2d94569/test/data/bam/11.bam -------------------------------------------------------------------------------- /test/data/bam/12.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RasmussenLab/vamb/c53e5419392ce3f46ffc9038d2df0004b2d94569/test/data/bam/12.bam -------------------------------------------------------------------------------- /test/data/fasta.fna: -------------------------------------------------------------------------------- 1 | >Sequence1_100nt_no_special 2 | GGAAGGTAGCCGTACCGAGTTCTTTGGAGAGAATCGTCCACTGAGCAGTACGGATCTCAGAATTAGTCGCGATACATGTATGGGCTTAGCGTACCTAGGC 3 | >Sequence2 100nt whitespace in header 4 | # This is a random comment 5 | CTTCCGCATGAGCCCGGGAGC 6 | CAGATGAGCATGCAGCAATTG 7 | CACCTGGTTGACCACGCTGGC 8 | CCGGAGCGGGAGGTGTCTGCA 9 | CCGTGTTGTGTCCCAA 10 | >Sequence3 150 nt, all ambiguous bases 11 | TTCTCTAGAGTTAGTTATATWGGTACCTSCTATGTTMAGTCACCCRATGAACTCKGCTCCCCTAYACCTGCGTTNTAGAAABAACGCGDTAGGGAGVCCCGAHACGATCTCAATTTCCTAACGCCTGAGCGCGCTAGCCAGTGGGCTTCA 12 | >Sequence4 99 nt, no tetranucleotides 13 | GCCNGGCNTCCNTGGNTGTNATTNTTANCCCNGCTNCGANTTGNTCTNATTNGACNCCTNCAANGGCNAATNTTTNCTGNTCTNGATNTGTNGGGNTAC 14 | >Sequence5 Empty seq 15 | >Sequence6 150 nt, same as seq4 but mixed case 16 | tTctCTaGAgTtagTtATAtwGGtaCctsCtAtGTTmAgtcACcCrAtGaAcTCkGCTCCcctAYacctGcgttNtaGAaABaacGCgDtAggGAgVcCCgAHACgaTCTCaaTTTCCTAAcgccTGagCgCGcTAgccAGtgggCTtCa 17 | -------------------------------------------------------------------------------- /test/data/marker.fna: -------------------------------------------------------------------------------- 1 | >abc 2 | GTGGTGAGTGCTTGGACGTGTGTGGTGATGCTCATGGTGTCTTCCCCCCTTCCCCGTGTTGTTACCACCCACTCTACCCGCTCACACTTTCGAATACAAGGGTATTTTTCGAAATGGAAAAGCTGTCGAGCACTCACATGTTCGACAGCTTCATATGTCCTTGAACTACGTAACTTTTTCCTTACAATCAGCGCAAAGTCCAAAGATTTCGGCCTCGTGACTACTGAGAGCAAAGCCGTTTTTAGTGGCAATTTCCTGTGCCCATGTCTCTACTGGACCGCCATCGATTTCGACTGTGTGACCGCAATTGGTGCAGACCAAGTGATGGTGGTGTCCCTCGTCGTGGCATTGGCGGTACAGAGTTTCTCCACCTGTGACGGTAAGTACGTCGACTGCCCTGATGTCCGCGAGGGATTGGAGGGTTCGGTAGACGGTTGTGAGGCCGACGTTGTGTTCCCTGGTGGATAGCTCGTGATGGATTTCTTTGGCGGAAGCGAAGTTATCGATTTCCTCAAGAACGTCAATTACGGCTTTTCGCTGTCTGGTGCTTCGCACTCCCAGCTTCGGGGCAGAGCCTTGGCTGATGCGATTGATACCCACCGTTGATCCTCCTCAATGACACAAAATGTACTTCGATAGTCTACCCAGATGTGTCAACCCCTGCGTTTAGTGCCAGGAGAAGTATGTCGAGGACGAGTGGTTCGGCAAGTGAATAAGTCATTTGCCGGCCTTGACGTTCTGCGTCGACGATACCTGCAGTTTTAAGGACTTTGAGGTGTTGGCTCACTAGTGGTTGCGAACTTTTTACTAGCTTGACCAATTCGTGGACGTAGTGGGGCCTTTCGTTGAGGGCGAGGATGATTTCGATTCTTAAGGGGGAATCTAGTGCCCTAATCAGCAGGCTGATCGCTTTGATGTTTTTTGCTGTTGCAAGTTTCTGAAGCTCAGCTGATGCTGTGGATTCGGACTCTTCTGCAGGGGTGACGAAATTCCGATTTGAGTGTTGAGCCACGGGGAAGTCCTTCCGTCCTTAGGCTAGGTCTGGAATGGATCTAGCACGCTTGCTATTTTACCTTCTATATAAACCTTTTATGAGGGAAATGAAAAAATAGTTATTAGAACTAGTTTACATCGCGAAGGCCGCAAAATGACGGGGTCAGCGGAAGCAACATCGTTAGTTGGGCTAGGATTGGTTGGGTATGTCCTAAAAGGGACGGTTATTTTTTCATTCGACGTGGAGGAGAGCATCCGACGTGGCTCAGCAATCGATCATCGACACCGTGGTTAACCTGTGTAAACGACGTGGACTGGTGTACCCCTGTGGTGAGATCTACGGCGGTACCCGCTCTGCGTGGGACTACGGCCCGCTGGGTGTGGAGCTGAAGGAAAACATCAAGCGCCAGTGGTGGCGTTCTATGGTTACTTCCCGCCCAGATGTTGTGGGTGTTGATACTTCTGTCATCCTTCCTCGCCAGGTGTGGGTAACTTCCGGCCACGTTGAGGTCTTCACTGACCCACTGGTTGAGTCTTTGAACACCCACAAGCGTTACCGTGCGGACCACCTGCTGGAGCAGTACGAAGAGAAGCATGGTCACCCACCTGTAAACGGCTTGGCTGACATCAACGATCCAGAGACCGGCCAGCCAGGTAACTGGACTGAGCCTAAGGCGTTCTCTGGTCTTCTGAAGACTTTCTTGGGACCTGTGGACGACGAAGAGGGTCTGCACTACCTGCGCCCTGAAACTGCTCAGGGTATCTTCGTGAACTTCAAGAACGTGATGAACACTTCACGCATGAAGCCACCTTTCGGTATCGCGAACATCGGTAAGTCTTTCCGTAACGAGATCACCCCAGGTAACTTCATTTTCCGTACTCGTGAGTTCGAGCAGATGGAGATGGAGTTCTTCGTCAAGCCTGGTGAGGACGAAGAGTGGCACCAGCACTGGATTGATACTCGCCTGCAGTGGTACATCAACCTGGGCATTAAGCCTGAGAACCTGCGTCTGTACGAGCACCCTCAGGAGAAGCTGTCTCACTACTCCAAGCGCACTGTTGATATTGAGTACGCATTCAACTTTGCTAACACCAAGTGGGGCGAGTTAGAGGGTATCGCGAACCGTACTGATTACGATCTTCGCGTGCACTCTGAGGGCTCTGGTGAGGACCTGTCATTCTTCGATCAGGAGACCAATGAGCGTTGGATTCCTTTCGTAATCGAGCCTGCTGCAGGTCTTGGTCGCGCAATGATGATGTTCCTGATGGATGCTTATCACGAGGACGAGGCACCAAACTCAAAGGGTGGCGTCGATAAGCGTGTTGTTCTGAAGCTTGACCGTCGCCTTGCGCCGGTTAAGGTTGCGGTCTTGCCGCTGTCAAAGAAGGACACTTTGACGCCTTTGGCGGAAAAGCTCGCAGCAGAGCTGCGTGAATTCTGGAACGTTGATTACGACACTTCAGGTGCGATTGGTCGCCGTTACCGTCGTCAGGACGAGATCGGTACTCCATTCTGCGTCACCGTTGACTTTGATTCTCTCGAGGACAACGCTGTGACCGTGCGTGAGCGCGACACCATGGAGCAGGTTCGTGTTCCACTTGATGAGCTGCAGGGTTACTTGGCTCAGCGCCTCATCGGCTGCTAAACGGCAACCAATAGAGCGATAATTCGCTAAGACGAATGTAATCGCAGCAACATATAGCACCGGCTTAACAGGCCGGTGCTATTCTGTTCGCATGACTTCGAAGGATCTGATTGTGACCTCCTATACGTCTTGGGGCAAGCGTTTCAAGAATGACGGGAAGCTTTTTATTAACCTACTTCGCAGCACCACTGATAGTGCTGATGAAAAGGTTTTAGCCACTTTCGGTGAAGTTCCCAGCAAATCATTTGAAACCACCGCAACGGTTGATGAGCAGCAGTGGGAACTGTCCTTCAATATTGATGGAACGGCAACTGCCAAGCTTCCTGATGGTCGTGTGTTCAGCGCGAATGCAGGTGAGAAGACCTTTACCAAGTCCAAGCGGATTGAAATCGACATGGACGGCACCGCGATGGCTGCTGTTAATGAAGATAAAAACAATTGGATTATCGACGATTCTGAAGAGAATAAAGTCGCTCAGTTTACCGGTATGAACAACGGTGTGCGTCGCGCGATTGTGGAGTTTGAGCCTGACGTAGAAGTCACCCAGGAGCAGGAAATTTTCTTGTCGTGGGTTGCTCGGAAAACTCTGGAATCCCGCATGTTGGGCTCCAGTTGGGGACTGACTCTGTTTTTGATCATTTTGACGCCAATCATTATTTTTCTCACTTTCAGCTAAAAGGACCATGCAATGGTAGACGCTCAGCGCCCCAAAGCAGGCATCTTCGGTAGCCACACAGAAGAAACATGGGTGTGGCTCGGTAATGAACTTTTCGACGAGTCCGGCGAGGTCATCGCCGACGTTCGCTCCGACGTCCTCTACGTGGATCGCGAACGACTACTCATCGAATCCACCCCCGGCACCATGCGTTTTCGTTGCCGCGCAACACTGTCCGGGGGTGAGGTCTATACAATGACTCAGAATTCTTTCACTGTGGGGGATCTCACTGCGGTGTGCGGGCGCCGGACGTATTCGCTAAAAAGGGTGTCGCCGTGGCGTAAAGAACGCCTGATCACCAACAATGGGGTGGAAGTGGCGCGACTTCGCCCGATGACCAGCGGTAAAGTCGAATTCATTGTGGGCACCGCGGACAGCGAGGCGTTGCCGTTCGTCGACGCAGTATTTTTGAGCTGGGCGTGCGTCCTGGTGGATTCGGCCGTGCGCCGGCCGAAAATTTAAAAGCTTTTTGCTTATCGACGCACCCCTCCACCTGTTTTTTGTAGCCGGGGGATCATTTCCTTTGAAGGATCCAATCTCCGCACTTAGTTTCCTTCGGTGTGAAGGAAAGAGTTCCGTAAAGACCTCTATCTCATTTAAAGAAGTGGAGGATTAGGGTCGTTGACTCGCCTTCGGCACTAATTTGAGCCAAGTTCAAGTTTGCTGCCATCCCAGGTGACCGAAAATGTCCTATGCGAGGTCTCTTCGGTCACTTGGTTTTGCTCGTTTCAGGCTAGAAGCGGCCTCCGCGGAACCCTCCTCCGCCACCGCCACCACCGCTGAAGCCGCCACCGCCTCCACCGAAGCCTCCACCGAAACCGCCACCGCGGCCGCTGTTGAGAATCGAGTTGATCACCATGCCGGTGACAATCGCACCGGTGGTTCCGCCACCGGAATTGTGGCGATTGTTGTAGGTGGTGATGTCGTTTTGTGCTGACTTGCTGGCGCGTTGGGCTGCGACTGCTGCTTGACGTCCGTAATCAATTCCTGCACGGGTGTCGCGGGTGCGGTTTTGTTGTGCCATGGCGTACAGTTTTTGTGCGTTGGCCAGGTGGGTGCGGGCTTCGGATTTTACGATGCGACCGCGGGTGGAGATGAGGTCTTCGGCCTTTTGGATTTGGCTTCTTGCAGATTGCAGCTGTTGGTCGAATACGCGTAGCTGGCGGGCTTGATCAGCTGCGGTGGCGCGAAGTGTGTCAAGTTGAGTGTCGAGGGCGGAGTCGACATCGACAAGTTCTGTGTAGGTTCCGAGCGGATCCTTTTCGGCGTCTGCTGATGCGGTGGTTAGTGCTGCGCTGGCTGCGCGGACAGCATCGTCGAGGGAGGCCCAGTCGGCACGGGCACCGTCGGCTCCTGCGCTTTGTTTGAGTTGGCCGGCTTCGTTGATTTCGTCTGAGATTTCTTGAATCAGATCGGCAACGTTTGCTTTGGCTGTGGAGATGTTTTCATCGGCATGCTCGACGCCCTCGAGGAGTTTGTCTGCGGTAGTGATGGCGTGCTCGATGTGACGGATCGCGTCGATAAGCCCGCCCTGCTCGCCTGCGGGCATGGACTCTATCTTGTACGCCTGTGGCAGGACTTCTTCTGCTTCGTCGAGCGAAGCGCTGGCGAGGTCGACGTTGTCGTCGATGCTTTCAAGGACCTCTGCTGAGTAGCGAGCGCGCAGGCCAGCGAGTGTTTCTTGAGCCTTGGGGAGGCGGGTGCGCAGGTCGACGGATTTTTGGGTGAGAGCATCCAATTTGCTGCCCGCGTTGATCAGCAGGTTGCGCATATCGGCAAAGTTTTGGGCCTCGGCGTCGAGGGCATCGTCGGCTTGGCCACAGGATGAAATGATTTCTACCAGCATGGATCGACGTTCGGCTTCGGATTCTGGGATAGAATCGTTGAGGCGCTGCTGAATCTCAAAGGCTTTTTGCAGGGTGCCGGTGGAGTGGTTCATGGCGCGGTTGAAGCTGCGGGTGCGCTCTGGTCCGAACTCGGAGGTAGCGATAGCGAGCTCTTCTTTTCCGCGACGGATGGAGTCATCAGTGGAGGTGAGCTCTTCTTGGGCAAGGTGTTCGAGAGTTTCCATGGGAAGCTGCATGAGGCGGTTGGTATCGCGAGGGTCGATCTCACGTGCATCTTCCAAGGTTGCAGCACTTGTTTTCTTCTTGCGGCTGCGGGAATAGGCCCAAATTCCGCCACCAGCGGCCACTGTGCCAACGCCCGCAGCAGCCAACCAAGCGCCGGAAGATCCAGAAGAGCCTGAGGTTCCTGAGGCACCAGAACTAGAACCAACTGATTCTGCCAGCGCTAGTGCGGAGCCTGCCCAATCTTCTTGGGAAAGCGCCTGGAAAGCAGCGTTGTTGGCGGCGTCGAGTTCAGCGTCGGTCCATTGAGTACCACCTTGGATGCCGTACTGCCGTTCCTCGGGAGCGAGTGCATAAACCAAGACGTTTCCGCCGCCGTTGGCTTGGAGTGCTTGCTGCGTCCACGTTTCAGGGTCAACTCCGTCGAAAGAGCTTAGGAAAACAACGAAAATAACCTTTTGTTCAGATGCCTTTACATCATCGATGGCAGCCTGAATGTTGGTGATATCGGACGAGGAAATCTGGCCGGTGTAGTCAGTGACATTGTCTTGGTAAAATTCTGGTGATTCAGCCAAGACATATGTTTCTGTGGCTTCTGCAGTGTGAGCAGTAAAAAATGGTCCACTGATAAGGAGCGCGCCAGCTCCAATTGCCACAGTGACCGATACACGGCGGACGTTTTCCCGAAGATGCACCAAACTAAAGTTCATGGTCCCCACCTTAGACGAGTCCAGCTGGCACACTAGTTAACGTGAGAAGATTTTTAGCCAAGAGTTTACTCTTAACCGCAGTAGCGCAACCAGCCCTGAGGGTGGTCGCGTATTCGATGCTCAGAACGCCTAATAATCGGCACAAAATTGATTCAATTTTGGTGTTGGGCACAGCTCAATATGATGGGGTTCCATCGAGGCAGTTTGCTGCTCGTTTGAGGCATGCCGCGAAGCTGTGGCGTCTTCATGAAATCCAGCATGTATATACTGTCGGCGGAAAACTTCCTGGTGATCGTTTCACCGAAGCAGAAGTCGCGCGGGAGTATTTGATCAAAGAGGGCGTGGATCCGGATCTGATTTTTGTCTCTGCAGTTGGCAATGACACTGTCTCCTCCTATGAGGCGCTTGATCCGGAAAAGCTTGGTCGGGTGCTGATTGTTACTGATCCGAACCATTCGTATCGGGCGGTGCGCATCGCGCGACGCATGGGCTTTGACGCGAAACCTTCCCCGACAACCTATAGTCCCGCGAAGTTTCCGTCGATAGTTTATTTTCTGACCTTGTCCCATGAGTGGGGCGGGGTAGTGGTACAGGACGTGTCGTGGCTCTTGGGCGAACGGGTGGCCGATAAGGTGGAAGCATCTTTGCGAACTATCCAAGGCCTGCTGCGCCCTTCGAGGCGTGCGCGCCATGAGCAACTTCGGAGGCTGAAAAAGTAGATGTACCCCTATTCCGACGCAGACGCTTTTCGACGCCACCCTGAGCGCGCCAAGTCCAGCCAACTGCGCACCAGCGCCGTAGACACCCGCAGCGCGTTCGCCCGCGACCGGGCTCGCGTGCTGCATTCTGCTGCTCTTCGACGCCTCGCGGATAAAACCCAAGTGGTTGGCCCCAATGATGGTGATACTCCGCGCACCCGGCTGACGCACTCTTTGGAAGTAGCTCAAATTGCACGGGGAATCGGAGCTGGACTGGATTTGGATCCTGATCTGTGCGATCTGGCAGGGCTGTGCCATGACATTGGGCATCCGCCGTATGGACACAACGGTGAAAACGCGTTGAATGAAGTTGCTGCGGCCTGTGGAGGATTTGAGGGCAACGCCCAAACCTTGCGTATTCTCACGCGGCTGGAGCCAAAAATTGTCTCTGATGAGGGGGAGAGCTTTGGGCTGAACTTGTCGCGGGCTGCTCTTGATGCTGCATGTAAGTATCCGTGGGCTAAAACAAATGCGGATGGCAGTGTCAATAAGAAATACAGTGCTTATGATGAGGACGCAGAAATTCTTGCTTGGATCAGGCAAGGCCATGAAGACCTCAGACCACCAATCGAAGCTCAGGTCATGGACTTTTCCGATGATATTGCCTACTCAGTACACGATGTAGAAGACGGCATTGTTTCCGGTCGCATCGATTTGAAAGTGCTGTGGGACCTGGTGGAATTAGCAGCACTGGCGGACAAAGGCGCAGCAGCTTTCGGAGGCTCGCCTGCAGAACTCATCGAGGGCGCAGCCTCGTTGCGGGAGCTTCCTGTGGTAGCGGCCGCTGCAGATTTTGATTTCTCACTGCGTTCCTACGCTGCGCTGAAGGCCATGACCTCAGAACTAGTGGGAAGATACGTTGGCTCTACCATCGAGTCAACAAAGAAAACACACGCCGGCATTGATGTGGGACGCATGCACGGCGATTTGATCATTCCAGAAACAGCGGCCAGTGAAGTAAAACTGCTCAAAACGTTAGCGGTTCTCTATGTGATGGATGACCCAGGGCACCTTGCGCGCCAAAACAGGCAACGGGATCGTATCTTCCGGGTTTTTGACTACCTGGTGCTGGGGGCTCCGGGATCGTTGGATCCGATGTATCGCCAGTGGTTTATTGAAGCGGATTCAGAATCGGAACAGATCCGTGTGATTGTTGATCAGATTGCGTCGATGACGGAGTCTCGTCTGGAACGCCTTGCCCGGAATGCTGCTGACATCTCAGGATTCTTGGGATAGTTGGTTAGAGCAGCAGCGATTTTTAGTAAGGCCAATAACATGTTTTGGCTTAAACCTGTGTCGTGTCAGATGGTGGCGAAGTAGAGTTCGCAAAGCTAGCGAACATGAATTCGTGTTCAGGAACTTAACAGGGATCAAACAGAGAACAGAGAACAGATCACGCTGCCCAAAAATCGCACTTTTAAGGTTTGTGGGCGTCTGTGTGTGGTTTGCCGCTGTAAAGTATCACCACGTTATGCGCCCTGGTGTGATCAAGCGTTCGTTCTGGGTCGAAACCCCAAAAGTCACAATTCCCCAGAAGCGGGTCAAACCCATTTAGCTTATTGCTTACATATCGAGGGTTTAGAAAAGTGATTTGTCGGATCAGTCGGTTTCTGCCAAGTAAATAGAACTTTATAAATTTTGTGGCTCTCAAATCTTAGGCCACGGCTTCCGATTTGAACCGGAGGTTCAAAAGGCTTATATAGACAAGATTCTGCATCGTCTCACGAGCCCCTCATTGCCTGACACGGTCAATCGTGTGGGAGGTACCAATCCGTGAGATTTCTGCCAACGAGCGATTCATTGGCCCCGCTGCAGAGCTGGCAGAACACGGACATAACCCAAATAATCTGAGGTCTGCCGTTTGCAGCAGCATTAGCGTTTGATGTGGAAGGTGATGCAGAGGCTGTTGATCTGCAAGCGCGTCTTTCCCAAGCACGGGGGAACCCTGAAGCATCGGATGCTCTAGTTGCTGAGCTGACTGGTGTTACTGCTAATCATCCGTTGGTCAGTGCTTGTCTGAAGTTTCCGCTCAATCCTAAGCTTCTCAAGATTTCGTAAAAAAGCTGCCAACTACCGTAAAACCGCACTACTAGAGGAGTGCGTTTTTCGTTCCTGAACACATTGCGTGCTGCAACTTAATTATGGTCCTCCCAGCTCAGTGTGCTGTGTGGATTGTTTATTCTCGTCCATTAAGTGATCGAGAAAAAGTTGTTGTAAAGTCATGCGCATGTGTGGAATTGTTGGATATATTGGCCAGGCGGGCGACTCCCGTGATTACTTTGCGCTTGACGTCGTTTTAGAAGGACTGCGCCGACTTGAATACCGCGGTTATGATTCCGCAGGTGTAGCTGTTCATGCGAACGGTGAAATCAGCTACCGAAAGAAGGCTGGAAAGGTAGCTGCGCTGGACGCTGAGATCGCTCGCGCTCCTTTGGCGGATTCCATTTTGGCTATTGGTCACACCCGGTGGGCAACTCACGGTGGACCAACCGATGCAAATGCACACCCCCATGTTGTTGATGGCGGCAAGTTAGCTGTCGTACACAACGGTATTATTGAAAACTTTGCAGAGCTGCGCGCAGAGCTTTCAGCTAAGGGCTACAACTTTGTTTCCGTTACTGACACTGAAGTTGCCGCCACATTGCTGGCAGAAATCTACAACACCCAGGCTAATGGCGATCTGACCAAGGCTATGCAGCTTACTGGTCAGCGTCTTGAGGGTGCGTTCACCCTGCTGGCTATCCATGCTGATCATGATGATCGTATTGTTGCAGCGCGCCGTAACTCTCCTTTGGTTATTGGCTTGGGTGAAGGCGAAAACTTCCTCGGCTCTGACGTTTCTGGCTTCATCGATTACACCCGCAAGGCTGTTGAGATGGGCAACGATCAGATTGTGACCATCACTGCGAACGACTACCAGATCACCAACTTCGATGGTTCTGAGGCAACCGGAAAACCTTTCGACGTGGAGTGGGATGCGGCTGCTGCTGAAAAGGGTGGCTTTGATTCCTTCATGGATAAGGAAATCCACGACCAGCCAGCTGCAGTGCGTGACACCCTCCTCGGACGTTTAGATGAGGATGGCAAGCTGGTCCTTGATGAGCTTCGT 3 | -------------------------------------------------------------------------------- /test/test_aamb_encode.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import random 4 | import vamb 5 | 6 | 7 | class TestAAE(unittest.TestCase): 8 | tnfs = np.random.random((111, 103)).astype(np.float32) 9 | rpkm = np.random.random((111, 14)).astype(np.float32) 10 | lens = np.random.randint(2000, 5000, size=111) 11 | contignames = ["".join(random.choices("abcdefghijklmnopqrstu", k=10)) for _ in lens] 12 | nlatent_l = 32 13 | default_args = (14, 256, nlatent_l, 25, 0.5, 0.5, 0.15, False, 0) 14 | default_temperature = 0.16 15 | 16 | # Construction 17 | def test_bad_args(self): 18 | default_args = self.default_args 19 | 20 | # Test the default args work 21 | aae = vamb.aamb_encode.AAE(*default_args) 22 | self.assertIsInstance(aae, vamb.aamb_encode.AAE) 23 | 24 | with self.assertRaises(ValueError): 25 | vamb.aamb_encode.AAE(0, *default_args[1:]) 26 | 27 | with self.assertRaises(ValueError): 28 | vamb.aamb_encode.AAE(*default_args[:1], 0, *default_args[2:]) 29 | 30 | with self.assertRaises(ValueError): 31 | vamb.aamb_encode.AAE(*default_args[:2], 0, *default_args[3:]) 32 | 33 | with self.assertRaises(ValueError): 34 | vamb.aamb_encode.AAE(*default_args[:3], 0, *default_args[4:]) 35 | 36 | with self.assertRaises(ValueError): 37 | vamb.aamb_encode.AAE(*default_args[:5], float("nan"), *default_args[6:]) 38 | 39 | with self.assertRaises(ValueError): 40 | vamb.aamb_encode.AAE(*default_args[:5], -0.0001, *default_args[6:]) 41 | 42 | with self.assertRaises(ValueError): 43 | vamb.aamb_encode.AAE(*default_args[:6], float("nan"), *default_args[7:]) 44 | 45 | def test_loss_falls(self): 46 | aae = vamb.aamb_encode.AAE(*self.default_args) 47 | rpkm_copy = self.rpkm.copy() 48 | tnfs_copy = self.tnfs.copy() 49 | dl = vamb.encode.make_dataloader( 50 | rpkm_copy, tnfs_copy, self.lens, batchsize=16, destroy=True 51 | ) 52 | (di, ti, ai, we) = next(iter(dl)) 53 | mu, do, to, _, _, _, _ = aae(di, ti) 54 | start_loss = aae.calc_loss(di, do, ti, to)[0].data.item() 55 | 56 | # Loss drops with training 57 | aae.trainmodel( 58 | dl, 59 | nepochs=3, 60 | batchsteps=[1, 2], 61 | T=self.default_temperature, 62 | modelfile=None, 63 | ) 64 | mu, do, to, _, _, _, _ = aae(di, ti) 65 | end_loss = aae.calc_loss(di, do, ti, to)[0].data.item() 66 | self.assertLess(end_loss, start_loss) 67 | 68 | def test_encode(self): 69 | aae = vamb.aamb_encode.AAE(*self.default_args) 70 | dl = vamb.encode.make_dataloader( 71 | self.rpkm.copy(), self.tnfs.copy(), self.lens, batchsize=16, destroy=True 72 | ) 73 | (_, encoding) = aae.get_latents(self.contignames, dl) 74 | self.assertIsInstance(encoding, np.ndarray) 75 | self.assertEqual(encoding.dtype, np.float32) 76 | self.assertEqual(encoding.shape, (len(self.rpkm), self.nlatent_l)) 77 | -------------------------------------------------------------------------------- /test/test_cluster.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from hashlib import md5 4 | 5 | import vamb 6 | 7 | 8 | class TestClusterer(unittest.TestCase): 9 | # This seed has been set just so the unit tests runs faster. 10 | # How many iterations of the clustering depends on the input data 11 | rng = np.random.RandomState(5) 12 | data = rng.random((1024, 40)).astype(np.float32) 13 | lens = rng.randint(500, 1000, size=1024) 14 | 15 | def test_bad_params(self): 16 | with self.assertRaises(ValueError): 17 | vamb.cluster.ClusterGenerator(self.data.astype(np.float64), self.lens) 18 | 19 | with self.assertRaises(ValueError): 20 | vamb.cluster.ClusterGenerator(self.data, self.lens, maxsteps=0) 21 | 22 | with self.assertRaises(ValueError): 23 | vamb.cluster.ClusterGenerator(self.data, self.lens, windowsize=0) 24 | 25 | with self.assertRaises(ValueError): 26 | vamb.cluster.ClusterGenerator(self.data, self.lens, minsuccesses=0) 27 | 28 | with self.assertRaises(ValueError): 29 | vamb.cluster.ClusterGenerator( 30 | self.data, self.lens, minsuccesses=5, windowsize=4 31 | ) 32 | 33 | with self.assertRaises(ValueError): 34 | vamb.cluster.ClusterGenerator( 35 | np.random.random((0, 40)), np.array([], dtype=int) 36 | ) 37 | 38 | def test_basics(self): 39 | clstr = vamb.cluster.ClusterGenerator(self.data, self.lens) 40 | self.assertIs(clstr, iter(clstr)) 41 | 42 | x = next(clstr) 43 | self.assertIsInstance(x, vamb.cluster.Cluster) 44 | 45 | clusters = list(clstr) 46 | clusters.append(x) 47 | 48 | # All members are clustered 49 | self.assertEqual(sum(map(lambda x: len(x.members), clusters)), len(self.data)) 50 | 51 | # Elements of members are exactly the matrix row indices 52 | mems = set() 53 | for i in clusters: 54 | mems.update(i.members) 55 | self.assertEqual(mems, set(range(len(self.data)))) 56 | 57 | def test_detruction(self): 58 | copy = self.data.copy() 59 | clstr = vamb.cluster.ClusterGenerator(self.data, self.lens) 60 | self.assertTrue(np.any(np.abs(self.data - clstr.matrix.numpy()) > 0.001)) 61 | clstr = vamb.cluster.ClusterGenerator(copy, self.lens, destroy=True) 62 | self.assertTrue(np.all(np.abs(copy - clstr.matrix.numpy()) < 1e-6)) 63 | self.assertTrue(np.any(np.abs(self.data - clstr.matrix.numpy()) > 0.001)) 64 | 65 | @staticmethod 66 | def xor_rows_hash(matrix): 67 | m = np.frombuffer(matrix.copy().data, dtype=np.uint32) 68 | m.shape = matrix.shape 69 | v = m[0] 70 | for i in range(1, len(m)): 71 | v ^= m[i] 72 | return md5(v).digest().hex() 73 | 74 | def test_normalization(self): 75 | hash_before = md5(self.data.data.tobytes()).digest().hex() 76 | vamb.cluster.ClusterGenerator(self.data, self.lens) 77 | self.assertEqual(hash_before, md5(self.data.data.tobytes()).digest().hex()) 78 | cp = self.data.copy() 79 | vamb.cluster.ClusterGenerator(cp, self.lens, destroy=True) 80 | hash_after = md5(cp.data.tobytes()).digest().hex() 81 | self.assertNotEqual(hash_before, hash_after) 82 | 83 | # Rows are permuted by the clusterer. We use xor to check the rows 84 | # are still essentially the same. 85 | before_xor = self.xor_rows_hash(cp) 86 | vamb.cluster.ClusterGenerator(cp, self.lens, destroy=True, normalized=True) 87 | self.assertEqual(before_xor, self.xor_rows_hash(cp)) 88 | 89 | def test_cluster(self): 90 | x = next(vamb.cluster.ClusterGenerator(self.data, self.lens)) 91 | self.assertIsInstance(x.members, np.ndarray) 92 | -------------------------------------------------------------------------------- /test/test_encode.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import torch 4 | import tempfile 5 | import vamb 6 | 7 | 8 | class TestDataLoader(unittest.TestCase): 9 | tnfs = np.random.random((111, 103)).astype(np.float32) 10 | rpkm = np.random.random((111, 14)).astype(np.float32) 11 | lens = np.random.randint(2000, 5000, size=111) 12 | 13 | def nearly_same(self, A, B): 14 | self.assertTrue(np.all(np.abs(A - B) < 1e-5)) 15 | 16 | def not_nearly_same(self, A, B): 17 | self.assertTrue(np.any(np.abs(A - B) > 1e-4)) 18 | 19 | def test_bad_args(self): 20 | # Bad rpkm 21 | with self.assertRaises(ValueError): 22 | vamb.encode.make_dataloader([[1, 2, 3]], self.tnfs, self.lens, batchsize=32) 23 | 24 | # bad tnfs 25 | with self.assertRaises(ValueError): 26 | vamb.encode.make_dataloader(self.rpkm, [[1, 2, 3]], self.lens, batchsize=32) 27 | 28 | # Bad batchsize 29 | with self.assertRaises(ValueError): 30 | vamb.encode.make_dataloader(self.rpkm, self.tnfs, self.lens, batchsize=0) 31 | 32 | # Differing lengths 33 | with self.assertRaises(ValueError): 34 | vamb.encode.make_dataloader( 35 | np.random.random((len(self.rpkm) - 1)).astype(np.float32), 36 | self.tnfs, 37 | self.lens, 38 | batchsize=32, 39 | ) 40 | 41 | # Bad dtype 42 | with self.assertRaises(ValueError): 43 | vamb.encode.make_dataloader( 44 | self.rpkm.astype(np.float64), self.tnfs, self.lens, batchsize=32 45 | ) 46 | 47 | def test_destroy(self): 48 | copy_rpkm = self.rpkm.copy() 49 | copy_tnfs = self.tnfs.copy() 50 | 51 | _ = vamb.encode.make_dataloader(self.rpkm, self.tnfs, self.lens, batchsize=32) 52 | self.nearly_same(self.rpkm, copy_rpkm) 53 | self.nearly_same(self.tnfs, copy_tnfs) 54 | 55 | _ = vamb.encode.make_dataloader( 56 | copy_rpkm, copy_tnfs, self.lens, batchsize=32, destroy=True 57 | ) 58 | self.not_nearly_same(self.rpkm, copy_rpkm) 59 | self.not_nearly_same(self.tnfs, copy_tnfs) 60 | 61 | def test_normalized(self): 62 | copy_rpkm = self.rpkm.copy() 63 | copy_tnfs = self.tnfs.copy() 64 | 65 | _ = vamb.encode.make_dataloader( 66 | copy_rpkm, copy_tnfs, self.lens, batchsize=32, destroy=True 67 | ) 68 | 69 | # TNFS: Mean of zero, std of one 70 | self.nearly_same(np.mean(copy_tnfs, axis=0), np.zeros(copy_tnfs.shape[1])) 71 | self.nearly_same(np.std(copy_tnfs, axis=0), np.ones(copy_tnfs.shape[1])) 72 | 73 | # RPKM: Sum to 1, all zero or above 74 | # print(copy_rpkm) 75 | self.nearly_same(np.sum(copy_rpkm, axis=1), np.ones(copy_rpkm.shape[0])) 76 | self.assertTrue(np.all(copy_rpkm >= 0.0)) 77 | 78 | def test_single_sample(self): 79 | single_rpkm = self.rpkm[:, [0]] 80 | copy_single = single_rpkm.copy() 81 | dl = vamb.encode.make_dataloader( 82 | single_rpkm, self.tnfs.copy(), self.lens, batchsize=32, destroy=True 83 | ) 84 | # When destroying a single sample, RPKM is set to 1.0 85 | self.assertAlmostEqual(np.abs(np.mean(single_rpkm)), 1.0) 86 | self.assertLess(abs(np.std(single_rpkm)), 1e-6) 87 | 88 | # ... and the abundance are the same abundances as before, 89 | # except normalized and scaled. We test that they are ordered 90 | # in the same order 91 | self.assertTrue( 92 | ( 93 | torch.argsort(dl.dataset.tensors[2], dim=0) 94 | == torch.argsort(torch.from_numpy(copy_single), dim=0) 95 | ) 96 | .all() 97 | .item() 98 | ) 99 | 100 | def test_iter(self): 101 | bs = 32 102 | dl = vamb.encode.make_dataloader(self.rpkm, self.tnfs, self.lens, batchsize=bs) 103 | 104 | # Check right element type 105 | for M in next(iter(dl)): 106 | self.assertEqual(M.dtype, torch.float32) 107 | self.assertEqual(M.shape[0], bs) 108 | 109 | # Check it iterates the right order (rpkm, tnfs) 110 | rpkm, tnfs, abundance, weights = next(iter(dl)) 111 | self.nearly_same(np.sum(rpkm.numpy(), axis=1), np.ones(bs)) 112 | 113 | def test_randomized(self): 114 | dl = vamb.encode.make_dataloader(self.rpkm, self.tnfs, self.lens, batchsize=64) 115 | rpkm, tnfs, abundances, weights = next(iter(dl)) 116 | 117 | # Test that first batch is not just the first 64 elements. 118 | # Could happen, but vanishingly unlikely. 119 | self.assertTrue(np.any(np.abs(tnfs.numpy() - self.tnfs[:64]) > 1e-3)) 120 | 121 | 122 | class TestVAE(unittest.TestCase): 123 | tnfs = np.random.random((111, 103)).astype(np.float32) 124 | rpkm = np.random.random((111, 14)).astype(np.float32) 125 | lens = np.random.randint(2000, 5000, size=111) 126 | 127 | def test_bad_args(self): 128 | with self.assertRaises(ValueError): 129 | vamb.encode.VAE(-1) 130 | 131 | with self.assertRaises(ValueError): 132 | vamb.encode.VAE(5, nlatent=0) 133 | 134 | with self.assertRaises(ValueError): 135 | vamb.encode.VAE(5, nhiddens=[128, 0]) 136 | 137 | with self.assertRaises(ValueError): 138 | vamb.encode.VAE(5, alpha=0.0) 139 | 140 | with self.assertRaises(ValueError): 141 | vamb.encode.VAE(5, alpha=1.0) 142 | 143 | with self.assertRaises(ValueError): 144 | vamb.encode.VAE(5, beta=0.0) 145 | 146 | with self.assertRaises(ValueError): 147 | vamb.encode.VAE(5, dropout=1.0) 148 | 149 | with self.assertRaises(ValueError): 150 | vamb.encode.VAE(5, dropout=-0.001) 151 | 152 | def test_loss_falls(self): 153 | vae = vamb.encode.VAE(self.rpkm.shape[1]) 154 | rpkm_copy = self.rpkm.copy() 155 | tnfs_copy = self.tnfs.copy() 156 | dl = vamb.encode.make_dataloader( 157 | rpkm_copy, tnfs_copy, self.lens, batchsize=16, destroy=True 158 | ) 159 | (di, ti, ai, we) = next(iter(dl)) 160 | do, to, ao, mu = vae(di, ti, ai) 161 | start_loss = vae.calc_loss(di, do, ti, to, ao, ai, mu, we)[0].data.item() 162 | 163 | with tempfile.TemporaryFile() as file: 164 | # Loss drops with training 165 | vae.trainmodel(dl, nepochs=3, batchsteps=[1, 2], modelfile=file) 166 | do, to, ao, mu = vae(di, ti, ai) 167 | end_loss = vae.calc_loss(di, do, ti, to, ao, ai, mu, we)[0].data.item() 168 | self.assertLess(end_loss, start_loss) 169 | 170 | # Also test save/load 171 | before_encoding = vae.encode(dl) 172 | file.flush() 173 | file.seek(0) 174 | vae_2 = vamb.encode.VAE.load(file) 175 | 176 | after_encoding = vae_2.encode(dl) 177 | self.assertTrue(np.all(np.abs(before_encoding - after_encoding) < 1e-6)) 178 | 179 | def test_encoding(self): 180 | nlatent = 15 181 | vae = vamb.encode.VAE(self.rpkm.shape[1], nlatent=nlatent) 182 | dl = vamb.encode.make_dataloader(self.rpkm, self.tnfs, self.lens, batchsize=32) 183 | encoding = vae.encode(dl) 184 | self.assertEqual(encoding.dtype, np.float32) 185 | self.assertEqual(encoding.shape, (len(self.rpkm), nlatent)) 186 | -------------------------------------------------------------------------------- /test/test_parsebam.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import io 3 | import numpy as np 4 | import tempfile 5 | from pathlib import Path 6 | 7 | import vamb 8 | import testtools 9 | from vamb.parsecontigs import CompositionMetaData 10 | 11 | 12 | class TestParseBam(unittest.TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | minlen = 3000 16 | mask = np.array( 17 | list(map(lambda x: x >= minlen, testtools.BAM_SEQ_LENS)), dtype=bool 18 | ) 19 | cls.comp_metadata = CompositionMetaData( 20 | np.array( 21 | [i for (i, m) in zip(testtools.BAM_NAMES, mask) if m], dtype=object 22 | ), 23 | np.array([i for (i, m) in zip(testtools.BAM_SEQ_LENS, mask) if m]), 24 | mask, 25 | minlen, 26 | ) 27 | 28 | cls.abundance = vamb.parsebam.Abundance.from_files( 29 | testtools.BAM_FILES, "/tmp/bam_tmpfile", cls.comp_metadata, True, 0.0, 2 30 | ) 31 | 32 | def test_refhash(self): 33 | m = self.comp_metadata 34 | cp = CompositionMetaData(m.identifiers, m.lengths, m.mask, m.minlength) 35 | # Change the refnames slighty 36 | cp.identifiers = cp.identifiers.copy() 37 | cp.identifiers[3] = cp.identifiers[3] + "w" 38 | cp.refhash = vamb.vambtools.RefHasher.hash_refnames(cp.identifiers) 39 | with self.assertRaises(ValueError): 40 | vamb.parsebam.Abundance.from_files( 41 | testtools.BAM_FILES, None, cp, True, 0.97, 4 42 | ) 43 | 44 | ab2 = vamb.parsebam.Abundance.from_files( 45 | testtools.BAM_FILES, None, cp, False, 0.97, 4 46 | ) 47 | self.assertEqual(self.abundance.refhash, ab2.refhash) 48 | 49 | def test_bad_metadata_mask(self): 50 | m = self.comp_metadata 51 | 52 | # If last element of mask is False, then the invariants of CompositionMetaData will 53 | # not hold after removing the last element of its mask, and that is NOT what we 54 | # are testing here. 55 | assert list(m.mask[-3:]) == [True, False, False] 56 | cp = CompositionMetaData( 57 | m.identifiers[:-1], m.lengths[:-1], m.mask[:-3], m.minlength 58 | ) 59 | with self.assertRaises(ValueError): 60 | vamb.parsebam.Abundance.from_files( 61 | testtools.BAM_FILES, None, cp, True, 0.97, 4 62 | ) 63 | 64 | def test_badfile(self): 65 | with self.assertRaises(BaseException): 66 | vamb.parsebam.Abundance.from_files( 67 | ["noexist"], None, self.comp_metadata, True, 0.97, 1 68 | ) 69 | 70 | # Minid too high 71 | def test_minid_off(self): 72 | with self.assertRaises(ValueError): 73 | vamb.parsebam.Abundance.from_files( 74 | testtools.BAM_FILES, None, self.comp_metadata, True, 1.01, 4 75 | ) 76 | 77 | def test_parse(self): 78 | nm = sum(self.comp_metadata.mask) 79 | self.assertEqual(nm, 12) 80 | 81 | self.assertEqual(self.abundance.matrix.shape, (nm, 3)) 82 | self.assertEqual(self.abundance.nseqs, nm) 83 | self.assertEqual(self.abundance.matrix.dtype, np.float32) 84 | self.assertEqual(self.abundance.nsamples, 3) 85 | 86 | def test_minid(self): 87 | abundance = vamb.parsebam.Abundance.from_files( 88 | testtools.BAM_FILES, None, self.comp_metadata, True, 0.95, 3 89 | ) 90 | self.assertTrue(np.any(abundance.matrix < self.abundance.matrix)) 91 | 92 | def test_save_load(self): 93 | buf = io.BytesIO() 94 | self.abundance.save(buf) 95 | buf.seek(0) 96 | 97 | # Bad refhash 98 | with self.assertRaises(ValueError): 99 | abundance2 = vamb.parsebam.Abundance.load(buf, b"a" * 32) 100 | 101 | buf.seek(0) 102 | abundance2 = vamb.parsebam.Abundance.load(buf, self.abundance.refhash) 103 | self.assertTrue(np.all(abundance2.matrix == self.abundance.matrix)) 104 | self.assertTrue(np.all(abundance2.samplenames == self.abundance.samplenames)) 105 | self.assertEqual(abundance2.refhash, self.abundance.refhash) 106 | self.assertEqual(abundance2.minid, self.abundance.minid) 107 | 108 | def test_parse_from_tsv(self): 109 | # Check it parses 110 | with open(testtools.AEMB_FILES[0]) as file: 111 | lines = [s.rstrip() for s in file] 112 | for path in testtools.AEMB_FILES[1:]: 113 | with open(path) as file: 114 | for i, existing in enumerate(file): 115 | lines[i] += "\t" + existing.split("\t")[1].rstrip() 116 | 117 | # Add in lines with zeros corresponding to the masked contigs 118 | unmasked_lines = [] 119 | i = 0 120 | for keep in self.comp_metadata.mask: 121 | if not keep: 122 | unmasked_lines.append("\t0.0\t0.0\t0.0") 123 | else: 124 | unmasked_lines.append(lines[i]) 125 | i += 1 126 | 127 | with tempfile.NamedTemporaryFile(mode="w+") as file: 128 | print("contigname\tfile1\tfile2\tfile3", file=file) 129 | for line in unmasked_lines: 130 | print(line, file=file) 131 | file.seek(0) 132 | abundance = vamb.parsebam.Abundance.from_tsv( 133 | Path(file.name), self.comp_metadata 134 | ) 135 | 136 | self.assertEqual(abundance.refhash, self.comp_metadata.refhash) 137 | self.assertEqual(list(abundance.samplenames), ["file1", "file2", "file3"]) 138 | 139 | # Check values are alright 140 | M = np.zeros_like(abundance.matrix) 141 | for row, line in enumerate(lines): 142 | for col, cell in enumerate(line.split("\t")[1:]): 143 | M[row, col] = float(cell) 144 | self.assertTrue((np.abs((M - abundance.matrix)) < 1e-6).all()) 145 | 146 | # Bad header order errors 147 | lines[5], lines[4] = lines[4], lines[5] 148 | 149 | with tempfile.NamedTemporaryFile(mode="w+") as file: 150 | print("contigname\tfile1\tfile2\tfile3", file=file) 151 | for line in lines: 152 | print(line, file=file) 153 | file.seek(0) 154 | with self.assertRaises(ValueError): 155 | vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata) 156 | 157 | # Restore 158 | lines[5], lines[4] = lines[4], lines[5] 159 | 160 | # Too many lines 161 | with tempfile.NamedTemporaryFile(mode="w+") as file: 162 | print("contigname\tfile1\tfile2\tfile3", file=file) 163 | for line in lines: 164 | print(line, file=file) 165 | print(lines[-2], file=file) 166 | file.seek(0) 167 | with self.assertRaises(ValueError): 168 | vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata) 169 | -------------------------------------------------------------------------------- /test/test_parsecontigs.py: -------------------------------------------------------------------------------- 1 | import io 2 | import unittest 3 | import random 4 | import numpy as np 5 | 6 | import testtools 7 | from vamb.parsecontigs import Composition, CompositionMetaData 8 | 9 | 10 | class TestReadContigs(unittest.TestCase): 11 | records = [] 12 | large_io = io.BytesIO() 13 | io = io.BytesIO() 14 | 15 | @classmethod 16 | def setUpClass(cls): 17 | rng = random.Random() 18 | for i in range(random.randrange(1400, 1500)): 19 | cls.records.append(testtools.make_randseq(rng, 400, 600)) 20 | 21 | for i in cls.records: 22 | cls.io.write(i.format().encode()) 23 | cls.io.write(b"\n") 24 | 25 | for i in range(25_000): 26 | record = testtools.make_randseq(rng, 250, 300) 27 | cls.large_io.write(record.format().encode()) 28 | cls.large_io.write(b"\n") 29 | 30 | def setUp(self): 31 | self.io.seek(0) 32 | self.large_io.seek(0) 33 | 34 | def test_only_ns(self): 35 | file = io.BytesIO() 36 | file.write(b">abc\n") 37 | file.write(b"N" * 2500) 38 | file.write(b"\n") 39 | file.seek(0) 40 | 41 | with self.assertRaises(ValueError): 42 | Composition.from_file(file, None) 43 | 44 | def test_unique_names(self): 45 | with self.assertRaises(ValueError): 46 | CompositionMetaData( 47 | np.array(["foo", "foo"], dtype=object), 48 | np.array([1000, 1000]), 49 | np.array([True, True], dtype=bool), 50 | 1000, 51 | ) 52 | 53 | def test_filter_minlength(self): 54 | minlen = 500 55 | composition = Composition.from_file(self.io, None, minlength=450) 56 | md = composition.metadata 57 | hash1 = md.refhash 58 | 59 | composition.filter_min_length(minlen) 60 | n_initial_seq = md.nseqs 61 | 62 | hash2 = md.refhash 63 | self.assertNotEqual(hash1, hash2) 64 | self.assertEqual(len(md.identifiers), len(md.lengths)) 65 | self.assertEqual(md.nseqs, md.mask.sum()) 66 | self.assertLessEqual(minlen, composition.metadata.lengths.min(initial=minlen)) 67 | self.assertEqual(len(md.mask), len(self.records)) 68 | 69 | # NB: Here we filter metadata without filtering the composition. 70 | # That means from this point on, the metadata and comp is out of sync, 71 | # and comp is invalid. 72 | md.filter_min_length(minlen + 50) 73 | self.assertEqual(len(md.identifiers), len(md.lengths)) 74 | self.assertEqual(md.nseqs, md.mask.sum()) 75 | self.assertLessEqual( 76 | minlen, composition.metadata.lengths.min(initial=minlen + 50) 77 | ) 78 | self.assertEqual(len(md.mask), len(self.records)) 79 | self.assertLess(md.nseqs, n_initial_seq) 80 | 81 | hash3 = md.refhash 82 | md.filter_min_length(minlen - 50) 83 | self.assertEqual(hash3, md.refhash) 84 | 85 | md.filter_min_length(50000000000) 86 | self.assertEqual(md.nseqs, 0) 87 | self.assertFalse(np.any(md.mask)) 88 | 89 | def test_minlength(self): 90 | with self.assertRaises(ValueError): 91 | Composition.from_file(self.io, None, minlength=3) 92 | 93 | def test_properties(self): 94 | composition = Composition.from_file(self.io, None, minlength=420) 95 | passed = list(filter(lambda x: len(x.sequence) >= 420, self.records)) 96 | 97 | self.assertEqual(composition.nseqs, len(composition.metadata.identifiers)) 98 | self.assertEqual(composition.nseqs, len(composition.metadata.lengths)) 99 | 100 | self.assertTrue(composition.matrix.dtype, np.float32) 101 | self.assertEqual(composition.matrix.shape, (len(passed), 103)) 102 | 103 | # Names 104 | self.assertEqual( 105 | list(composition.metadata.identifiers), [i.header for i in passed] 106 | ) 107 | 108 | # Lengths 109 | self.assertTrue(np.issubdtype(composition.metadata.lengths.dtype, np.integer)) 110 | self.assertEqual( 111 | [len(i.sequence) for i in passed], list(composition.metadata.lengths) 112 | ) 113 | 114 | def test_save_load(self): 115 | buf = io.BytesIO() 116 | composition_1 = Composition.from_file(self.io, None) 117 | md1 = composition_1.metadata 118 | composition_1.save(buf) 119 | buf.seek(0) 120 | composition_2 = Composition.load(buf) 121 | md2 = composition_2.metadata 122 | 123 | self.assertTrue(np.all(composition_1.matrix == composition_2.matrix)) 124 | self.assertTrue(np.all(md1.identifiers == md2.identifiers)) 125 | self.assertTrue(np.all(md1.lengths == md2.lengths)) 126 | self.assertTrue(np.all(md1.refhash == md2.refhash)) 127 | self.assertTrue(np.all(md1.minlength == md2.minlength)) 128 | 129 | def test_windows_newlines(self): 130 | rng = random.Random() 131 | buf1 = io.BytesIO() 132 | buf2 = io.BytesIO() 133 | for i in range(10): 134 | record = testtools.make_randseq(rng, 10, 20) 135 | buf1.write(b">" + record.header.encode()) 136 | buf2.write(b">" + record.header.encode()) 137 | buf1.write(b"\r\n") 138 | buf2.write(b"\n") 139 | buf1.write(record.sequence) 140 | buf2.write(record.sequence) 141 | buf1.write(b"\r\n") 142 | buf2.write(b"\n") 143 | 144 | buf1.seek(0) 145 | buf2.seek(0) 146 | comp1 = Composition.from_file(buf1, None) 147 | comp2 = Composition.from_file(buf2, None) 148 | 149 | self.assertEqual(comp1.metadata.refhash, comp2.metadata.refhash) 150 | self.assertTrue(np.all(comp1.matrix == comp2.matrix)) 151 | -------------------------------------------------------------------------------- /test/test_parsemarkers.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import vamb 3 | import testtools 4 | from pathlib import Path 5 | import tempfile 6 | import shutil 7 | import io 8 | 9 | 10 | class TestParseMarkers(unittest.TestCase): 11 | def test_instantiate(self): 12 | tmp = tempfile.mkdtemp() 13 | tmp_path = Path(tmp) 14 | shutil.rmtree(tmp) 15 | markers = vamb.parsemarkers.Markers.from_files( 16 | Path(testtools.DATADIR).joinpath("marker.fna"), 17 | Path(testtools.PARENTDIR).joinpath("vamb").joinpath("marker.hmm"), 18 | ["abc"], 19 | tmp_path, 20 | 4, 21 | None, 22 | ) 23 | self.assertIsNotNone(markers.markers[0]) 24 | self.assertEqual(len(markers.markers), 1) 25 | self.assertEqual(set(markers.markers[0]), {39}) 26 | self.assertEqual( 27 | markers.refhash, vamb.vambtools.RefHasher.hash_refnames(["abc"]) 28 | ) 29 | 30 | buf = io.StringIO() 31 | markers.save(buf) 32 | buf.seek(0) 33 | 34 | markers2 = vamb.parsemarkers.Markers.load(buf, markers.refhash) 35 | self.assertEqual(len(markers.markers), len(markers2.markers)) 36 | self.assertEqual(set(markers.markers[0]), set(markers2.markers[0])) 37 | self.assertEqual(markers.marker_names, markers2.marker_names) 38 | -------------------------------------------------------------------------------- /test/test_reclustering.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | # For CAMI dataset, compute comp, abundance, taxonomy, markers 4 | # Subset to e.g. 5 genera plus a few unclassified contigs 5 | 6 | # FASTA 7 | # Comp 8 | # Abundance 9 | # Markers 10 | # Taxonomy 11 | # Latent 12 | # Refined taxonomy 13 | 14 | 15 | class TestKmeansReclustering(unittest.TestCase): 16 | pass 17 | # Make markers + lengths 18 | # Make taxonomy 19 | # Create latent 20 | 21 | # Initial clustering 22 | 23 | 24 | class TestDBScanReclustering(unittest.TestCase): 25 | # It produces disjoint clusters, a subset of the input points 26 | pass 27 | -------------------------------------------------------------------------------- /test/test_results.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import io 3 | import random 4 | import numpy as np 5 | import torch 6 | from hashlib import sha256 7 | 8 | import vamb 9 | import testtools 10 | 11 | # PyTorch cannot be made stable, so we cannot run CI on the result 12 | # of pytorch training. 13 | # Nonetheless, you can enable it locally with this switch here, 14 | # which can be useful sometimes. 15 | TEST_UNSTABLE_HASHES = False 16 | 17 | 18 | class TestCompositionResult(unittest.TestCase): 19 | io = io.BytesIO() 20 | 21 | @classmethod 22 | def setUpClass(cls): 23 | rng = random.Random(15) 24 | for _ in range(4): 25 | seq = testtools.make_randseq(rng, 400, 600) 26 | cls.io.write(seq.format().encode()) 27 | cls.io.write(b"\n") 28 | 29 | def setUp(self): 30 | self.io.seek(0) 31 | 32 | def test_runs(self): 33 | comp = vamb.parsecontigs.Composition.from_file(self.io, None) 34 | self.assertIsInstance(comp, vamb.parsecontigs.Composition) 35 | 36 | if TEST_UNSTABLE_HASHES: 37 | 38 | def test_result(self): 39 | comp = vamb.parsecontigs.Composition.from_file(self.io, None) 40 | self.assertEqual( 41 | sha256(comp.matrix.data.tobytes()).digest().hex(), 42 | "9e9a2d7b021654e874894722bdd6cd3eda18bed03fabd32a9440e806a8ab1bd1", 43 | ) 44 | 45 | 46 | class TestAbundanceResult(unittest.TestCase): 47 | @classmethod 48 | def setUpClass(cls): 49 | cls.comp_metadata = vamb.parsecontigs.CompositionMetaData( 50 | np.array(testtools.BAM_NAMES, dtype=object), 51 | np.array(testtools.BAM_SEQ_LENS), 52 | np.ones(len(testtools.BAM_SEQ_LENS), dtype=bool), 53 | 2000, 54 | ) 55 | 56 | def test_runs(self): 57 | abundance = vamb.parsebam.Abundance.from_files( 58 | testtools.BAM_FILES, None, self.comp_metadata, True, 0.9, 4 59 | ) 60 | self.assertIsInstance(abundance, vamb.parsebam.Abundance) 61 | 62 | if TEST_UNSTABLE_HASHES: 63 | 64 | def test_result(self): 65 | abundance = vamb.parsebam.Abundance.from_files( 66 | testtools.BAM_FILES, "/tmp/tmpbam", self.comp_metadata, True, 0.9, 2 67 | ) 68 | self.assertEqual( 69 | sha256(abundance.matrix.data.tobytes()).digest().hex(), 70 | "c346abb53b62423fe95ed4b2eb5988d77141b2d7a5c58c03fdf09abc6476df78", 71 | ) 72 | abundance2 = vamb.parsebam.Abundance.from_files( 73 | testtools.BAM_FILES, None, self.comp_metadata, True, 0.9, 4 74 | ) 75 | self.assertTrue(np.all(np.abs(abundance.matrix - abundance2.matrix) < 1e-5)) 76 | 77 | 78 | class TestEncodingResult(unittest.TestCase): 79 | @classmethod 80 | def setUpClass(cls): 81 | torch.manual_seed(0) 82 | rng = np.random.RandomState(15) 83 | cls.tnfs = rng.random((200, 103)).astype(np.float32) 84 | cls.rpkm = rng.random((200, 6)).astype(np.float32) 85 | cls.lens = rng.randint(2000, 5000, 200) 86 | 87 | def test_runs(self): 88 | self.assertEqual( 89 | sha256(self.lens.data.tobytes()).digest().hex(), 90 | "68894f01cc435a5f032a655faecddd817cd35a71397129296a11f8c40bd29fcb", 91 | ) 92 | 93 | vae = vamb.encode.VAE(6) 94 | dl = vamb.encode.make_dataloader( 95 | self.rpkm.copy(), self.tnfs, self.lens, batchsize=16 96 | ) 97 | vae.trainmodel(dl, nepochs=3, batchsteps=[1, 2]) 98 | latent = vae.encode(dl) 99 | 100 | self.assertIsInstance(latent, np.ndarray) 101 | 102 | if TEST_UNSTABLE_HASHES: 103 | 104 | def test_result(self): 105 | torch.manual_seed(0) 106 | torch.use_deterministic_algorithms(True) 107 | np.random.seed(0) 108 | random.seed(0) 109 | vae = vamb.encode.VAE(6) 110 | dl = vamb.encode.make_dataloader( 111 | self.rpkm, self.tnfs, self.lens, batchsize=16 112 | ) 113 | vae.trainmodel(dl, nepochs=3, batchsteps=[1, 2]) 114 | latent = vae.encode(dl) 115 | 116 | self.assertEqual( 117 | sha256(latent.data.tobytes()).digest().hex(), 118 | "0148ec0767e88c756615340d6fd0b31ca07aa6b4b172a1874fb7de7179acb57d", 119 | ) 120 | 121 | self.assertEqual( 122 | sha256(torch.rand(10).numpy().tobytes()).digest().hex(), 123 | "c417b9722e14e854fbe79cc5c797cc6653360c1e6536064205ca0c073f41eaf6", 124 | ) 125 | 126 | 127 | class TestClusterResult(unittest.TestCase): 128 | @classmethod 129 | def setUpClass(cls): 130 | rng = np.random.RandomState(15) 131 | cls.latent = rng.random((1000, 3)).astype(np.float32) - 0.5 132 | 133 | def test_runs(self): 134 | self.assertEqual( 135 | sha256(self.latent.tobytes()).digest().hex(), 136 | "630a98a4b44c3754a3f423e915847f44767bb69fb13ea5901dc512428aee9811", 137 | ) 138 | 139 | if TEST_UNSTABLE_HASHES: 140 | 141 | def test_result(self): 142 | hash = sha256() 143 | 144 | # Use this to check that the clustering used in this test produces 145 | # a reasonable cluster size, and that it doesn't just pass because 146 | # it always clusters everything in 1-point clusters. 147 | # Uncomment when updating this test. 148 | # lens = list() 149 | for cluster in vamb.cluster.ClusterGenerator(self.latent.copy()): 150 | medoid = cluster.metadata.medoid 151 | points = set(cluster.members) 152 | # Set hashing may differ from run to run, so turn into sorted arrays 153 | arr = np.array(list(points)) 154 | arr.sort() 155 | # lens.append(arr) 156 | hash.update(medoid.to_bytes(4, "big")) 157 | hash.update(arr.data) 158 | 159 | # self.assertGreater(len(list(map(lambda x: len(lens) > 1))), 3) 160 | self.assertEqual( 161 | hash.digest().hex(), 162 | "2b3caf674ff1d1906a831219e0953b2d9f1b78ecefec709b70c672280af49aee", 163 | ) 164 | -------------------------------------------------------------------------------- /test/test_semisupervised_encode.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import tempfile 4 | import vamb 5 | 6 | 7 | class TestDataLoader(unittest.TestCase): 8 | def test_permute_indices(self): 9 | indices = vamb.semisupervised_encode.permute_indices(10, 25, seed=1) 10 | set_10 = set(range(10)) 11 | self.assertTrue(len(indices) == 25) 12 | self.assertTrue(set(indices[:10]) == set_10) 13 | self.assertTrue(set(indices[10:20]) == set_10) 14 | self.assertTrue(set(indices[20:]).issubset(set_10)) 15 | 16 | 17 | class TestVAEVAE(unittest.TestCase): 18 | N_contigs = 111 19 | tnfs = np.random.random((N_contigs, 103)).astype(np.float32) 20 | rpkms = np.random.random((N_contigs, 14)).astype(np.float32) 21 | domain = "d_Bacteria" 22 | phyla = ["f_1", "f_2", "f_3"] 23 | classes = { 24 | "f_1": ["c_11", "c_21", "c_31"], 25 | "f_2": ["c_12", "c_22", "c_32"], 26 | "f_3": ["c_13", "c_23", "c_33"], 27 | } 28 | lengths = np.random.randint(2000, 5000, size=N_contigs) 29 | 30 | def make_random_annotation(self): 31 | phylum = np.random.choice(self.phyla, 1)[0] 32 | clas = np.random.choice(self.classes[phylum], 1)[0] 33 | if np.random.random() <= 0.2: 34 | return vamb.taxonomy.ContigTaxonomy.from_semicolon_sep( 35 | ";".join([self.domain]) 36 | ) 37 | if 0.2 < np.random.random() <= 0.5: 38 | return vamb.taxonomy.ContigTaxonomy.from_semicolon_sep( 39 | ";".join([self.domain, phylum]) 40 | ) 41 | return vamb.taxonomy.ContigTaxonomy.from_semicolon_sep( 42 | ";".join([self.domain, phylum, clas]) 43 | ) 44 | 45 | def make_random_annotations(self): 46 | return [self.make_random_annotation() for _ in range(self.N_contigs)] 47 | 48 | def test_make_graph(self): 49 | annotations = self.make_random_annotations() 50 | nodes, ind_nodes, table_parent = vamb.taxvamb_encode.make_graph(annotations) 51 | print(nodes, ind_nodes, table_parent) 52 | self.assertTrue( 53 | set(nodes).issubset( 54 | set( 55 | [ 56 | "root", 57 | "d_Bacteria", 58 | "f_1", 59 | "f_2", 60 | "f_3", 61 | "c_11", 62 | "c_21", 63 | "c_31", 64 | "c_12", 65 | "c_22", 66 | "c_32", 67 | "c_13", 68 | "c_23", 69 | "c_33", 70 | ] 71 | ) 72 | ) 73 | ) 74 | for p, cls in self.classes.items(): 75 | for c in cls: 76 | for f in self.phyla: 77 | # Since the taxonomy is generated randomly, we can't guarantee 78 | # that each run will have all the clades. 79 | if any(i not in ind_nodes for i in (p, c, f)): 80 | continue 81 | self.assertTrue(ind_nodes.get(f, -666) < ind_nodes.get(c, 666)) 82 | self.assertTrue(table_parent[ind_nodes[f]] == 1) 83 | self.assertTrue(table_parent[ind_nodes[c]] == ind_nodes[p]) 84 | 85 | def test_encoding(self): 86 | nlatent = 10 87 | batchsize = 10 88 | nepochs = 2 89 | annotations = self.make_random_annotations() 90 | nodes, ind_nodes, table_parent = vamb.taxvamb_encode.make_graph(annotations) 91 | 92 | classes_order = np.array([a.ranks[-1] for a in annotations]) 93 | targets = np.array([ind_nodes[i] for i in classes_order]) 94 | 95 | vae = vamb.taxvamb_encode.VAEVAEHLoss( 96 | self.rpkms.shape[1], 97 | len(nodes), 98 | nodes, 99 | table_parent, 100 | nlatent=nlatent, 101 | cuda=False, 102 | ) 103 | 104 | dataloader_vamb = vamb.encode.make_dataloader( 105 | self.rpkms, 106 | self.tnfs, 107 | self.lengths, 108 | batchsize=batchsize, 109 | cuda=False, 110 | ) 111 | dataloader_joint = vamb.taxvamb_encode.make_dataloader_concat_hloss( 112 | self.rpkms, 113 | self.tnfs, 114 | self.lengths, 115 | targets, 116 | len(nodes), 117 | table_parent, 118 | batchsize=batchsize, 119 | cuda=False, 120 | ) 121 | dataloader_labels = vamb.taxvamb_encode.make_dataloader_labels_hloss( 122 | self.rpkms, 123 | self.tnfs, 124 | self.lengths, 125 | targets, 126 | len(nodes), 127 | table_parent, 128 | batchsize=batchsize, 129 | cuda=False, 130 | ) 131 | 132 | shapes = (self.rpkms.shape[1], 103, 1, len(nodes)) 133 | dataloader = vamb.taxvamb_encode.make_dataloader_semisupervised_hloss( 134 | dataloader_joint, 135 | dataloader_vamb, 136 | dataloader_labels, 137 | len(nodes), 138 | table_parent, 139 | shapes, 140 | 666, 141 | batchsize=batchsize, 142 | cuda=False, 143 | ) 144 | with tempfile.TemporaryFile() as modelfile: 145 | vae.trainmodel( 146 | dataloader, 147 | nepochs=nepochs, 148 | modelfile=modelfile, 149 | batchsteps=[], 150 | ) 151 | 152 | latent_both = vae.VAEJoint.encode(dataloader_joint) 153 | self.assertEqual(latent_both.dtype, np.float32) 154 | self.assertEqual(latent_both.shape, (len(self.rpkms), nlatent)) 155 | -------------------------------------------------------------------------------- /test/testtools.py: -------------------------------------------------------------------------------- 1 | import string 2 | import os 3 | import pathlib 4 | 5 | import vamb 6 | 7 | PARENTDIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 8 | DATADIR = os.path.join(PARENTDIR, "test", "data") 9 | BAM_FILES = sorted( 10 | [ 11 | pathlib.Path(DATADIR).joinpath("bam").joinpath(i) 12 | for i in os.listdir(os.path.join(DATADIR, "bam")) 13 | ] 14 | ) 15 | AEMB_DIR = os.path.join(DATADIR, "aemb") 16 | AEMB_FILES = sorted([pathlib.Path(AEMB_DIR).joinpath(i) for i in os.listdir(AEMB_DIR)]) 17 | 18 | BAM_NAMES = [ 19 | "S27C175628", 20 | "S27C95602", 21 | "S27C25358", 22 | "S26C115410", 23 | "S4C529736", 24 | "S27C181335", 25 | "S4C222286", 26 | "S27C38468", 27 | "S11C13125", 28 | "S4C480978", 29 | "S27C255582", 30 | "S27C170328", 31 | "S7C221395", 32 | "S26C281881", 33 | "S12C228927", 34 | "S26C86604", 35 | "S27C93037", 36 | "S9C124493", 37 | "S27C236159", 38 | "S27C214882", 39 | "S7C273086", 40 | "S8C93079", 41 | "S12C85159", 42 | "S10C72456", 43 | "S27C19079", 44 | ] 45 | 46 | BAM_SEQ_LENS = [ 47 | 2271, 48 | 3235, 49 | 3816, 50 | 2625, 51 | 2716, 52 | 4035, 53 | 3001, 54 | 2583, 55 | 5962, 56 | 3774, 57 | 2150, 58 | 2161, 59 | 2218, 60 | 2047, 61 | 5772, 62 | 2633, 63 | 3400, 64 | 3502, 65 | 2103, 66 | 4308, 67 | 3061, 68 | 2464, 69 | 4099, 70 | 2640, 71 | 2449, 72 | ] 73 | 74 | 75 | def make_randseq(rng, frm: int, to: int) -> vamb.vambtools.FastaEntry: 76 | name = rng.choice(string.ascii_uppercase) + "".join( 77 | rng.choices(string.ascii_lowercase, k=11) 78 | ) 79 | seq = "".join( 80 | rng.choices( 81 | "acgtACGTnNywsdbK", 82 | weights=[0.12] * 8 + [0.005] * 8, 83 | k=rng.randrange(frm, to), 84 | ) 85 | ) 86 | return vamb.vambtools.FastaEntry(name.encode(), bytearray(seq.encode())) 87 | -------------------------------------------------------------------------------- /vamb/__init__.py: -------------------------------------------------------------------------------- 1 | """Vamb - Variational Autoencoders for Metagenomic Binning 2 | Documentation: https://github.com/RasmussenLab/vamb/ 3 | """ 4 | 5 | # TODO: Pyhmmer is compiled with -funsafe-math-optimizations, which toggles some 6 | # flag in the CPU controlling float subnormal behaviour. 7 | # This causes a warning in NumPy. 8 | # This is not an issue in Vamb (I think), so we silence the warning here as a 9 | # temporary fix. 10 | # See https://github.com/althonos/pyhmmer/issues/71 11 | import warnings 12 | 13 | warnings.filterwarnings("ignore", category=UserWarning, module="numpy") 14 | 15 | from . import vambtools 16 | from . import parsebam 17 | from . import parsecontigs 18 | from . import parsemarkers 19 | from . import taxonomy 20 | from . import cluster 21 | from . import encode 22 | from . import aamb_encode 23 | from . import semisupervised_encode 24 | from . import hloss_misc 25 | from . import taxvamb_encode 26 | from . import reclustering 27 | 28 | from importlib.metadata import version as get_version 29 | from loguru import logger 30 | 31 | __version_str__ = get_version("vamb") 32 | logger.remove() 33 | 34 | __all__ = [ 35 | "vambtools", 36 | "parsebam", 37 | "parsecontigs", 38 | "parsemarkers", 39 | "taxonomy", 40 | "cluster", 41 | "encode", 42 | "aamb_encode", 43 | "semisupervised_encode", 44 | "taxvamb_encode", 45 | "hloss_misc", 46 | "reclustering", 47 | ] 48 | -------------------------------------------------------------------------------- /vamb/kernel.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RasmussenLab/vamb/c53e5419392ce3f46ffc9038d2df0004b2d94569/vamb/kernel.npz -------------------------------------------------------------------------------- /vamb/parsebam.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Estimate depths from BAM files of reads mapped to contigs. 2 | 3 | Usage: 4 | >>> bampaths = ['/path/to/bam1.bam', '/path/to/bam2.bam', '/path/to/bam3.bam'] 5 | >>> rpkms = Abundance.from_file(bampaths, metadata, True, 0.1, 3) 6 | """ 7 | 8 | import pycoverm 9 | import os as _os 10 | import numpy as _np 11 | from math import isfinite 12 | from vamb.parsecontigs import CompositionMetaData 13 | from vamb import vambtools 14 | from typing import Optional, TypeVar, Union, IO, Sequence, Iterable 15 | from pathlib import Path 16 | from itertools import zip_longest 17 | import shutil 18 | 19 | _ncpu = _os.cpu_count() 20 | DEFAULT_THREADS = 8 if _ncpu is None else _ncpu 21 | 22 | A = TypeVar("A", bound="Abundance") 23 | 24 | 25 | class Abundance: 26 | "Object representing contig abundance. Contains a matrix and refhash." 27 | 28 | __slots__ = ["matrix", "samplenames", "minid", "refhash"] 29 | 30 | def __init__( 31 | self, 32 | matrix: _np.ndarray, 33 | samplenames: Sequence[str], 34 | minid: float, 35 | refhash: bytes, 36 | ): 37 | assert matrix.dtype == _np.float32 38 | assert matrix.ndim == 2 39 | assert matrix.shape[1] == len(samplenames) 40 | assert isfinite(minid) and 0.0 <= minid and minid <= 1.0 41 | 42 | self.matrix = matrix 43 | self.samplenames = _np.array(samplenames, dtype=object) 44 | self.minid = minid 45 | self.refhash = refhash 46 | 47 | @property 48 | def nseqs(self) -> int: 49 | return len(self.matrix) 50 | 51 | @property 52 | def nsamples(self) -> int: 53 | return len(self.samplenames) 54 | 55 | def save(self, io: Union[Path, IO[bytes]]): 56 | _np.savez_compressed( 57 | io, 58 | matrix=self.matrix, 59 | samplenames=self.samplenames, 60 | minid=self.minid, 61 | refhash=self.refhash, 62 | ) 63 | 64 | @classmethod 65 | def load( 66 | cls: type[A], io: Union[str, Path, IO[bytes]], refhash: Optional[bytes] 67 | ) -> A: 68 | arrs = _np.load(io, allow_pickle=True) 69 | if "arr_0" in arrs.keys(): 70 | return arrs["arr_0"] # old format 71 | abundance = cls( 72 | vambtools.validate_input_array(arrs["matrix"]), 73 | arrs["samplenames"], 74 | arrs["minid"].item(), 75 | arrs["refhash"].item(), 76 | ) 77 | if refhash is not None: 78 | vambtools.RefHasher.verify_refhash( 79 | abundance.refhash, 80 | refhash, 81 | "the loaded Abundance object", 82 | "the given refhash", 83 | None, 84 | ) 85 | 86 | return abundance 87 | 88 | @classmethod 89 | def from_files( 90 | cls: type[A], 91 | paths: list[Path], 92 | cache_directory: Optional[Path], 93 | comp_metadata: CompositionMetaData, 94 | verify_refhash: bool, 95 | minid: float, 96 | nthreads: int, 97 | ) -> A: 98 | """Input: 99 | paths: List of paths to BAM files 100 | cache_directory: Where to store temp parts of the larger matrix, if reading multiple 101 | BAM files in chunks. Required if len(paths) > min(16, nthreads) 102 | comp_metadata: CompositionMetaData of sequence catalogue used to make BAM files 103 | verify_refhash: Whether to verify composition and BAM references are the same 104 | minid: Discard any reads with nucleotide identity less than this 105 | nthreads: Use this number of threads for coverage estimation 106 | """ 107 | if minid < 0 or minid > 1: 108 | raise ValueError(f"minid must be between 0 and 1, not {minid}") 109 | 110 | # Workaround: Currently pycoverm has a bug where it filters contigs when mindid == 0 111 | # (issue #7). Can be solved by setting it to a low value 112 | minid = minid if minid > 0.001 else 0.001 113 | 114 | if nthreads < 1: 115 | raise ValueError(f"nthreads must be > 0, not {nthreads}") 116 | 117 | chunksize = min(nthreads, len(paths)) 118 | 119 | # We cap it to 16 threads, max. This will prevent pycoverm from consuming a huge amount 120 | # of memory if given a crapload of threads, and most programs will probably be IO bound 121 | # when reading 16 files at a time. 122 | chunksize = min(chunksize, 16) 123 | 124 | # If it can be done in memory, do so 125 | if chunksize >= len(paths): 126 | (matrix, refhash) = cls.run_pycoverm( 127 | paths, 128 | minid, 129 | comp_metadata.refhash if verify_refhash else None, 130 | comp_metadata.identifiers if verify_refhash else None, 131 | comp_metadata.mask, 132 | ) 133 | vambtools.mask_lower_bits(matrix, 12) 134 | return cls(matrix, [str(p) for p in paths], minid, refhash) 135 | # Else, we load it in chunks, then assemble afterwards 136 | else: 137 | if cache_directory is None: 138 | raise ValueError( 139 | "If min(16, nthreads) < len(paths), cache_directory must not be None" 140 | ) 141 | return cls.chunkwise_loading( 142 | paths, 143 | cache_directory, 144 | chunksize, 145 | minid, 146 | comp_metadata.refhash if verify_refhash else None, 147 | comp_metadata.identifiers if verify_refhash else None, 148 | comp_metadata.mask, 149 | ) 150 | 151 | @classmethod 152 | def chunkwise_loading( 153 | cls: type[A], 154 | paths: list[Path], 155 | cache_directory: Path, 156 | nthreads: int, 157 | minid: float, 158 | target_refhash: Optional[bytes], 159 | target_identifiers: Optional[Iterable[str]], 160 | mask: _np.ndarray, 161 | ) -> A: 162 | _os.makedirs(cache_directory) 163 | 164 | chunks = [ 165 | (i, min(len(paths), i + nthreads)) for i in range(0, len(paths), nthreads) 166 | ] 167 | filenames = [ 168 | _os.path.join(cache_directory, str(i) + ".npz") for i in range(len(chunks)) 169 | ] 170 | assert len(chunks) > 1 171 | 172 | # Load from BAM and store them chunkwise 173 | refhash = None 174 | for filename, (chunkstart, chunkstop) in zip(filenames, chunks): 175 | (matrix, refhash) = cls.run_pycoverm( 176 | paths[chunkstart:chunkstop], 177 | minid, 178 | target_refhash, 179 | target_identifiers, 180 | mask, 181 | ) 182 | vambtools.write_npz(filename, matrix) 183 | 184 | # Initialize matrix, the load them chunkwise. Delete the temp files when done 185 | matrix = _np.empty((mask.sum(), len(paths)), dtype=_np.float32) 186 | for filename, (chunkstart, chunkstop) in zip(filenames, chunks): 187 | matrix[:, chunkstart:chunkstop] = vambtools.read_npz(filename) 188 | vambtools.mask_lower_bits(matrix, 12) 189 | 190 | shutil.rmtree(cache_directory) 191 | 192 | assert refhash is not None 193 | return cls(matrix, [str(p) for p in paths], minid, refhash) 194 | 195 | @staticmethod 196 | def run_pycoverm( 197 | paths: list[Path], 198 | minid: float, 199 | target_refhash: Optional[bytes], 200 | target_identifiers: Optional[Iterable[str]], 201 | mask: _np.ndarray, 202 | ) -> tuple[_np.ndarray, bytes]: 203 | (headers, coverage) = pycoverm.get_coverages_from_bam( 204 | [str(p) for p in paths], 205 | threads=len(paths), 206 | min_identity=minid, 207 | # Note: pycoverm's trim_upper=0.1 is same as CoverM trim-upper 90. 208 | trim_upper=0.1, 209 | trim_lower=0.1, 210 | ) 211 | 212 | assert coverage.shape == (len(headers), len(paths)) 213 | 214 | # Filter length, using comp_metadata's mask, which has been set by minlength 215 | if len(mask) != len(headers): 216 | raise ValueError( 217 | f"CompositionMetaData used to create Abundance object was created with {len(mask)} sequences, " 218 | f"but number of reference sequences in BAM files are {len(headers)}. " 219 | "Make sure the BAM files were created by mapping to the same FASTA file " 220 | "which you used to create the Composition object." 221 | ) 222 | 223 | headers = [h for (h, m) in zip(headers, mask) if m] 224 | vambtools.numpy_inplace_maskarray(coverage, mask) 225 | refhash = vambtools.RefHasher.hash_refnames(headers) 226 | 227 | if target_identifiers is None: 228 | identifier_pairs = None 229 | else: 230 | identifier_pairs = (headers, target_identifiers) 231 | 232 | if target_refhash is not None: 233 | vambtools.RefHasher.verify_refhash( 234 | refhash, target_refhash, "FASTA file", "BAM", identifier_pairs 235 | ) 236 | 237 | return (coverage, refhash) 238 | 239 | @classmethod 240 | def from_tsv(cls: type[A], path: Path, comp_metadata: CompositionMetaData) -> A: 241 | seen_identifiers: list[str] = [] 242 | with open(path) as file: 243 | try: 244 | header = next(file) 245 | except StopIteration: 246 | err = ValueError(f"Found no TSV header in abundance file '{path}'") 247 | raise err from None 248 | columns = header.rstrip("\r\n").split("\t") 249 | if len(columns) < 2: 250 | raise ValueError( 251 | f'Expected at least 2 columns in abundance TSV file at "{path}"' 252 | ) 253 | if columns[0] != "contigname": 254 | raise ValueError('First column in header must be "contigname"') 255 | samples = columns[1:] 256 | n_samples = len(samples) 257 | matrix = _np.empty((comp_metadata.nseqs, n_samples), dtype=_np.float32) 258 | matrix_row = 0 259 | 260 | # Line number minus two since we already read header, and Python is zero-indexed 261 | for line_number_minus_two, (line, should_keep) in enumerate( 262 | zip_longest(file, comp_metadata.mask) 263 | ): 264 | if line is None: 265 | # If line is none, there are too few lines in file 266 | raise ValueError( 267 | f'Too few rows in abundance TSV file "{path}", expected ' 268 | f"{len(comp_metadata.mask) + 1}, got {line_number_minus_two + 1}" 269 | ) 270 | 271 | line = line.rstrip() 272 | 273 | if not line: 274 | for next_line in file: 275 | if next_line.rstrip(): 276 | raise ValueError( 277 | "Found an empty line not at end of abundance TSV file" 278 | f'"{path}"' 279 | ) 280 | break 281 | 282 | if should_keep is None: 283 | raise ValueError( 284 | f'Too many rows in abundance TSV file "{path}", expected ' 285 | f"{len(comp_metadata.mask) + 1} sequences, got at least " 286 | f"{line_number_minus_two + 2}" 287 | ) 288 | 289 | if not should_keep: 290 | continue 291 | 292 | fields = line.split("\t") 293 | if len(fields) != n_samples + 1: 294 | raise ValueError( 295 | f'In abundance TSV file "{path}", on line {line_number_minus_two + 2}' 296 | f", expected {n_samples + 1} columns, found {len(fields)}" 297 | ) 298 | for i in range(n_samples): 299 | matrix[matrix_row, i] = float(fields[i + 1]) 300 | matrix_row += 1 301 | seen_identifiers.append(fields[0]) 302 | 303 | vambtools.RefHasher.verify_refhash( 304 | vambtools.RefHasher.hash_refnames(seen_identifiers), 305 | comp_metadata.refhash, 306 | "abundance TSV", 307 | "composition", 308 | (seen_identifiers, comp_metadata.identifiers), 309 | ) 310 | 311 | return cls(matrix, samples, 0.0, comp_metadata.refhash) 312 | -------------------------------------------------------------------------------- /vamb/parsecontigs.py: -------------------------------------------------------------------------------- 1 | import os as _os 2 | import numpy as _np 3 | import vamb.vambtools as _vambtools 4 | from collections.abc import Iterable, Sequence 5 | from typing import IO, Union, TypeVar, Optional 6 | from pathlib import Path 7 | 8 | # This kernel is created in src/create_kernel.py. See that file for explanation 9 | _KERNEL: _np.ndarray = _vambtools.read_npz( 10 | _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "kernel.npz") 11 | ) 12 | 13 | 14 | class CompositionMetaData: 15 | """A class containing metadata of sequence composition. 16 | Current fields are: 17 | * identifiers: A Numpy array of objects, str identifiers of kept sequences 18 | * lengths: A Numpy vector of 32-bit uint lengths of kept sequences 19 | * mask: A boolean Numpy vector of which sequences were kept in original file 20 | * refhash: A bytes object representing the hash of the identifiers 21 | * minlength: The minimum contig length used for filtering 22 | """ 23 | 24 | __slots__ = ["identifiers", "lengths", "mask", "refhash", "minlength"] 25 | 26 | def __init__( 27 | self, 28 | identifiers: _np.ndarray, 29 | lengths: _np.ndarray, 30 | mask: _np.ndarray, 31 | minlength: int, 32 | ): 33 | assert len(identifiers) == len(lengths) 34 | assert identifiers.dtype == _np.dtype("O") 35 | assert _np.issubdtype(lengths.dtype, _np.integer) 36 | assert mask.dtype == bool 37 | assert mask.sum() == len(lengths) 38 | assert lengths.min(initial=minlength) >= minlength 39 | 40 | if len(set(identifiers)) < len(identifiers): 41 | raise ValueError( 42 | "Sequence names must be unique, but are not. " 43 | "Vamb only uses the identifier (e.g. header before whitespace) as " 44 | "sequence identifiers. Verify identifier uniqueness." 45 | ) 46 | 47 | self.identifiers = identifiers 48 | self.lengths = lengths 49 | self.mask = mask 50 | self.minlength = minlength 51 | self.refhash = _vambtools.RefHasher.hash_refnames(identifiers) 52 | 53 | @property 54 | def nseqs(self) -> int: 55 | "Number of sequences after filtering" 56 | return len(self.identifiers) 57 | 58 | def filter_mask(self, mask: Sequence[bool]): 59 | "Filter contigs given a mask whose length should be nseqs" 60 | assert len(mask) == self.nseqs 61 | ind = 0 62 | for i in range(len(self.mask)): 63 | if self.mask[i]: 64 | self.mask[i] &= mask[ind] 65 | ind += 1 66 | 67 | self.identifiers = self.identifiers[mask] 68 | self.lengths = self.lengths[mask] 69 | self.refhash = _vambtools.RefHasher.hash_refnames(self.identifiers) 70 | 71 | def filter_min_length(self, length: int): 72 | "Set or reset minlength of this object" 73 | if length <= self.minlength: 74 | return None 75 | 76 | self.filter_mask(self.lengths >= length) # type:ignore 77 | self.minlength = length 78 | 79 | 80 | C = TypeVar("C", bound="Composition") 81 | 82 | 83 | class Composition: 84 | """A class containing a CompositionMetaData and its TNF matrix. 85 | Current fields are: 86 | * metadata: A CompositionMetaData object 87 | * matrix: The composition matrix itself 88 | """ 89 | 90 | __slots__ = ["metadata", "matrix"] 91 | 92 | def __init__(self, metadata: CompositionMetaData, matrix: _np.ndarray): 93 | assert matrix.dtype == _np.float32 94 | assert matrix.shape == (metadata.nseqs, 103) 95 | 96 | self.metadata = metadata 97 | self.matrix = matrix 98 | 99 | def count_bases(self) -> int: 100 | return self.metadata.lengths.sum() 101 | 102 | @property 103 | def nseqs(self) -> int: 104 | return self.metadata.nseqs 105 | 106 | def save(self, io: Union[str, Path, IO[bytes]]): 107 | _np.savez_compressed( 108 | io, 109 | matrix=self.matrix, 110 | identifiers=self.metadata.identifiers, 111 | lengths=self.metadata.lengths, 112 | mask=self.metadata.mask, 113 | minlength=self.metadata.minlength, 114 | ) 115 | 116 | @classmethod 117 | def load(cls, io: Union[str, IO[bytes], Path]): 118 | arrs = _np.load(io, allow_pickle=True) 119 | metadata = CompositionMetaData( 120 | _vambtools.validate_input_array(arrs["identifiers"]), 121 | _vambtools.validate_input_array(arrs["lengths"]), 122 | _vambtools.validate_input_array(arrs["mask"]), 123 | arrs["minlength"].item(), 124 | ) 125 | return cls(metadata, _vambtools.validate_input_array(arrs["matrix"])) 126 | 127 | def filter_min_length(self, length: int): 128 | if length <= self.metadata.minlength: 129 | return None 130 | 131 | mask = self.metadata.lengths >= length 132 | self.metadata.filter_mask(mask) 133 | self.metadata.minlength = length 134 | _vambtools.numpy_inplace_maskarray(self.matrix, mask) 135 | 136 | @staticmethod 137 | def _project(fourmers: _np.ndarray, kernel: _np.ndarray = _KERNEL) -> _np.ndarray: 138 | "Project fourmers down in dimensionality" 139 | s = fourmers.sum(axis=1).reshape(-1, 1) 140 | s[s == 0] = 1.0 141 | fourmers *= 1 / s 142 | fourmers += -(1 / 256) 143 | return _np.dot(fourmers, kernel) 144 | 145 | @staticmethod 146 | def _convert(raw: _vambtools.PushArray, projected: _vambtools.PushArray): 147 | "Move data from raw PushArray to projected PushArray, converting it." 148 | raw_mat = raw.take().reshape(-1, 256) 149 | projected_mat = Composition._project(raw_mat) 150 | projected.extend(projected_mat.ravel()) 151 | raw.clear() 152 | 153 | @classmethod 154 | def from_file( 155 | cls: type[C], 156 | filehandle: Iterable[bytes], 157 | filename: Optional[str], 158 | minlength: int = 2000, 159 | ) -> C: 160 | """Parses a FASTA file open in binary reading mode, returning Composition. 161 | 162 | Input: 163 | filehandle: Filehandle open in binary mode of a FASTA file 164 | minlength: Ignore any references shorter than N bases [2000] 165 | """ 166 | 167 | if minlength < 4: 168 | raise ValueError(f"Minlength must be at least 4, not {minlength}") 169 | 170 | raw = _vambtools.PushArray(_np.float32) 171 | projected = _vambtools.PushArray(_np.float32) 172 | lengths = _vambtools.PushArray(_np.int32) 173 | mask = bytearray() # we convert to Numpy at end 174 | contignames: list[str] = list() 175 | entries = _vambtools.byte_iterfasta(filehandle, filename) 176 | 177 | for entry in entries: 178 | length = len(entry) 179 | skip = length < minlength 180 | mask.append(not skip) 181 | 182 | if skip: 183 | continue 184 | 185 | counts = entry.kmercounts() 186 | if counts.sum() == 0: 187 | raise ValueError( 188 | f'TNF value of contig "{entry.header}" is all zeros. ' 189 | + "This implies that the sequence contained no 4-mers of A, C, G, T or U, " 190 | + "making this sequence uninformative. This is probably a mistake. " 191 | + "Verify that the sequence contains usable information (e.g. is not all N's)" 192 | ) 193 | raw.extend(counts) 194 | 195 | if len(raw) > 256000: 196 | Composition._convert(raw, projected) 197 | 198 | lengths.append(len(entry)) 199 | contignames.append(entry.identifier) 200 | 201 | # Convert rest of contigs 202 | Composition._convert(raw, projected) 203 | tnfs_arr = projected.take() 204 | _vambtools.mask_lower_bits(tnfs_arr, 12) 205 | 206 | # Don't use reshape since it creates a new array object with shared memory 207 | tnfs_arr.shape = (len(tnfs_arr) // 103, 103) 208 | lengths_arr = lengths.take() 209 | 210 | metadata = CompositionMetaData( 211 | _np.array(contignames, dtype=object), 212 | lengths_arr, 213 | _np.array(mask, dtype=bool), 214 | minlength, 215 | ) 216 | return cls(metadata, tnfs_arr) 217 | -------------------------------------------------------------------------------- /vamb/parsemarkers.py: -------------------------------------------------------------------------------- 1 | # Overview 2 | # We use pyrodigal to predict genes in every contigs not filtered away by 3 | # the given mask, then use pyhmmer to predict single copy marker genes (SCGs) 4 | # on the genes, hence getting a contig => list[SCG] mapping. 5 | # Pyrodigal/pyhmmer is a bottleneck, so we run in parallel processes. 6 | # To avoid inter-process communication overhead, we first split the input 7 | # FASTA files to N files, then we have each process work on the files independently. 8 | 9 | from vamb.vambtools import FastaEntry, Reader, RefHasher, byte_iterfasta 10 | import pyrodigal 11 | import pyhmmer 12 | from multiprocessing.pool import Pool 13 | import os 14 | import itertools 15 | from pathlib import Path 16 | from typing import NewType, Sequence, Union, IO, Optional, Iterable 17 | import shutil 18 | from collections import defaultdict 19 | import json 20 | import numpy as np 21 | from loguru import logger 22 | 23 | MarkerID = NewType("MarkerID", int) 24 | MarkerName = NewType("MarkerName", str) 25 | ContigID = NewType("ContigID", int) 26 | ContigName = NewType("ContigName", str) 27 | 28 | 29 | class Markers: 30 | """ 31 | The set of marker genes predicted for a collection of contigs. 32 | Instantiate using `Markers.from_files`, or load using `Markers.load`. 33 | Like Abundance objects, Markers carry a refhash to check that the markers correspond 34 | to the same sequences used to create the markers. 35 | Access the markers with `markers.markers`, a `list[Optional[np.array]]`, with one 36 | element for each contig. The element is `None` if there are no markers, else a list 37 | of marker genes present in the contig. 38 | The marker genes are stored as integers - the name of a marker `i` can be gotten using 39 | `markers.marker_names[i]`. 40 | In each contig, markers are deduplicated, so at most 1 of each marker is found 41 | in each contig. 42 | """ 43 | 44 | __slots__ = ["markers", "marker_names", "refhash"] 45 | 46 | def __init__( 47 | self, 48 | markers: list[Optional[np.ndarray]], 49 | # Some IDs map to multiple names, if they act as the same SCG in the cell 50 | marker_names: list[list[MarkerName]], 51 | refhash: bytes, 52 | ): 53 | if len(set(itertools.chain.from_iterable(marker_names))) != sum( 54 | len(i) for i in marker_names 55 | ): 56 | raise ValueError("Marker names are not unique, but must be") 57 | 58 | self.markers = markers 59 | self.marker_names = marker_names 60 | self.refhash = refhash 61 | 62 | @property 63 | def n_markers(self): 64 | return len(self.marker_names) 65 | 66 | @property 67 | def n_seqs(self): 68 | return len(self.markers) 69 | 70 | def score_bin(self, indices: Iterable[int]) -> tuple[float, float]: 71 | counts = np.zeros(self.n_markers, dtype=np.uint8) 72 | for i in indices: 73 | mkrs = self.markers[i] 74 | if mkrs is None: 75 | continue 76 | for m in mkrs: 77 | counts[m] += 1 78 | 79 | n_unique = (counts > 0).sum() 80 | completeness = n_unique / self.n_markers 81 | contamination = (counts.sum() - n_unique) / self.n_markers 82 | return (completeness, contamination) 83 | 84 | def save(self, io: Union[Path, str, IO[str]]): 85 | representation = { 86 | "markers": [i if i is None else i.tolist() for i in self.markers], 87 | "marker_names": self.marker_names, 88 | "refhash": self.refhash.hex(), 89 | } 90 | # Check we didn't forget any fields 91 | assert len(representation) == len(self.__slots__) 92 | if isinstance(io, Path) or isinstance(io, str): 93 | with open(io, "w") as file: 94 | json.dump(representation, file) 95 | 96 | else: 97 | json.dump(representation, io) 98 | 99 | @classmethod 100 | def load(cls, io: Union[Path, str, IO[str]], refhash: Optional[bytes]): 101 | if isinstance(io, Path) or isinstance(io, str): 102 | with open(io, "rb") as file: 103 | representation = json.load(file) 104 | else: 105 | representation = json.load(io) 106 | observed_refhash = bytes.fromhex(representation["refhash"]) 107 | if refhash is not None: 108 | RefHasher.verify_refhash( 109 | refhash=observed_refhash, 110 | target_refhash=refhash, 111 | observed_name="Loaded markers", 112 | target_name=None, 113 | identifiers=None, 114 | ) 115 | markers_as_arrays = [ 116 | i if i is None else np.array(i, dtype=np.uint8) 117 | for i in representation["markers"] 118 | ] 119 | 120 | return cls(markers_as_arrays, representation["marker_names"], observed_refhash) 121 | 122 | @classmethod 123 | def from_files( 124 | cls, 125 | contigs: Path, 126 | hmm_path: Path, 127 | contignames: Sequence[str], 128 | tmpdir_to_create: Path, 129 | n_processes: int, 130 | target_refhash: Optional[bytes], 131 | ): 132 | """ 133 | Create the Markers from input files: 134 | `contigs`: Path to a FASTA file with all contigs, gzipped or not. 135 | `hmm_path`: Path to a HMMER .hmm file with the markers. Note: Currently, 136 | this file can contain at most 256 markers, though this restriction can 137 | be lifted if necessary 138 | 139 | The `fasta_entry_mask` is a boolean mask of which contigs in the FASTA 140 | file to include. This affects the refhash which is only computed for 141 | the contigs not filtered away. 142 | If the target refhash is not None, and the computed reference hash does not 143 | match, an exception is thrown. See vamb.vambtools.RefHasher. 144 | """ 145 | n_processes = cap_processes(n_processes) 146 | with open(hmm_path, "rb") as file: 147 | hmms = list(pyhmmer.plan7.HMMFile(file)) 148 | (_, marker_names) = get_name_to_id(hmms) 149 | 150 | (refhash, paths) = split_file( 151 | contigs, 152 | contignames, 153 | tmpdir_to_create, 154 | n_processes, 155 | ) 156 | 157 | if target_refhash is not None: 158 | RefHasher.verify_refhash( 159 | refhash, target_refhash, "Markers FASTA file", None, None 160 | ) 161 | 162 | index_of_name = { 163 | ContigName(n): ContigID(i) for (i, n) in enumerate(contignames) 164 | } 165 | marker_list: list[Optional[np.ndarray]] = [None] * len(contignames) 166 | with Pool(n_processes) as pool: 167 | for sub_result in pool.imap_unordered( 168 | work_per_process, 169 | list(zip(paths, itertools.repeat(hmms))), 170 | ): 171 | for contig_name, markers in sub_result: 172 | marker_list[index_of_name[contig_name]] = markers 173 | 174 | shutil.rmtree(tmpdir_to_create) 175 | markers = cls(marker_list, marker_names, refhash) 176 | 177 | return markers 178 | 179 | 180 | def cap_processes(processes: int) -> int: 181 | if processes < 1: 182 | raise ValueError(f"Must use at least 1 process, not {processes}") 183 | # Cap processes, because most OSs cap the number of open file handles, 184 | # and we need one file per process when splitting FASTA file 185 | elif processes > 64: 186 | logger.warning(f"Processes set to {processes}, capping to 64") 187 | return 64 188 | return processes 189 | 190 | 191 | # Some markers have different names, but should be treated as the same SCG. 192 | NORMALIZE_MARKER_TRANS_DICT = { 193 | "TIGR00388": "TIGR00389", 194 | "TIGR00471": "TIGR00472", 195 | "TIGR00408": "TIGR00409", 196 | "TIGR02386": "TIGR02387", 197 | } 198 | 199 | 200 | def split_file( 201 | input: Path, 202 | contignames: Sequence[str], 203 | tmpdir_to_create: Path, 204 | n_splits: int, 205 | ) -> tuple[bytes, list[Path]]: 206 | names = set(contignames) 207 | os.mkdir(tmpdir_to_create) 208 | paths = [tmpdir_to_create.joinpath(str(i)) for i in range(n_splits)] 209 | filehandles = [open(path, "w") for path in paths] 210 | refhasher = RefHasher() 211 | with Reader(input) as infile: 212 | for i, (outfile, record) in enumerate( 213 | zip( 214 | itertools.cycle(filehandles), 215 | filter(lambda x: x.identifier in names, byte_iterfasta(infile, None)), 216 | ) 217 | ): 218 | refhasher.add_refname(record.identifier) 219 | print(record.format(), file=outfile) 220 | 221 | for filehandle in filehandles: 222 | filehandle.close() 223 | refhash = refhasher.digest() 224 | return (refhash, paths) 225 | 226 | 227 | def process_chunk( 228 | chunk: list[FastaEntry], 229 | hmms: list[pyhmmer.plan7.HMM], 230 | name_to_id: dict[MarkerName, MarkerID], 231 | finder: pyrodigal.GeneFinder, 232 | ) -> list[tuple[ContigName, np.ndarray]]: 233 | # We temporarily store them as sets in order to deduplicate. While single contigs 234 | # may have duplicate markers, it makes no sense to count this as contamination, 235 | # because we are not about to second-guess the assembler's job of avoiding 236 | # chimeric sequences. 237 | markers: defaultdict[ContigName, set[MarkerID]] = defaultdict(set) 238 | alphabet = pyhmmer.easel.Alphabet.amino() 239 | digitized: list[pyhmmer.easel.DigitalSequence] = [] 240 | for record in chunk: 241 | for gene in finder.find_genes(record.sequence): 242 | seq = pyhmmer.easel.TextSequence( 243 | name=record.identifier.encode(), sequence=gene.translate() 244 | ).digitize(alphabet) 245 | digitized.append(seq) 246 | 247 | for hmm, top_hits in zip(hmms, pyhmmer.hmmsearch(hmms, digitized)): 248 | marker_name = MarkerName(hmm.name.decode()) 249 | marker_id = name_to_id[marker_name] 250 | # We need this score cutoff, which is stored in the HMM file to remove the large 251 | # number of false positives from HMMER 252 | score_cutoff = hmm.cutoffs.trusted1 253 | assert score_cutoff is not None 254 | for hit in top_hits: 255 | if hit.score >= score_cutoff: 256 | markers[ContigName(hit.name.decode())].add(marker_id) 257 | 258 | return [ 259 | (name, np.array(list(ids), dtype=np.uint8)) for (name, ids) in markers.items() 260 | ] 261 | 262 | 263 | def work_per_process( 264 | args: tuple[Path, list[pyhmmer.plan7.HMM]], 265 | ) -> list[tuple[ContigName, np.ndarray]]: 266 | (contig_path, hmms) = args 267 | 268 | (name_to_id, _) = get_name_to_id(hmms) 269 | 270 | # Chunk up the FASTA file for memory efficiency reasons, while still 271 | # allowing pyhmmer to scan multiple sequences at once for speed 272 | chunk: list[FastaEntry] = [] 273 | result: list[tuple[ContigName, np.ndarray]] = [] 274 | finder = pyrodigal.GeneFinder(meta=True) 275 | with open(contig_path, "rb") as file: 276 | for record in byte_iterfasta(file, None): 277 | chunk.append(record) 278 | if len(chunk) == 2048: 279 | result.extend(process_chunk(chunk, hmms, name_to_id, finder)) 280 | chunk.clear() 281 | result.extend(process_chunk(chunk, hmms, name_to_id, finder)) 282 | 283 | return result 284 | 285 | 286 | def get_name_to_id( 287 | hmms: list[pyhmmer.plan7.HMM], 288 | ) -> tuple[dict[MarkerName, MarkerID], list[list[MarkerName]]]: 289 | name_to_id: dict[MarkerName, MarkerID] = dict() 290 | for hmm in hmms: 291 | name = hmm.name.decode() 292 | if name in NORMALIZE_MARKER_TRANS_DICT: 293 | continue 294 | name_to_id[MarkerName(name)] = MarkerID(len(name_to_id)) 295 | for old_name, new_name in NORMALIZE_MARKER_TRANS_DICT.items(): 296 | name_to_id[MarkerName(old_name)] = name_to_id[MarkerName(new_name)] 297 | 298 | if len(set(name_to_id.values())) > 256: 299 | raise ValueError("Maximum 256 marker IDs") 300 | 301 | id_to_names: defaultdict[MarkerID, list[MarkerName]] = defaultdict(list) 302 | for n, i in name_to_id.items(): 303 | id_to_names[i].append(n) 304 | marker_names = [id_to_names[MarkerID(i)] for i in range(len(id_to_names))] 305 | 306 | return name_to_id, marker_names 307 | -------------------------------------------------------------------------------- /vamb/reclustering.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following code is based on the k-means based reclustering algorithm first published at https://github.com/BigDataBiology/SemiBin 3 | The original code is distributed under MIT License. 4 | """ 5 | 6 | from sklearn.cluster import KMeans 7 | import numpy as np 8 | from collections import defaultdict 9 | from sklearn.cluster import DBSCAN 10 | from sklearn.metrics import pairwise_distances 11 | from vamb.taxonomy import Taxonomy 12 | from vamb.parsemarkers import Markers, MarkerID 13 | from vamb.parsecontigs import CompositionMetaData 14 | from vamb.vambtools import RefHasher 15 | from collections.abc import Sequence, Iterable 16 | from typing import NewType, Optional, Union 17 | 18 | # We use these aliases to be able to work with integers, which is faster. 19 | ContigId = NewType("ContigId", int) 20 | BinId = NewType("BinId", int) 21 | 22 | # TODO: We might want to benchmark the best value for this constant. 23 | # Right now, we do too much duplicated work by clustering 18 times. 24 | EPS_VALUES = np.arange(0.01, 0.35, 0.02) 25 | 26 | 27 | class KmeansAlgorithm: 28 | "Arguments needed specifically when using the KMeans algorithm" 29 | 30 | def __init__( 31 | self, clusters: list[set[ContigId]], random_seed: int, contiglengths: np.ndarray 32 | ): 33 | assert np.issubdtype(contiglengths.dtype, np.integer) 34 | self.contiglengths = contiglengths 35 | self.clusters = clusters 36 | self.random_seed = random_seed 37 | 38 | 39 | class DBScanAlgorithm: 40 | "Arguments needed specifically when using the DBScan algorithm" 41 | 42 | def __init__( 43 | self, comp_metadata: CompositionMetaData, taxonomy: Taxonomy, n_processes: int 44 | ): 45 | if not taxonomy.is_canonical: 46 | raise ValueError( 47 | "Can only run DBScan on a Taxonomy object with is_canonical set" 48 | ) 49 | RefHasher.verify_refhash( 50 | taxonomy.refhash, 51 | comp_metadata.refhash, 52 | "taxonomy", 53 | "composition", 54 | None, 55 | ) 56 | self.contiglengths = comp_metadata.lengths 57 | self.taxonomy = taxonomy 58 | self.n_processes = n_processes 59 | 60 | 61 | def recluster_bins( 62 | markers: Markers, 63 | latent: np.ndarray, 64 | algorithm: Union[KmeansAlgorithm, DBScanAlgorithm], 65 | ) -> list[set[ContigId]]: 66 | assert np.issubdtype(algorithm.contiglengths.dtype, np.integer) 67 | assert np.issubdtype(latent.dtype, np.floating) 68 | 69 | if not (len(algorithm.contiglengths) == markers.n_seqs == len(latent)): 70 | raise ValueError( 71 | "Number of elements in contiglengths, markers and latent must match" 72 | ) 73 | 74 | # Simply dispatch to the right implementation based on the algorithm used 75 | if isinstance(algorithm, KmeansAlgorithm): 76 | return recluster_kmeans( 77 | algorithm.clusters, 78 | latent, 79 | algorithm.contiglengths, 80 | markers, 81 | algorithm.random_seed, 82 | ) 83 | elif isinstance(algorithm, DBScanAlgorithm): 84 | assert len(algorithm.taxonomy.contig_taxonomies) == markers.n_seqs 85 | return recluster_dbscan( 86 | algorithm.taxonomy, 87 | latent, 88 | algorithm.contiglengths, 89 | markers, 90 | algorithm.n_processes, 91 | ) 92 | 93 | 94 | def recluster_kmeans( 95 | clusters: list[set[ContigId]], 96 | latent: np.ndarray, 97 | contiglengths: np.ndarray, 98 | markers: Markers, 99 | random_seed: int, 100 | ) -> list[set[ContigId]]: 101 | assert len(latent) == len(contiglengths) == markers.n_seqs 102 | assert np.issubdtype(contiglengths.dtype, np.integer) 103 | assert np.issubdtype(latent.dtype, np.floating) 104 | assert latent.ndim == 2 105 | 106 | result: list[set[ContigId]] = [] 107 | indices_by_medoid: dict[int, set[ContigId]] = defaultdict(set) 108 | # We loop over all existing clusters, and determine if they should be split, 109 | # by looking at the median number of single-copy genes in the cluster 110 | for cluster in clusters: 111 | # All clusters with 1 contig by definition cannot have multiple single-copy 112 | # genes (because SCGs are deduplicated within a single contig) 113 | if len(cluster) == 1: 114 | result.append(cluster) 115 | continue 116 | # Get a count of each marker and compute the median count of SCGs 117 | counts = count_markers(cluster, markers) 118 | cp = counts.copy() 119 | cp.sort() 120 | median_counts: int = cp[len(cp) // 2] 121 | # If we have less than 2 SCGs on average, the cluster should not be split, 122 | # and we emit it unchanged 123 | if median_counts < 2: 124 | result.append(cluster) 125 | continue 126 | 127 | # Run K-means with the median number of SCGs to split the contig. 128 | # We weigh the contigs by length. 129 | seeds = get_kmeans_seeds( 130 | cluster, 131 | markers, 132 | contiglengths, # type: ignore 133 | counts, 134 | median_counts, 135 | ) 136 | 137 | cluster_indices = np.array(list(cluster)) 138 | cluter_latent = latent[cluster_indices] 139 | cluster_lengths = contiglengths[cluster_indices] 140 | seed_latent = latent[seeds] 141 | kmeans = KMeans( 142 | n_clusters=median_counts, 143 | init=seed_latent, 144 | n_init=1, 145 | random_state=random_seed, 146 | ) 147 | kmeans.fit(cluter_latent, sample_weight=cluster_lengths) 148 | indices_by_medoid.clear() 149 | for cluster_label, index in zip(kmeans.labels_, cluster_indices): 150 | indices_by_medoid[cluster_label].add(ContigId(index)) 151 | result.extend(indices_by_medoid.values()) 152 | 153 | return result 154 | 155 | 156 | # Get a vector of counts, of each SCG, 157 | # where if MarkerID(5) is seen 9 times, then counts[5] == 9. 158 | def count_markers( 159 | contigs: Iterable[ContigId], 160 | markers: Markers, 161 | ) -> np.ndarray: 162 | counts = np.zeros(markers.n_markers, dtype=np.int32) 163 | for contig in contigs: 164 | m = markers.markers[contig] 165 | if m is not None: 166 | counts[m] += 1 167 | return counts 168 | 169 | 170 | # Same as above, but once we see a very high number of marker genes, 171 | # we bail. This is because a large fraction of time spent in this module 172 | # would otherwise be counting markers of huge clusters, long after we already 173 | # know it's hopelessly contaminated 174 | def count_markers_saturated( 175 | contigs: Iterable[ContigId], 176 | markers: Markers, 177 | ) -> Optional[np.ndarray]: 178 | counts = np.zeros(markers.n_markers, dtype=np.int32) 179 | n_markers = 0 180 | n_unique = 0 181 | # This implies contamination == 1.0 182 | max_duplicates = 1 * markers.n_markers 183 | for contig in contigs: 184 | m = markers.markers[contig] 185 | if m is not None: 186 | n_markers += len(m) 187 | for i in m: 188 | existing = counts[i] 189 | n_unique += existing == 0 190 | counts[i] = existing + 1 191 | 192 | if (n_markers - n_unique) > max_duplicates: 193 | return None 194 | return counts 195 | 196 | 197 | # This is not very effectively implemented, but I assume it does not matter. 198 | # This function looks at all markers that occur exactly `median` times, each of these 199 | # markers corresponding to a list of `median` number of contigs. 200 | # It picks the marker for which the smallest contig that contains it is largest. 201 | # The idea here is that long contigs, which contain one of the SCGs that exist exactly 202 | # `median` times are most likely to be close to the actual medoid 203 | # that Kmeans needs to find. 204 | # This is just one possible seeding strategy. We could also plausibly choose e.g. 205 | # the trio of contigs that have the most SCGs. 206 | def get_kmeans_seeds( 207 | contigs: Iterable[ContigId], 208 | markers: Markers, 209 | contiglengths: Sequence[int], 210 | counts: np.ndarray, 211 | median: int, 212 | ) -> list[ContigId]: 213 | considered_markers = {MarkerID(i) for (i, c) in enumerate(counts) if c == median} 214 | contigs_of_markers: dict[MarkerID, list[ContigId]] = defaultdict(list) 215 | for contig in contigs: 216 | m = markers.markers[contig] 217 | if m is None: 218 | continue 219 | for mid in m: 220 | if mid not in considered_markers: 221 | continue 222 | contigs_of_markers[MarkerID(mid)].append(contig) 223 | 224 | candidate_list = list(contigs_of_markers.items()) 225 | pair = max(candidate_list, key=lambda x: min(contiglengths[i] for i in x[1])) 226 | result = pair[1] 227 | assert len(result) == median 228 | return result 229 | 230 | 231 | def get_completeness_contamination(counts: np.ndarray) -> tuple[float, float]: 232 | n_total = counts.sum() 233 | n_unique = (counts > 0).sum() 234 | completeness = n_unique / len(counts) 235 | contamination = (n_total - n_unique) / len(counts) 236 | return (completeness, contamination) 237 | 238 | 239 | def recluster_dbscan( 240 | taxonomy: Taxonomy, 241 | latent: np.ndarray, 242 | contiglengths: np.ndarray, 243 | markers: Markers, 244 | num_processes: int, 245 | ) -> list[set[ContigId]]: 246 | # Since DBScan is computationally expensive, and scales poorly with the number 247 | # of contigs, we use taxonomy to only cluster within each genus 248 | n_worse_in_row = 0 249 | genera_indices = group_indices_by_genus(taxonomy) 250 | best_score = 0 251 | best_bins: list[set[ContigId]] = [] 252 | for eps in EPS_VALUES: 253 | bins: list[set[ContigId]] = [] 254 | for indices in genera_indices: 255 | genus_clusters = dbscan_genus( 256 | latent[indices], indices, contiglengths[indices], num_processes, eps 257 | ) 258 | bins.extend(genus_clusters) 259 | 260 | score = count_good_genomes(bins, markers) 261 | if best_score == 0 or score > best_score: 262 | best_bins = bins 263 | best_score = score 264 | 265 | if score >= best_score: 266 | n_worse_in_row = 0 267 | else: 268 | n_worse_in_row += 1 269 | if n_worse_in_row > 2: 270 | break 271 | 272 | return best_bins 273 | 274 | 275 | # DBScan within the subset of contigs that are annotated with a single genus 276 | def dbscan_genus( 277 | latent_of_genus: np.ndarray, 278 | original_indices: np.ndarray, 279 | contiglengths_of_genus: np.ndarray, 280 | num_processes: int, 281 | eps: float, 282 | ) -> list[set[ContigId]]: 283 | assert len(latent_of_genus) == len(original_indices) == len(contiglengths_of_genus) 284 | # Precompute distance matrix. This is O(N^2), but DBScan is even worse, 285 | # so this pays off. 286 | # TODO: Maybe we should emit a warning if this function is called with too 287 | # many points such that this matrix becomes huge? 288 | distance_matrix = pairwise_distances( 289 | latent_of_genus, latent_of_genus, metric="cosine" 290 | ) 291 | # The DBScan approach works by blindly clustering with different eps values 292 | # (a critical parameter for DBscan), and then using SCGs to select the best 293 | # subset of clusters. 294 | # It's ugly and wasteful, but it does work. 295 | dbscan = DBSCAN( 296 | eps=eps, 297 | min_samples=5, 298 | n_jobs=num_processes, 299 | metric="precomputed", 300 | ) 301 | dbscan.fit(distance_matrix, sample_weight=contiglengths_of_genus) 302 | bins: dict[int, set[ContigId]] = defaultdict(set) 303 | for original_index, bin_index in zip(original_indices, dbscan.labels_): 304 | bins[bin_index].add(ContigId(original_index)) 305 | return list(bins.values()) 306 | 307 | 308 | def count_good_genomes(binning: Iterable[Iterable[ContigId]], markers: Markers) -> int: 309 | max_contamination = 0.3 310 | min_completeness = 0.75 311 | result = 0 312 | for contigs in binning: 313 | count = count_markers_saturated(contigs, markers) 314 | if count is None: 315 | continue 316 | (comp, cont) = get_completeness_contamination(count) 317 | if comp >= min_completeness and cont <= max_contamination: 318 | result += 1 319 | 320 | return result 321 | 322 | 323 | def group_indices_by_genus( 324 | taxonomy: Taxonomy, 325 | ) -> list[np.ndarray]: 326 | if not taxonomy.is_canonical: 327 | raise ValueError("Can only group by genus for a canonical taxonomy") 328 | by_genus: dict[Optional[str], list[ContigId]] = defaultdict(list) 329 | for i, tax in enumerate(taxonomy.contig_taxonomies): 330 | genus = None if tax is None else tax.genus 331 | by_genus[genus].append(ContigId(i)) 332 | return [np.array(i, dtype=np.int32) for i in by_genus.values()] 333 | -------------------------------------------------------------------------------- /vamb/taxonomy.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, IO 2 | from pathlib import Path 3 | from vamb.parsecontigs import CompositionMetaData 4 | from vamb.vambtools import strip_string_newline 5 | import numpy as np 6 | from typing import Union 7 | 8 | TAXONOMY_HEADER = "contigs\tpredictions" 9 | PREDICTED_TAXONOMY_HEADER = "contigs\tpredictions\tscores" 10 | 11 | 12 | class ContigTaxonomy: 13 | """ 14 | Hierarchical taxonomy of some contig. 15 | If `is_canonical`, the ranks are assumed to be domain, phylum, class, 16 | order, family, genus, species, in that order. 17 | The taxonomy may be arbitrarily truncated, e.g. ["Eukaryota", "Chordata"] 18 | is a valid (canonical) taxonomy for a human. 19 | """ 20 | 21 | __slots__ = ["ranks"] 22 | 23 | def __init__(self, ranks: list[str], is_canonical: bool = False): 24 | if is_canonical and len(ranks) > 7: 25 | raise ValueError( 26 | "For a canonical ContigTaxonomy, there must be at most 7 ranks" 27 | ) 28 | 29 | self.ranks = ranks 30 | 31 | @classmethod 32 | def from_semicolon_sep(cls, s: str, is_canonical: bool = False): 33 | if len(s) == 0: 34 | return cls([], is_canonical) 35 | else: 36 | return cls(s.split(";"), is_canonical) 37 | 38 | @property 39 | def genus(self) -> Optional[str]: 40 | if len(self.ranks) < 6: 41 | return None 42 | return self.ranks[5] 43 | 44 | 45 | class Taxonomy: 46 | """ 47 | * contig_taxonomies: An Optional[ContigTaxonomy] for every contig given by the 48 | CompositionMetaData used to instantiate 49 | * refhash: Refhash of CompositionMetaData used to instantiate 50 | * is_canonical: If the taxonomy uses the canonical seven ranks 51 | (domain, phylum, class, order, family, genus, species). 52 | """ 53 | 54 | __slots__ = ["contig_taxonomies", "refhash", "is_canonical"] 55 | 56 | @property 57 | def nseqs(self) -> int: 58 | return len(self.contig_taxonomies) 59 | 60 | @classmethod 61 | def from_file( 62 | cls, tax_file: Path, metadata: CompositionMetaData, is_canonical: bool 63 | ): 64 | observed = cls.parse_tax_file(tax_file, is_canonical) 65 | return cls.from_observed(observed, metadata, is_canonical) 66 | 67 | @classmethod 68 | def from_refined_file( 69 | cls, tax_file: Path, metadata: CompositionMetaData, is_canonical: bool 70 | ): 71 | observed = PredictedTaxonomy.parse_tax_file(tax_file, is_canonical) 72 | observed = [(name, tax.contig_taxonomy) for (name, tax) in observed] 73 | return cls.from_observed(observed, metadata, is_canonical) 74 | 75 | @classmethod 76 | def from_observed( 77 | cls, 78 | observed_taxonomies: list[tuple[str, ContigTaxonomy]], 79 | metadata: CompositionMetaData, 80 | is_canonical: bool, 81 | ): 82 | index_of_contigname: dict[str, int] = { 83 | c: i for (i, c) in enumerate(metadata.identifiers) 84 | } 85 | contig_taxonomies: list[Optional[ContigTaxonomy]] = [None] * len( 86 | metadata.identifiers 87 | ) 88 | n_found = 0 89 | for contigname, taxonomy in observed_taxonomies: 90 | index = index_of_contigname.get(contigname) 91 | if index is None: 92 | continue 93 | n_found += 1 94 | existing = contig_taxonomies[index] 95 | if existing is not None: 96 | raise ValueError( 97 | f'Duplicate contigname when parsing taxonomy: "{contigname}"' 98 | ) 99 | contig_taxonomies[index] = taxonomy 100 | 101 | if n_found != metadata.nseqs: 102 | raise ValueError( 103 | f"In taxonomy file, expected {metadata.nseqs} contigs that are " 104 | f"also present in the filtered FASTA file, but found {n_found}. " 105 | "Note that this might occur because some contigs in the taxonomy " 106 | "file falls under the minimum length threshold." 107 | ) 108 | return cls(contig_taxonomies, metadata.refhash, is_canonical) 109 | 110 | def __init__( 111 | self, 112 | contig_taxonomies: list[Optional[ContigTaxonomy]], 113 | refhash: bytes, 114 | is_canonical: bool, 115 | ): 116 | self.contig_taxonomies = contig_taxonomies 117 | self.refhash = refhash 118 | self.is_canonical = is_canonical 119 | assert_unambiguous_ranks(self) 120 | 121 | @staticmethod 122 | def parse_tax_file( 123 | path: Path, force_canonical: bool 124 | ) -> list[tuple[str, ContigTaxonomy]]: 125 | with open(path) as file: 126 | result: list[tuple[str, ContigTaxonomy]] = [] 127 | header = next(file, None) 128 | header = None if header is None else header.rstrip() 129 | if header is None or header != TAXONOMY_HEADER: 130 | raise ValueError( 131 | f"In taxonomy file '{path}', expected header to be {repr(TAXONOMY_HEADER)}, " 132 | f"but found {'no header' if header is None else repr(header)}" 133 | ) 134 | # Minus two because we already read header, and because Python is zero-indexed 135 | for lineno_minus_two, line in enumerate(file): 136 | line = strip_string_newline(line) 137 | fields = line.split("\t") 138 | if len(fields) != 2: 139 | raise ValueError( 140 | f"In taxonomy file '{path}', on line {lineno_minus_two + 2}, " 141 | f"expected 2 tab-separated columns, but found {len(fields)}." 142 | ) 143 | (contigname, taxonomy) = fields 144 | result.append( 145 | ( 146 | contigname, 147 | ContigTaxonomy.from_semicolon_sep(taxonomy, force_canonical), 148 | ) 149 | ) 150 | 151 | return result 152 | 153 | 154 | class PredictedContigTaxonomy: 155 | slots = ["contig_taxonomy", "probs"] 156 | 157 | def __init__(self, tax: ContigTaxonomy, probs: np.ndarray): 158 | if len(probs) != len(tax.ranks): 159 | raise ValueError("The length of probs must equal that of ranks") 160 | # Due to floating point errors, the probabilities may be slightly outside of 0 or 1. 161 | # We could perhaps validate the values, but that's not likely to be necessary. 162 | np.clip(probs, a_min=0.0, a_max=1.0, out=probs) 163 | self.contig_taxonomy = tax 164 | self.probs = probs 165 | 166 | 167 | class PredictedTaxonomy: 168 | "Output of Taxometer" 169 | 170 | __slots__ = ["contig_taxonomies", "refhash", "is_canonical"] 171 | 172 | def __init__( 173 | self, 174 | taxonomies: list[PredictedContigTaxonomy], 175 | metadata: CompositionMetaData, 176 | is_canonical: bool, 177 | ): 178 | if len(taxonomies) != len(metadata.identifiers): 179 | raise ValueError("Length of taxonomies must match that of identifiers") 180 | 181 | self.contig_taxonomies = taxonomies 182 | self.refhash = metadata.refhash 183 | self.is_canonical = is_canonical 184 | assert_unambiguous_ranks(self) 185 | 186 | def to_taxonomy(self) -> Taxonomy: 187 | lst: list[Optional[ContigTaxonomy]] = [ 188 | p.contig_taxonomy for p in self.contig_taxonomies 189 | ] 190 | return Taxonomy(lst, self.refhash, self.is_canonical) 191 | 192 | @property 193 | def nseqs(self) -> int: 194 | return len(self.contig_taxonomies) 195 | 196 | @staticmethod 197 | def parse_tax_file( 198 | path: Path, force_canonical: bool 199 | ) -> list[tuple[str, PredictedContigTaxonomy]]: 200 | with open(path) as file: 201 | result: list[tuple[str, PredictedContigTaxonomy]] = [] 202 | lines = filter(None, map(str.rstrip, file)) 203 | header = next(lines, None) 204 | if header is None or header != PREDICTED_TAXONOMY_HEADER: 205 | raise ValueError( 206 | f"In predicted taxonomy file '{path}', " 207 | f"expected header to be {repr(PREDICTED_TAXONOMY_HEADER)}, " 208 | f"but found {'no header' if header is None else repr(header)}." 209 | ) 210 | for linenum_minus_two, line in enumerate(lines): 211 | fields = line.split("\t") 212 | if len(fields) != 3: 213 | raise ValueError( 214 | f"Expected 3 fields in line {linenum_minus_two + 2} of file '{path}', " 215 | f"got {len(fields)}.\nLine: '{line}'" 216 | ) 217 | (contigname, taxonomy, scores) = fields 218 | contig_taxonomy = ContigTaxonomy.from_semicolon_sep( 219 | taxonomy, force_canonical 220 | ) 221 | probs = np.array([float(i) for i in scores.split(";")], dtype=float) 222 | result.append( 223 | ( 224 | contigname, 225 | PredictedContigTaxonomy(contig_taxonomy, probs), 226 | ) 227 | ) 228 | 229 | return result 230 | 231 | def write_as_tsv(self, file: IO[str], comp_metadata: CompositionMetaData): 232 | if self.refhash != comp_metadata.refhash: 233 | raise ValueError( 234 | "Refhash of comp_metadata and predicted taxonomy must match" 235 | ) 236 | assert self.nseqs == comp_metadata.nseqs 237 | print(PREDICTED_TAXONOMY_HEADER, file=file) 238 | for i in range(self.nseqs): 239 | tax = self.contig_taxonomies[i] 240 | ranks_str = ";".join(tax.contig_taxonomy.ranks) 241 | probs_str = ";".join([str(round(i, 5)) for i in tax.probs]) 242 | print( 243 | comp_metadata.identifiers[i], 244 | ranks_str, 245 | probs_str, 246 | file=file, 247 | sep="\t", 248 | ) 249 | 250 | 251 | def assert_unambiguous_ranks(taxonomy: Union[Taxonomy, PredictedTaxonomy]): 252 | """ 253 | Ensure that no rank appears at multiple levels in the taxonomy. 254 | This will mess up some of TaxVamb's algorithms since it's based on the names of 255 | taxons, and therefore, having a name on two ranks may cause it to be parsed 256 | as a graph which is not a tree. 257 | """ 258 | seen_ranks: dict[str, int] = dict() 259 | parent_of: dict[str, str] = dict() 260 | for i in taxonomy.contig_taxonomies: 261 | # May be missing from Taxonomy 262 | if i is None: 263 | continue 264 | 265 | if isinstance(i, ContigTaxonomy): 266 | ranks = i.ranks 267 | else: 268 | ranks = i.contig_taxonomy.ranks 269 | 270 | for rank, name in enumerate(ranks): 271 | if seen_ranks.setdefault(name, rank) != rank: 272 | raise ValueError( 273 | f'Taxonomy is ambiguous: "{name}" appears at multiple ranks' 274 | ) 275 | 276 | for parent, child in zip(ranks, ranks[1:]): 277 | if parent_of.setdefault(child, parent) != parent: 278 | raise ValueError( 279 | f'Taxonomy is ambiguous: "{child}" has multiple parents' 280 | ) 281 | -------------------------------------------------------------------------------- /workflow_avamb/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "contigs": "contigs.txt", 3 | "sample_data": "samples2data.tsv", 4 | "index_size": "3G", 5 | "min_contig_size": "2000", 6 | "min_bin_size": "200000", 7 | "min_identity": "0.95", 8 | "minimap_mem": "15GB", 9 | "minimap_ppn": "15", 10 | "avamb_mem": "15GB", 11 | "avamb_ppn": "30", 12 | "checkm2_mem": "15GB", 13 | "checkm2_ppn": "15", 14 | "checkm2_mem_r": "30GB", 15 | "checkm2_ppn_r": "30", 16 | "avamb_params": " --model vae-aae -o C --seed 0 ", 17 | "avamb_preload": "", 18 | "outdir": "avamb_outdir", 19 | "min_comp": "0.9", 20 | "max_cont": "0.05" 21 | } 22 | -------------------------------------------------------------------------------- /workflow_avamb/envs/avamb.yaml: -------------------------------------------------------------------------------- 1 | name: avamb 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.9.16 8 | - snakemake=7.22.0 9 | - pip=23.0.1 10 | - biopython=1.81 11 | - networkx=3.0 12 | 13 | - pip: 14 | - ordered-set==4.1.0 15 | -------------------------------------------------------------------------------- /workflow_avamb/envs/checkm2.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - python=3.8.15 7 | - scikit-learn=0.23.2 8 | - h5py=2.10.0 9 | - numpy=1.23.2 10 | - tensorflow=2.9.1 11 | - lightgbm=3.3.2 12 | - pandas=1.4.3 13 | - scipy=1.9.0 14 | - setuptools=65.3.0 15 | - requests=2.28.1 16 | - packaging=21.3 17 | - tqdm=4.64.0 18 | - diamond=2.0.15 19 | - prodigal=2.6.3 20 | -------------------------------------------------------------------------------- /workflow_avamb/envs/minimap2.yaml: -------------------------------------------------------------------------------- 1 | name: minimap2 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - minimap2 6 | - samtools 7 | -------------------------------------------------------------------------------- /workflow_avamb/envs/samtools.yaml: -------------------------------------------------------------------------------- 1 | name: samtools 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - samtools 6 | -------------------------------------------------------------------------------- /workflow_avamb/src/abundances_mask.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from vamb.vambtools import RefHasher 4 | from pathlib import Path 5 | 6 | 7 | def abundances_mask(headers: Path, mask_refhash: Path, min_contig_size: int): 8 | """# Using the headers above, compute the mask and the refhash""" 9 | 10 | mask = [] 11 | identifiers = [] 12 | 13 | with open(headers) as file: 14 | for line in file: 15 | # SN:S27C112075 LN:2239 16 | (sn, ln) = line.split("\t") 17 | if sn[:3] != "SN:" or ln[:3] != "LN:": 18 | raise ValueError("Unknown format") 19 | passed = int(ln[3:]) >= min_contig_size 20 | mask.append(passed) 21 | if passed: 22 | identifiers.append(sn[3:]) 23 | 24 | np.savez_compressed( 25 | mask_refhash, 26 | mask=np.array(mask, dtype=bool), 27 | refhash=RefHasher.hash_refnames(identifiers), 28 | ) 29 | 30 | 31 | if __name__ == "__main__": 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument("--h", type=Path, help=" Headers file") 34 | parser.add_argument("--msk", type=Path, help="mask refhash") 35 | 36 | parser.add_argument("--minsize", type=int, help="min contig size") 37 | 38 | opt = parser.parse_args() 39 | 40 | abundances_mask(opt.h, opt.msk, opt.minsize) 41 | -------------------------------------------------------------------------------- /workflow_avamb/src/create_abundances.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import vamb 4 | from pathlib import Path 5 | 6 | 7 | def create_abundances( 8 | abundances: list[Path], mask_refhash: Path, min_id: float, outfile: Path 9 | ): 10 | """Merge the abundances to a single Abundance object and save it""" 11 | refhash = np.load(mask_refhash)["refhash"] 12 | 13 | n_samples = len(abundances) 14 | first = vamb.vambtools.read_npz(abundances[0]) 15 | print(len(first), n_samples) 16 | print(first.shape) 17 | matrix = np.empty((len(first), n_samples), dtype=np.float32) 18 | matrix[:, 0] = first 19 | for i, path in enumerate(abundances[1:]): 20 | matrix[:, i + 1] = vamb.vambtools.read_npz(path) 21 | abundance = vamb.parsebam.Abundance( 22 | matrix, [str(i) for i in abundances], min_id, refhash 23 | ) 24 | abundance.save(outfile) 25 | 26 | 27 | if __name__ == "__main__": 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument("--msk", type=Path, help="mask refhash") 30 | parser.add_argument("--ab", type=Path, nargs="+", help=" abundancaes list of files") 31 | parser.add_argument("--min_id", type=float, help="min identity for alignment") 32 | parser.add_argument("--out", type=Path, help="abundances outfile") 33 | 34 | opt = parser.parse_args() 35 | 36 | create_abundances(opt.ab, opt.msk, opt.min_id, opt.out) 37 | -------------------------------------------------------------------------------- /workflow_avamb/src/create_cluster_scores_bin_path_dict.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import json 4 | import argparse 5 | 6 | from typing import cast 7 | 8 | 9 | def get_cluster_score_bin_path( 10 | path_checkm_all: str, path_bins: str, bins: set[str] 11 | ) -> tuple[dict[str, tuple[float, float]], dict[str, str]]: 12 | """Given CheckM has been run for all samples, create 2 dictionaries: 13 | - {bin:path_bin} 14 | - {bin:[completeness, contamination]}""" 15 | cluster_score: dict[str, tuple[float, float]] = dict() 16 | bin_path: dict[str, str] = dict() 17 | for sample in os.listdir(path_checkm_all): 18 | path_quality_s = os.path.join(path_checkm_all, sample, "quality_report.tsv") 19 | c_com_con = np.loadtxt( 20 | path_quality_s, 21 | delimiter="\t", 22 | skiprows=1, 23 | usecols=(0, 1, 2), 24 | dtype=str, 25 | ndmin=2, 26 | ) 27 | 28 | for row in c_com_con: 29 | cluster, com, con = row 30 | cluster = cast(str, cluster) 31 | com, con = float(com), float(con) 32 | bin_name = cluster + ".fna" 33 | if bin_name in bins: 34 | cluster_score[cluster] = (com, con) 35 | bin_path[cluster + ".fna"] = os.path.join( 36 | path_bins, sample, cluster + ".fna" 37 | ) 38 | return cluster_score, bin_path 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("--s", type=str, help="path checkm2 that contains all samples") 44 | parser.add_argument("--b", type=str, help="path all bins ") 45 | parser.add_argument( 46 | "--cs_d", type=str, help="cluster_score dictionary will be stored here" 47 | ) 48 | parser.add_argument( 49 | "--bp_d", type=str, help="bin_path dictionary will be stored here " 50 | ) 51 | 52 | opt = parser.parse_args() 53 | 54 | bins_set = set() 55 | for sample in os.listdir(opt.b): 56 | for bin_ in os.listdir(os.path.join(opt.b, sample)): 57 | if ".fna" in bin_: 58 | bins_set.add(bin_) 59 | 60 | cluster_score, bin_path = get_cluster_score_bin_path(opt.s, opt.b, bins_set) 61 | with open(opt.cs_d, "w") as f: 62 | json.dump(cluster_score, f) 63 | 64 | with open(opt.bp_d, "w") as f: 65 | json.dump(bin_path, f) 66 | -------------------------------------------------------------------------------- /workflow_avamb/src/manual_drep_JN.py: -------------------------------------------------------------------------------- 1 | import vamb 2 | import numpy as np 3 | import os 4 | import itertools 5 | 6 | from typing import NewType, Union, Optional 7 | from collections.abc import Sequence, Mapping, Iterable 8 | from pathlib import Path 9 | 10 | import argparse 11 | import json 12 | 13 | ContigId = NewType("ContigId", int) 14 | BinId = NewType("BinId", int) 15 | 16 | 17 | def main( 18 | # Path to output clusters file. Will error if it already exists 19 | outpath: Path, 20 | # Path to composition.npz 21 | composition_path: Path, 22 | # Path to CheckM2 quality_report.tsv file 23 | quality_report: dict[str, list], 24 | # List of paths to clusters.tsv files as output by Vamb. 25 | # Names of clusters must match those in CheckM2 quality_report, 26 | # and those of contigs match those in names_npz 27 | binnings: Sequence[Path], 28 | # min coverage to consdier 2 bins the same with 100% above this coverage 29 | min_cov: float, 30 | # min completentes for bin to be included into the dereplication process 31 | min_comp: float, 32 | # max contamination for bin to be included into the dereplication process 33 | max_cont: float, 34 | bins_extension: str, 35 | min_bin_size: int, 36 | ) -> None: 37 | # Load contig names and lengths 38 | comp = vamb.parsecontigs.Composition.load(composition_path) 39 | 40 | contig_names: list[str] = list(comp.metadata.identifiers) 41 | assert isinstance(contig_names, list) 42 | assert isinstance(contig_names[0], str) 43 | 44 | lengths = comp.metadata.lengths 45 | assert len(lengths) == len(contig_names) 46 | del comp # free up memory 47 | 48 | # Load CheckM2 49 | (bin_names, qualities, bin_by_name) = load_checkm2( 50 | quality_report, min_comp, max_cont, bins_extension 51 | ) 52 | # Load bins 53 | (bin_lengths, union_bins) = load_binnings( 54 | binnings, contig_names, lengths, bin_by_name, min_bin_size 55 | ) 56 | del bin_by_name 57 | 58 | dereplicated = dereplicate(union_bins, qualities, lengths, bin_lengths, min_cov) 59 | del bin_lengths 60 | 61 | if os.path.exists(outpath): 62 | raise FileExistsError(outpath) 63 | 64 | with open(outpath, "w") as file: 65 | print(vamb.vambtools.CLUSTERS_HEADER, file=file) 66 | for bin in dereplicated: 67 | bin_name = bin_names[bin] 68 | bin_name = bin_name.replace(".fna", "") 69 | for contig in union_bins[bin]: 70 | print(bin_name, contig_names[contig], sep="\t", file=file) 71 | 72 | 73 | def load_checkm2( 74 | quality_report: dict[str, list], 75 | min_completeness: float, 76 | max_contamination: float, 77 | bins_extension: str, 78 | ) -> tuple[ 79 | list[str], # Bin names 80 | list[tuple[float, float]], # Bin qualities 81 | dict[str, Optional[BinId]], # Mapping to binid, if not skipped 82 | ]: 83 | """Extract all bin names and assign them either a BinId, or else None, 84 | if their completeness/contamination is so bad the bin should be discarded 85 | """ 86 | # This is None if the bin is to be discarded 87 | bin_by_name: dict[str, Optional[BinId]] = dict() 88 | bin_names: list[str] = [] 89 | qualities: list[tuple[float, float]] = [] 90 | 91 | # The file looks like this: 92 | # Name Completeness Contamination Completeness_Model_Used Translation_Table_Used Additional_Notes 93 | # AAE_UC_Y_1980340Ccluster_501--AAE_UC_v3_1980340Ccluster_2599 5.18 0.0 Neural Network (Specific Model) 11 None 94 | 95 | for cluster, scores in quality_report.items(): 96 | name = cluster + bins_extension 97 | comp, cont = scores 98 | completeness = float(comp) / 100 99 | contamination = float(cont) / 100 100 | assert 0.0 <= completeness <= 1.0 101 | assert 0.0 <= contamination # can be unbounded 102 | 103 | if completeness >= min_completeness and contamination <= max_contamination: 104 | bin = BinId(len(bin_names)) 105 | bin_names.append(name) 106 | qualities.append((completeness, contamination)) 107 | bin_by_name[name] = bin 108 | else: 109 | bin_by_name[name] = None 110 | 111 | assert sum(1 for i in bin_by_name.values() if isinstance(i, int)) == len(bin_names) 112 | return (bin_names, qualities, bin_by_name) 113 | 114 | 115 | def load_binnings( 116 | binnings: Sequence[Path], 117 | contig_names: Sequence[str], 118 | lengths: np.ndarray, 119 | bin_by_name: Mapping[str, Optional[BinId]], 120 | min_bin_size: int, 121 | ) -> tuple[list[int], list[set[ContigId]]]: 122 | """ 123 | Load clusters.tsv files from each binning, and filter away those assigned to be discarded based on CheckM2 data. 124 | Return bin length and bins, each represented as a set of ContigId 125 | """ 126 | id_len_of_contig_name: dict[str, tuple[ContigId, int]] = dict() 127 | for index, (name, length) in enumerate(zip(contig_names, lengths)): 128 | id_len_of_contig_name[name] = (ContigId(index), length) 129 | 130 | # Load binnings 131 | n_union_bins = sum(1 for i in bin_by_name.values() if i is not None) 132 | 133 | lengthof = dict(zip(contig_names, lengths)) 134 | 135 | union_bins: list[Optional[set[ContigId]]] = [None] * n_union_bins 136 | for binning_path in binnings: 137 | with open(binning_path) as file: 138 | clusters = vamb.vambtools.read_clusters(file) 139 | clusters_filtered = filterclusters(clusters, lengthof, min_bin_size) 140 | # filter by clusters larger than 200kbs 141 | for bin_name, contigs in clusters_filtered.items(): 142 | bin_name += ".fna" 143 | # None is a valid value, so we use -1 as sentinel for missing 144 | bin = bin_by_name.get(bin_name, -1) 145 | if bin == -1: 146 | raise ValueError( 147 | f"Bin {bin_name} found in binning {binning_path}, but is not scored by CheckM2" 148 | ) 149 | # Means: Below threshold, so skip it 150 | elif bin is None: 151 | continue 152 | else: 153 | ids: set[ContigId] = set() 154 | for contig in contigs: 155 | existing = id_len_of_contig_name.get(contig) 156 | if existing is None: 157 | raise KeyError( 158 | f"Cluster file {binning_path} contain contig {contig}, " 159 | "but that name is not present in provided names npz file" 160 | ) 161 | ids.add(existing[0]) 162 | union_bins[bin] = ids 163 | 164 | bin_lengths: list[int] = [] 165 | 166 | for i in union_bins: 167 | assert isinstance(i, set) 168 | union_bins_asserted: list[set[ContigId]] = union_bins # type: ignore 169 | 170 | for contigs in union_bins_asserted: 171 | bin_lengths.append(sum(lengths[contig] for contig in contigs)) 172 | 173 | return (bin_lengths, union_bins_asserted) 174 | 175 | 176 | def filterclusters( 177 | clusters: Mapping[str, set], lengthof: Mapping[str, int], min_bin_size: int 178 | ) -> Mapping[str, set]: 179 | filtered_bins = dict() 180 | for medoid, contigs in clusters.items(): 181 | binsize = sum(lengthof[contig] for contig in contigs) 182 | 183 | if binsize >= min_bin_size: 184 | filtered_bins[medoid] = contigs 185 | 186 | return filtered_bins 187 | 188 | 189 | def dereplicate( 190 | union_bins: Sequence[set[ContigId]], 191 | qualities: Sequence[tuple[float, float]], 192 | contig_lengths: np.ndarray, 193 | bin_lengths: Sequence[int], 194 | threshold: float, 195 | ) -> list[BinId]: 196 | "Removes bins if they are too similar to another bin. Return list of kept bins" 197 | assert len(union_bins) == len(qualities) == len(bin_lengths) 198 | 199 | overlapping_pairs = get_overlapping_bin_pairs(get_binsof(union_bins), qualities) 200 | to_remove = compute_to_remove( 201 | union_bins, overlapping_pairs, contig_lengths, bin_lengths, threshold 202 | ) 203 | return [BinId(i) for i in range(len(bin_lengths)) if BinId(i) not in to_remove] 204 | 205 | 206 | def get_binsof(union_bins: Iterable[Iterable[ContigId]]) -> dict[ContigId, list[BinId]]: 207 | "Makes a dict from contig -> list of bins the contig is present in, if in multiple bins" 208 | binsof: dict[ContigId, Union[BinId, list[BinId]]] = dict() 209 | for bin_int, contigs in enumerate(union_bins): 210 | bin = BinId(bin_int) 211 | for contig in contigs: 212 | existing = binsof.get(contig) 213 | if existing is None: 214 | binsof[contig] = bin 215 | elif isinstance(existing, int): 216 | binsof[contig] = [existing, bin] 217 | else: 218 | assert isinstance(existing, list) 219 | existing.append(bin) 220 | return {k: v for (k, v) in binsof.items() if isinstance(v, list)} 221 | 222 | 223 | def bin_score(completeness: float, contamination: float) -> float: 224 | return completeness - 5 * contamination 225 | 226 | 227 | def get_overlapping_bin_pairs( 228 | binsof: Mapping[ContigId, list[BinId]], qualities: Sequence[tuple[float, float]] 229 | ) -> Sequence[tuple[BinId, BinId]]: 230 | "Get a list of pairs of bins that share at least one contig" 231 | pairs: set[tuple[BinId, BinId]] = set() 232 | for overlapping_bins in binsof.values(): 233 | for a, b in itertools.combinations(overlapping_bins, r=2): 234 | # Order them so we don't have (a, b) and (b, a) as distinct pairs 235 | if a > b: 236 | (a, b) = (b, a) 237 | pairs.add((a, b)) 238 | 239 | # Now be sure to order them as (worst, best) depending on score 240 | # If they tie, then use lexographic order (a, b) we added them 241 | # in above 242 | result: list[tuple[BinId, BinId]] = [] 243 | for a, b in pairs: 244 | score_a = bin_score(*qualities[a]) 245 | score_b = bin_score(*qualities[b]) 246 | if score_a > score_b: 247 | result.append((b, a)) 248 | else: 249 | result.append((a, b)) 250 | 251 | return result 252 | 253 | 254 | def compute_to_remove( 255 | union_bins: Sequence[set[ContigId]], 256 | overlapping_pairs: Iterable[tuple[BinId, BinId]], 257 | lengths: np.ndarray, 258 | bin_lengths: Sequence[int], 259 | threshold: float, 260 | ) -> set[BinId]: 261 | "Create a list of bins to remove because they overlap with another bin" 262 | result: set[BinId] = set() 263 | for bin_a, bin_b in overlapping_pairs: 264 | if bin_a in result or bin_b in result: 265 | continue 266 | 267 | intersection = union_bins[bin_a] & union_bins[bin_b] 268 | int_len = sum(lengths[i] for i in intersection) 269 | if int_len / min(bin_lengths[bin_a], bin_lengths[bin_b]) >= threshold: 270 | # We remove an arbitrary one 271 | result.add(bin_a) 272 | return result 273 | 274 | 275 | if __name__ == "__main__": 276 | parser = argparse.ArgumentParser() 277 | parser.add_argument("--cs_d", type=str, help="path bins_scores dictionary") 278 | parser.add_argument( 279 | "--composition", type=Path, help="Path to the composition.npz file" 280 | ) 281 | parser.add_argument( 282 | "--output", 283 | type=str, 284 | help="Path output clusters generated by dereplicating bins", 285 | ) 286 | parser.add_argument( 287 | "--clusters", 288 | type=str, 289 | nargs="*", 290 | help="Path input clusters generated by aamb and vamb", 291 | ) 292 | parser.add_argument("--cov", type=float, default=0.75, help="Min coverage ") 293 | parser.add_argument("--comp", type=float, default=0.9, help="Min completeness ") 294 | parser.add_argument("--cont", type=float, default=0.05, help="Max contamination ") 295 | parser.add_argument( 296 | "--bins_extension", type=str, default=".fna", help="Extension of the bins " 297 | ) 298 | parser.add_argument( 299 | "--min_bin_size", 300 | type=int, 301 | help="Min bin length to be considered for dereplication ", 302 | ) 303 | 304 | opt = parser.parse_args() 305 | args = vars(parser.parse_args()) 306 | with open(opt.cs_d) as f: 307 | cluster_scores = json.load(f) 308 | 309 | main( 310 | outpath=opt.output, 311 | composition_path=opt.composition, 312 | quality_report=cluster_scores, 313 | binnings=opt.clusters, 314 | min_cov=opt.cov, 315 | min_comp=opt.comp, 316 | max_cont=opt.cont, 317 | bins_extension=opt.bins_extension, 318 | min_bin_size=opt.min_bin_size, 319 | ) 320 | -------------------------------------------------------------------------------- /workflow_avamb/src/mv_bins_from_mdrep_clusters.py: -------------------------------------------------------------------------------- 1 | import vamb 2 | import argparse 3 | import shutil 4 | import os 5 | import json 6 | 7 | from typing import Optional 8 | 9 | 10 | def main( 11 | cluster_scores: dict[str, tuple[float, float]], 12 | cluster_contigs: dict[str, set[str]], 13 | bin_separator: Optional[str], 14 | path_nc_bins_folder: str, 15 | path_bins_folder: str, 16 | path_nc_clusters: str, 17 | min_comp: float = 0.9, 18 | max_cont: float = 0.05, 19 | ): 20 | cluster_sample = get_cluster_sample(cluster_contigs, bin_separator) 21 | nc_cluster_scores = get_nc_cluster_scores( 22 | cluster_scores, cluster_sample, min_comp, max_cont 23 | ) 24 | create_nc_sample_folders(nc_cluster_scores, cluster_sample, path_nc_bins_folder) 25 | write_nc_bins_from_mdrep_clusters( 26 | nc_cluster_scores, cluster_sample, path_nc_bins_folder, path_bins_folder 27 | ) 28 | write_quality_report(nc_cluster_scores, path_nc_bins_folder) 29 | write_final_nc_clusters(nc_cluster_scores, cluster_contigs, path_nc_clusters) 30 | 31 | 32 | def get_nc_cluster_scores( 33 | cluster_scores: dict[str, tuple[float, float]], 34 | cluster_sample: dict[str, str], 35 | min_comp: float, 36 | max_cont: float, 37 | ) -> dict[str, tuple[float, float]]: 38 | nc_cluster_scores: dict[str, tuple[float, float]] = dict() 39 | for cluster, scores in cluster_scores.items(): 40 | comp, cont = scores 41 | comp, cont = float(comp), float(cont) 42 | comp, cont = comp / 100, cont / 100 43 | if cluster not in cluster_sample.keys(): 44 | continue 45 | if comp >= min_comp and cont <= max_cont: 46 | nc_cluster_scores[cluster] = (comp, cont) 47 | 48 | return nc_cluster_scores 49 | 50 | 51 | def get_cluster_sample( 52 | cluster_contigs: dict[str, set[str]], bin_separator: Optional[str] 53 | ) -> dict[str, str]: 54 | cluster_sample: dict[str, str] = dict() 55 | for cluster_ in cluster_contigs.keys(): 56 | contigs = cluster_contigs[cluster_] 57 | contig_i = next(iter(contigs)) 58 | sample = contig_i.split(bin_separator)[0] 59 | cluster_sample[cluster_] = sample 60 | 61 | return cluster_sample 62 | 63 | 64 | def create_nc_sample_folders( 65 | cluster_scores: dict[str, tuple[float, float]], 66 | cluster_sample: dict[str, str], 67 | path_nc_bins_folder: str, 68 | ): 69 | nc_samples: set[str] = set() 70 | for cluster in cluster_scores.keys(): 71 | sample = cluster_sample[cluster] 72 | nc_samples.add(sample) 73 | 74 | for sample in nc_samples: 75 | try: 76 | os.mkdir(os.path.join(path_nc_bins_folder, sample)) 77 | except FileExistsError: 78 | pass 79 | 80 | 81 | def write_nc_bins_from_mdrep_clusters( 82 | cluster_scores: dict[str, tuple[float, float]], 83 | cluster_sample: dict[str, str], 84 | path_nc_bins_folder: str, 85 | path_bins_folder: str, 86 | ): 87 | for cluster in cluster_scores.keys(): 88 | sample = cluster_sample[cluster] 89 | src_bin = os.path.join(path_bins_folder, sample, cluster + ".fna") 90 | trg_bin = os.path.join(path_nc_bins_folder, sample, cluster + ".fna") 91 | shutil.move(src_bin, trg_bin) 92 | 93 | 94 | def write_quality_report( 95 | cluster_scores: dict[str, tuple[float, float]], path_nc_bins_folder: str 96 | ): 97 | with open(os.path.join(path_nc_bins_folder, "quality_report.tsv"), "w") as file: 98 | print("Name completeness contamination", sep="\t", file=file) 99 | file.flush() 100 | for nc_cluster, (completeness, contaminaton) in cluster_scores.items(): 101 | print(nc_cluster, completeness, contaminaton, sep="\t", file=file) 102 | file.flush() 103 | 104 | 105 | def write_final_nc_clusters( 106 | cluster_scores: dict[str, tuple[float, float]], 107 | cluster_contigs: dict[str, set[str]], 108 | path_nc_clusters: str, 109 | ): 110 | with open(path_nc_clusters, "w") as file: 111 | print(vamb.vambtools.CLUSTERS_HEADER, file=file) 112 | for nc_cluster in cluster_scores.keys(): 113 | nc_contigs = cluster_contigs[nc_cluster] 114 | for nc_contig in nc_contigs: 115 | print(nc_cluster, nc_contig, sep="\t", file=file) 116 | 117 | 118 | if __name__ == "__main__": 119 | parser = argparse.ArgumentParser() 120 | parser.add_argument("--c", type=str, help="path clusters file from tmp folder") 121 | parser.add_argument("--cf", type=str, help="path clusters file final") 122 | parser.add_argument("--cs_d", type=str, help="cluster_scores dictionary path ") 123 | parser.add_argument("--b", type=str, help="path all bins ") 124 | parser.add_argument( 125 | "--d", type=str, help="path to folder that will contain all nc bins" 126 | ) 127 | parser.add_argument("--bin_separator", type=str, help="separator ") 128 | parser.add_argument("--comp", type=float, default=0.9, help="Min completeness ") 129 | parser.add_argument("--cont", type=float, default=0.05, help="Max contamination ") 130 | 131 | opt = parser.parse_args() 132 | 133 | with open(opt.c) as clusters_file: 134 | cluster_contigs = vamb.vambtools.read_clusters(clusters_file) 135 | 136 | with open(opt.cs_d) as f: 137 | cluster_scores = json.load(f) 138 | 139 | main( 140 | cluster_scores, 141 | cluster_contigs, 142 | opt.bin_separator, 143 | opt.d, 144 | opt.b, 145 | opt.cf, 146 | opt.comp, 147 | opt.cont, 148 | ) 149 | -------------------------------------------------------------------------------- /workflow_avamb/src/update_cluster_scores_dict_after_ripping.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import argparse 4 | 5 | 6 | def update_cluster_score_bin_path( 7 | path_checkm_ripped: str, cluster_score: dict[str, tuple[float, float]] 8 | ): 9 | c_com_con = np.loadtxt( 10 | path_checkm_ripped, 11 | delimiter="\t", 12 | skiprows=1, 13 | usecols=(0, 1, 2), 14 | dtype=str, 15 | ndmin=2, 16 | ) 17 | for row in c_com_con: 18 | cluster, com, con = row 19 | if "--" in cluster: 20 | continue 21 | com, con = float(com), float(con) 22 | print(cluster, "scores were", cluster_score[cluster]) 23 | 24 | cluster_score[cluster] = (com, con) 25 | print("and now are", cluster_score[cluster]) 26 | return cluster_score 27 | 28 | 29 | if __name__ == "__main__": 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument( 32 | "--s", 33 | type=str, 34 | help="path checkm2 that contains quality_report.tsv file for ripped bins", 35 | ) 36 | parser.add_argument( 37 | "--cs_d", 38 | type=str, 39 | help="cluster_score dictionary path ", 40 | ) 41 | parser.add_argument( 42 | "--cs_d_o", 43 | type=str, 44 | help="cluster_score dictionary path updated, updated with tthe information for clusters that where ripped either becuase of meaningless edges or when making the component lenght <= 2 ", 45 | ) 46 | 47 | opt = parser.parse_args() 48 | 49 | with open(opt.cs_d) as f: 50 | cluster_score = json.load(f) 51 | 52 | cluster_score_ = update_cluster_score_bin_path(opt.s, cluster_score) 53 | 54 | with open(opt.cs_d_o, "w") as f: 55 | json.dump(cluster_score_, f) 56 | -------------------------------------------------------------------------------- /workflow_avamb/src/workflow_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from typing import cast 4 | 5 | 6 | def get_cluster_score_bin_path( 7 | path_checkm_all: str, path_bins: str, bins: set[str] 8 | ) -> tuple[dict[str, tuple[float, float]], dict[str, str]]: 9 | """Given CheckM has been run for all samples, create 2 dictionaries: 10 | - {bin:path_bin} 11 | - {bin:[completeness, contamination]}""" 12 | cluster_score: dict[str, tuple[float, float]] = dict() 13 | bin_path: dict[str, str] = dict() 14 | for sample in os.listdir(path_checkm_all): 15 | path_quality_s = os.path.join(path_checkm_all, sample, "quality_report.tsv") 16 | c_com_con = np.loadtxt( 17 | path_quality_s, 18 | delimiter="\t", 19 | skiprows=1, 20 | usecols=(0, 1, 2), 21 | dtype=str, 22 | ndmin=2, 23 | ) 24 | 25 | for row in c_com_con: 26 | cluster, com, con = row 27 | cluster = cast(str, cluster) 28 | com, con = float(com), float(con) 29 | bin_name = cluster + ".fna" 30 | if bin_name in bins: 31 | cluster_score[cluster] = (com, con) 32 | bin_path[cluster + ".fna"] = os.path.join( 33 | path_bins, sample, cluster + ".fna" 34 | ) 35 | return cluster_score, bin_path 36 | 37 | 38 | def update_cluster_score_bin_path( 39 | path_checkm_ripped: str, cluster_score: dict[str, tuple[float, float]] 40 | ) -> dict[str, tuple[float, float]]: 41 | c_com_con = np.loadtxt( 42 | path_checkm_ripped, 43 | delimiter="\t", 44 | skiprows=1, 45 | usecols=(0, 1, 2), 46 | dtype=str, 47 | ndmin=2, 48 | ) 49 | for row in c_com_con: 50 | cluster, com, con = row 51 | if "--" in cluster: 52 | continue 53 | com, con = float(com), float(con) 54 | print(cluster, "scores were", cluster_score[cluster]) 55 | 56 | cluster_score[cluster] = (com, con) 57 | print("and now are", cluster_score[cluster]) 58 | return cluster_score 59 | -------------------------------------------------------------------------------- /workflow_avamb/src/write_abundances.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import vamb 4 | from pathlib import Path 5 | 6 | 7 | def write_abundances( 8 | mask_refhash: Path, bampath: Path, min_identity: float, outfile: Path 9 | ): 10 | """For every sample, compute the abundances given the mask and refhashes""" 11 | loadnpz = np.load(mask_refhash) 12 | refhash = loadnpz["refhash"] 13 | mask = loadnpz["mask"] 14 | refhash = refhash.reshape(1)[0] 15 | (abundance, _) = vamb.parsebam.Abundance.run_pycoverm( 16 | paths=[bampath], 17 | minid=min_identity, 18 | target_refhash=refhash, 19 | target_identifiers=None, 20 | mask=mask, 21 | ) 22 | vamb.vambtools.write_npz(outfile, abundance.ravel()) 23 | 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument("--msk", type=Path, help="mask refhash") 28 | parser.add_argument("--b", type=Path, help=" bam path") 29 | parser.add_argument("--min_id", type=float, help="min identity for alignment") 30 | parser.add_argument("--out", type=Path, help="abundances outfile") 31 | 32 | opt = parser.parse_args() 33 | 34 | write_abundances(opt.msk, opt.b, opt.min_id, opt.out) 35 | -------------------------------------------------------------------------------- /workflow_avamb/src/write_clusters_from_dereplicated_and_ripped_bins.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | while getopts "d:o:" opt; do 3 | case $opt in 4 | d) drep_dir=$OPTARG ;; 5 | o) clusters_file=$OPTARG ;; 6 | *) echo 'error' >&2 7 | exit 1 8 | esac 9 | done 10 | output_file=$(pwd)/${clusters_file}/avamb/avamb_manual_drep_disjoint_clusters.tsv 11 | echo 'creating z y v clusters from the final set of bins' 12 | for s in $(ls $drep_dir) 13 | do 14 | s="$drep_dir"/"$s"/ 15 | if [ -d "$s" ] 16 | then 17 | cd $s 18 | for bin in $(ls . 2> /dev/null) 19 | 20 | do 21 | if [[ $bin == **".fna" ]] 22 | then 23 | 24 | cluster_name=$(echo $bin | sed 's=.fna==g' | sed 's=.fa==g') 25 | 26 | echo -e "clustername\tcontigname" >> $output_file 27 | for contig in $(grep '>' $bin | sed 's=>==g') 28 | do 29 | echo -e "$cluster_name""\t""$contig" >> $output_file 30 | done 31 | 32 | 33 | fi 34 | done 35 | 36 | fi 37 | done 38 | --------------------------------------------------------------------------------