├── .git-blame-ignore-revs
├── .github
    ├── ISSUE_TEMPLATE.md
    └── workflows
    │   ├── cli_vamb.yml
    │   ├── lint.yml
    │   ├── snakemake_avamb.yml
    │   └── unittest.yml
├── .gitignore
├── .readthedocs.yaml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmark
    ├── Project.toml
    ├── benchmark.png
    ├── benchmarks.csv
    └── make_plots.jl
├── doc
    ├── README.md
    ├── conf.py
    ├── front_page.md
    ├── histogram.npz
    ├── how_to_run.md
    ├── index.rst
    ├── inputs_outputs.md
    ├── installation.md
    └── tips.md
├── pyproject.toml
├── setup.cfg
├── src
    ├── concatenate.py
    ├── create_fasta.py
    ├── create_kernel.py
    └── merge_aemb.py
├── test
    ├── data
    │   ├── aemb
    │   │   ├── 6.aemb.tsv
    │   │   ├── 7.aemb.tsv
    │   │   └── 8.aemb.tsv
    │   ├── bam
    │   │   ├── 10.bam
    │   │   ├── 11.bam
    │   │   └── 12.bam
    │   ├── fasta.fna
    │   └── marker.fna
    ├── test_aamb_encode.py
    ├── test_cluster.py
    ├── test_encode.py
    ├── test_parsebam.py
    ├── test_parsecontigs.py
    ├── test_parsemarkers.py
    ├── test_reclustering.py
    ├── test_results.py
    ├── test_semisupervised_encode.py
    ├── test_vambtools.py
    └── testtools.py
├── vamb
    ├── __init__.py
    ├── __main__.py
    ├── aamb_encode.py
    ├── cluster.py
    ├── encode.py
    ├── hloss_misc.py
    ├── kernel.npz
    ├── marker.hmm
    ├── parsebam.py
    ├── parsecontigs.py
    ├── parsemarkers.py
    ├── reclustering.py
    ├── semisupervised_encode.py
    ├── taxonomy.py
    ├── taxvamb_encode.py
    └── vambtools.py
└── workflow_avamb
    ├── README.md
    ├── avamb.snake.conda.smk
    ├── config.json
    ├── envs
        ├── avamb.yaml
        ├── checkm2.yml
        ├── minimap2.yaml
        └── samtools.yaml
    └── src
        ├── abundances_mask.py
        ├── create_abundances.py
        ├── create_cluster_scores_bin_path_dict.py
        ├── manual_drep_JN.py
        ├── mv_bins_from_mdrep_clusters.py
        ├── rip_bins.py
        ├── transfer_contigs_and_aggregate_all_nc_bins.py
        ├── update_cluster_scores_dict_after_ripping.py
        ├── workflow_tools.py
        ├── write_abundances.py
        └── write_clusters_from_dereplicated_and_ripped_bins.sh


/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | e6480cefc77abcbd06d86e438504e8db9e8276eb
2 | 910f64c9fc294fa5c92aa19015a02a376c1d5ecc
3 | 5a0cc3a4d9dd8ddcb74bdebd9119cffacbe25942
4 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Thank you for making an issue.
2 | If you are submitting a bug report, it will help us if you include the following information:
3 | 
4 | - Your version of Python and Vamb.
5 | - The log file (called `log.txt`) from the output directory
6 | - The full error message produced by Vamb, if any
7 | 


--------------------------------------------------------------------------------
/.github/workflows/cli_vamb.yml:
--------------------------------------------------------------------------------
 1 | name: Command line interface tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "master" ]
 6 |   pull_request:
 7 |     branches: [ "master" ]
 8 |   workflow_dispatch:
 9 |     inputs:
10 | 
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   build:
16 |     runs-on: ubuntu-latest
17 |     strategy:
18 |       matrix:
19 |         python-version: ["3.11"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v4
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v5
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |         cache: 'pip' # caching pip dependencies
28 |         cache-dependency-path: '**/pyproject.toml'
29 |     - name: Download fixtures
30 |       run: |
31 |         wget https://www.dropbox.com/scl/fi/10tdf0w0kf70pf46hy8ks/ci_data.zip\?rlkey\=smlcinkesuwiw557zulgbb59l\&st\=hhokiqma\&dl\=0 -O ci_data.zip
32 |         unzip -o ci_data.zip
33 |     - name: Install dependencies
34 |       run: |
35 |         python -m pip install --upgrade pip
36 |         pip install flake8 pytest
37 |         pip install -e .
38 |     - name: Run VAMB
39 |       run: |
40 |         vamb bin default --outdir outdir_vamb --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz -l 32 -e 10 -q 2 -o C --minfasta 200000 -t 10
41 |         ls -la outdir_vamb
42 |         cat outdir_vamb/log.txt
43 |     - name: Run TaxVAMB
44 |       run: |
45 |         vamb bin taxvamb --outdir outdir_taxvamb --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy taxonomy_mock.tsv -pe 10 -pt 10 -e 10 -q 2 3 -t 10 -o C --minfasta 200000
46 |         ls -la outdir_taxvamb
47 |         cat outdir_taxvamb/log.txt
48 |         vamb bin taxvamb --outdir outdir_taxvamb_no_predict --no_predictor --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy taxonomy_mock.tsv -e 10 -q 2 3 -t 10 -o C --minfasta 200000
49 |         ls -la outdir_taxvamb_no_predict
50 |         cat outdir_taxvamb_no_predict/log.txt
51 |         vamb bin taxvamb --outdir outdir_taxvamb_preds --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz  --no_predictor --taxonomy outdir_taxvamb/results_taxometer.tsv -e 10 -q 2 -t 10 -o C --minfasta 200000
52 |         ls -la outdir_taxvamb_preds
53 |         cat outdir_taxvamb_preds/log.txt
54 |     - name: Run Taxometer
55 |       run: |
56 |         vamb taxometer --outdir outdir_taxometer --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy taxonomy_mock.tsv -pe 10 -pt 10
57 |         ls -la outdir_taxometer
58 |         cat outdir_taxometer/log.txt
59 |     - name: Run k-means reclustering
60 |       run: |
61 |         vamb recluster --outdir outdir_recluster --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --latent_path outdir_taxvamb/vaevae_latent.npz --clusters_path outdir_taxvamb/vaevae_clusters_split.tsv --markers markers_mock.npz --algorithm kmeans --minfasta 200000
62 |         ls -la outdir_recluster
63 |         cat outdir_recluster/log.txt
64 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ "master" ]
 6 | 
 7 | jobs:
 8 |   lint:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v4
12 |       - uses: astral-sh/ruff-action@v3
13 |         with:
14 |           version: "0.11.x"
15 | 
16 | 
17 |   format:
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - uses: astral-sh/ruff-action@v3
22 |         with:
23 |           version: "0.11.x"
24 |           args: 'format --check'
25 | 


--------------------------------------------------------------------------------
/.github/workflows/snakemake_avamb.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install snakemake and AVAMB dependencies and run the AVAMB snakemake pipeline
 2 | 
 3 | name: AVAMB snakemake - runs daily
 4 | 
 5 | on:
 6 |   workflow_dispatch:
 7 |     inputs:
 8 |   schedule:
 9 |     - cron: "0 1 * * *"
10 | 
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   build:
16 |     runs-on: ubuntu-latest
17 |     strategy:
18 |       matrix:
19 |         python-version: ["3.9"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v4
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |         cache: 'pip'
28 |         cache-dependency-path: '**/pyproject.toml'
29 |     - uses: mamba-org/setup-micromamba@v1
30 |       with:
31 |         micromamba-version: '1.3.1-0'
32 |         environment-file: workflow_avamb/envs/avamb.yaml
33 |         environment-name: avamb
34 |         create-args: >-
35 |           python=3.9.16
36 |         init-shell: >-
37 |           bash
38 |         cache-environment: true
39 |         post-cleanup: 'all'
40 |     - name: Install dependencies to avamb environment
41 |       run: |
42 |         which pip
43 |         pip install -e .
44 |         pip freeze
45 |         git clone https://github.com/chklovski/CheckM2.git 
46 |       shell: micromamba-shell {0}
47 |     - name: Install CheckM2 environment
48 |       run: |
49 |         micromamba create -n checkm2 python=3.8.15 pandas=2.1.1
50 |         micromamba env update -n checkm2 --file workflow_avamb/envs/checkm2.yml
51 |         eval "$(micromamba shell hook --shell=bash)"
52 |         micromamba activate checkm2
53 |         cd CheckM2  && git checkout e563159 && python setup.py install && cd ..
54 |         checkm2 database --download
55 |       shell: micromamba-shell {0}
56 |     - name: Download fixtures
57 |       run: |
58 |         wget https://www.dropbox.com/scl/fi/q54wfho3ultb0otq5z3rh/testset_snakemake.zip\?rlkey\=7tbsc2giff0s42ppdmeb706fa\&dl\=0 -O testset_snakemake.zip
59 |         unzip testset_snakemake.zip
60 |         ls -la
61 |         ls -la testset_snakemake
62 |         pwd
63 |     - name: Snakemake
64 |       uses: snakemake/snakemake-github-action@v1.25.1
65 |       with:
66 |         directory: '.test'
67 |         snakefile: 'workflow_avamb/avamb.snake.conda.smk'
68 |         args: '--cores 4 --configfile workflow_avamb/config.json --use-conda'
69 | 


--------------------------------------------------------------------------------
/.github/workflows/unittest.yml:
--------------------------------------------------------------------------------
 1 | name: Unittest
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "master" ]
 6 |   pull_request:
 7 |     branches: [ "master" ]
 8 | 
 9 | permissions:
10 |   contents: read
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         # There is a bug in Python 3.13.0 which breaks Vamb's tests, fixed in 3.13.1
18 |         python-version: ["3.10", "3.11", "3.12", "3.13.1"]
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v4
22 |     - name: Set up Python ${{ matrix.python-version }}
23 |       uses: actions/setup-python@v5
24 |       with:
25 |         python-version: ${{ matrix.python-version }}
26 |         cache: 'pip' # caching pip dependencies
27 |         cache-dependency-path: '**/pyproject.toml'
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install --upgrade pip
31 |         pip install pytest
32 |         pip install -e .
33 |     - name: Run tests
34 |       run: python -m pytest test
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.so
 2 | __pycache__
 3 | .coverage
 4 | *.o
 5 | src/_vambtools.cpp
 6 | **.c
 7 | .eggs
 8 | *~
 9 | vamb.egg-info
10 | changelog
11 | .DS_Store
12 | .ipynb_checkpoints
13 | TODO.md
14 | build/
15 | dist/
16 | **.vscode
17 | # doc
18 | doc/_build
19 | doc/reference
20 | target
21 | Manifest.toml
22 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the OS, Python version and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.10"
13 |     # You can also specify other tool versions:
14 |     # nodejs: "19"
15 |     # rust: "1.64"
16 |     # golang: "1.19"
17 | 
18 | # Build documentation in the "docs/" directory with Sphinx
19 | sphinx:
20 |   configuration: doc/conf.py
21 |   
22 | # Optionally build your docs in additional formats such as PDF and ePub
23 | # formats:
24 | #    - pdf
25 | #    - epub
26 | 
27 | # Optional but recommended, declare the Python requirements required
28 | # to build your documentation
29 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
30 | python:
31 |   install:
32 |     - method: pip
33 |       path: .
34 |       extra_requirements:
35 |         - docs
36 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## v5.0.0 [UNRELEASED]
  4 | Version 5 is a major release that includes several breaking changes to the API,
  5 | as well as new types of models, improved binning accuracy, and more user
  6 | friendliness.
  7 | 
  8 | ### Added
  9 | * Added the TaxVamb binner - a semi-supervised model that can augment binning
 10 |   using taxonomic assignment from e.g. mmseqs2 of some of the input contigs.
 11 |   TaxVamb is state-of-the-art, and significantly outperforms all other Vamb
 12 |   models when the taxonomic assignment is reasonably good.
 13 |   TaxVamb is available from command-line using `vamb bin taxvamb`
 14 | * Added the Taxometer annotation refiner. This program enhances taxonomic
 15 |   assignment of metagenomic contigs using composition and abundance.
 16 |   TaxVamb will automatically run Taxometer to increase accuracy.
 17 |   Taxometer is available from command-line using `vamb taxometer`
 18 | * [EXPERIMENTAL] Added reclustering functionality, which reclusters an existing
 19 |   binning using single-copy genes, using a technique inspired by the SemiBin2
 20 |   binner. This improves bacterial bins.
 21 |   We may remove this feature in future versions of Vamb.
 22 | 
 23 | ### Breaking changes
 24 | * The command-line interface of Vamb has been changed, such that the different
 25 |   functionality should be used through subcommands. For example, the binners in
 26 |   Vamb are accesible through `vamb bin`.
 27 |   Also, a few command-line flags have been removed.
 28 | * All output files ending in `.tsv` is now actually in TSV format. Previously,
 29 |   Vamb did not include a header in the file, as the TSV format requires.
 30 |   In version 5, the header is included.
 31 | * The file `mask.npz` is no longer output, because the encoder no longer masks
 32 |   any sequences.
 33 | * The name of the output clusters files have been changed. When binsplitting is
 34 |   used, Vamb now outputs both the split, and the unsplit clusters.
 35 |   The name of the output files are now:
 36 |   	- `vae_clusters_split.tsv`
 37 |   	- `vae_clusters_unsplit.tsv`
 38 |   And similarly for e.g. `vaevae_clusters_split.tsv`.
 39 |   When binsplitting is not used, only the unsplit clusters are output.
 40 | * The `benchmark` module of Vamb has been removed, as it is superseded by our
 41 |   new benchmarking tool https://github.com/jakobnissen/BinBencher.jl
 42 |   
 43 | ### Other changes
 44 | * Several details of the clustering algorithm has been rehauled.
 45 |   It now returns more accurate clusters and may be faster in some circumstances.
 46 |   However, GPU clustering may be significantly slower. (#198)
 47 | * Vamb now uses both relative and absolute abundances in the encoder, compared
 48 |   to only the relative ones before. This improves binning, especially when using
 49 |   a low number of samples (#210)
 50 | * Vamb now binsplits with `-o C` by default.
 51 | 	- To disable binsplitting, pass `-o` without an argument
 52 | * Vamb now supports passing abundances in TSV format. This TSV can created very
 53 |   efficiently using the `strobealign` aligner with the `--aemb` flag.
 54 | * If passing abundances in BAM format, it is now recommended to pass in a
 55 |   directory with all the BAM files using the --bamdir flag, instead of using
 56 |   the old --bamfiles flag.
 57 | * Vamb no longer errors when the batch size is too large.
 58 | * Several errors and warnings have been improved:
 59 | 	- The user is warned if any sequences are filtered away for falling below
 60 | 	  the contig size cutoff (flag `-m`).
 61 | 	- Improved the error message when the FASTA and BAM headers to not match.
 62 | 	- Vamb now errors early if the binsplit separator (flag `-o`) is not found
 63 | 	  in the parsed contig identifiers.
 64 | 	  If the binsplit separator is not set explicitly and defaults to `-o C`,
 65 | 	  Vamb will instead warn the user and disable binsplitting. 
 66 | * Vamb now writes its log to both stderr and to the logfile. Every line in the
 67 |   log is now timestamped, and formatted better.
 68 | * Vamb now outputs metadata about the unsplit clusters in the output TSV file
 69 |   `vae_clusters_metadata.tsv`.
 70 | * Vamb now correctly uses a random seed on each invokation (#213)
 71 | * Fixed various bugs and undoubtedly introduced some fresh ones.
 72 | 
 73 | ## v4.1.3
 74 | * Fix a bug that resulting in poor clustering results (#179)
 75 | 
 76 | ## v4.1.2
 77 | * Fix a bug in src/create_fasta.py
 78 | * Bugfix: Make seeding the RNG work from command line
 79 | * Bump compatible Cython version
 80 | 
 81 | ## v4.1.1
 82 | * Create tmp directory in parsebam if needed for pycoverm (issue # 167)
 83 | 
 84 | ## v4.1.0
 85 | * Fix typo in output AAE_Z cluster names. They are now called e.g. "aae_z_1"
 86 |   instead of "aae_z1"
 87 | * Clean up the directory structure of Avamb workflow.
 88 | * Fix the CheckM2 dependencies to allow CheckM2 to be installed
 89 | * Allow the Avamb workflow to be run on Slurm clusters
 90 | * Fix issue #161: Mismatched refhash when spaces in FASTA headers
 91 | * Allow setting the RNG seed from command line
 92 | 
 93 | ## v4.0.1
 94 | * Fix Random.choice for Tensor on Python 3.11. See issue #148
 95 | 
 96 | ## v4.0.0
 97 | Version 4 is a thorough rewrite of major parts of Vamb that has taken more than a year.
 98 | Vamb now ships with with an upgraded dual variational autoencoder (VAE) and
 99 | adversatial autoencoder (AAE) model, usable in a CheckM based workflow.
100 | The code quality and test suite has gotten significant upgrades, making Vamb
101 | more stable and robust to bugs.
102 | Vamb version is slightly faster and produces better bins than v3.
103 | The user interface has gotten limited changes.
104 | 
105 | ### Breaking changes
106 | * The official API of Vamb is now defined only in terms of its command-line
107 |   interface. This means that from now on, Vamb can freely change and modify its
108 |   internal functions, even in minor releases or patch releases.
109 |   If you are using Vamb as a Python package, it means you should precisely
110 |   specify the full version of Vamb used in order to ensure reproducibility.
111 | * Benchmark procedure has been changed, so benchmark results are incompatible
112 |   with results from v3. Benchmarking is now considered an implementation detail,
113 |   and is not stable across releases.
114 | * Vamb no longer outputs TNF, sequence names and sequence lengths as .npz files.
115 |   Instead, it produces a `composition.npz` that contains all this information
116 |   and more.
117 |   As a consequence command-line options `--tnfs`, `--names` and `--lengths`
118 |   have been removed, and replaced with the single `--composition` option.
119 | * The output .npz array `rpkm.npz` has been changed in a backwards incompatible
120 |   way. From version 4, the content of the output .npz files are considered an
121 |   implementation detail.
122 | * The depths input option `--jgi` has been removed. To use depths computed by
123 |   an external program, construct an instance of the `Abundance` class from your
124 |   depths and save it using its `.save` method to an `rpkm.npz` file.
125 |   (though read the Notable changes section below).
126 |   
127 | ### New features
128 | * Vamb now included an optional AAE model along the VAE model.
129 |   Users may run the VAE model, where it behaves similarly to v3, or run the mixed
130 |   VAE/AAE model, in which both models will be run on the same dataset.
131 | * The Snakemake workflow has been rehauled, and how defaults to using
132 |   the VAE/AAE combined model, using CheckM to dereplicate.
133 | * Vamb is now more easily installed via pip: `pip install vamb`. We have fixed
134 |   a bunch of issues that caused installation problems.
135 | * By default, Vamb gzip compresses FASTA files written using the `--minfasta`
136 |   flag.
137 | 
138 | ### Notable other changes
139 | * Using the combined VAE-AAE workflow, the user can get significantly better bins.
140 | * Vamb now uses `CoverM` internally to calculate abundances. This means it is
141 |   significantly faster and more accurate than before.
142 |   Thus, we no longer recommend users computing depths with MetaBAT2's JGI tool.
143 | * Lots of bugfixes may have changed Vamb's behaviour in a backwards incompatible
144 |   way for certain edge cases. For example, FASTA identifiers are now required to
145 |   match the name specification in the SAM format to ensure the identifiers are
146 |   the same in FASTA and BAM files.
147 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Vamb
 2 | The Git repository is currently hosted at https://github.com/RasmussenLab/vamb
 3 | 
 4 | ## Git workflow
 5 | In order for your contribution to be easily integrated into a package that is concurrently worked on by multiple people, it's important that you adhere to the Git workflow that we use for this repo.
 6 | 
 7 | #### Feature branches
 8 | We never push directly to master. Instead, create a new feature branch on your own fork, and make a PR from your fork to master.
 9 | Feature branches are any branch that contain new code that will eventually be merged to master, whether this is an actual feature, or a bugfix or whatever.
10 | We recommend creating your feature branch from an updated version of master, to make it easier to merge into master again.
11 | 
12 | For large features, feature branches can contain huge changes, and be in development over months. Rebase on master as often as possible.
13 | However, where feasible, keep your feature branches' diff relative to master small. If your feature branch contain multiple independent changes, instead make multiple different PRs on different feature branches. This is easier to review, and to bisect if necessary.
14 | 
15 | Make sure to squash your commits on your feature branches as necessary to keep the history clean.
16 | A good rule of thumb is that 1 commit = 1 PR, but there may be exceptions.
17 | Also, please delete your feature branches after they've been merged to master so they don't accumulate.
18 | 
19 | #### Release branches
20 | Releases are only cut from release branches.
21 | The purpose of release branches is to keep a version of Vamb that is more stable than the development version found on master.
22 | This stability is achieved by only adding bugfixes to release branches, not new features. Over time, the bugfixes will accumulate, while the new features (which mostly are where new bugs come from), are added to master only.
23 | Release branches are named "release", plus the major and minor version, like so: "release-4.1". They are always cut from master.
24 | We only backport bugfixes to one, or a few release branches at a time, so old release branches quickly get outdated. However, we will not remove them.
25 | 
26 | Release branches are never merged back to master. If commits from master are needed in a release branch, you may cherry-pick them from master.
27 | This is the only case where commits may be duplicated on two different branches.
28 | 
29 | #### Tags
30 | Each release of Vamb (from a release branch) is tagged with a lowercase "v", then a SemVer 2.0 version, e.g. "v4.1.3".
31 | A tag unambiguously refers to a commit, and is never removed.
32 | Ideally, the tagged commit should be the one that updates the version in `vamb/__init__.py`.
33 | 
34 | #### Testing
35 | Our CI pipeline currently uses a formatter and a linter to check for issues (currently, the Ruff formatter and linter).
36 | To quicken development time, you can install these locally so you can catch these issues before they are caught in CI.
37 | 
38 | #### Dependencies
39 | Please avoid adding new dependencies if at all practical.
40 | We already have lots of issues with out dependencies, and don't want any more.
41 | 
42 | ## Example commands
43 | We assume:
44 |     - The https://github.com/RasmussenLab/vamb repo is added as a remote with the name `upstream`
45 |     - Your own fork of Vamb is added as a remote called `origin`
46 | ### Making an new PR
47 | Syncronize the master branches between your repo and upstream.
48 | Do this before making any new branches from master.
49 | ```shell
50 | $ git switch master
51 | $ git pull upstream master
52 | $ git push origin master
53 | ```
54 | 
55 | Make a new branch, with a feature, here for example "kmer-compression".
56 | Name your branch accordingly.
57 | ```shell
58 | $ git switch -c kmer-compression
59 | ```
60 | 
61 | Write your code, then test it.
62 | This requires you to have installed Vamb (preferentially with `pip install -e .`),
63 | and installed `pytest` and `ruff`:
64 | ```shell
65 | $ python -m pytest # test the code
66 | $ ruff check . # run the linter
67 | $ ruff format . # run the formatter
68 | ```
69 | 
70 | Commit it, then push to `origin`
71 | ```shell
72 | $ git add * # add your files
73 | $ git status # check you didnt add spurious unneeded files
74 | $ git commit # then write a commit message
75 | $ git push -u origin kmer-compression
76 | ```
77 | 
78 | Navigate to Vamb's GitHub repo (the `upstream` one), then on the Pull Requests tab make a new PR from `kmer-compression`
79 | to `upstream`'s master.
80 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 University of Copenhagen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include vamb/kernel.npz
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Vamb
 2 | [![Read the Doc](https://readthedocs.org/projects/vamb/badge/?version=latest)](https://vamb.readthedocs.io/en/latest/)
 3 | 
 4 | Read the documentation on how to use Vamb here: https://vamb.readthedocs.io/en/latest/
 5 | 
 6 | Vamb is a family of metagenomic binners which feeds kmer composition and abundance into a variational autoencoder and clusters the embedding to form bins.
 7 | Its binners perform excellently with multiple samples, and pretty good on single-sample data.
 8 | 
 9 | ## Programs in Vamb
10 | The Vamb package contains several programs, including three binners:
11 | 
12 | * __TaxVamb__: A semi-supervised binner that uses taxonomy information from e.g. `mmseqs taxonomy`.
13 |   TaxVamb produces the best results, but requires you have run a taxonomic annotation workflow.
14 |   [Link to article](https://doi.org/10.1101/2024.10.25.620172).
15 | * __Vamb__: The original binner based on variational autoencoders.
16 |   This has been upgraded significantly since its original release.
17 |   Vamb strikes a good balance between speed and accuracy.
18 |   [Link to article](https://doi.org/10.1038/s41587-020-00777-4).
19 | * __Avamb__: An obsolete ensemble model based on Vamb and adversarial autoencoders. 
20 |   Avamb has an accuracy in between Vamb and TaxVamb, but is more computationally demanding than either.
21 |   We don't recommend running Avamb: If you have the compute to run it, you should instead run TaxVamb
22 |   See the [Avamb README page](https://github.com/RasmussenLab/avamb/tree/avamb_new/workflow_avamb) for more information.
23 |   [Link to article](https://doi.org/10.1038/s42003-023-05452-3).
24 | 
25 | And a taxonomy predictor:
26 | * __Taxometer__: This tool refines arbitrary taxonomy predictions (e.g. from `mmseqs taxonomy`) using kmer composition and co-abundance.
27 |   [Link to article](https://www.nature.com/articles/s41467-024-52771-y)
28 | 
29 | See also [our tool BinBencher.jl](https://github.com/jakobnissen/BinBencher.jl) for evaluating metagenomic bins when a ground truth is available,
30 | e.g. for simulated data or a mock microbiome.
31 | 
32 | ## Quickstart
33 | For more details, and how to run on an example dataset [see the documentation.](https://vamb.readthedocs.io/en/latest/)
34 | 
35 | ```shell
36 | # Assemble your reads, one assembly per sample, e.g. with SPAdes
37 | for sample in 1 2 3; do
38 |     spades.py --meta ${sample}.{fw,rv}.fq.gz -t 24 -m 100gb -o asm_${sample};
39 | done    
40 | 
41 | # Concatenate your assemblies, and rename the contigs to the naming scheme
42 | # S{sample}C{original contig name}. This can be done with a script provided by Vamb
43 | # in the vamb/src directory
44 | python src/concatenate.py contigs.fna.gz asm_{1,2,3}/contigs.fasta
45 | 
46 | # Estimate sample-wise abundance by mapping reads to the contigs.
47 | # Any mapper will do, but we recommend strobealign with the --aemb flag
48 | mkdir aemb
49 | for sample in 1 2 3; do
50 |     strobealign -t 8 --aemb contigs.fna.gz ${sample}.{fw,rv}.fq.gz > aemb/${sample}.tsv;
51 | done
52 | 
53 | # Create an abundance TSV file from --aemb outputs using the script in vamb/src dir
54 | python src/merge_aemb.py aemb abundance.tsv
55 | 
56 | # Run Vamb using the contigs and the directory with abundance files
57 | vamb bin default --outdir vambout --fasta contigs.fna.gz --abundance_tsv abundance.tsv
58 | ```
59 | 


--------------------------------------------------------------------------------
/benchmark/Project.toml:
--------------------------------------------------------------------------------
 1 | [deps]
 2 | CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 3 | CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 4 | CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 5 | DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 6 | Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 7 | 
 8 | [compat]
 9 | CSV = "=0.10.15"
10 | CairoMakie = "=0.13.2"
11 | CategoricalArrays = "=0.10.8"
12 | DataFrames = "=1.7.0"
13 | Statistics = "=1.11.1"
14 | julia = "1.11"


--------------------------------------------------------------------------------
/benchmark/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RasmussenLab/vamb/c53e5419392ce3f46ffc9038d2df0004b2d94569/benchmark/benchmark.png


--------------------------------------------------------------------------------
/benchmark/benchmarks.csv:
--------------------------------------------------------------------------------
 1 | version,dataset,run,nc,mq,seconds
 2 | v3.0.6,Airways,3,55,100,9877.15
 3 | v3.0.6,Airways,1,55,100,10829.63
 4 | v3.0.6,Airways,2,55,100,10230.52
 5 | v3.0.6,Gastrointestinal,1,83,103,4872.34
 6 | v3.0.6,Gastrointestinal,3,83,103,4449.65
 7 | v3.0.6,Gastrointestinal,2,83,103,4776.16
 8 | v3.0.6,Oral,2,119,159,11790.15
 9 | v3.0.6,Oral,1,119,159,11546.48
10 | v3.0.6,Oral,3,119,159,11500.1
11 | v3.0.6,Skin,2,79,128,10384.21
12 | v3.0.6,Skin,3,79,128,10302.17
13 | v3.0.6,Skin,1,79,128,10241.69
14 | v3.0.6,Urogenital,2,78,98,3606.18
15 | v3.0.6,Urogenital,3,78,98,3585.2
16 | v3.0.6,Urogenital,1,78,98,3614.02
17 | v4.1.3,Airways,2,58,103,2017.4
18 | v4.1.3,Airways,3,58,103,2045.9
19 | v4.1.3,Airways,1,58,103,2020.46
20 | v4.1.3,Gastrointestinal,2,88,109,1040.9
21 | v4.1.3,Gastrointestinal,1,88,109,992.93
22 | v4.1.3,Gastrointestinal,3,88,109,985.28
23 | v4.1.3,Oral,1,125,160,2498.71
24 | v4.1.3,Oral,2,125,160,2230.58
25 | v4.1.3,Oral,3,125,160,2163.1
26 | v4.1.3,Skin,2,77,131,2101.47
27 | v4.1.3,Skin,3,77,131,2180.22
28 | v4.1.3,Skin,1,77,131,2154.07
29 | v4.1.3,Urogenital,3,76,101,1691.71
30 | v4.1.3,Urogenital,1,76,101,1848.71
31 | v4.1.3,Urogenital,2,76,101,1892.25
32 | v5.0.2,Airways,3,74,111,2178.08
33 | v5.0.2,Airways,1,78,117,2805.76
34 | v5.0.2,Airways,2,73,114,2203.5
35 | v5.0.2,Gastrointestinal,1,123,145,1176.34
36 | v5.0.2,Gastrointestinal,3,123,145,998.33
37 | v5.0.2,Gastrointestinal,2,121,143,962.34
38 | v5.0.2,Oral,2,145,190,2623.77
39 | v5.0.2,Oral,1,144,179,3067.2
40 | v5.0.2,Oral,3,146,182,1990.03
41 | v5.0.2,Skin,2,102,152,2076.27
42 | v5.0.2,Skin,3,106,157,1511.35
43 | v5.0.2,Skin,1,98,149,2003.8
44 | v5.0.2,Urogenital,2,102,127,684.11
45 | v5.0.2,Urogenital,1,106,129,856.93
46 | v5.0.2,Urogenital,3,104,129,717.86
47 | 


--------------------------------------------------------------------------------
/benchmark/make_plots.jl:
--------------------------------------------------------------------------------
  1 | using CairoMakie
  2 | using DataFrames
  3 | using CSV
  4 | using Statistics
  5 | using CategoricalArrays
  6 | 
  7 | df = CSV.read("benchmarks.csv", DataFrame)
  8 | 
  9 | sort!(df, [:version, :dataset])
 10 | 
 11 | df.version = categorical(df.version)
 12 | df.dataset = categorical(df.dataset)
 13 | 
 14 | n_versions = length(levels(df.version))
 15 | n_datasets = length(levels(df.dataset))
 16 | 
 17 | summary = combine(
 18 |     groupby(df, [:version, :dataset]),
 19 |     :nc => mean => :nc_mean,
 20 |     :mq => mean => :mq_mean,
 21 |     :seconds => (i -> minimum(i) / 3600) => :hours_min,
 22 | )
 23 | 
 24 | combined = combine(
 25 |     groupby(summary, [:version]),
 26 |     :hours_min => sum => :hours_total
 27 | )
 28 | 
 29 | colors = Makie.wong_colors();
 30 | fig = Figure();
 31 | 
 32 | grid = fig[1, 1:2] = GridLayout()
 33 | 
 34 | # Plot accuracy
 35 | let
 36 |     global ax_accuracy = Axis(
 37 |         grid[1, 1],
 38 |         title = "Accuracy",
 39 |         xticks = (
 40 |             (((n_versions - 1) / 2) + 1):(n_versions + 1):(n_datasets * (n_versions + 1)),
 41 |             levels(summary.dataset),
 42 |         ),
 43 |         limits = (nothing, nothing, 0, nothing),
 44 |         ylabel = "Genomes recovered as NC / MQ bins",
 45 |         ylabelpadding = 0,
 46 |         xlabelpadding = -15,
 47 |         xticklabelrotation = 0.2,
 48 |         xticklabelpad = 10,
 49 |         xticklabelalign = (:center, :center),
 50 |         xlabel = "Dataset",
 51 |     )
 52 | 
 53 |     xs = Float64[]
 54 |     ncs = Float64[]
 55 |     mqs = Float64[]
 56 |     for row in eachrow(summary)
 57 |         dataset_offset = (levelcode(row.dataset) - 1) * (n_versions + 1)
 58 |         version_offset = levelcode(row.version) - 1
 59 |         push!(xs, dataset_offset + version_offset + 1)
 60 |         push!(ncs, row.nc_mean)
 61 |         push!(mqs, row.mq_mean)
 62 |     end
 63 |     for (index, label) in enumerate(levels(summary.version))
 64 |         indices = ((index - 1) * n_datasets + 1):(index * n_datasets)
 65 |         for (ys, alpha) in [(mqs, 0.6), (ncs, 1.0)]
 66 |             barplot!(
 67 |                 ax_accuracy,
 68 |                 xs[indices],
 69 |                 ys[indices];
 70 |                 color = colors[index],
 71 |                 alpha = alpha,
 72 |                 width = 1,
 73 |                 label = alpha == 1 ? label : nothing,
 74 |             )
 75 |         end
 76 |     end
 77 | end
 78 | 
 79 | # Plot timing
 80 | let
 81 |     global ax_runtime = Axis(
 82 |         grid[1, 2],
 83 |         title = "Runtime",
 84 |         limits = (nothing, nothing, 0, nothing),
 85 |         ylabel = "Runtime (all datasets, hours)",
 86 |         yticks = 1:11,
 87 |         xticksvisible = false,
 88 |         xticklabelsvisible = false,
 89 |     )
 90 | 
 91 |     sort!(combined, [:version]; by = levelcode)
 92 |     for i in 1:n_versions
 93 |         barplot!(
 94 |             ax_runtime,
 95 |             [i],
 96 |             [combined.hours_total[i]],
 97 |             color = colors[i],
 98 |             width = 1,
 99 |         )
100 |     end
101 | end
102 | 
103 | colgap!(grid, 10)
104 | colsize!(grid, 2, Relative(0.2))
105 | 
106 | axislegend(ax_accuracy, "Vamb version", position = :cb)
107 | 
108 | save("benchmark.png", fig)
109 | 


--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
 1 | # Docs creation
 2 | 
 3 | In order to build the docs you need to 
 4 | 
 5 |   1. install sphinx and additional support packages
 6 |   2. build the package reference files
 7 |   3. run sphinx to create a local html version
 8 | 
 9 | The documentation is build using readthedocs automatically.
10 | 
11 | Install the docs dependencies of the package (as speciefied in toml):
12 | 
13 | ```bash
14 | # in main folder
15 | pip install '.[docs]'
16 | ```
17 | 
18 | ## Build docs using Sphinx command line tools
19 | 
20 | Command to be run from `path/to/doc`, i.e. from within the `doc` folder: 
21 | 
22 | Options:
23 |   - `--separate` to build separate pages for each (sub-)module
24 | 
25 | ```bash	
26 | # pwd: doc
27 | # apidoc
28 | # sphinx-apidoc --force --implicit-namespaces --module-first -o reference ../vamb
29 | # build docs
30 | sphinx-build -n -W --keep-going -b html ./ ./_build/
31 | ```
32 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | #
 13 | import os
 14 | from importlib import metadata
 15 | 
 16 | 
 17 | # -- Project information -----------------------------------------------------
 18 | 
 19 | project = "vamb"
 20 | copyright = "2024, Jakob Nybo Nissen, Simon Rasmussen"  # ! please update
 21 | author = "Jakob Nybo Nissen, Simon Rasmussen"
 22 | PACKAGE_VERSION = metadata.version("vamb")
 23 | version = PACKAGE_VERSION
 24 | release = PACKAGE_VERSION
 25 | 
 26 | 
 27 | # -- General configuration ---------------------------------------------------
 28 | 
 29 | # Add any Sphinx extension module names here, as strings. They can be
 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 31 | # ones.
 32 | extensions = [
 33 |     "sphinx.ext.autodoc",
 34 |     "sphinx.ext.autodoc.typehints",
 35 |     "sphinx.ext.viewcode",
 36 |     "sphinx.ext.napoleon",
 37 |     "sphinx.ext.intersphinx",
 38 |     "sphinx_new_tab_link",
 39 |     "myst_nb",
 40 | ]
 41 | 
 42 | #  https://myst-nb.readthedocs.io/en/latest/computation/execute.html
 43 | nb_execution_mode = "auto"
 44 | 
 45 | myst_enable_extensions = ["dollarmath", "amsmath"]
 46 | 
 47 | # Plolty support through require javascript library
 48 | # https://myst-nb.readthedocs.io/en/latest/render/interactive.html#plotly
 49 | html_js_files = [
 50 |     "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"
 51 | ]
 52 | 
 53 | # https://myst-nb.readthedocs.io/en/latest/configuration.html
 54 | # Execution
 55 | nb_execution_raise_on_error = True
 56 | # Rendering
 57 | nb_merge_streams = True
 58 | 
 59 | # Add any paths that contain templates here, relative to this directory.
 60 | templates_path = ["_templates"]
 61 | 
 62 | # List of patterns, relative to source directory, that match files and
 63 | # directories to ignore when looking for source files.
 64 | # This pattern also affects html_static_path and html_extra_path.
 65 | exclude_patterns = [
 66 |     "_build",
 67 |     "Thumbs.db",
 68 |     ".DS_Store",
 69 |     ".npz",
 70 | ]
 71 | 
 72 | 
 73 | # Intersphinx options
 74 | intersphinx_mapping = {
 75 |     "python": ("https://docs.python.org/3", None),
 76 |     "torch": ("https://pytorch.org/docs/stable/index.html", None),
 77 |     "numpy": ("https://numpy.org/doc/stable/", None),
 78 |     # "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
 79 |     # "scikit-learn": ("https://scikit-learn.org/stable/", None),
 80 |     # "matplotlib": ("https://matplotlib.org/stable/", None),
 81 | }
 82 | 
 83 | # -- Options for HTML output -------------------------------------------------
 84 | 
 85 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 86 | # a list of builtin themes.
 87 | # See:
 88 | # https://github.com/executablebooks/MyST-NB/blob/master/docs/conf.py
 89 | # html_title = ""
 90 | html_theme = "sphinx_book_theme"
 91 | # html_logo = "_static/logo-wide.svg"
 92 | # html_favicon = "_static/logo-square.svg"
 93 | html_theme_options = {
 94 |     "github_url": "https://github.com/RasmussenLab/vamb",
 95 |     "repository_url": "https://github.com/RasmussenLab/vamb",
 96 |     "repository_branch": "main",
 97 |     "home_page_in_toc": True,
 98 |     "path_to_docs": "docs",
 99 |     "show_navbar_depth": 2,
100 |     "use_edit_page_button": True,
101 |     "use_repository_button": True,
102 |     "use_download_button": True,
103 |     "launch_buttons": {
104 |         "colab_url": "https://colab.research.google.com"
105 |         #     "binderhub_url": "https://mybinder.org",
106 |         #     "notebook_interface": "jupyterlab",
107 |     },
108 |     "navigation_with_keys": False,
109 | }
110 | 
111 | # Add any paths that contain custom static files (such as style sheets) here,
112 | # relative to this directory. They are copied after the builtin static files,
113 | # so a file named "default.css" will overwrite the builtin "default.css".
114 | # html_static_path = ["_static"]
115 | 
116 | 
117 | # -- Setup for sphinx-apidoc -------------------------------------------------
118 | 
119 | # Read the Docs doesn't support running arbitrary commands like tox.
120 | # sphinx-apidoc needs to be called manually if Sphinx is running there.
121 | # https://github.com/readthedocs/readthedocs.org/issues/1139
122 | 
123 | if os.environ.get("READTHEDOCS") == "True":
124 |     from pathlib import Path
125 | 
126 |     PROJECT_ROOT = Path(__file__).parent.parent
127 |     PACKAGE_ROOT = PROJECT_ROOT / "vamb"
128 | 
129 | #     def run_apidoc(_):
130 | #         from sphinx.ext import apidoc
131 | #
132 | #         apidoc.main(
133 | #             [
134 | #                 "--force",
135 | #                 "--implicit-namespaces",
136 | #                 "--module-first",
137 | #                 "--separate",
138 | #                 "-o",
139 | #                 str(PROJECT_ROOT / "doc" / "reference"),
140 | #                 str(PACKAGE_ROOT),
141 | #                 str(PACKAGE_ROOT / "*.c"),
142 | #                 str(PACKAGE_ROOT / "*.so"),
143 | #             ]
144 | #         )
145 | #
146 | #     def setup(app):
147 | #         app.connect("builder-inited", run_apidoc)
148 | 


--------------------------------------------------------------------------------
/doc/front_page.md:
--------------------------------------------------------------------------------
 1 | # Variational Autoencoders for Metagenomic Binning (Vamb)
 2 | 
 3 | Vamb is a family of metagenomic binners which feeds kmer composition and abundance into a variational autoencoder and clusters the embedding to form bins.
 4 | Its binners perform excellently with multiple samples, and pretty good on single-sample data.
 5 | 
 6 | ## Programs in Vamb
 7 | The Vamb package contains several programs, including three binners:
 8 | 
 9 | * __TaxVamb__: A semi-supervised binner that uses taxonomy information from e.g. `mmseqs taxonomy`.
10 |   TaxVamb produces the best results, but requires you have run a taxonomic annotation workflow.
11 |   [Link to article](https://doi.org/10.1101/2024.10.25.620172).
12 | * __Vamb__: The original binner based on variational autoencoders.
13 |   This has been upgraded significantly since its original release.
14 |   Vamb strikes a good balance between speed and accuracy.
15 |   [Link to article](https://doi.org/10.1038/s41587-020-00777-4).
16 | * __Avamb__: An obsolete ensemble model based on Vamb and adversarial autoencoders. 
17 |   Avamb has an accuracy in between Vamb and TaxVamb, but is more computationally demanding than either.
18 |   We don't recommend running Avamb: If you have the compute to run it, you should instead run TaxVamb
19 |   See the [Avamb README page](https://github.com/RasmussenLab/vamb/tree/master/workflow_avamb) for more information.
20 |   [Link to article](https://doi.org/10.1038/s42003-023-05452-3).
21 | 
22 | And a taxonomy predictor:
23 | * __Taxometer__: This tool refines arbitrary taxonomy predictions (e.g. from `mmseqs taxonomy`) using kmer composition and co-abundance.
24 |   [Link to article](https://www.nature.com/articles/s41467-024-52771-y)
25 | 
26 | See also [our tool BinBencher.jl](https://github.com/jakobnissen/BinBencher.jl) for evaluating metagenomic bins when a ground truth is available,
27 | e.g. for simulated data or a mock microbiome.
28 | 


--------------------------------------------------------------------------------
/doc/histogram.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RasmussenLab/vamb/c53e5419392ce3f46ffc9038d2df0004b2d94569/doc/histogram.npz


--------------------------------------------------------------------------------
/doc/how_to_run.md:
--------------------------------------------------------------------------------
  1 | # Running Vamb
  2 | Most users will want to copy and change the commands from the quickstart section below.
  3 | Users with more advanced data, or who really wants to dig into Vamb to get the most out of Vamb should read the in-depth sections below.
  4 | 
  5 | First figure out what you want to run:
  6 | * Do you have contigs plus reads plus a taxonomic annotation of the contigs? Use __TaxVamb__
  7 | * Do you only have contigs plus reads and want a decent, fast binner? Use __Vamb__
  8 | 
  9 | We also support the now-obsolete __AVAMB__ binner. Its performance is in between TaxVamb and Vamb,
 10 | but it requires more compute than either.
 11 | I recommend new users run either TaxVamb or Vamb.
 12 | 
 13 | ## Quickstart
 14 | The general workflow looks like this.
 15 | For more detailed information, see the documentation page on Vamb's inputs and outputs, as well as the page with tips on how to run Vamb.
 16 | 
 17 | ```shell
 18 | # Assemble your reads, one assembly per sample, e.g. with SPAdes
 19 | for sample in 1 2 3; do
 20 |     spades.py --meta ${sample}.{fw,rv}.fq.gz -t 24 -m 100gb -o asm_${sample};
 21 | done    
 22 | 
 23 | # Concatenate your assemblies, and rename the contigs to the naming scheme
 24 | # S{sample}C{original contig name}. This can be done with a script provided by Vamb
 25 | # in the vamb/src directory
 26 | python src/concatenate.py contigs.fna.gz asm_{1,2,3}/contigs.fasta
 27 | 
 28 | # Estimate sample-wise abundance by mapping reads to the contigs.
 29 | # Any mapper will do, but we recommend strobealign with the --aemb flag
 30 | mkdir aemb
 31 | for sample in 1 2 3; do
 32 |     strobealign -t 8 --aemb contigs.fna.gz ${sample}.{fw,rv}.fq.gz > aemb/${sample}.tsv;
 33 | done
 34 | 
 35 | # Create an abundance TSV file from --aemb outputs using the script in vamb/src dir
 36 | python src/merge_aemb.py aemb abundance.tsv
 37 | 
 38 | # Run Vamb using the contigs and the directory with abundance files
 39 | vamb bin default --outdir vambout --fasta contigs.fna.gz --abundance_tsv abundance.tsv
 40 | ```
 41 | 
 42 | ## Running with test data
 43 | We provide example data under the "releases" section on the Vamb Github repository: https://github.com/RasmussenLab/vamb/releases/download/input_data/inputs.tar.gz
 44 | 
 45 | After downloading, extract its content:
 46 | ```shell
 47 | $ tar -xzf inputs.tar.gz
 48 | ```
 49 | 
 50 | This data is only for demonstrating the Vamb commands, and test running Vamb, and does not reflect a realistic metagenome. It is not suitable for benchmarking the accuracy of any binner.
 51 | 
 52 | The following commands makes use of these example files. You can substitute those files with your own in the commands.
 53 | 
 54 | 
 55 | ### Vamb
 56 | Default command:
 57 | 
 58 | ```shell
 59 | $ vamb bin default --outdir out1 --fasta contigs.fna.gz --abundance_tsv abundances.tsv
 60 | ```
 61 | 
 62 | ### TaxVamb
 63 | For TaxVamb, it's almost the same, but we also provide the taxonomy file:
 64 | 
 65 | ```shell
 66 | $ vamb bin taxvamb --outdir out2 --fasta contigs.fna.gz --abundance_tsv abundances.tsv --taxonomy taxonomy.tsv
 67 | ```
 68 | 
 69 | ### Taxometer
 70 | Same default arguments as TaxVamb:
 71 | 
 72 | ```shell
 73 | $ vamb taxometer --outdir out3 --fasta contigs.fna.gz --abundance_tsv abundances.tsv --taxonomy taxonomy.tsv
 74 | ```
 75 | 
 76 | ### AVAMB
 77 | See the README.md file in the `workflow_avamb` directory.
 78 | 
 79 | ### Reducing the number of epochs for testing
 80 | For testing purposes, e.g. when running on the test data, it may be useful to reduce the number of training epochs, so Vamb finishes faster.
 81 | This will cause Vamb's models to be severely underfitted and perform terribly, so doing it is only recommended for testing.
 82 | 
 83 | * For Vamb: Add flags `-e 5 -q 2 3`
 84 | * For TaxVamb: Add flags `-e 5 -q 2 3 -pe 5`
 85 | * For Taxometer: Add flags `-pe 5`
 86 | 
 87 | ## Explanation of command-line options
 88 | Each program in Vamb only has a subset of the following options.
 89 | 
 90 | * `-h, --help`: Print help and exit
 91 | * `--version`: Print version to stdout and exit
 92 | * `--outdir`: Output directory to create. Must not exist. Parent directory must exist.
 93 | * `-m`: Ignore contigs shorter than this value. Too short contigs have an unstable kmer composition
 94 |   and abundance signal, and therefore adds too much noise to the binning process.
 95 | * `-p` Number of threads to use. Note that Vamb has limited control over the number of threads used by
 96 |   its underlying libraries such as PyTorch, NumPy and BLAS. Although Vamb tries its best to limit the
 97 |   number of threads to the number specified, that might not always work.
 98 | * `--norefcheck`: Disable reference hash checking between composition, abundance and taxonomic inputs.
 99 |   See the section on reference hash checking in the input section.
100 | * `--cuda`: Use a graphical processing unit for model training and clustering.
101 |   Must have a CUDA-compatible version of PyTorch installed, and an NVIDIA GPU which supports CUDA. 
102 | * `--seed`: Pass an integer seed for the random number generation. Vamb will use this seed to attempt reproducibility. Note that PyTorch does not support reproducible training of models, so passing this seed does not guarantee that Vamb will produce the same results from the same data.
103 | * `--minfasta`: Output all bins with a total size (sum of contig lengths) greater than or equal to this
104 |   number. The bins will be output in a directory called `bins` under the output directory, and each bin
105 |   will be a FASTA file with the same name as the bin, suffixed by ".fna".
106 | * `-o` Set binsplit separator. See the section on binsplitting in "tips for running Vamb" section for its meaning.
107 |   If not passed, defaults to `C` if 'C' is present in all identifiers.
108 |   To disable binsplitting, pass `-o` without an argument.
109 | * `--no_predictor`: When running TaxVamb, if this flag is not set, TaxVamb will automatically run
110 |   Taxometer when given an unrefined input taxonomy to refine it.
111 |   Using a refined taxonomy usually improves the accuracy of TaxVamb.
112 | * `--fasta`: FASTA input file. See section on Vamb inputs and outputs.
113 | * `--composition`: NPZ composition input file. See section on Vamb inputs and outputs.
114 | * `--bamdir`: Directory with BAM files to use for abundance. See section on Vamb inputs and outputs.
115 | * `--abundance_tsv`: TSV file with precomputed abundances. See section on Vamb inputs and outputs.
116 | * `--abundance`: NPZ abundance input file. See section on Vamb inputs and outputs.
117 | * `--taxonomy`: TSV file with refined or unrefined taxonomy. See section on Vamb inputs and outputs.
118 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: front_page.md
 2 |    :parser: myst_parser.sphinx_
 3 | 
 4 | Table of contents
 5 | ==================
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 3
 9 | 
10 |    installation.md
11 | 
12 | .. toctree::
13 |    :maxdepth: 3
14 |    
15 |    how_to_run.md
16 | 
17 | .. toctree::
18 |    :maxdepth: 3
19 |    
20 |    inputs_outputs.md
21 | 
22 | .. toctree::
23 |    :maxdepth: 3
24 |    
25 |    tips.md
26 | 
27 | Indices and tables
28 | ==================
29 | 
30 | * :ref:`genindex`
31 | * :ref:`modindex`
32 | * :ref:`search`
33 | 


--------------------------------------------------------------------------------
/doc/inputs_outputs.md:
--------------------------------------------------------------------------------
  1 | # Vamb inputs and outputs
  2 | All modes of Vamb takes various _inputs_ and produces various _outputs_.
  3 | Currently, all modes take the following two central inputs:
  4 | 
  5 | * The kmer-composition of the sequence (the _composition_).
  6 | * The abundance of the contigs in each sample (the _abundance_).
  7 | 
  8 | For inputs that take significant time to produce, Vamb will serialize the parsed input to a file, such that future runs of Vamb can use that instead of re-computing it.
  9 | 
 10 | ## Composition
 11 | The composition is computed from the input contig file in FASTA format (the 'catalogue').
 12 | From command line, this looks like:
 13 | 
 14 | ```shell
 15 | --fasta contigs.fna.gz
 16 | ```
 17 | 
 18 | Where the catalogue may be either gzipped or a plain FASTA file.
 19 | 
 20 | Vamb produces the best results when run with the "multi-split" workflow, as demonstrated in the quickstart section in "how to run Vamb".
 21 | In this workflow, samples are assembled independently, and the resulting contigs are concatenated to a single FASTA file before binning.
 22 | After binning, the bins can be split into sample-wise pure bins.
 23 | 
 24 | To do this splitting (termed "binsplitting"), Vamb needs to know which contig came from which sample.
 25 | Therefore, it's recommended that the FASTA headers are formatted in the following pattern:
 26 | `{samplename}C{contigname}`
 27 | 
 28 | Where `{samplename}` is some text that uniquely identifies each sample, and `{contigname}` uniquely identifies each contig within a sample.
 29 | For example, if the samples are named S1, S2, S3, etc., and the contigs are named 1, 2, 3, etc, a FASTA header may be `S3C119`.
 30 | 
 31 | After the `composition.npz` has been created, Vamb will write the composition in the output file `composition.npz`.
 32 | Future runs of Vamb can then instead use the following option to load the composition directly:
 33 | 
 34 | ```shell
 35 | --composition composition.npz
 36 | ```
 37 | 
 38 | ## Abundance
 39 | The abundance may be computed from either:
 40 | * A TSV file with the header being "contigname" followed by one sample name per sample,
 41 |   and the values in the TSV file being precomputed abundances.
 42 |   These may be derived from `paste`ing together outputs from the tool `strobealign --aemb`, and concatenating the result to a TSV header.
 43 | * A directory of sorted BAM files generated by mapping the reads of each sample to the contig catalogue.
 44 | 
 45 | On the command line, abundance input can be specified as:
 46 | ```shell
 47 | --abundance_tsv abundance.tsv
 48 | ```
 49 | or
 50 | ```shell
 51 | --bamdir dir_with_bam_files
 52 | ```
 53 | 
 54 | Once the abundance has been parsed, Vamb will produce the file `abundance.npz`, which can be used for future
 55 | Vamb runs instead:
 56 | ```shell
 57 | --abundance abundance.npz
 58 | ```
 59 | 
 60 | ### Abundance TSV format
 61 | This follows the ordinary `.tsv` format, with a mandatory header, and disallowing `\t` in contig names.
 62 | The header of the first column name must be exactly `contigname`.
 63 | 
 64 | The abundance TSV file in the correct format can be created from the output of `strobealign --aemb` using the script in `src/merge_aemb.py`:
 65 | 
 66 | ```shell
 67 | python src/merge_aemb.py input_dir abundance.tsv
 68 | ```
 69 | 
 70 | Example file:
 71 | ```
 72 | contigname	S1	S2	S3
 73 | S1C1	1.53	1.11	4.1e2
 74 | S1C2	0.94	9.2	5.1e2
 75 | S2C1	1.2e-3	0	9.2
 76 | S3C1	88.21	51.2	12.1e3
 77 | S3C2	14.1	90.1	13.01
 78 | ```
 79 | 
 80 | ### Abundance as BAM files
 81 | If you don't want to compute abundance using `strobealign --aemb` and create a TSV file from its output (recommended),
 82 | Vamb can compute abundances from BAM files.
 83 | 
 84 | To do this:
 85 | 
 86 | * Create the FASTA contig catalogue as described in the section of "composition".
 87 | * Map the reads for each sample to the catalogue, to obtain on BAM file per sample.
 88 | 
 89 | Using the aligner [minimap2](https://github.com/lh3/minimap2) as well as [samstrip](https://github.com/jakobnissen/samstrip) and [samtools](https://github.com/samtools/samtools), the commands may be:
 90 | 
 91 | ```shell
 92 | # Index the FASTA file so it only has to be done once instead of on every mapping
 93 | minimap2 -I 32G -d catalogue.mmi catalogue.fasta;
 94 | 
 95 | # Map each sample. Here, using 8 threads, using paired short reads.
 96 | minimap2 -t 8 -ax sr catalogue.mmi s1.fw.fq.gz s1.rv.fq.gz | samstrip | samtools view -F 3584 -b - > s1.bam
 97 | ```
 98 | 
 99 | _Note that if you use minimap2 specifically, be aware of [this bug in minimap2](https://github.com/lh3/minimap2/issues/37), where, if the index
100 | is not large enough, the output will be an invalid SAM file. To get around this, use enough RAM when indexing (e.g. set option `-I` appropriately)_
101 | 
102 | ### Reference hash checking
103 | To ensure the integrity of the data, Vamb will compare the identifiers in the composition (ultimately: headers in the FASTA file) with the contig names from the abundance input (TSV file contig names, or BAM sequence names) and, if provided, that of the taxonomic input.
104 | 
105 | To do this efficiently, the identifiers are _hashed_ to provide a _reference hash_ (refhash), which is compared, and an error is thrown if they are different.
106 | 
107 | If you, for some reason, can't create input files with matching identifiers, and you are 100% sure the order of the sequences is identical in the composition input and abundance input, you can disable this reference hashing with the `--norefcheck` option.
108 | 
109 | ## Taxonomy
110 | Vamb operates with two kinds of taxonomies:
111 | * _Unrefined_ taxonomies give the taxonomic annotation for each contig
112 | * _Refined_ taxonomies gives the taxonomic annotation _plus a probability estimate_ for each contig
113 | 
114 | Vamb's __Taxometer__ tool can be used to refine a taxonomy.
115 | It takes an unrefined taxonomy as input, and outputs a refined taxonomy.
116 | TaxVamb usually achieves better results if its taxonomy input is refined with Taxometer.
117 | 
118 | Both refined and unrefined taxonomies can be used for TaxVamb.
119 | By default, if TaxVamb gets an unrefined taxonomy, it will automatically refine it with Taxometer, unless `--no_predict` is passed.
120 | 
121 | Taxonomy files are TSV files with the following format:
122 | * Header: `contigs\tpredictions` for unrefined taxonomies and `contigs\tpredictions\tscores` for refined ones.
123 | * In the `contigs` column: The FASTA identifier for every contig in the catalogue.
124 | * In the `predictions` column: A semicolon-separated string with taxonomic levels, for each of the following seven ranks, in order:
125 |   domain, phylum, class, order, family, genus, species. Lower ranks may be omitted.
126 |   There is no requirement that the labels are actually meaningful, i.e. that they correspond to any real taxonomic clade.
127 | * In the `scores` column: A semicolon separated list of floats, one per element in the `predictions` column.
128 | 
129 | The following are examples of a VALID rows in the prediction column:
130 | ```
131 | Bacteria;Bacillota;Clostridia
132 | Bacteria;Bacillota;Bacilli;Bacillales
133 | Bacteria;Pseudomonadota;Gammaproteobacteria;Moraxellales;Moraxellaceae;Acinetobacter;Acinetobacter sp. TTH0-4
134 | ```
135 | 
136 | The following are example of INVALID rows in the prediction column:
137 | * Invalid: Begins with class instead of domain: `Clostridia;Eubacteriales;Lachnospiraceae;Roseburia;Roseburia hominis`
138 | * Invalid: Skips the phylum: `Bacteria;Gammaproteobacteria;Moraxellales;Moraxellaceae;Acinetobacter;Acinetobacter sp. TTH0-4`
139 | 
140 | The following is an example of a valid, unrefined taxonomy file:
141 | ```
142 | contigs	predictions
143 | S18C13	Bacteria;Bacillota;Clostridia;Eubacteriales
144 | S18C25	Bacteria;Pseudomonadota
145 | S18C67	Bacteria;Bacillota;Bacilli;Bacillales;Staphylococcaceae;Staphylococcus
146 | ```
147 | 
148 | Our tool  [__Taxconverter__](https://github.com/RasmussenLab/taxconverter) can be used to create unrefined taxonomy files from MMSeqs2, Centrifuge, Kraken2, Metabuli or MetaMaps output files.
149 | 
150 | # Outputs
151 | 
152 | ## Vamb
153 | - `log.txt` - A text file with information about the Vamb run. Look here (and at stderr) if you experience errors.
154 | - `composition.npz`: A Numpy .npz file that contain all kmer composition information computed by Vamb from the FASTA file.
155 |   This can be provided to another run of Vamb to skip the composition calculation step.
156 |   This is not produced if an existing `composition.npz` was used to run Vamb.
157 | - `abundance.npz`: Similar to `composition.npz`, but this file contains information calculated from the abundance TSV file (or BAM files).
158 |   Using this as input instead of BAM files will skip re-parsing the BAM files, which take a significant amount of time.
159 |   This file is not produced if an existing `abundance.npz` was used to run Vamb.
160 | - `model.pt`: A file containing the trained VAE model. When running Vamb from a Python interpreter, the VAE can be loaded from this file to skip training.
161 | - `latent.npz`: This contains the output of the VAE model, the embedding of each of the contigs.
162 | - `bins`: If `--minfasta` is set, this is a directory with one FASTA file per bin, after binsplitting.
163 | - `vae_clusters_unsplit.tsv` - A two-column TSV text with the header `clustername\tcontigname`, then one row per sequence:
164 |   Left column for the cluster (i.e bin) name, right column for the sequence name.
165 |   You can create the FASTA-file bins themselves using the script in `src/create_fasta.py`
166 | - (if binsplitting is enabled:) `vae_clusters_split.tsv`, similar to the unsplit version, but after binsplitting.
167 |   See the section on binsplitting on the page"tips for running Vamb".
168 | - `vae_clusters_metadata.tsv`: A file with some metadata about clusters.
169 |     - Name: The name of the cluster
170 |     - Radius: Cosine radius in embedding space. Small clusters are usually more likely to be pure.
171 |     - Peak/valley ratio: A small PVR means the cluster's edges is more well defined, and hence the cluster is more likely pure
172 |     - Kind: Currently, Vamb produces three kinds of clusters:
173 |         - Normal: Defined by a local density in latent space. Most good clusters are of this type
174 |         - Loner: A contig far away from everything else in latent space.
175 |         - Fallback: After failing to produce good clusters for some time, these (usually poor) clusters are created
176 |           to not get stuck in an infinite loop when clustering
177 |     - Bp: Sum of length of all sequences in the cluster
178 |     - Ncontigs: Number of sequences in the cluster
179 |     - Medoid: Name of contig used as the cluster's medoid, i.e. the center of the cluster
180 | 
181 | ## TaxVamb
182 | * `log.txt`, `composition.npz` and `abundance.npz`: Same as when running `Vamb`
183 | * `predictor_model.pt` and `results_taxometer.tsv`: If Taxometer was used to automatically refine TaxVamb. See the Taxometer output section.
184 | * `vaevae_clusters_{split,unsplit,metadata}.tsv`: Same as when running `Vamb`, but from TaxVamb's VAEVAE model
185 | * `vaevae_model.pt` A PyTorch model with the trained VAEVAE model.
186 | 
187 | ## Taxometer
188 | * `log.txt`, `composition.npz` and `abundance.npz`: Same as when running `Vamb`
189 | * `predictor_model.pt`: A PyTorch model file containing the trained predictor.
190 | * `results_taxometer.tsv`: A refined taxonomy file (see the section on files on the "how to run" page)
191 | 
192 | ## AVAMB
193 | Same as Vamb, but also:
194 | - `aae_y_clusters_{split,unsplit}.tsv`: The clusters obtained from the categorical latent space
195 | - `aae_z_latent.npz`: Like `latent.npz`, but of the adversarial Z latent space
196 | - `aae_z_clusters_{metadata,split,unsplit}.tsv`: Like the corresponding `vae_clusters*` files, but from the adversarial Z latent space
197 | 
198 | 


--------------------------------------------------------------------------------
/doc/installation.md:
--------------------------------------------------------------------------------
 1 | # How to install Vamb
 2 | Vamb is in continuous development, and the latest versions are significantly better than older versions.
 3 | For the best results, make sure to install the released latest version.
 4 | 
 5 | ## Recommended: Install with `pip`
 6 | Recommended: Vamb can be installed with `pip` (thanks to contribution from C. Titus Brown):
 7 | ```shell
 8 | pip install vamb
 9 | ```
10 | 
11 | Note: Check that you've installed the latest version by comparing the installed version with [the latest version on PyPI](https://pypi.org/project/vamb/#history).
12 | 
13 | Note: An active Conda environment can hijack your system's linker, causing an error during installation.
14 | If you see issues, either deactivate `conda`, or delete the `~/miniconda/compiler_compats` directory before installing with pip.
15 | 
16 | ## Install a specific version of Vamb
17 | If you want to install the latest version from GitHub, or you want to change Vamb's source code, you should install it like this:
18 | 
19 | ```shell
20 | # Clone the desired branch from the repository, here master
21 | git clone https://github.com/RasmussenLab/vamb -b master
22 | cd vamb
23 | # The `-e` flag will make Vamb change if the source code is changed after install
24 | pip install -e .
25 | ```
26 | 
27 | __Note that the master branch is work-in-progress, have not been thoroughly tested, and is expected to have more bugs.__
28 | 
29 | ## Avoid using Conda to install Vamb
30 | The version of Vamb currently on BioConda is out of date, and significantly less accurate than the latest current version.
31 | We have also experienced that our users have more issues with installations from Conda.
32 | We will only be releasing new versions to be installable with `pip`. 
33 | 


--------------------------------------------------------------------------------
/doc/tips.md:
--------------------------------------------------------------------------------
 1 | # Tips for running Vamb
 2 | 
 3 | ## Use the latest released version
 4 | Vamb generally gets faster and more accurate over time, so it's worth it to get the latest version.
 5 | Note that Conda releases are typically (far) behind pip releases, so I recommend installation using pip.
 6 | 
 7 | ```{image} ../benchmark/benchmark.png
 8 | :alt: Vamb gets better over time
 9 | :width: 600px
10 | ```
11 | _Figure 1: Newer Vamb releases are faster and more accurate.
12 | Results are from binning the CAMI2 toy human microbiome short read gold standard assembly datasets, using the recommended multi-split workflow, and binned with Vamb using default settings._
13 | 
14 | ## Garbage in, garbage out
15 | For the best results when running Vamb, make sure the inputs to Vamb are as good as they can be.
16 | In particular, the assembly process is a main bottleneck in the total binning workflow, so improving assembly
17 | by e.g. preprocessing reads, using a better assembler, or switching to long read technology can make a big difference.
18 | 
19 | ## Postprocess your bins
20 | On principle, Vamb will bin every single input contig.
21 | Currently, Vamb's bins are also _disjoint_, meaning each contig is present in only one bin.
22 | 
23 | Having to place every contig into a big, even those with a weak binning signal,
24 | means that a large number of contigs will be binned poorly.
25 | Often, these poor-quality contigs are put in a bin of their own, or with just one or two smaller contigs.
26 | Practically speaking, this means _most bins produces by Vamb will be of poor quality_.
27 | 
28 | Hence, to use bins you can rely on, you will need to postprocess your bins:
29 | * You may filter the bins by size, if you are only looking for organisms
30 |   and not e.g. plasmids.
31 |   For example, removing all bins < 250,000 bp in size will remove most poor quality bins,
32 |   while keeping all bacterial genomes with a reasonable level of completeness.
33 | * Using tools such as CheckM2 to score your bins, you can keep only the bins
34 |   that pass some scoring criteria
35 | * You may use the information in the `vae_clusters_metadata.tsv` file (see Output),
36 |   and e.g. remove all clusters marked as "Fallback", below a certain size, or with a too
37 |   high peak-valley ratio. However, this is only recommended for advanced users.
38 | 
39 | ## How binsplitting works
40 | In the recommended workflow, each sample is assembled independently, then the contigs are pooled
41 | and binning together.
42 | After Vamb have encoded the input features into the embedding (latent space), the embedding is clustered
43 | to clusters.
44 | The clusters thus may contain contigs from multiple samples, and may represent the same genome assembled
45 | in different samples.
46 | To obtain mono-sample bins from the clusters, the clusters then split by their sample of origin in a process we call binsplitting.
47 | This reduces duplication in the output bins, and better preserves inter-sample diversity.
48 | 
49 | Binsplitting is done by looking at the identifiers (headers) of the contigs in the FASTA file:
50 | They are assumed to be named according to the scheme `<sample identifier><separator><contig identifier>`,
51 | where:
52 | * The sample identifier uniquely identifies the same that the contig came from,
53 | * The separator separates the sample- and contig identifier, and is guaranteed to not be contained in the sample identifier
54 | * The contig identifier uniquely identifies the contig within the sample.
55 | When using the provided `src/concatenate.py` script, the names conform to this scheme, being named e.g.
56 | `S5C1042`, for sample 5, contig 1042. In this case, the binsplit separator is 'C'.
57 | 
58 | The separator can be set on command-line with the flag `-o`.
59 | It defaults to 'C', if all headers contain a 'C'.
60 | To disable binsplitting, pass `-o` without an argument.
61 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html
 2 | [project]
 3 | dynamic = ["version"]
 4 | name = "vamb"
 5 | dependencies = [
 6 |     "vambcore == 0.1.2",
 7 |     "numpy == 1.26.4",
 8 |     "torch == 2.6.0",
 9 |     "pycoverm == 0.6.2",
10 |     "networkx == 3.4.2",
11 |     "scikit-learn == 1.6.1",
12 |     "dadaptation == 3.2",
13 |     "loguru == 0.7.3",
14 |     "pyhmmer == 0.10.15",
15 |     "pyrodigal == 3.6.3",
16 | ]
17 | # Currently pycoverm does not have binaries for Python > 3.13.
18 | # The dependency resolver, will not error on Python 3.14, but attempt
19 | # to build pycoverm from source, but will not get the deps required for that.
20 | requires-python = "<3.14,>=3.10.0"
21 | scripts = {vamb = "vamb.__main__:main"}
22 | 
23 | [project.optional-dependencies]
24 | docs = [
25 |   "sphinx",
26 |   "sphinx-book-theme",
27 |   "myst-nb",
28 |   "ipywidgets",
29 |   "sphinx-new-tab-link!=0.2.2",
30 | ]
31 | 
32 | [metadata]
33 | authors = [
34 |   {name = "Jakob Nybo Nissen", email = "jakobnybonissen@gmail.com"},
35 |   {name = "Pau Piera", email = "pau.piera@cpr.ku.dk"},
36 |   {name = "Simon Rasmussen", email = "simon.rasmussen@cpr.ku.dk"},
37 | ]
38 | description = "Variational and Adversarial autoencoders for Metagenomic Binning"
39 | license = "MIT"
40 | readme = {file = "README.md"}
41 | url = "https://github.com/RasmussenLab/vamb"
42 | 
43 | [build-system]
44 | requires = [
45 |     "setuptools ~= 70.1",
46 |     "setuptools-scm >= 8.0",
47 | ]
48 | build-backend = "setuptools.build_meta"
49 | 
50 | [tool.ruff]
51 | lint.ignore = [
52 |     "E722", # Use bare except.
53 |     "E402", # import not at top - needed for the hack in __init__.py
54 | ]
55 | 
56 | # pyproject.toml
57 | [tool.pytest.ini_options]
58 | filterwarnings = [
59 |     "error",
60 |     "ignore::DeprecationWarning",
61 |     "ignore::UserWarning",
62 | ]
63 | 
64 | [tool.setuptools_scm]
65 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [options]
2 | packages = find:
3 | include_package_data = True
4 | 


--------------------------------------------------------------------------------
/src/concatenate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import argparse
 5 | import gzip
 6 | import vamb
 7 | 
 8 | parser = argparse.ArgumentParser(
 9 |     description="""Creates the input FASTA file for Vamb.
10 | Input should be one or more FASTA files, each from a sample-specific assembly.
11 | If keepnames is False, resulting FASTA can be binsplit with separator 'C'.""",
12 |     formatter_class=argparse.RawDescriptionHelpFormatter,
13 |     add_help=True,
14 | )
15 | 
16 | parser.add_argument("outpath", help="Path to output FASTA file")
17 | parser.add_argument("inpaths", help="Paths to input FASTA file(s)", nargs="+")
18 | parser.add_argument(
19 |     "-m",
20 |     dest="minlength",
21 |     metavar="",
22 |     type=int,
23 |     default=2000,
24 |     help="Discard sequences below this length [2000]",
25 | )
26 | parser.add_argument(
27 |     "--keepnames", action="store_true", help="Do not rename sequences [False]"
28 | )
29 | parser.add_argument("--nozip", action="store_true", help="Do not gzip output [False]")
30 | 
31 | args = parser.parse_args()
32 | 
33 | # Check inputs
34 | for path in args.inpaths:
35 |     if not os.path.isfile(path):
36 |         raise FileNotFoundError(path)
37 | 
38 | if os.path.exists(args.outpath):
39 |     raise FileExistsError(args.outpath)
40 | 
41 | outpath = os.path.normpath(args.outpath)
42 | parent = os.path.dirname(outpath)
43 | if parent != "" and not os.path.isdir(parent):
44 |     raise NotADirectoryError(
45 |         f'Output file cannot be created: Parent directory "{parent}" is not an existing directory'
46 |     )
47 | 
48 | # Run the code. Compressing DNA is easy, this is not much bigger than level 9, but
49 | # many times faster
50 | filehandle = (
51 |     open(outpath, "w") if args.nozip else gzip.open(outpath, "wt", compresslevel=1)
52 | )
53 | try:
54 |     vamb.vambtools.concatenate_fasta(
55 |         filehandle, args.inpaths, minlength=args.minlength, rename=(not args.keepnames)
56 |     )
57 | except:
58 |     filehandle.close()
59 |     raise
60 | 


--------------------------------------------------------------------------------
/src/create_fasta.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | import vamb
 4 | import pathlib
 5 | 
 6 | parser = argparse.ArgumentParser(
 7 |     description="""Command-line bin creator.
 8 | Will read the entire content of the FASTA file into memory - beware.""",
 9 |     formatter_class=argparse.RawDescriptionHelpFormatter,
10 |     add_help=False,
11 | )
12 | 
13 | parser.add_argument("fastapath", help="Path to FASTA file")
14 | parser.add_argument("clusterspath", help="Path to clusters.tsv")
15 | parser.add_argument("minsize", help="Minimum size of bin in bp", type=int, default=0)
16 | parser.add_argument("outdir", help="Directory to create")
17 | 
18 | if len(sys.argv) == 1:
19 |     parser.print_help()
20 |     sys.exit()
21 | 
22 | args = parser.parse_args()
23 | 
24 | # Read in FASTA files only to get its length. This way, we can avoid storing
25 | # in memory contigs for sequences that will never get output anyway
26 | lens: dict[str, int] = dict()
27 | with vamb.vambtools.Reader(args.fastapath) as file:
28 |     for record in vamb.vambtools.byte_iterfasta(file, args.fastapath):
29 |         lens[record.identifier] = len(record)
30 | 
31 | with open(args.clusterspath) as file:
32 |     clusters = vamb.vambtools.read_clusters(file)
33 | 
34 | clusters = {
35 |     cluster: contigs
36 |     for (cluster, contigs) in clusters.items()
37 |     if sum(lens[c] for c in contigs) >= args.minsize
38 | }
39 | 
40 | with vamb.vambtools.Reader(args.fastapath) as file:
41 |     vamb.vambtools.write_bins(pathlib.Path(args.outdir), clusters, file, maxbins=None)
42 | 


--------------------------------------------------------------------------------
/src/create_kernel.py:
--------------------------------------------------------------------------------
  1 | __doc__ = """Create kernel for use in kmer frequencies.
  2 | Method copied from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2765972/
  3 | 
  4 | Principle:
  5 | There are 256 tetranucleotides, so a frequency distribution (tetranucleotide frequency, TNF)
  6 | is a length 256 vector. But the individual TNFs are not independent. For example, AAAT
  7 | must correlate highly with AATA. The TNFs are subject to at least 3 linear constrains:
  8 | 
  9 | 1) The vector must sum to one. We simply shift the TNF down by 1/256 to make it sum to zero
 10 | for simplicity instead.
 11 | 2) We cannot distinguish between a kmer and its reverse complement because the sequencede
 12 | strand is arbitrary. So we must count e.g. AGAT as one half of AGAT and one half ATCT.
 13 | So each kmer's frequency is the same as its reverse-complement.
 14 | 3) Every time a kmer is observed, the next kmer must have three overlapping nucleotides.
 15 | E.g. every observation of AGAT is followed by GATA, GATC, GATG or GATT. Same for previous
 16 | kmer. in other words, sum(xABC) = sum(ABCx).
 17 | This is not true right at the ends of the sequences because the kmers stop eventually, but
 18 | that can be considered a measurement error, and we don't care about it.
 19 | 
 20 | We list these linear constrains and produce kernel L that works on tnf matrix T such that
 21 | TL = P, a smaller projected TNF space.
 22 | 
 23 | Notably, for constraint 2 to be true, we need to average the frequency between a kmer
 24 | and its reverse complement. We can do this with a matrix multiply with an averaging kernel
 25 | R. So:
 26 | 
 27 | P = (TR)L = T(RL) = TK
 28 | 
 29 | We thus calculate K = RL and save this for use in Vamb for projection.
 30 | """
 31 | 
 32 | from os.path import abspath, dirname, join
 33 | import numpy as np
 34 | import itertools
 35 | from scipy.linalg import null_space
 36 | 
 37 | 
 38 | def reverse_complement(nuc):
 39 |     table = str.maketrans("ACGT", "TGCA")
 40 |     return nuc[::-1].translate(table)
 41 | 
 42 | 
 43 | def all_kmers(k):
 44 |     for i in itertools.product("ACGT", repeat=k):
 45 |         yield ("".join(i))
 46 | 
 47 | 
 48 | def create_projection_kernel():
 49 |     indexof = {kmer: i for i, kmer in enumerate(all_kmers(4))}
 50 |     linear_equations = list()
 51 | 
 52 |     # Constraint one: Frequencies sum to one (or in this scaled case, zero)
 53 |     linear_equations.append([1] * 256)
 54 | 
 55 |     # Constaint two: Frequencies are same as that of reverse complement
 56 |     for kmer in all_kmers(4):
 57 |         revcomp = reverse_complement(kmer)
 58 | 
 59 |         # Only look at canonical kmers - this makes no difference
 60 |         if kmer >= revcomp:
 61 |             continue
 62 | 
 63 |         line = [0] * 256
 64 |         line[indexof[kmer]] = 1
 65 |         line[indexof[revcomp]] = -1
 66 |         linear_equations.append(line)
 67 | 
 68 |     # Constraint three: sum(ABCx) = sum(xABC)
 69 |     for trimer in all_kmers(3):
 70 |         line = [0] * 256
 71 |         for suffix in "ACGT":
 72 |             line[indexof[trimer + suffix]] += 1
 73 |         for prefix in "ACGT":
 74 |             line[indexof[prefix + trimer]] += -1
 75 |         linear_equations.append(line)
 76 | 
 77 |     linear_equations = np.array(linear_equations)
 78 |     kernel = null_space(linear_equations).astype(np.float32)
 79 |     assert kernel.shape == (256, 103)
 80 |     return kernel
 81 | 
 82 | 
 83 | def create_rc_kernel():
 84 |     indexof = {kmer: i for i, kmer in enumerate(all_kmers(4))}
 85 |     rc_matrix = np.zeros((256, 256), dtype=np.float32)
 86 |     for col, kmer in enumerate(all_kmers(4)):
 87 |         revcomp = reverse_complement(kmer)
 88 |         rc_matrix[indexof[kmer], col] += 0.5
 89 |         rc_matrix[indexof[revcomp], col] += 0.5
 90 | 
 91 |     return rc_matrix
 92 | 
 93 | 
 94 | def create_dual_kernel():
 95 |     return np.dot(create_rc_kernel(), create_projection_kernel())
 96 | 
 97 | 
 98 | dual_kernel = create_dual_kernel()
 99 | 
100 | # Prevent overwriting kernel when running tests
101 | if __name__ == "__main__":
102 |     path = join(dirname(dirname(abspath(__file__))), "vamb", "kernel.npz")
103 |     np.savez_compressed(path, dual_kernel)
104 | 


--------------------------------------------------------------------------------
/src/merge_aemb.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import sys
  4 | from math import isinf, isnan
  5 | from pathlib import Path
  6 | 
  7 | parser = argparse.ArgumentParser(
  8 |     description="""Merge output files of `strobealign --aemb` to a single abundance TSV file.
  9 | The sample names will be the basenames of the paths in the input directory.""",
 10 |     formatter_class=argparse.RawDescriptionHelpFormatter,
 11 |     add_help=True,
 12 | )
 13 | 
 14 | parser.add_argument("input_dir", help="Path to directory of --aemb output files")
 15 | parser.add_argument(
 16 |     "output_file", help="Path to write output TSV file (must not exist)"
 17 | )
 18 | 
 19 | args = parser.parse_args()
 20 | 
 21 | 
 22 | def exit_with(message: str):
 23 |     print(message, file=sys.stderr)
 24 |     exit(1)
 25 | 
 26 | 
 27 | # Check input directory exists
 28 | input = Path(args.input_dir)
 29 | output = Path(args.output_file)
 30 | 
 31 | if not input.is_dir():
 32 |     exit_with(f"Error: Input is not an existing directory: '{input}'")
 33 | 
 34 | # Check output file's parent is an existing directory.
 35 | if not output.parent.is_dir():
 36 |     exit_with(
 37 |         f"Error: Output file cannot be created: Parent directory '{output.parent}' is not an existing directory"
 38 |     )
 39 | 
 40 | if output.exists():
 41 |     exit_with(f"Error: Output file already exists: '{output}'")
 42 | 
 43 | files = sorted(input.iterdir())
 44 | 
 45 | for file in files:
 46 |     for char in ("\n", "\r", "\t", "\v"):
 47 |         if char in file.name:
 48 |             exit_with(
 49 |                 f"Error: File name '{file.name}' contains a char {repr(char)}, which is not permitted in Vamb"
 50 |             )
 51 | 
 52 | 
 53 | def exit_on_line(path: Path, line: int, message: str):
 54 |     exit_with(f"Error: {message}, in file '{path}' on line {line}")
 55 | 
 56 | 
 57 | # We allow an empty directory, but let's warn the user since it's an easy mistake
 58 | # to make.
 59 | if len(files) == 0:
 60 |     # N.B: We don't use exitwith here, because we want the exit code to be 0
 61 |     # indicating this is not an error.
 62 |     print("Warning: No files in input directory", sys.stderr)
 63 |     exit(0)
 64 | 
 65 | 
 66 | # Parses an --aemb file, yielding (identifier, depth), where depth is a non-negative,
 67 | # non-inf, non-nan float.
 68 | def parse_lines(path: Path):
 69 |     with open(path) as file:
 70 |         for lineno_minus_one, line in enumerate(file):
 71 |             line = line.rstrip()
 72 | 
 73 |             # If line is empty or whitespace-only, it must be the last line.
 74 |             # --aemb does not produce trailing whitespace
 75 |             if not line:
 76 |                 for next_line in file:
 77 |                     if next_line.rstrip():
 78 |                         exit_on_line(
 79 |                             path, lineno_minus_one + 1, "Found non-trailing empty line"
 80 |                         )
 81 |                 return
 82 | 
 83 |             fields = line.split("\t")
 84 | 
 85 |             # Currently --aemb only outputs two columns, but they document explicitly
 86 |             # that they may add other columns in the future
 87 |             if len(fields) < 2:
 88 |                 exit_on_line(
 89 |                     path, lineno_minus_one + 1, "Not at least two tab-separated columns"
 90 |                 )
 91 | 
 92 |             (identifier, depth_str) = (fields[0], fields[1])
 93 |             try:
 94 |                 depth = float(depth_str)
 95 |             except ValueError:
 96 |                 exit_on_line(
 97 |                     path, lineno_minus_one + 1, "Depth cannot be parsed as float"
 98 |                 )
 99 |             except:
100 |                 raise
101 | 
102 |             if isnan(depth) or isinf(depth) or depth < 0.0:
103 |                 exit_on_line(
104 |                     path, lineno_minus_one + 1, "Depth is negative, NaN or infinite"
105 |                 )
106 | 
107 |             yield (identifier, depth)
108 | 
109 | 
110 | # We allow the order of rows to differ between the files, so we need to be able
111 | # to convert an identifier into a row index for subsequent files
112 | identifier_to_index: dict[str, int] = dict()
113 | 
114 | # We store depths in a matrix, but we need to have parsed the first file to know
115 | # how big to make the matrix
116 | first_depths: list[float] = []
117 | identifiers: list[str] = []
118 | for identifier, depth in parse_lines(files[0]):
119 |     length = len(identifier_to_index)
120 |     identifier_to_index[identifier] = length
121 |     # If the identifier has previously been seen, the dict entry will be overwritten
122 |     if len(identifier_to_index) == length:
123 |         exit_with(
124 |             f"Duplicate sequence name found in file '{files[0]}': '{identifier}'",
125 |         )
126 |     first_depths.append(depth)
127 |     identifiers.append(identifier)
128 | 
129 | # Initialize with -1, so we can search for it at the end and make sure no entries
130 | # are uninitialized
131 | matrix = np.full((len(identifiers), len(files)), -1.0, dtype=np.float32)
132 | matrix[:, 0] = first_depths
133 | 
134 | del first_depths
135 | 
136 | # Fill in the rest of the files
137 | for col_minus_one, file in enumerate(files[1:]):
138 |     n_seen_identifiers = 0
139 |     for identifier, depth in parse_lines(file):
140 |         n_seen_identifiers += 1
141 |         index = identifier_to_index.get(identifier)
142 | 
143 |         # Ensure all entries in this file have a known index (i.e. are also
144 |         # in the first file)
145 |         if index is None:
146 |             exit_with(
147 |                 f"Error: Identifier '{identifier}' found in file '{file}' "
148 |                 "but not present in all files.",
149 |             )
150 | 
151 |         if matrix[index, col_minus_one + 1] != -1.0:
152 |             exit_with(
153 |                 f"Error: Identifier '{identifier}' present multiple times in file '{file}'"
154 |             )
155 | 
156 |         matrix[index, col_minus_one + 1] = depth
157 | 
158 |     # Check that this file does not have a strict subset of identifiers from the first
159 |     # file. After that, we know the set of identifiers is exactly the same
160 |     if n_seen_identifiers != len(identifiers):
161 |         exit_with(
162 |             f"Error: File '{file}' does not have all identifiers of file '{files[0]}'."
163 |         )
164 | 
165 | assert -1.0 not in matrix, (
166 |     "Matrix not full; this is a bug in the script and should never happen"
167 | )
168 | 
169 | with open(output, "w") as file:
170 |     # We already checked this, but let's check it again
171 |     assert len(matrix) == len(identifiers)
172 |     print("contigname", "\t".join([p.name for p in files]), sep="\t", file=file)
173 |     for identifier, row in zip(identifiers, matrix):
174 |         print(identifier, "\t".join([str(i) for i in row]), sep="\t", file=file)
175 | 


--------------------------------------------------------------------------------
/test/data/aemb/6.aemb.tsv:
--------------------------------------------------------------------------------
 1 | S27C95602	5.988746
 2 | S27C25358	25.066412
 3 | S27C181335	1.159981
 4 | S4C222286	33.167842
 5 | S11C13125	6.825609
 6 | S4C480978	6.578677
 7 | S12C228927	6.716019
 8 | S27C93037	13.361650
 9 | S9C124493	16.475576
10 | S27C214882	6.249275
11 | S7C273086	3.115955
12 | S12C85159	3.793851
13 | 


--------------------------------------------------------------------------------
/test/data/aemb/7.aemb.tsv:
--------------------------------------------------------------------------------
 1 | S27C95602	0.000000
 2 | S27C25358	40.494834
 3 | S27C181335	1.123731
 4 | S4C222286	41.094294
 5 | S11C13125	0.000000
 6 | S4C480978	0.000617
 7 | S12C228927	0.000000
 8 | S27C93037	0.084345
 9 | S9C124493	0.000000
10 | S27C214882	1.003976
11 | S7C273086	2.705933
12 | S12C85159	4.306267
13 | 


--------------------------------------------------------------------------------
/test/data/aemb/8.aemb.tsv:
--------------------------------------------------------------------------------
 1 | S27C95602	0.000000
 2 | S27C25358	2.157007
 3 | S27C181335	5.691155
 4 | S4C222286	35.064668
 5 | S11C13125	0.000000
 6 | S4C480978	0.000000
 7 | S12C228927	0.000000
 8 | S27C93037	0.099672
 9 | S9C124493	0.578131
10 | S27C214882	0.000000
11 | S7C273086	1.028035
12 | S12C85159	2.950335
13 | 


--------------------------------------------------------------------------------
/test/data/bam/10.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RasmussenLab/vamb/c53e5419392ce3f46ffc9038d2df0004b2d94569/test/data/bam/10.bam


--------------------------------------------------------------------------------
/test/data/bam/11.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RasmussenLab/vamb/c53e5419392ce3f46ffc9038d2df0004b2d94569/test/data/bam/11.bam


--------------------------------------------------------------------------------
/test/data/bam/12.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RasmussenLab/vamb/c53e5419392ce3f46ffc9038d2df0004b2d94569/test/data/bam/12.bam


--------------------------------------------------------------------------------
/test/data/fasta.fna:
--------------------------------------------------------------------------------
 1 | >Sequence1_100nt_no_special
 2 | GGAAGGTAGCCGTACCGAGTTCTTTGGAGAGAATCGTCCACTGAGCAGTACGGATCTCAGAATTAGTCGCGATACATGTATGGGCTTAGCGTACCTAGGC
 3 | >Sequence2 100nt whitespace in header
 4 | # This is a random comment
 5 | CTTCCGCATGAGCCCGGGAGC
 6 | CAGATGAGCATGCAGCAATTG
 7 | CACCTGGTTGACCACGCTGGC
 8 | CCGGAGCGGGAGGTGTCTGCA
 9 | CCGTGTTGTGTCCCAA
10 | >Sequence3 150 nt, all ambiguous bases
11 | TTCTCTAGAGTTAGTTATATWGGTACCTSCTATGTTMAGTCACCCRATGAACTCKGCTCCCCTAYACCTGCGTTNTAGAAABAACGCGDTAGGGAGVCCCGAHACGATCTCAATTTCCTAACGCCTGAGCGCGCTAGCCAGTGGGCTTCA
12 | >Sequence4 99 nt, no tetranucleotides
13 | GCCNGGCNTCCNTGGNTGTNATTNTTANCCCNGCTNCGANTTGNTCTNATTNGACNCCTNCAANGGCNAATNTTTNCTGNTCTNGATNTGTNGGGNTAC
14 | >Sequence5 Empty seq
15 | >Sequence6 150 nt, same as seq4 but mixed case
16 | tTctCTaGAgTtagTtATAtwGGtaCctsCtAtGTTmAgtcACcCrAtGaAcTCkGCTCCcctAYacctGcgttNtaGAaABaacGCgDtAggGAgVcCCgAHACgaTCTCaaTTTCCTAAcgccTGagCgCGcTAgccAGtgggCTtCa
17 | 


--------------------------------------------------------------------------------
/test/data/marker.fna:
--------------------------------------------------------------------------------
1 | >abc
2 | GTGGTGAGTGCTTGGACGTGTGTGGTGATGCTCATGGTGTCTTCCCCCCTTCCCCGTGTTGTTACCACCCACTCTACCCGCTCACACTTTCGAATACAAGGGTATTTTTCGAAATGGAAAAGCTGTCGAGCACTCACATGTTCGACAGCTTCATATGTCCTTGAACTACGTAACTTTTTCCTTACAATCAGCGCAAAGTCCAAAGATTTCGGCCTCGTGACTACTGAGAGCAAAGCCGTTTTTAGTGGCAATTTCCTGTGCCCATGTCTCTACTGGACCGCCATCGATTTCGACTGTGTGACCGCAATTGGTGCAGACCAAGTGATGGTGGTGTCCCTCGTCGTGGCATTGGCGGTACAGAGTTTCTCCACCTGTGACGGTAAGTACGTCGACTGCCCTGATGTCCGCGAGGGATTGGAGGGTTCGGTAGACGGTTGTGAGGCCGACGTTGTGTTCCCTGGTGGATAGCTCGTGATGGATTTCTTTGGCGGAAGCGAAGTTATCGATTTCCTCAAGAACGTCAATTACGGCTTTTCGCTGTCTGGTGCTTCGCACTCCCAGCTTCGGGGCAGAGCCTTGGCTGATGCGATTGATACCCACCGTTGATCCTCCTCAATGACACAAAATGTACTTCGATAGTCTACCCAGATGTGTCAACCCCTGCGTTTAGTGCCAGGAGAAGTATGTCGAGGACGAGTGGTTCGGCAAGTGAATAAGTCATTTGCCGGCCTTGACGTTCTGCGTCGACGATACCTGCAGTTTTAAGGACTTTGAGGTGTTGGCTCACTAGTGGTTGCGAACTTTTTACTAGCTTGACCAATTCGTGGACGTAGTGGGGCCTTTCGTTGAGGGCGAGGATGATTTCGATTCTTAAGGGGGAATCTAGTGCCCTAATCAGCAGGCTGATCGCTTTGATGTTTTTTGCTGTTGCAAGTTTCTGAAGCTCAGCTGATGCTGTGGATTCGGACTCTTCTGCAGGGGTGACGAAATTCCGATTTGAGTGTTGAGCCACGGGGAAGTCCTTCCGTCCTTAGGCTAGGTCTGGAATGGATCTAGCACGCTTGCTATTTTACCTTCTATATAAACCTTTTATGAGGGAAATGAAAAAATAGTTATTAGAACTAGTTTACATCGCGAAGGCCGCAAAATGACGGGGTCAGCGGAAGCAACATCGTTAGTTGGGCTAGGATTGGTTGGGTATGTCCTAAAAGGGACGGTTATTTTTTCATTCGACGTGGAGGAGAGCATCCGACGTGGCTCAGCAATCGATCATCGACACCGTGGTTAACCTGTGTAAACGACGTGGACTGGTGTACCCCTGTGGTGAGATCTACGGCGGTACCCGCTCTGCGTGGGACTACGGCCCGCTGGGTGTGGAGCTGAAGGAAAACATCAAGCGCCAGTGGTGGCGTTCTATGGTTACTTCCCGCCCAGATGTTGTGGGTGTTGATACTTCTGTCATCCTTCCTCGCCAGGTGTGGGTAACTTCCGGCCACGTTGAGGTCTTCACTGACCCACTGGTTGAGTCTTTGAACACCCACAAGCGTTACCGTGCGGACCACCTGCTGGAGCAGTACGAAGAGAAGCATGGTCACCCACCTGTAAACGGCTTGGCTGACATCAACGATCCAGAGACCGGCCAGCCAGGTAACTGGACTGAGCCTAAGGCGTTCTCTGGTCTTCTGAAGACTTTCTTGGGACCTGTGGACGACGAAGAGGGTCTGCACTACCTGCGCCCTGAAACTGCTCAGGGTATCTTCGTGAACTTCAAGAACGTGATGAACACTTCACGCATGAAGCCACCTTTCGGTATCGCGAACATCGGTAAGTCTTTCCGTAACGAGATCACCCCAGGTAACTTCATTTTCCGTACTCGTGAGTTCGAGCAGATGGAGATGGAGTTCTTCGTCAAGCCTGGTGAGGACGAAGAGTGGCACCAGCACTGGATTGATACTCGCCTGCAGTGGTACATCAACCTGGGCATTAAGCCTGAGAACCTGCGTCTGTACGAGCACCCTCAGGAGAAGCTGTCTCACTACTCCAAGCGCACTGTTGATATTGAGTACGCATTCAACTTTGCTAACACCAAGTGGGGCGAGTTAGAGGGTATCGCGAACCGTACTGATTACGATCTTCGCGTGCACTCTGAGGGCTCTGGTGAGGACCTGTCATTCTTCGATCAGGAGACCAATGAGCGTTGGATTCCTTTCGTAATCGAGCCTGCTGCAGGTCTTGGTCGCGCAATGATGATGTTCCTGATGGATGCTTATCACGAGGACGAGGCACCAAACTCAAAGGGTGGCGTCGATAAGCGTGTTGTTCTGAAGCTTGACCGTCGCCTTGCGCCGGTTAAGGTTGCGGTCTTGCCGCTGTCAAAGAAGGACACTTTGACGCCTTTGGCGGAAAAGCTCGCAGCAGAGCTGCGTGAATTCTGGAACGTTGATTACGACACTTCAGGTGCGATTGGTCGCCGTTACCGTCGTCAGGACGAGATCGGTACTCCATTCTGCGTCACCGTTGACTTTGATTCTCTCGAGGACAACGCTGTGACCGTGCGTGAGCGCGACACCATGGAGCAGGTTCGTGTTCCACTTGATGAGCTGCAGGGTTACTTGGCTCAGCGCCTCATCGGCTGCTAAACGGCAACCAATAGAGCGATAATTCGCTAAGACGAATGTAATCGCAGCAACATATAGCACCGGCTTAACAGGCCGGTGCTATTCTGTTCGCATGACTTCGAAGGATCTGATTGTGACCTCCTATACGTCTTGGGGCAAGCGTTTCAAGAATGACGGGAAGCTTTTTATTAACCTACTTCGCAGCACCACTGATAGTGCTGATGAAAAGGTTTTAGCCACTTTCGGTGAAGTTCCCAGCAAATCATTTGAAACCACCGCAACGGTTGATGAGCAGCAGTGGGAACTGTCCTTCAATATTGATGGAACGGCAACTGCCAAGCTTCCTGATGGTCGTGTGTTCAGCGCGAATGCAGGTGAGAAGACCTTTACCAAGTCCAAGCGGATTGAAATCGACATGGACGGCACCGCGATGGCTGCTGTTAATGAAGATAAAAACAATTGGATTATCGACGATTCTGAAGAGAATAAAGTCGCTCAGTTTACCGGTATGAACAACGGTGTGCGTCGCGCGATTGTGGAGTTTGAGCCTGACGTAGAAGTCACCCAGGAGCAGGAAATTTTCTTGTCGTGGGTTGCTCGGAAAACTCTGGAATCCCGCATGTTGGGCTCCAGTTGGGGACTGACTCTGTTTTTGATCATTTTGACGCCAATCATTATTTTTCTCACTTTCAGCTAAAAGGACCATGCAATGGTAGACGCTCAGCGCCCCAAAGCAGGCATCTTCGGTAGCCACACAGAAGAAACATGGGTGTGGCTCGGTAATGAACTTTTCGACGAGTCCGGCGAGGTCATCGCCGACGTTCGCTCCGACGTCCTCTACGTGGATCGCGAACGACTACTCATCGAATCCACCCCCGGCACCATGCGTTTTCGTTGCCGCGCAACACTGTCCGGGGGTGAGGTCTATACAATGACTCAGAATTCTTTCACTGTGGGGGATCTCACTGCGGTGTGCGGGCGCCGGACGTATTCGCTAAAAAGGGTGTCGCCGTGGCGTAAAGAACGCCTGATCACCAACAATGGGGTGGAAGTGGCGCGACTTCGCCCGATGACCAGCGGTAAAGTCGAATTCATTGTGGGCACCGCGGACAGCGAGGCGTTGCCGTTCGTCGACGCAGTATTTTTGAGCTGGGCGTGCGTCCTGGTGGATTCGGCCGTGCGCCGGCCGAAAATTTAAAAGCTTTTTGCTTATCGACGCACCCCTCCACCTGTTTTTTGTAGCCGGGGGATCATTTCCTTTGAAGGATCCAATCTCCGCACTTAGTTTCCTTCGGTGTGAAGGAAAGAGTTCCGTAAAGACCTCTATCTCATTTAAAGAAGTGGAGGATTAGGGTCGTTGACTCGCCTTCGGCACTAATTTGAGCCAAGTTCAAGTTTGCTGCCATCCCAGGTGACCGAAAATGTCCTATGCGAGGTCTCTTCGGTCACTTGGTTTTGCTCGTTTCAGGCTAGAAGCGGCCTCCGCGGAACCCTCCTCCGCCACCGCCACCACCGCTGAAGCCGCCACCGCCTCCACCGAAGCCTCCACCGAAACCGCCACCGCGGCCGCTGTTGAGAATCGAGTTGATCACCATGCCGGTGACAATCGCACCGGTGGTTCCGCCACCGGAATTGTGGCGATTGTTGTAGGTGGTGATGTCGTTTTGTGCTGACTTGCTGGCGCGTTGGGCTGCGACTGCTGCTTGACGTCCGTAATCAATTCCTGCACGGGTGTCGCGGGTGCGGTTTTGTTGTGCCATGGCGTACAGTTTTTGTGCGTTGGCCAGGTGGGTGCGGGCTTCGGATTTTACGATGCGACCGCGGGTGGAGATGAGGTCTTCGGCCTTTTGGATTTGGCTTCTTGCAGATTGCAGCTGTTGGTCGAATACGCGTAGCTGGCGGGCTTGATCAGCTGCGGTGGCGCGAAGTGTGTCAAGTTGAGTGTCGAGGGCGGAGTCGACATCGACAAGTTCTGTGTAGGTTCCGAGCGGATCCTTTTCGGCGTCTGCTGATGCGGTGGTTAGTGCTGCGCTGGCTGCGCGGACAGCATCGTCGAGGGAGGCCCAGTCGGCACGGGCACCGTCGGCTCCTGCGCTTTGTTTGAGTTGGCCGGCTTCGTTGATTTCGTCTGAGATTTCTTGAATCAGATCGGCAACGTTTGCTTTGGCTGTGGAGATGTTTTCATCGGCATGCTCGACGCCCTCGAGGAGTTTGTCTGCGGTAGTGATGGCGTGCTCGATGTGACGGATCGCGTCGATAAGCCCGCCCTGCTCGCCTGCGGGCATGGACTCTATCTTGTACGCCTGTGGCAGGACTTCTTCTGCTTCGTCGAGCGAAGCGCTGGCGAGGTCGACGTTGTCGTCGATGCTTTCAAGGACCTCTGCTGAGTAGCGAGCGCGCAGGCCAGCGAGTGTTTCTTGAGCCTTGGGGAGGCGGGTGCGCAGGTCGACGGATTTTTGGGTGAGAGCATCCAATTTGCTGCCCGCGTTGATCAGCAGGTTGCGCATATCGGCAAAGTTTTGGGCCTCGGCGTCGAGGGCATCGTCGGCTTGGCCACAGGATGAAATGATTTCTACCAGCATGGATCGACGTTCGGCTTCGGATTCTGGGATAGAATCGTTGAGGCGCTGCTGAATCTCAAAGGCTTTTTGCAGGGTGCCGGTGGAGTGGTTCATGGCGCGGTTGAAGCTGCGGGTGCGCTCTGGTCCGAACTCGGAGGTAGCGATAGCGAGCTCTTCTTTTCCGCGACGGATGGAGTCATCAGTGGAGGTGAGCTCTTCTTGGGCAAGGTGTTCGAGAGTTTCCATGGGAAGCTGCATGAGGCGGTTGGTATCGCGAGGGTCGATCTCACGTGCATCTTCCAAGGTTGCAGCACTTGTTTTCTTCTTGCGGCTGCGGGAATAGGCCCAAATTCCGCCACCAGCGGCCACTGTGCCAACGCCCGCAGCAGCCAACCAAGCGCCGGAAGATCCAGAAGAGCCTGAGGTTCCTGAGGCACCAGAACTAGAACCAACTGATTCTGCCAGCGCTAGTGCGGAGCCTGCCCAATCTTCTTGGGAAAGCGCCTGGAAAGCAGCGTTGTTGGCGGCGTCGAGTTCAGCGTCGGTCCATTGAGTACCACCTTGGATGCCGTACTGCCGTTCCTCGGGAGCGAGTGCATAAACCAAGACGTTTCCGCCGCCGTTGGCTTGGAGTGCTTGCTGCGTCCACGTTTCAGGGTCAACTCCGTCGAAAGAGCTTAGGAAAACAACGAAAATAACCTTTTGTTCAGATGCCTTTACATCATCGATGGCAGCCTGAATGTTGGTGATATCGGACGAGGAAATCTGGCCGGTGTAGTCAGTGACATTGTCTTGGTAAAATTCTGGTGATTCAGCCAAGACATATGTTTCTGTGGCTTCTGCAGTGTGAGCAGTAAAAAATGGTCCACTGATAAGGAGCGCGCCAGCTCCAATTGCCACAGTGACCGATACACGGCGGACGTTTTCCCGAAGATGCACCAAACTAAAGTTCATGGTCCCCACCTTAGACGAGTCCAGCTGGCACACTAGTTAACGTGAGAAGATTTTTAGCCAAGAGTTTACTCTTAACCGCAGTAGCGCAACCAGCCCTGAGGGTGGTCGCGTATTCGATGCTCAGAACGCCTAATAATCGGCACAAAATTGATTCAATTTTGGTGTTGGGCACAGCTCAATATGATGGGGTTCCATCGAGGCAGTTTGCTGCTCGTTTGAGGCATGCCGCGAAGCTGTGGCGTCTTCATGAAATCCAGCATGTATATACTGTCGGCGGAAAACTTCCTGGTGATCGTTTCACCGAAGCAGAAGTCGCGCGGGAGTATTTGATCAAAGAGGGCGTGGATCCGGATCTGATTTTTGTCTCTGCAGTTGGCAATGACACTGTCTCCTCCTATGAGGCGCTTGATCCGGAAAAGCTTGGTCGGGTGCTGATTGTTACTGATCCGAACCATTCGTATCGGGCGGTGCGCATCGCGCGACGCATGGGCTTTGACGCGAAACCTTCCCCGACAACCTATAGTCCCGCGAAGTTTCCGTCGATAGTTTATTTTCTGACCTTGTCCCATGAGTGGGGCGGGGTAGTGGTACAGGACGTGTCGTGGCTCTTGGGCGAACGGGTGGCCGATAAGGTGGAAGCATCTTTGCGAACTATCCAAGGCCTGCTGCGCCCTTCGAGGCGTGCGCGCCATGAGCAACTTCGGAGGCTGAAAAAGTAGATGTACCCCTATTCCGACGCAGACGCTTTTCGACGCCACCCTGAGCGCGCCAAGTCCAGCCAACTGCGCACCAGCGCCGTAGACACCCGCAGCGCGTTCGCCCGCGACCGGGCTCGCGTGCTGCATTCTGCTGCTCTTCGACGCCTCGCGGATAAAACCCAAGTGGTTGGCCCCAATGATGGTGATACTCCGCGCACCCGGCTGACGCACTCTTTGGAAGTAGCTCAAATTGCACGGGGAATCGGAGCTGGACTGGATTTGGATCCTGATCTGTGCGATCTGGCAGGGCTGTGCCATGACATTGGGCATCCGCCGTATGGACACAACGGTGAAAACGCGTTGAATGAAGTTGCTGCGGCCTGTGGAGGATTTGAGGGCAACGCCCAAACCTTGCGTATTCTCACGCGGCTGGAGCCAAAAATTGTCTCTGATGAGGGGGAGAGCTTTGGGCTGAACTTGTCGCGGGCTGCTCTTGATGCTGCATGTAAGTATCCGTGGGCTAAAACAAATGCGGATGGCAGTGTCAATAAGAAATACAGTGCTTATGATGAGGACGCAGAAATTCTTGCTTGGATCAGGCAAGGCCATGAAGACCTCAGACCACCAATCGAAGCTCAGGTCATGGACTTTTCCGATGATATTGCCTACTCAGTACACGATGTAGAAGACGGCATTGTTTCCGGTCGCATCGATTTGAAAGTGCTGTGGGACCTGGTGGAATTAGCAGCACTGGCGGACAAAGGCGCAGCAGCTTTCGGAGGCTCGCCTGCAGAACTCATCGAGGGCGCAGCCTCGTTGCGGGAGCTTCCTGTGGTAGCGGCCGCTGCAGATTTTGATTTCTCACTGCGTTCCTACGCTGCGCTGAAGGCCATGACCTCAGAACTAGTGGGAAGATACGTTGGCTCTACCATCGAGTCAACAAAGAAAACACACGCCGGCATTGATGTGGGACGCATGCACGGCGATTTGATCATTCCAGAAACAGCGGCCAGTGAAGTAAAACTGCTCAAAACGTTAGCGGTTCTCTATGTGATGGATGACCCAGGGCACCTTGCGCGCCAAAACAGGCAACGGGATCGTATCTTCCGGGTTTTTGACTACCTGGTGCTGGGGGCTCCGGGATCGTTGGATCCGATGTATCGCCAGTGGTTTATTGAAGCGGATTCAGAATCGGAACAGATCCGTGTGATTGTTGATCAGATTGCGTCGATGACGGAGTCTCGTCTGGAACGCCTTGCCCGGAATGCTGCTGACATCTCAGGATTCTTGGGATAGTTGGTTAGAGCAGCAGCGATTTTTAGTAAGGCCAATAACATGTTTTGGCTTAAACCTGTGTCGTGTCAGATGGTGGCGAAGTAGAGTTCGCAAAGCTAGCGAACATGAATTCGTGTTCAGGAACTTAACAGGGATCAAACAGAGAACAGAGAACAGATCACGCTGCCCAAAAATCGCACTTTTAAGGTTTGTGGGCGTCTGTGTGTGGTTTGCCGCTGTAAAGTATCACCACGTTATGCGCCCTGGTGTGATCAAGCGTTCGTTCTGGGTCGAAACCCCAAAAGTCACAATTCCCCAGAAGCGGGTCAAACCCATTTAGCTTATTGCTTACATATCGAGGGTTTAGAAAAGTGATTTGTCGGATCAGTCGGTTTCTGCCAAGTAAATAGAACTTTATAAATTTTGTGGCTCTCAAATCTTAGGCCACGGCTTCCGATTTGAACCGGAGGTTCAAAAGGCTTATATAGACAAGATTCTGCATCGTCTCACGAGCCCCTCATTGCCTGACACGGTCAATCGTGTGGGAGGTACCAATCCGTGAGATTTCTGCCAACGAGCGATTCATTGGCCCCGCTGCAGAGCTGGCAGAACACGGACATAACCCAAATAATCTGAGGTCTGCCGTTTGCAGCAGCATTAGCGTTTGATGTGGAAGGTGATGCAGAGGCTGTTGATCTGCAAGCGCGTCTTTCCCAAGCACGGGGGAACCCTGAAGCATCGGATGCTCTAGTTGCTGAGCTGACTGGTGTTACTGCTAATCATCCGTTGGTCAGTGCTTGTCTGAAGTTTCCGCTCAATCCTAAGCTTCTCAAGATTTCGTAAAAAAGCTGCCAACTACCGTAAAACCGCACTACTAGAGGAGTGCGTTTTTCGTTCCTGAACACATTGCGTGCTGCAACTTAATTATGGTCCTCCCAGCTCAGTGTGCTGTGTGGATTGTTTATTCTCGTCCATTAAGTGATCGAGAAAAAGTTGTTGTAAAGTCATGCGCATGTGTGGAATTGTTGGATATATTGGCCAGGCGGGCGACTCCCGTGATTACTTTGCGCTTGACGTCGTTTTAGAAGGACTGCGCCGACTTGAATACCGCGGTTATGATTCCGCAGGTGTAGCTGTTCATGCGAACGGTGAAATCAGCTACCGAAAGAAGGCTGGAAAGGTAGCTGCGCTGGACGCTGAGATCGCTCGCGCTCCTTTGGCGGATTCCATTTTGGCTATTGGTCACACCCGGTGGGCAACTCACGGTGGACCAACCGATGCAAATGCACACCCCCATGTTGTTGATGGCGGCAAGTTAGCTGTCGTACACAACGGTATTATTGAAAACTTTGCAGAGCTGCGCGCAGAGCTTTCAGCTAAGGGCTACAACTTTGTTTCCGTTACTGACACTGAAGTTGCCGCCACATTGCTGGCAGAAATCTACAACACCCAGGCTAATGGCGATCTGACCAAGGCTATGCAGCTTACTGGTCAGCGTCTTGAGGGTGCGTTCACCCTGCTGGCTATCCATGCTGATCATGATGATCGTATTGTTGCAGCGCGCCGTAACTCTCCTTTGGTTATTGGCTTGGGTGAAGGCGAAAACTTCCTCGGCTCTGACGTTTCTGGCTTCATCGATTACACCCGCAAGGCTGTTGAGATGGGCAACGATCAGATTGTGACCATCACTGCGAACGACTACCAGATCACCAACTTCGATGGTTCTGAGGCAACCGGAAAACCTTTCGACGTGGAGTGGGATGCGGCTGCTGCTGAAAAGGGTGGCTTTGATTCCTTCATGGATAAGGAAATCCACGACCAGCCAGCTGCAGTGCGTGACACCCTCCTCGGACGTTTAGATGAGGATGGCAAGCTGGTCCTTGATGAGCTTCGT
3 | 


--------------------------------------------------------------------------------
/test/test_aamb_encode.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | import random
 4 | import vamb
 5 | 
 6 | 
 7 | class TestAAE(unittest.TestCase):
 8 |     tnfs = np.random.random((111, 103)).astype(np.float32)
 9 |     rpkm = np.random.random((111, 14)).astype(np.float32)
10 |     lens = np.random.randint(2000, 5000, size=111)
11 |     contignames = ["".join(random.choices("abcdefghijklmnopqrstu", k=10)) for _ in lens]
12 |     nlatent_l = 32
13 |     default_args = (14, 256, nlatent_l, 25, 0.5, 0.5, 0.15, False, 0)
14 |     default_temperature = 0.16
15 | 
16 |     # Construction
17 |     def test_bad_args(self):
18 |         default_args = self.default_args
19 | 
20 |         # Test the default args work
21 |         aae = vamb.aamb_encode.AAE(*default_args)
22 |         self.assertIsInstance(aae, vamb.aamb_encode.AAE)
23 | 
24 |         with self.assertRaises(ValueError):
25 |             vamb.aamb_encode.AAE(0, *default_args[1:])
26 | 
27 |         with self.assertRaises(ValueError):
28 |             vamb.aamb_encode.AAE(*default_args[:1], 0, *default_args[2:])
29 | 
30 |         with self.assertRaises(ValueError):
31 |             vamb.aamb_encode.AAE(*default_args[:2], 0, *default_args[3:])
32 | 
33 |         with self.assertRaises(ValueError):
34 |             vamb.aamb_encode.AAE(*default_args[:3], 0, *default_args[4:])
35 | 
36 |         with self.assertRaises(ValueError):
37 |             vamb.aamb_encode.AAE(*default_args[:5], float("nan"), *default_args[6:])
38 | 
39 |         with self.assertRaises(ValueError):
40 |             vamb.aamb_encode.AAE(*default_args[:5], -0.0001, *default_args[6:])
41 | 
42 |         with self.assertRaises(ValueError):
43 |             vamb.aamb_encode.AAE(*default_args[:6], float("nan"), *default_args[7:])
44 | 
45 |     def test_loss_falls(self):
46 |         aae = vamb.aamb_encode.AAE(*self.default_args)
47 |         rpkm_copy = self.rpkm.copy()
48 |         tnfs_copy = self.tnfs.copy()
49 |         dl = vamb.encode.make_dataloader(
50 |             rpkm_copy, tnfs_copy, self.lens, batchsize=16, destroy=True
51 |         )
52 |         (di, ti, ai, we) = next(iter(dl))
53 |         mu, do, to, _, _, _, _ = aae(di, ti)
54 |         start_loss = aae.calc_loss(di, do, ti, to)[0].data.item()
55 | 
56 |         # Loss drops with training
57 |         aae.trainmodel(
58 |             dl,
59 |             nepochs=3,
60 |             batchsteps=[1, 2],
61 |             T=self.default_temperature,
62 |             modelfile=None,
63 |         )
64 |         mu, do, to, _, _, _, _ = aae(di, ti)
65 |         end_loss = aae.calc_loss(di, do, ti, to)[0].data.item()
66 |         self.assertLess(end_loss, start_loss)
67 | 
68 |     def test_encode(self):
69 |         aae = vamb.aamb_encode.AAE(*self.default_args)
70 |         dl = vamb.encode.make_dataloader(
71 |             self.rpkm.copy(), self.tnfs.copy(), self.lens, batchsize=16, destroy=True
72 |         )
73 |         (_, encoding) = aae.get_latents(self.contignames, dl)
74 |         self.assertIsInstance(encoding, np.ndarray)
75 |         self.assertEqual(encoding.dtype, np.float32)
76 |         self.assertEqual(encoding.shape, (len(self.rpkm), self.nlatent_l))
77 | 


--------------------------------------------------------------------------------
/test/test_cluster.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from hashlib import md5
 4 | 
 5 | import vamb
 6 | 
 7 | 
 8 | class TestClusterer(unittest.TestCase):
 9 |     # This seed has been set just so the unit tests runs faster.
10 |     # How many iterations of the clustering depends on the input data
11 |     rng = np.random.RandomState(5)
12 |     data = rng.random((1024, 40)).astype(np.float32)
13 |     lens = rng.randint(500, 1000, size=1024)
14 | 
15 |     def test_bad_params(self):
16 |         with self.assertRaises(ValueError):
17 |             vamb.cluster.ClusterGenerator(self.data.astype(np.float64), self.lens)
18 | 
19 |         with self.assertRaises(ValueError):
20 |             vamb.cluster.ClusterGenerator(self.data, self.lens, maxsteps=0)
21 | 
22 |         with self.assertRaises(ValueError):
23 |             vamb.cluster.ClusterGenerator(self.data, self.lens, windowsize=0)
24 | 
25 |         with self.assertRaises(ValueError):
26 |             vamb.cluster.ClusterGenerator(self.data, self.lens, minsuccesses=0)
27 | 
28 |         with self.assertRaises(ValueError):
29 |             vamb.cluster.ClusterGenerator(
30 |                 self.data, self.lens, minsuccesses=5, windowsize=4
31 |             )
32 | 
33 |         with self.assertRaises(ValueError):
34 |             vamb.cluster.ClusterGenerator(
35 |                 np.random.random((0, 40)), np.array([], dtype=int)
36 |             )
37 | 
38 |     def test_basics(self):
39 |         clstr = vamb.cluster.ClusterGenerator(self.data, self.lens)
40 |         self.assertIs(clstr, iter(clstr))
41 | 
42 |         x = next(clstr)
43 |         self.assertIsInstance(x, vamb.cluster.Cluster)
44 | 
45 |         clusters = list(clstr)
46 |         clusters.append(x)
47 | 
48 |         # All members are clustered
49 |         self.assertEqual(sum(map(lambda x: len(x.members), clusters)), len(self.data))
50 | 
51 |         # Elements of members are exactly the matrix row indices
52 |         mems = set()
53 |         for i in clusters:
54 |             mems.update(i.members)
55 |         self.assertEqual(mems, set(range(len(self.data))))
56 | 
57 |     def test_detruction(self):
58 |         copy = self.data.copy()
59 |         clstr = vamb.cluster.ClusterGenerator(self.data, self.lens)
60 |         self.assertTrue(np.any(np.abs(self.data - clstr.matrix.numpy()) > 0.001))
61 |         clstr = vamb.cluster.ClusterGenerator(copy, self.lens, destroy=True)
62 |         self.assertTrue(np.all(np.abs(copy - clstr.matrix.numpy()) < 1e-6))
63 |         self.assertTrue(np.any(np.abs(self.data - clstr.matrix.numpy()) > 0.001))
64 | 
65 |     @staticmethod
66 |     def xor_rows_hash(matrix):
67 |         m = np.frombuffer(matrix.copy().data, dtype=np.uint32)
68 |         m.shape = matrix.shape
69 |         v = m[0]
70 |         for i in range(1, len(m)):
71 |             v ^= m[i]
72 |         return md5(v).digest().hex()
73 | 
74 |     def test_normalization(self):
75 |         hash_before = md5(self.data.data.tobytes()).digest().hex()
76 |         vamb.cluster.ClusterGenerator(self.data, self.lens)
77 |         self.assertEqual(hash_before, md5(self.data.data.tobytes()).digest().hex())
78 |         cp = self.data.copy()
79 |         vamb.cluster.ClusterGenerator(cp, self.lens, destroy=True)
80 |         hash_after = md5(cp.data.tobytes()).digest().hex()
81 |         self.assertNotEqual(hash_before, hash_after)
82 | 
83 |         # Rows are permuted by the clusterer. We use xor to check the rows
84 |         # are still essentially the same.
85 |         before_xor = self.xor_rows_hash(cp)
86 |         vamb.cluster.ClusterGenerator(cp, self.lens, destroy=True, normalized=True)
87 |         self.assertEqual(before_xor, self.xor_rows_hash(cp))
88 | 
89 |     def test_cluster(self):
90 |         x = next(vamb.cluster.ClusterGenerator(self.data, self.lens))
91 |         self.assertIsInstance(x.members, np.ndarray)
92 | 


--------------------------------------------------------------------------------
/test/test_encode.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import numpy as np
  3 | import torch
  4 | import tempfile
  5 | import vamb
  6 | 
  7 | 
  8 | class TestDataLoader(unittest.TestCase):
  9 |     tnfs = np.random.random((111, 103)).astype(np.float32)
 10 |     rpkm = np.random.random((111, 14)).astype(np.float32)
 11 |     lens = np.random.randint(2000, 5000, size=111)
 12 | 
 13 |     def nearly_same(self, A, B):
 14 |         self.assertTrue(np.all(np.abs(A - B) < 1e-5))
 15 | 
 16 |     def not_nearly_same(self, A, B):
 17 |         self.assertTrue(np.any(np.abs(A - B) > 1e-4))
 18 | 
 19 |     def test_bad_args(self):
 20 |         # Bad rpkm
 21 |         with self.assertRaises(ValueError):
 22 |             vamb.encode.make_dataloader([[1, 2, 3]], self.tnfs, self.lens, batchsize=32)
 23 | 
 24 |         # bad tnfs
 25 |         with self.assertRaises(ValueError):
 26 |             vamb.encode.make_dataloader(self.rpkm, [[1, 2, 3]], self.lens, batchsize=32)
 27 | 
 28 |         # Bad batchsize
 29 |         with self.assertRaises(ValueError):
 30 |             vamb.encode.make_dataloader(self.rpkm, self.tnfs, self.lens, batchsize=0)
 31 | 
 32 |         # Differing lengths
 33 |         with self.assertRaises(ValueError):
 34 |             vamb.encode.make_dataloader(
 35 |                 np.random.random((len(self.rpkm) - 1)).astype(np.float32),
 36 |                 self.tnfs,
 37 |                 self.lens,
 38 |                 batchsize=32,
 39 |             )
 40 | 
 41 |         # Bad dtype
 42 |         with self.assertRaises(ValueError):
 43 |             vamb.encode.make_dataloader(
 44 |                 self.rpkm.astype(np.float64), self.tnfs, self.lens, batchsize=32
 45 |             )
 46 | 
 47 |     def test_destroy(self):
 48 |         copy_rpkm = self.rpkm.copy()
 49 |         copy_tnfs = self.tnfs.copy()
 50 | 
 51 |         _ = vamb.encode.make_dataloader(self.rpkm, self.tnfs, self.lens, batchsize=32)
 52 |         self.nearly_same(self.rpkm, copy_rpkm)
 53 |         self.nearly_same(self.tnfs, copy_tnfs)
 54 | 
 55 |         _ = vamb.encode.make_dataloader(
 56 |             copy_rpkm, copy_tnfs, self.lens, batchsize=32, destroy=True
 57 |         )
 58 |         self.not_nearly_same(self.rpkm, copy_rpkm)
 59 |         self.not_nearly_same(self.tnfs, copy_tnfs)
 60 | 
 61 |     def test_normalized(self):
 62 |         copy_rpkm = self.rpkm.copy()
 63 |         copy_tnfs = self.tnfs.copy()
 64 | 
 65 |         _ = vamb.encode.make_dataloader(
 66 |             copy_rpkm, copy_tnfs, self.lens, batchsize=32, destroy=True
 67 |         )
 68 | 
 69 |         # TNFS: Mean of zero, std of one
 70 |         self.nearly_same(np.mean(copy_tnfs, axis=0), np.zeros(copy_tnfs.shape[1]))
 71 |         self.nearly_same(np.std(copy_tnfs, axis=0), np.ones(copy_tnfs.shape[1]))
 72 | 
 73 |         # RPKM: Sum to 1, all zero or above
 74 |         # print(copy_rpkm)
 75 |         self.nearly_same(np.sum(copy_rpkm, axis=1), np.ones(copy_rpkm.shape[0]))
 76 |         self.assertTrue(np.all(copy_rpkm >= 0.0))
 77 | 
 78 |     def test_single_sample(self):
 79 |         single_rpkm = self.rpkm[:, [0]]
 80 |         copy_single = single_rpkm.copy()
 81 |         dl = vamb.encode.make_dataloader(
 82 |             single_rpkm, self.tnfs.copy(), self.lens, batchsize=32, destroy=True
 83 |         )
 84 |         # When destroying a single sample, RPKM is set to 1.0
 85 |         self.assertAlmostEqual(np.abs(np.mean(single_rpkm)), 1.0)
 86 |         self.assertLess(abs(np.std(single_rpkm)), 1e-6)
 87 | 
 88 |         # ... and the abundance are the same abundances as before,
 89 |         # except normalized and scaled. We test that they are ordered
 90 |         # in the same order
 91 |         self.assertTrue(
 92 |             (
 93 |                 torch.argsort(dl.dataset.tensors[2], dim=0)
 94 |                 == torch.argsort(torch.from_numpy(copy_single), dim=0)
 95 |             )
 96 |             .all()
 97 |             .item()
 98 |         )
 99 | 
100 |     def test_iter(self):
101 |         bs = 32
102 |         dl = vamb.encode.make_dataloader(self.rpkm, self.tnfs, self.lens, batchsize=bs)
103 | 
104 |         # Check right element type
105 |         for M in next(iter(dl)):
106 |             self.assertEqual(M.dtype, torch.float32)
107 |             self.assertEqual(M.shape[0], bs)
108 | 
109 |         # Check it iterates the right order (rpkm, tnfs)
110 |         rpkm, tnfs, abundance, weights = next(iter(dl))
111 |         self.nearly_same(np.sum(rpkm.numpy(), axis=1), np.ones(bs))
112 | 
113 |     def test_randomized(self):
114 |         dl = vamb.encode.make_dataloader(self.rpkm, self.tnfs, self.lens, batchsize=64)
115 |         rpkm, tnfs, abundances, weights = next(iter(dl))
116 | 
117 |         # Test that first batch is not just the first 64 elements.
118 |         # Could happen, but vanishingly unlikely.
119 |         self.assertTrue(np.any(np.abs(tnfs.numpy() - self.tnfs[:64]) > 1e-3))
120 | 
121 | 
122 | class TestVAE(unittest.TestCase):
123 |     tnfs = np.random.random((111, 103)).astype(np.float32)
124 |     rpkm = np.random.random((111, 14)).astype(np.float32)
125 |     lens = np.random.randint(2000, 5000, size=111)
126 | 
127 |     def test_bad_args(self):
128 |         with self.assertRaises(ValueError):
129 |             vamb.encode.VAE(-1)
130 | 
131 |         with self.assertRaises(ValueError):
132 |             vamb.encode.VAE(5, nlatent=0)
133 | 
134 |         with self.assertRaises(ValueError):
135 |             vamb.encode.VAE(5, nhiddens=[128, 0])
136 | 
137 |         with self.assertRaises(ValueError):
138 |             vamb.encode.VAE(5, alpha=0.0)
139 | 
140 |         with self.assertRaises(ValueError):
141 |             vamb.encode.VAE(5, alpha=1.0)
142 | 
143 |         with self.assertRaises(ValueError):
144 |             vamb.encode.VAE(5, beta=0.0)
145 | 
146 |         with self.assertRaises(ValueError):
147 |             vamb.encode.VAE(5, dropout=1.0)
148 | 
149 |         with self.assertRaises(ValueError):
150 |             vamb.encode.VAE(5, dropout=-0.001)
151 | 
152 |     def test_loss_falls(self):
153 |         vae = vamb.encode.VAE(self.rpkm.shape[1])
154 |         rpkm_copy = self.rpkm.copy()
155 |         tnfs_copy = self.tnfs.copy()
156 |         dl = vamb.encode.make_dataloader(
157 |             rpkm_copy, tnfs_copy, self.lens, batchsize=16, destroy=True
158 |         )
159 |         (di, ti, ai, we) = next(iter(dl))
160 |         do, to, ao, mu = vae(di, ti, ai)
161 |         start_loss = vae.calc_loss(di, do, ti, to, ao, ai, mu, we)[0].data.item()
162 | 
163 |         with tempfile.TemporaryFile() as file:
164 |             # Loss drops with training
165 |             vae.trainmodel(dl, nepochs=3, batchsteps=[1, 2], modelfile=file)
166 |             do, to, ao, mu = vae(di, ti, ai)
167 |             end_loss = vae.calc_loss(di, do, ti, to, ao, ai, mu, we)[0].data.item()
168 |             self.assertLess(end_loss, start_loss)
169 | 
170 |             # Also test save/load
171 |             before_encoding = vae.encode(dl)
172 |             file.flush()
173 |             file.seek(0)
174 |             vae_2 = vamb.encode.VAE.load(file)
175 | 
176 |         after_encoding = vae_2.encode(dl)
177 |         self.assertTrue(np.all(np.abs(before_encoding - after_encoding) < 1e-6))
178 | 
179 |     def test_encoding(self):
180 |         nlatent = 15
181 |         vae = vamb.encode.VAE(self.rpkm.shape[1], nlatent=nlatent)
182 |         dl = vamb.encode.make_dataloader(self.rpkm, self.tnfs, self.lens, batchsize=32)
183 |         encoding = vae.encode(dl)
184 |         self.assertEqual(encoding.dtype, np.float32)
185 |         self.assertEqual(encoding.shape, (len(self.rpkm), nlatent))
186 | 


--------------------------------------------------------------------------------
/test/test_parsebam.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import io
  3 | import numpy as np
  4 | import tempfile
  5 | from pathlib import Path
  6 | 
  7 | import vamb
  8 | import testtools
  9 | from vamb.parsecontigs import CompositionMetaData
 10 | 
 11 | 
 12 | class TestParseBam(unittest.TestCase):
 13 |     @classmethod
 14 |     def setUpClass(cls):
 15 |         minlen = 3000
 16 |         mask = np.array(
 17 |             list(map(lambda x: x >= minlen, testtools.BAM_SEQ_LENS)), dtype=bool
 18 |         )
 19 |         cls.comp_metadata = CompositionMetaData(
 20 |             np.array(
 21 |                 [i for (i, m) in zip(testtools.BAM_NAMES, mask) if m], dtype=object
 22 |             ),
 23 |             np.array([i for (i, m) in zip(testtools.BAM_SEQ_LENS, mask) if m]),
 24 |             mask,
 25 |             minlen,
 26 |         )
 27 | 
 28 |         cls.abundance = vamb.parsebam.Abundance.from_files(
 29 |             testtools.BAM_FILES, "/tmp/bam_tmpfile", cls.comp_metadata, True, 0.0, 2
 30 |         )
 31 | 
 32 |     def test_refhash(self):
 33 |         m = self.comp_metadata
 34 |         cp = CompositionMetaData(m.identifiers, m.lengths, m.mask, m.minlength)
 35 |         # Change the refnames slighty
 36 |         cp.identifiers = cp.identifiers.copy()
 37 |         cp.identifiers[3] = cp.identifiers[3] + "w"
 38 |         cp.refhash = vamb.vambtools.RefHasher.hash_refnames(cp.identifiers)
 39 |         with self.assertRaises(ValueError):
 40 |             vamb.parsebam.Abundance.from_files(
 41 |                 testtools.BAM_FILES, None, cp, True, 0.97, 4
 42 |             )
 43 | 
 44 |         ab2 = vamb.parsebam.Abundance.from_files(
 45 |             testtools.BAM_FILES, None, cp, False, 0.97, 4
 46 |         )
 47 |         self.assertEqual(self.abundance.refhash, ab2.refhash)
 48 | 
 49 |     def test_bad_metadata_mask(self):
 50 |         m = self.comp_metadata
 51 | 
 52 |         # If last element of mask is False, then the invariants of CompositionMetaData will
 53 |         # not hold after removing the last element of its mask, and that is NOT what we
 54 |         # are testing here.
 55 |         assert list(m.mask[-3:]) == [True, False, False]
 56 |         cp = CompositionMetaData(
 57 |             m.identifiers[:-1], m.lengths[:-1], m.mask[:-3], m.minlength
 58 |         )
 59 |         with self.assertRaises(ValueError):
 60 |             vamb.parsebam.Abundance.from_files(
 61 |                 testtools.BAM_FILES, None, cp, True, 0.97, 4
 62 |             )
 63 | 
 64 |     def test_badfile(self):
 65 |         with self.assertRaises(BaseException):
 66 |             vamb.parsebam.Abundance.from_files(
 67 |                 ["noexist"], None, self.comp_metadata, True, 0.97, 1
 68 |             )
 69 | 
 70 |     # Minid too high
 71 |     def test_minid_off(self):
 72 |         with self.assertRaises(ValueError):
 73 |             vamb.parsebam.Abundance.from_files(
 74 |                 testtools.BAM_FILES, None, self.comp_metadata, True, 1.01, 4
 75 |             )
 76 | 
 77 |     def test_parse(self):
 78 |         nm = sum(self.comp_metadata.mask)
 79 |         self.assertEqual(nm, 12)
 80 | 
 81 |         self.assertEqual(self.abundance.matrix.shape, (nm, 3))
 82 |         self.assertEqual(self.abundance.nseqs, nm)
 83 |         self.assertEqual(self.abundance.matrix.dtype, np.float32)
 84 |         self.assertEqual(self.abundance.nsamples, 3)
 85 | 
 86 |     def test_minid(self):
 87 |         abundance = vamb.parsebam.Abundance.from_files(
 88 |             testtools.BAM_FILES, None, self.comp_metadata, True, 0.95, 3
 89 |         )
 90 |         self.assertTrue(np.any(abundance.matrix < self.abundance.matrix))
 91 | 
 92 |     def test_save_load(self):
 93 |         buf = io.BytesIO()
 94 |         self.abundance.save(buf)
 95 |         buf.seek(0)
 96 | 
 97 |         # Bad refhash
 98 |         with self.assertRaises(ValueError):
 99 |             abundance2 = vamb.parsebam.Abundance.load(buf, b"a" * 32)
100 | 
101 |         buf.seek(0)
102 |         abundance2 = vamb.parsebam.Abundance.load(buf, self.abundance.refhash)
103 |         self.assertTrue(np.all(abundance2.matrix == self.abundance.matrix))
104 |         self.assertTrue(np.all(abundance2.samplenames == self.abundance.samplenames))
105 |         self.assertEqual(abundance2.refhash, self.abundance.refhash)
106 |         self.assertEqual(abundance2.minid, self.abundance.minid)
107 | 
108 |     def test_parse_from_tsv(self):
109 |         # Check it parses
110 |         with open(testtools.AEMB_FILES[0]) as file:
111 |             lines = [s.rstrip() for s in file]
112 |         for path in testtools.AEMB_FILES[1:]:
113 |             with open(path) as file:
114 |                 for i, existing in enumerate(file):
115 |                     lines[i] += "\t" + existing.split("\t")[1].rstrip()
116 | 
117 |         # Add in lines with zeros corresponding to the masked contigs
118 |         unmasked_lines = []
119 |         i = 0
120 |         for keep in self.comp_metadata.mask:
121 |             if not keep:
122 |                 unmasked_lines.append("\t0.0\t0.0\t0.0")
123 |             else:
124 |                 unmasked_lines.append(lines[i])
125 |                 i += 1
126 | 
127 |         with tempfile.NamedTemporaryFile(mode="w+") as file:
128 |             print("contigname\tfile1\tfile2\tfile3", file=file)
129 |             for line in unmasked_lines:
130 |                 print(line, file=file)
131 |             file.seek(0)
132 |             abundance = vamb.parsebam.Abundance.from_tsv(
133 |                 Path(file.name), self.comp_metadata
134 |             )
135 | 
136 |         self.assertEqual(abundance.refhash, self.comp_metadata.refhash)
137 |         self.assertEqual(list(abundance.samplenames), ["file1", "file2", "file3"])
138 | 
139 |         # Check values are alright
140 |         M = np.zeros_like(abundance.matrix)
141 |         for row, line in enumerate(lines):
142 |             for col, cell in enumerate(line.split("\t")[1:]):
143 |                 M[row, col] = float(cell)
144 |         self.assertTrue((np.abs((M - abundance.matrix)) < 1e-6).all())
145 | 
146 |         # Bad header order errors
147 |         lines[5], lines[4] = lines[4], lines[5]
148 | 
149 |         with tempfile.NamedTemporaryFile(mode="w+") as file:
150 |             print("contigname\tfile1\tfile2\tfile3", file=file)
151 |             for line in lines:
152 |                 print(line, file=file)
153 |             file.seek(0)
154 |             with self.assertRaises(ValueError):
155 |                 vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata)
156 | 
157 |         # Restore
158 |         lines[5], lines[4] = lines[4], lines[5]
159 | 
160 |         # Too many lines
161 |         with tempfile.NamedTemporaryFile(mode="w+") as file:
162 |             print("contigname\tfile1\tfile2\tfile3", file=file)
163 |             for line in lines:
164 |                 print(line, file=file)
165 |             print(lines[-2], file=file)
166 |             file.seek(0)
167 |             with self.assertRaises(ValueError):
168 |                 vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata)
169 | 


--------------------------------------------------------------------------------
/test/test_parsecontigs.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import unittest
  3 | import random
  4 | import numpy as np
  5 | 
  6 | import testtools
  7 | from vamb.parsecontigs import Composition, CompositionMetaData
  8 | 
  9 | 
 10 | class TestReadContigs(unittest.TestCase):
 11 |     records = []
 12 |     large_io = io.BytesIO()
 13 |     io = io.BytesIO()
 14 | 
 15 |     @classmethod
 16 |     def setUpClass(cls):
 17 |         rng = random.Random()
 18 |         for i in range(random.randrange(1400, 1500)):
 19 |             cls.records.append(testtools.make_randseq(rng, 400, 600))
 20 | 
 21 |         for i in cls.records:
 22 |             cls.io.write(i.format().encode())
 23 |             cls.io.write(b"\n")
 24 | 
 25 |         for i in range(25_000):
 26 |             record = testtools.make_randseq(rng, 250, 300)
 27 |             cls.large_io.write(record.format().encode())
 28 |             cls.large_io.write(b"\n")
 29 | 
 30 |     def setUp(self):
 31 |         self.io.seek(0)
 32 |         self.large_io.seek(0)
 33 | 
 34 |     def test_only_ns(self):
 35 |         file = io.BytesIO()
 36 |         file.write(b">abc\n")
 37 |         file.write(b"N" * 2500)
 38 |         file.write(b"\n")
 39 |         file.seek(0)
 40 | 
 41 |         with self.assertRaises(ValueError):
 42 |             Composition.from_file(file, None)
 43 | 
 44 |     def test_unique_names(self):
 45 |         with self.assertRaises(ValueError):
 46 |             CompositionMetaData(
 47 |                 np.array(["foo", "foo"], dtype=object),
 48 |                 np.array([1000, 1000]),
 49 |                 np.array([True, True], dtype=bool),
 50 |                 1000,
 51 |             )
 52 | 
 53 |     def test_filter_minlength(self):
 54 |         minlen = 500
 55 |         composition = Composition.from_file(self.io, None, minlength=450)
 56 |         md = composition.metadata
 57 |         hash1 = md.refhash
 58 | 
 59 |         composition.filter_min_length(minlen)
 60 |         n_initial_seq = md.nseqs
 61 | 
 62 |         hash2 = md.refhash
 63 |         self.assertNotEqual(hash1, hash2)
 64 |         self.assertEqual(len(md.identifiers), len(md.lengths))
 65 |         self.assertEqual(md.nseqs, md.mask.sum())
 66 |         self.assertLessEqual(minlen, composition.metadata.lengths.min(initial=minlen))
 67 |         self.assertEqual(len(md.mask), len(self.records))
 68 | 
 69 |         # NB: Here we filter metadata without filtering the composition.
 70 |         # That means from this point on, the metadata and comp is out of sync,
 71 |         # and comp is invalid.
 72 |         md.filter_min_length(minlen + 50)
 73 |         self.assertEqual(len(md.identifiers), len(md.lengths))
 74 |         self.assertEqual(md.nseqs, md.mask.sum())
 75 |         self.assertLessEqual(
 76 |             minlen, composition.metadata.lengths.min(initial=minlen + 50)
 77 |         )
 78 |         self.assertEqual(len(md.mask), len(self.records))
 79 |         self.assertLess(md.nseqs, n_initial_seq)
 80 | 
 81 |         hash3 = md.refhash
 82 |         md.filter_min_length(minlen - 50)
 83 |         self.assertEqual(hash3, md.refhash)
 84 | 
 85 |         md.filter_min_length(50000000000)
 86 |         self.assertEqual(md.nseqs, 0)
 87 |         self.assertFalse(np.any(md.mask))
 88 | 
 89 |     def test_minlength(self):
 90 |         with self.assertRaises(ValueError):
 91 |             Composition.from_file(self.io, None, minlength=3)
 92 | 
 93 |     def test_properties(self):
 94 |         composition = Composition.from_file(self.io, None, minlength=420)
 95 |         passed = list(filter(lambda x: len(x.sequence) >= 420, self.records))
 96 | 
 97 |         self.assertEqual(composition.nseqs, len(composition.metadata.identifiers))
 98 |         self.assertEqual(composition.nseqs, len(composition.metadata.lengths))
 99 | 
100 |         self.assertTrue(composition.matrix.dtype, np.float32)
101 |         self.assertEqual(composition.matrix.shape, (len(passed), 103))
102 | 
103 |         # Names
104 |         self.assertEqual(
105 |             list(composition.metadata.identifiers), [i.header for i in passed]
106 |         )
107 | 
108 |         # Lengths
109 |         self.assertTrue(np.issubdtype(composition.metadata.lengths.dtype, np.integer))
110 |         self.assertEqual(
111 |             [len(i.sequence) for i in passed], list(composition.metadata.lengths)
112 |         )
113 | 
114 |     def test_save_load(self):
115 |         buf = io.BytesIO()
116 |         composition_1 = Composition.from_file(self.io, None)
117 |         md1 = composition_1.metadata
118 |         composition_1.save(buf)
119 |         buf.seek(0)
120 |         composition_2 = Composition.load(buf)
121 |         md2 = composition_2.metadata
122 | 
123 |         self.assertTrue(np.all(composition_1.matrix == composition_2.matrix))
124 |         self.assertTrue(np.all(md1.identifiers == md2.identifiers))
125 |         self.assertTrue(np.all(md1.lengths == md2.lengths))
126 |         self.assertTrue(np.all(md1.refhash == md2.refhash))
127 |         self.assertTrue(np.all(md1.minlength == md2.minlength))
128 | 
129 |     def test_windows_newlines(self):
130 |         rng = random.Random()
131 |         buf1 = io.BytesIO()
132 |         buf2 = io.BytesIO()
133 |         for i in range(10):
134 |             record = testtools.make_randseq(rng, 10, 20)
135 |             buf1.write(b">" + record.header.encode())
136 |             buf2.write(b">" + record.header.encode())
137 |             buf1.write(b"\r\n")
138 |             buf2.write(b"\n")
139 |             buf1.write(record.sequence)
140 |             buf2.write(record.sequence)
141 |             buf1.write(b"\r\n")
142 |             buf2.write(b"\n")
143 | 
144 |         buf1.seek(0)
145 |         buf2.seek(0)
146 |         comp1 = Composition.from_file(buf1, None)
147 |         comp2 = Composition.from_file(buf2, None)
148 | 
149 |         self.assertEqual(comp1.metadata.refhash, comp2.metadata.refhash)
150 |         self.assertTrue(np.all(comp1.matrix == comp2.matrix))
151 | 


--------------------------------------------------------------------------------
/test/test_parsemarkers.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import vamb
 3 | import testtools
 4 | from pathlib import Path
 5 | import tempfile
 6 | import shutil
 7 | import io
 8 | 
 9 | 
10 | class TestParseMarkers(unittest.TestCase):
11 |     def test_instantiate(self):
12 |         tmp = tempfile.mkdtemp()
13 |         tmp_path = Path(tmp)
14 |         shutil.rmtree(tmp)
15 |         markers = vamb.parsemarkers.Markers.from_files(
16 |             Path(testtools.DATADIR).joinpath("marker.fna"),
17 |             Path(testtools.PARENTDIR).joinpath("vamb").joinpath("marker.hmm"),
18 |             ["abc"],
19 |             tmp_path,
20 |             4,
21 |             None,
22 |         )
23 |         self.assertIsNotNone(markers.markers[0])
24 |         self.assertEqual(len(markers.markers), 1)
25 |         self.assertEqual(set(markers.markers[0]), {39})
26 |         self.assertEqual(
27 |             markers.refhash, vamb.vambtools.RefHasher.hash_refnames(["abc"])
28 |         )
29 | 
30 |         buf = io.StringIO()
31 |         markers.save(buf)
32 |         buf.seek(0)
33 | 
34 |         markers2 = vamb.parsemarkers.Markers.load(buf, markers.refhash)
35 |         self.assertEqual(len(markers.markers), len(markers2.markers))
36 |         self.assertEqual(set(markers.markers[0]), set(markers2.markers[0]))
37 |         self.assertEqual(markers.marker_names, markers2.marker_names)
38 | 


--------------------------------------------------------------------------------
/test/test_reclustering.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | # For CAMI dataset, compute comp, abundance, taxonomy, markers
 4 | # Subset to e.g. 5 genera plus a few unclassified contigs
 5 | 
 6 | # FASTA
 7 | # Comp
 8 | # Abundance
 9 | # Markers
10 | # Taxonomy
11 | # Latent
12 | # Refined taxonomy
13 | 
14 | 
15 | class TestKmeansReclustering(unittest.TestCase):
16 |     pass
17 |     # Make markers + lengths
18 |     # Make taxonomy
19 |     # Create latent
20 | 
21 |     # Initial clustering
22 | 
23 | 
24 | class TestDBScanReclustering(unittest.TestCase):
25 |     # It produces disjoint clusters, a subset of the input points
26 |     pass
27 | 


--------------------------------------------------------------------------------
/test/test_results.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import io
  3 | import random
  4 | import numpy as np
  5 | import torch
  6 | from hashlib import sha256
  7 | 
  8 | import vamb
  9 | import testtools
 10 | 
 11 | # PyTorch cannot be made stable, so we cannot run CI on the result
 12 | # of pytorch training.
 13 | # Nonetheless, you can enable it locally with this switch here,
 14 | # which can be useful sometimes.
 15 | TEST_UNSTABLE_HASHES = False
 16 | 
 17 | 
 18 | class TestCompositionResult(unittest.TestCase):
 19 |     io = io.BytesIO()
 20 | 
 21 |     @classmethod
 22 |     def setUpClass(cls):
 23 |         rng = random.Random(15)
 24 |         for _ in range(4):
 25 |             seq = testtools.make_randseq(rng, 400, 600)
 26 |             cls.io.write(seq.format().encode())
 27 |             cls.io.write(b"\n")
 28 | 
 29 |     def setUp(self):
 30 |         self.io.seek(0)
 31 | 
 32 |     def test_runs(self):
 33 |         comp = vamb.parsecontigs.Composition.from_file(self.io, None)
 34 |         self.assertIsInstance(comp, vamb.parsecontigs.Composition)
 35 | 
 36 |     if TEST_UNSTABLE_HASHES:
 37 | 
 38 |         def test_result(self):
 39 |             comp = vamb.parsecontigs.Composition.from_file(self.io, None)
 40 |             self.assertEqual(
 41 |                 sha256(comp.matrix.data.tobytes()).digest().hex(),
 42 |                 "9e9a2d7b021654e874894722bdd6cd3eda18bed03fabd32a9440e806a8ab1bd1",
 43 |             )
 44 | 
 45 | 
 46 | class TestAbundanceResult(unittest.TestCase):
 47 |     @classmethod
 48 |     def setUpClass(cls):
 49 |         cls.comp_metadata = vamb.parsecontigs.CompositionMetaData(
 50 |             np.array(testtools.BAM_NAMES, dtype=object),
 51 |             np.array(testtools.BAM_SEQ_LENS),
 52 |             np.ones(len(testtools.BAM_SEQ_LENS), dtype=bool),
 53 |             2000,
 54 |         )
 55 | 
 56 |     def test_runs(self):
 57 |         abundance = vamb.parsebam.Abundance.from_files(
 58 |             testtools.BAM_FILES, None, self.comp_metadata, True, 0.9, 4
 59 |         )
 60 |         self.assertIsInstance(abundance, vamb.parsebam.Abundance)
 61 | 
 62 |     if TEST_UNSTABLE_HASHES:
 63 | 
 64 |         def test_result(self):
 65 |             abundance = vamb.parsebam.Abundance.from_files(
 66 |                 testtools.BAM_FILES, "/tmp/tmpbam", self.comp_metadata, True, 0.9, 2
 67 |             )
 68 |             self.assertEqual(
 69 |                 sha256(abundance.matrix.data.tobytes()).digest().hex(),
 70 |                 "c346abb53b62423fe95ed4b2eb5988d77141b2d7a5c58c03fdf09abc6476df78",
 71 |             )
 72 |             abundance2 = vamb.parsebam.Abundance.from_files(
 73 |                 testtools.BAM_FILES, None, self.comp_metadata, True, 0.9, 4
 74 |             )
 75 |             self.assertTrue(np.all(np.abs(abundance.matrix - abundance2.matrix) < 1e-5))
 76 | 
 77 | 
 78 | class TestEncodingResult(unittest.TestCase):
 79 |     @classmethod
 80 |     def setUpClass(cls):
 81 |         torch.manual_seed(0)
 82 |         rng = np.random.RandomState(15)
 83 |         cls.tnfs = rng.random((200, 103)).astype(np.float32)
 84 |         cls.rpkm = rng.random((200, 6)).astype(np.float32)
 85 |         cls.lens = rng.randint(2000, 5000, 200)
 86 | 
 87 |     def test_runs(self):
 88 |         self.assertEqual(
 89 |             sha256(self.lens.data.tobytes()).digest().hex(),
 90 |             "68894f01cc435a5f032a655faecddd817cd35a71397129296a11f8c40bd29fcb",
 91 |         )
 92 | 
 93 |         vae = vamb.encode.VAE(6)
 94 |         dl = vamb.encode.make_dataloader(
 95 |             self.rpkm.copy(), self.tnfs, self.lens, batchsize=16
 96 |         )
 97 |         vae.trainmodel(dl, nepochs=3, batchsteps=[1, 2])
 98 |         latent = vae.encode(dl)
 99 | 
100 |         self.assertIsInstance(latent, np.ndarray)
101 | 
102 |     if TEST_UNSTABLE_HASHES:
103 | 
104 |         def test_result(self):
105 |             torch.manual_seed(0)
106 |             torch.use_deterministic_algorithms(True)
107 |             np.random.seed(0)
108 |             random.seed(0)
109 |             vae = vamb.encode.VAE(6)
110 |             dl = vamb.encode.make_dataloader(
111 |                 self.rpkm, self.tnfs, self.lens, batchsize=16
112 |             )
113 |             vae.trainmodel(dl, nepochs=3, batchsteps=[1, 2])
114 |             latent = vae.encode(dl)
115 | 
116 |             self.assertEqual(
117 |                 sha256(latent.data.tobytes()).digest().hex(),
118 |                 "0148ec0767e88c756615340d6fd0b31ca07aa6b4b172a1874fb7de7179acb57d",
119 |             )
120 | 
121 |             self.assertEqual(
122 |                 sha256(torch.rand(10).numpy().tobytes()).digest().hex(),
123 |                 "c417b9722e14e854fbe79cc5c797cc6653360c1e6536064205ca0c073f41eaf6",
124 |             )
125 | 
126 | 
127 | class TestClusterResult(unittest.TestCase):
128 |     @classmethod
129 |     def setUpClass(cls):
130 |         rng = np.random.RandomState(15)
131 |         cls.latent = rng.random((1000, 3)).astype(np.float32) - 0.5
132 | 
133 |     def test_runs(self):
134 |         self.assertEqual(
135 |             sha256(self.latent.tobytes()).digest().hex(),
136 |             "630a98a4b44c3754a3f423e915847f44767bb69fb13ea5901dc512428aee9811",
137 |         )
138 | 
139 |     if TEST_UNSTABLE_HASHES:
140 | 
141 |         def test_result(self):
142 |             hash = sha256()
143 | 
144 |             # Use this to check that the clustering used in this test produces
145 |             # a reasonable cluster size, and that it doesn't just pass because
146 |             # it always clusters everything in 1-point clusters.
147 |             # Uncomment when updating this test.
148 |             # lens = list()
149 |             for cluster in vamb.cluster.ClusterGenerator(self.latent.copy()):
150 |                 medoid = cluster.metadata.medoid
151 |                 points = set(cluster.members)
152 |                 # Set hashing may differ from run to run, so turn into sorted arrays
153 |                 arr = np.array(list(points))
154 |                 arr.sort()
155 |                 # lens.append(arr)
156 |                 hash.update(medoid.to_bytes(4, "big"))
157 |                 hash.update(arr.data)
158 | 
159 |             # self.assertGreater(len(list(map(lambda x: len(lens) > 1))), 3)
160 |             self.assertEqual(
161 |                 hash.digest().hex(),
162 |                 "2b3caf674ff1d1906a831219e0953b2d9f1b78ecefec709b70c672280af49aee",
163 |             )
164 | 


--------------------------------------------------------------------------------
/test/test_semisupervised_encode.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import numpy as np
  3 | import tempfile
  4 | import vamb
  5 | 
  6 | 
  7 | class TestDataLoader(unittest.TestCase):
  8 |     def test_permute_indices(self):
  9 |         indices = vamb.semisupervised_encode.permute_indices(10, 25, seed=1)
 10 |         set_10 = set(range(10))
 11 |         self.assertTrue(len(indices) == 25)
 12 |         self.assertTrue(set(indices[:10]) == set_10)
 13 |         self.assertTrue(set(indices[10:20]) == set_10)
 14 |         self.assertTrue(set(indices[20:]).issubset(set_10))
 15 | 
 16 | 
 17 | class TestVAEVAE(unittest.TestCase):
 18 |     N_contigs = 111
 19 |     tnfs = np.random.random((N_contigs, 103)).astype(np.float32)
 20 |     rpkms = np.random.random((N_contigs, 14)).astype(np.float32)
 21 |     domain = "d_Bacteria"
 22 |     phyla = ["f_1", "f_2", "f_3"]
 23 |     classes = {
 24 |         "f_1": ["c_11", "c_21", "c_31"],
 25 |         "f_2": ["c_12", "c_22", "c_32"],
 26 |         "f_3": ["c_13", "c_23", "c_33"],
 27 |     }
 28 |     lengths = np.random.randint(2000, 5000, size=N_contigs)
 29 | 
 30 |     def make_random_annotation(self):
 31 |         phylum = np.random.choice(self.phyla, 1)[0]
 32 |         clas = np.random.choice(self.classes[phylum], 1)[0]
 33 |         if np.random.random() <= 0.2:
 34 |             return vamb.taxonomy.ContigTaxonomy.from_semicolon_sep(
 35 |                 ";".join([self.domain])
 36 |             )
 37 |         if 0.2 < np.random.random() <= 0.5:
 38 |             return vamb.taxonomy.ContigTaxonomy.from_semicolon_sep(
 39 |                 ";".join([self.domain, phylum])
 40 |             )
 41 |         return vamb.taxonomy.ContigTaxonomy.from_semicolon_sep(
 42 |             ";".join([self.domain, phylum, clas])
 43 |         )
 44 | 
 45 |     def make_random_annotations(self):
 46 |         return [self.make_random_annotation() for _ in range(self.N_contigs)]
 47 | 
 48 |     def test_make_graph(self):
 49 |         annotations = self.make_random_annotations()
 50 |         nodes, ind_nodes, table_parent = vamb.taxvamb_encode.make_graph(annotations)
 51 |         print(nodes, ind_nodes, table_parent)
 52 |         self.assertTrue(
 53 |             set(nodes).issubset(
 54 |                 set(
 55 |                     [
 56 |                         "root",
 57 |                         "d_Bacteria",
 58 |                         "f_1",
 59 |                         "f_2",
 60 |                         "f_3",
 61 |                         "c_11",
 62 |                         "c_21",
 63 |                         "c_31",
 64 |                         "c_12",
 65 |                         "c_22",
 66 |                         "c_32",
 67 |                         "c_13",
 68 |                         "c_23",
 69 |                         "c_33",
 70 |                     ]
 71 |                 )
 72 |             )
 73 |         )
 74 |         for p, cls in self.classes.items():
 75 |             for c in cls:
 76 |                 for f in self.phyla:
 77 |                     # Since the taxonomy is generated randomly, we can't guarantee
 78 |                     # that each run will have all the clades.
 79 |                     if any(i not in ind_nodes for i in (p, c, f)):
 80 |                         continue
 81 |                     self.assertTrue(ind_nodes.get(f, -666) < ind_nodes.get(c, 666))
 82 |                     self.assertTrue(table_parent[ind_nodes[f]] == 1)
 83 |                     self.assertTrue(table_parent[ind_nodes[c]] == ind_nodes[p])
 84 | 
 85 |     def test_encoding(self):
 86 |         nlatent = 10
 87 |         batchsize = 10
 88 |         nepochs = 2
 89 |         annotations = self.make_random_annotations()
 90 |         nodes, ind_nodes, table_parent = vamb.taxvamb_encode.make_graph(annotations)
 91 | 
 92 |         classes_order = np.array([a.ranks[-1] for a in annotations])
 93 |         targets = np.array([ind_nodes[i] for i in classes_order])
 94 | 
 95 |         vae = vamb.taxvamb_encode.VAEVAEHLoss(
 96 |             self.rpkms.shape[1],
 97 |             len(nodes),
 98 |             nodes,
 99 |             table_parent,
100 |             nlatent=nlatent,
101 |             cuda=False,
102 |         )
103 | 
104 |         dataloader_vamb = vamb.encode.make_dataloader(
105 |             self.rpkms,
106 |             self.tnfs,
107 |             self.lengths,
108 |             batchsize=batchsize,
109 |             cuda=False,
110 |         )
111 |         dataloader_joint = vamb.taxvamb_encode.make_dataloader_concat_hloss(
112 |             self.rpkms,
113 |             self.tnfs,
114 |             self.lengths,
115 |             targets,
116 |             len(nodes),
117 |             table_parent,
118 |             batchsize=batchsize,
119 |             cuda=False,
120 |         )
121 |         dataloader_labels = vamb.taxvamb_encode.make_dataloader_labels_hloss(
122 |             self.rpkms,
123 |             self.tnfs,
124 |             self.lengths,
125 |             targets,
126 |             len(nodes),
127 |             table_parent,
128 |             batchsize=batchsize,
129 |             cuda=False,
130 |         )
131 | 
132 |         shapes = (self.rpkms.shape[1], 103, 1, len(nodes))
133 |         dataloader = vamb.taxvamb_encode.make_dataloader_semisupervised_hloss(
134 |             dataloader_joint,
135 |             dataloader_vamb,
136 |             dataloader_labels,
137 |             len(nodes),
138 |             table_parent,
139 |             shapes,
140 |             666,
141 |             batchsize=batchsize,
142 |             cuda=False,
143 |         )
144 |         with tempfile.TemporaryFile() as modelfile:
145 |             vae.trainmodel(
146 |                 dataloader,
147 |                 nepochs=nepochs,
148 |                 modelfile=modelfile,
149 |                 batchsteps=[],
150 |             )
151 | 
152 |         latent_both = vae.VAEJoint.encode(dataloader_joint)
153 |         self.assertEqual(latent_both.dtype, np.float32)
154 |         self.assertEqual(latent_both.shape, (len(self.rpkms), nlatent))
155 | 


--------------------------------------------------------------------------------
/test/testtools.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import os
 3 | import pathlib
 4 | 
 5 | import vamb
 6 | 
 7 | PARENTDIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | DATADIR = os.path.join(PARENTDIR, "test", "data")
 9 | BAM_FILES = sorted(
10 |     [
11 |         pathlib.Path(DATADIR).joinpath("bam").joinpath(i)
12 |         for i in os.listdir(os.path.join(DATADIR, "bam"))
13 |     ]
14 | )
15 | AEMB_DIR = os.path.join(DATADIR, "aemb")
16 | AEMB_FILES = sorted([pathlib.Path(AEMB_DIR).joinpath(i) for i in os.listdir(AEMB_DIR)])
17 | 
18 | BAM_NAMES = [
19 |     "S27C175628",
20 |     "S27C95602",
21 |     "S27C25358",
22 |     "S26C115410",
23 |     "S4C529736",
24 |     "S27C181335",
25 |     "S4C222286",
26 |     "S27C38468",
27 |     "S11C13125",
28 |     "S4C480978",
29 |     "S27C255582",
30 |     "S27C170328",
31 |     "S7C221395",
32 |     "S26C281881",
33 |     "S12C228927",
34 |     "S26C86604",
35 |     "S27C93037",
36 |     "S9C124493",
37 |     "S27C236159",
38 |     "S27C214882",
39 |     "S7C273086",
40 |     "S8C93079",
41 |     "S12C85159",
42 |     "S10C72456",
43 |     "S27C19079",
44 | ]
45 | 
46 | BAM_SEQ_LENS = [
47 |     2271,
48 |     3235,
49 |     3816,
50 |     2625,
51 |     2716,
52 |     4035,
53 |     3001,
54 |     2583,
55 |     5962,
56 |     3774,
57 |     2150,
58 |     2161,
59 |     2218,
60 |     2047,
61 |     5772,
62 |     2633,
63 |     3400,
64 |     3502,
65 |     2103,
66 |     4308,
67 |     3061,
68 |     2464,
69 |     4099,
70 |     2640,
71 |     2449,
72 | ]
73 | 
74 | 
75 | def make_randseq(rng, frm: int, to: int) -> vamb.vambtools.FastaEntry:
76 |     name = rng.choice(string.ascii_uppercase) + "".join(
77 |         rng.choices(string.ascii_lowercase, k=11)
78 |     )
79 |     seq = "".join(
80 |         rng.choices(
81 |             "acgtACGTnNywsdbK",
82 |             weights=[0.12] * 8 + [0.005] * 8,
83 |             k=rng.randrange(frm, to),
84 |         )
85 |     )
86 |     return vamb.vambtools.FastaEntry(name.encode(), bytearray(seq.encode()))
87 | 


--------------------------------------------------------------------------------
/vamb/__init__.py:
--------------------------------------------------------------------------------
 1 | """Vamb - Variational Autoencoders for Metagenomic Binning
 2 | Documentation: https://github.com/RasmussenLab/vamb/
 3 | """
 4 | 
 5 | # TODO: Pyhmmer is compiled with -funsafe-math-optimizations, which toggles some
 6 | # flag in the CPU controlling float subnormal behaviour.
 7 | # This causes a warning in NumPy.
 8 | # This is not an issue in Vamb (I think), so we silence the warning here as a
 9 | # temporary fix.
10 | # See https://github.com/althonos/pyhmmer/issues/71
11 | import warnings
12 | 
13 | warnings.filterwarnings("ignore", category=UserWarning, module="numpy")
14 | 
15 | from . import vambtools
16 | from . import parsebam
17 | from . import parsecontigs
18 | from . import parsemarkers
19 | from . import taxonomy
20 | from . import cluster
21 | from . import encode
22 | from . import aamb_encode
23 | from . import semisupervised_encode
24 | from . import hloss_misc
25 | from . import taxvamb_encode
26 | from . import reclustering
27 | 
28 | from importlib.metadata import version as get_version
29 | from loguru import logger
30 | 
31 | __version_str__ = get_version("vamb")
32 | logger.remove()
33 | 
34 | __all__ = [
35 |     "vambtools",
36 |     "parsebam",
37 |     "parsecontigs",
38 |     "parsemarkers",
39 |     "taxonomy",
40 |     "cluster",
41 |     "encode",
42 |     "aamb_encode",
43 |     "semisupervised_encode",
44 |     "taxvamb_encode",
45 |     "hloss_misc",
46 |     "reclustering",
47 | ]
48 | 


--------------------------------------------------------------------------------
/vamb/kernel.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RasmussenLab/vamb/c53e5419392ce3f46ffc9038d2df0004b2d94569/vamb/kernel.npz


--------------------------------------------------------------------------------
/vamb/parsebam.py:
--------------------------------------------------------------------------------
  1 | __doc__ = """Estimate depths from BAM files of reads mapped to contigs.
  2 | 
  3 | Usage:
  4 | >>> bampaths = ['/path/to/bam1.bam', '/path/to/bam2.bam', '/path/to/bam3.bam']
  5 | >>> rpkms = Abundance.from_file(bampaths, metadata, True, 0.1, 3)
  6 | """
  7 | 
  8 | import pycoverm
  9 | import os as _os
 10 | import numpy as _np
 11 | from math import isfinite
 12 | from vamb.parsecontigs import CompositionMetaData
 13 | from vamb import vambtools
 14 | from typing import Optional, TypeVar, Union, IO, Sequence, Iterable
 15 | from pathlib import Path
 16 | from itertools import zip_longest
 17 | import shutil
 18 | 
 19 | _ncpu = _os.cpu_count()
 20 | DEFAULT_THREADS = 8 if _ncpu is None else _ncpu
 21 | 
 22 | A = TypeVar("A", bound="Abundance")
 23 | 
 24 | 
 25 | class Abundance:
 26 |     "Object representing contig abundance. Contains a matrix and refhash."
 27 | 
 28 |     __slots__ = ["matrix", "samplenames", "minid", "refhash"]
 29 | 
 30 |     def __init__(
 31 |         self,
 32 |         matrix: _np.ndarray,
 33 |         samplenames: Sequence[str],
 34 |         minid: float,
 35 |         refhash: bytes,
 36 |     ):
 37 |         assert matrix.dtype == _np.float32
 38 |         assert matrix.ndim == 2
 39 |         assert matrix.shape[1] == len(samplenames)
 40 |         assert isfinite(minid) and 0.0 <= minid and minid <= 1.0
 41 | 
 42 |         self.matrix = matrix
 43 |         self.samplenames = _np.array(samplenames, dtype=object)
 44 |         self.minid = minid
 45 |         self.refhash = refhash
 46 | 
 47 |     @property
 48 |     def nseqs(self) -> int:
 49 |         return len(self.matrix)
 50 | 
 51 |     @property
 52 |     def nsamples(self) -> int:
 53 |         return len(self.samplenames)
 54 | 
 55 |     def save(self, io: Union[Path, IO[bytes]]):
 56 |         _np.savez_compressed(
 57 |             io,
 58 |             matrix=self.matrix,
 59 |             samplenames=self.samplenames,
 60 |             minid=self.minid,
 61 |             refhash=self.refhash,
 62 |         )
 63 | 
 64 |     @classmethod
 65 |     def load(
 66 |         cls: type[A], io: Union[str, Path, IO[bytes]], refhash: Optional[bytes]
 67 |     ) -> A:
 68 |         arrs = _np.load(io, allow_pickle=True)
 69 |         if "arr_0" in arrs.keys():
 70 |             return arrs["arr_0"]  # old format
 71 |         abundance = cls(
 72 |             vambtools.validate_input_array(arrs["matrix"]),
 73 |             arrs["samplenames"],
 74 |             arrs["minid"].item(),
 75 |             arrs["refhash"].item(),
 76 |         )
 77 |         if refhash is not None:
 78 |             vambtools.RefHasher.verify_refhash(
 79 |                 abundance.refhash,
 80 |                 refhash,
 81 |                 "the loaded Abundance object",
 82 |                 "the given refhash",
 83 |                 None,
 84 |             )
 85 | 
 86 |         return abundance
 87 | 
 88 |     @classmethod
 89 |     def from_files(
 90 |         cls: type[A],
 91 |         paths: list[Path],
 92 |         cache_directory: Optional[Path],
 93 |         comp_metadata: CompositionMetaData,
 94 |         verify_refhash: bool,
 95 |         minid: float,
 96 |         nthreads: int,
 97 |     ) -> A:
 98 |         """Input:
 99 |         paths: List of paths to BAM files
100 |         cache_directory: Where to store temp parts of the larger matrix, if reading multiple
101 |            BAM files in chunks. Required if len(paths) > min(16, nthreads)
102 |         comp_metadata: CompositionMetaData of sequence catalogue used to make BAM files
103 |         verify_refhash: Whether to verify composition and BAM references are the same
104 |         minid: Discard any reads with nucleotide identity less than this
105 |         nthreads: Use this number of threads for coverage estimation
106 |         """
107 |         if minid < 0 or minid > 1:
108 |             raise ValueError(f"minid must be between 0 and 1, not {minid}")
109 | 
110 |         # Workaround: Currently pycoverm has a bug where it filters contigs when mindid == 0
111 |         # (issue #7). Can be solved by setting it to a low value
112 |         minid = minid if minid > 0.001 else 0.001
113 | 
114 |         if nthreads < 1:
115 |             raise ValueError(f"nthreads must be > 0, not {nthreads}")
116 | 
117 |         chunksize = min(nthreads, len(paths))
118 | 
119 |         # We cap it to 16 threads, max. This will prevent pycoverm from consuming a huge amount
120 |         # of memory if given a crapload of threads, and most programs will probably be IO bound
121 |         # when reading 16 files at a time.
122 |         chunksize = min(chunksize, 16)
123 | 
124 |         # If it can be done in memory, do so
125 |         if chunksize >= len(paths):
126 |             (matrix, refhash) = cls.run_pycoverm(
127 |                 paths,
128 |                 minid,
129 |                 comp_metadata.refhash if verify_refhash else None,
130 |                 comp_metadata.identifiers if verify_refhash else None,
131 |                 comp_metadata.mask,
132 |             )
133 |             vambtools.mask_lower_bits(matrix, 12)
134 |             return cls(matrix, [str(p) for p in paths], minid, refhash)
135 |         # Else, we load it in chunks, then assemble afterwards
136 |         else:
137 |             if cache_directory is None:
138 |                 raise ValueError(
139 |                     "If min(16, nthreads) < len(paths), cache_directory must not be None"
140 |                 )
141 |             return cls.chunkwise_loading(
142 |                 paths,
143 |                 cache_directory,
144 |                 chunksize,
145 |                 minid,
146 |                 comp_metadata.refhash if verify_refhash else None,
147 |                 comp_metadata.identifiers if verify_refhash else None,
148 |                 comp_metadata.mask,
149 |             )
150 | 
151 |     @classmethod
152 |     def chunkwise_loading(
153 |         cls: type[A],
154 |         paths: list[Path],
155 |         cache_directory: Path,
156 |         nthreads: int,
157 |         minid: float,
158 |         target_refhash: Optional[bytes],
159 |         target_identifiers: Optional[Iterable[str]],
160 |         mask: _np.ndarray,
161 |     ) -> A:
162 |         _os.makedirs(cache_directory)
163 | 
164 |         chunks = [
165 |             (i, min(len(paths), i + nthreads)) for i in range(0, len(paths), nthreads)
166 |         ]
167 |         filenames = [
168 |             _os.path.join(cache_directory, str(i) + ".npz") for i in range(len(chunks))
169 |         ]
170 |         assert len(chunks) > 1
171 | 
172 |         # Load from BAM and store them chunkwise
173 |         refhash = None
174 |         for filename, (chunkstart, chunkstop) in zip(filenames, chunks):
175 |             (matrix, refhash) = cls.run_pycoverm(
176 |                 paths[chunkstart:chunkstop],
177 |                 minid,
178 |                 target_refhash,
179 |                 target_identifiers,
180 |                 mask,
181 |             )
182 |             vambtools.write_npz(filename, matrix)
183 | 
184 |         # Initialize matrix, the load them chunkwise. Delete the temp files when done
185 |         matrix = _np.empty((mask.sum(), len(paths)), dtype=_np.float32)
186 |         for filename, (chunkstart, chunkstop) in zip(filenames, chunks):
187 |             matrix[:, chunkstart:chunkstop] = vambtools.read_npz(filename)
188 |         vambtools.mask_lower_bits(matrix, 12)
189 | 
190 |         shutil.rmtree(cache_directory)
191 | 
192 |         assert refhash is not None
193 |         return cls(matrix, [str(p) for p in paths], minid, refhash)
194 | 
195 |     @staticmethod
196 |     def run_pycoverm(
197 |         paths: list[Path],
198 |         minid: float,
199 |         target_refhash: Optional[bytes],
200 |         target_identifiers: Optional[Iterable[str]],
201 |         mask: _np.ndarray,
202 |     ) -> tuple[_np.ndarray, bytes]:
203 |         (headers, coverage) = pycoverm.get_coverages_from_bam(
204 |             [str(p) for p in paths],
205 |             threads=len(paths),
206 |             min_identity=minid,
207 |             # Note: pycoverm's trim_upper=0.1 is same as CoverM trim-upper 90.
208 |             trim_upper=0.1,
209 |             trim_lower=0.1,
210 |         )
211 | 
212 |         assert coverage.shape == (len(headers), len(paths))
213 | 
214 |         # Filter length, using comp_metadata's mask, which has been set by minlength
215 |         if len(mask) != len(headers):
216 |             raise ValueError(
217 |                 f"CompositionMetaData used to create Abundance object was created with {len(mask)} sequences, "
218 |                 f"but number of reference sequences in BAM files are {len(headers)}. "
219 |                 "Make sure the BAM files were created by mapping to the same FASTA file "
220 |                 "which you used to create the Composition object."
221 |             )
222 | 
223 |         headers = [h for (h, m) in zip(headers, mask) if m]
224 |         vambtools.numpy_inplace_maskarray(coverage, mask)
225 |         refhash = vambtools.RefHasher.hash_refnames(headers)
226 | 
227 |         if target_identifiers is None:
228 |             identifier_pairs = None
229 |         else:
230 |             identifier_pairs = (headers, target_identifiers)
231 | 
232 |         if target_refhash is not None:
233 |             vambtools.RefHasher.verify_refhash(
234 |                 refhash, target_refhash, "FASTA file", "BAM", identifier_pairs
235 |             )
236 | 
237 |         return (coverage, refhash)
238 | 
239 |     @classmethod
240 |     def from_tsv(cls: type[A], path: Path, comp_metadata: CompositionMetaData) -> A:
241 |         seen_identifiers: list[str] = []
242 |         with open(path) as file:
243 |             try:
244 |                 header = next(file)
245 |             except StopIteration:
246 |                 err = ValueError(f"Found no TSV header in abundance file '{path}'")
247 |                 raise err from None
248 |             columns = header.rstrip("\r\n").split("\t")
249 |             if len(columns) < 2:
250 |                 raise ValueError(
251 |                     f'Expected at least 2 columns in abundance TSV file at "{path}"'
252 |                 )
253 |             if columns[0] != "contigname":
254 |                 raise ValueError('First column in header must be "contigname"')
255 |             samples = columns[1:]
256 |             n_samples = len(samples)
257 |             matrix = _np.empty((comp_metadata.nseqs, n_samples), dtype=_np.float32)
258 |             matrix_row = 0
259 | 
260 |             # Line number minus two since we already read header, and Python is zero-indexed
261 |             for line_number_minus_two, (line, should_keep) in enumerate(
262 |                 zip_longest(file, comp_metadata.mask)
263 |             ):
264 |                 if line is None:
265 |                     # If line is none, there are too few lines in file
266 |                     raise ValueError(
267 |                         f'Too few rows in abundance TSV file "{path}", expected '
268 |                         f"{len(comp_metadata.mask) + 1}, got {line_number_minus_two + 1}"
269 |                     )
270 | 
271 |                 line = line.rstrip()
272 | 
273 |                 if not line:
274 |                     for next_line in file:
275 |                         if next_line.rstrip():
276 |                             raise ValueError(
277 |                                 "Found an empty line not at end of abundance TSV file"
278 |                                 f'"{path}"'
279 |                             )
280 |                     break
281 | 
282 |                 if should_keep is None:
283 |                     raise ValueError(
284 |                         f'Too many rows in abundance TSV file "{path}", expected '
285 |                         f"{len(comp_metadata.mask) + 1} sequences, got at least "
286 |                         f"{line_number_minus_two + 2}"
287 |                     )
288 | 
289 |                 if not should_keep:
290 |                     continue
291 | 
292 |                 fields = line.split("\t")
293 |                 if len(fields) != n_samples + 1:
294 |                     raise ValueError(
295 |                         f'In abundance TSV file "{path}", on line {line_number_minus_two + 2}'
296 |                         f", expected {n_samples + 1} columns, found {len(fields)}"
297 |                     )
298 |                 for i in range(n_samples):
299 |                     matrix[matrix_row, i] = float(fields[i + 1])
300 |                 matrix_row += 1
301 |                 seen_identifiers.append(fields[0])
302 | 
303 |         vambtools.RefHasher.verify_refhash(
304 |             vambtools.RefHasher.hash_refnames(seen_identifiers),
305 |             comp_metadata.refhash,
306 |             "abundance TSV",
307 |             "composition",
308 |             (seen_identifiers, comp_metadata.identifiers),
309 |         )
310 | 
311 |         return cls(matrix, samples, 0.0, comp_metadata.refhash)
312 | 


--------------------------------------------------------------------------------
/vamb/parsecontigs.py:
--------------------------------------------------------------------------------
  1 | import os as _os
  2 | import numpy as _np
  3 | import vamb.vambtools as _vambtools
  4 | from collections.abc import Iterable, Sequence
  5 | from typing import IO, Union, TypeVar, Optional
  6 | from pathlib import Path
  7 | 
  8 | # This kernel is created in src/create_kernel.py. See that file for explanation
  9 | _KERNEL: _np.ndarray = _vambtools.read_npz(
 10 |     _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "kernel.npz")
 11 | )
 12 | 
 13 | 
 14 | class CompositionMetaData:
 15 |     """A class containing metadata of sequence composition.
 16 |     Current fields are:
 17 |     * identifiers: A Numpy array of objects, str identifiers of kept sequences
 18 |     * lengths: A Numpy vector of 32-bit uint lengths of kept sequences
 19 |     * mask: A boolean Numpy vector of which sequences were kept in original file
 20 |     * refhash: A bytes object representing the hash of the identifiers
 21 |     * minlength: The minimum contig length used for filtering
 22 |     """
 23 | 
 24 |     __slots__ = ["identifiers", "lengths", "mask", "refhash", "minlength"]
 25 | 
 26 |     def __init__(
 27 |         self,
 28 |         identifiers: _np.ndarray,
 29 |         lengths: _np.ndarray,
 30 |         mask: _np.ndarray,
 31 |         minlength: int,
 32 |     ):
 33 |         assert len(identifiers) == len(lengths)
 34 |         assert identifiers.dtype == _np.dtype("O")
 35 |         assert _np.issubdtype(lengths.dtype, _np.integer)
 36 |         assert mask.dtype == bool
 37 |         assert mask.sum() == len(lengths)
 38 |         assert lengths.min(initial=minlength) >= minlength
 39 | 
 40 |         if len(set(identifiers)) < len(identifiers):
 41 |             raise ValueError(
 42 |                 "Sequence names must be unique, but are not. "
 43 |                 "Vamb only uses the identifier (e.g. header before whitespace) as "
 44 |                 "sequence identifiers. Verify identifier uniqueness."
 45 |             )
 46 | 
 47 |         self.identifiers = identifiers
 48 |         self.lengths = lengths
 49 |         self.mask = mask
 50 |         self.minlength = minlength
 51 |         self.refhash = _vambtools.RefHasher.hash_refnames(identifiers)
 52 | 
 53 |     @property
 54 |     def nseqs(self) -> int:
 55 |         "Number of sequences after filtering"
 56 |         return len(self.identifiers)
 57 | 
 58 |     def filter_mask(self, mask: Sequence[bool]):
 59 |         "Filter contigs given a mask whose length should be nseqs"
 60 |         assert len(mask) == self.nseqs
 61 |         ind = 0
 62 |         for i in range(len(self.mask)):
 63 |             if self.mask[i]:
 64 |                 self.mask[i] &= mask[ind]
 65 |                 ind += 1
 66 | 
 67 |         self.identifiers = self.identifiers[mask]
 68 |         self.lengths = self.lengths[mask]
 69 |         self.refhash = _vambtools.RefHasher.hash_refnames(self.identifiers)
 70 | 
 71 |     def filter_min_length(self, length: int):
 72 |         "Set or reset minlength of this object"
 73 |         if length <= self.minlength:
 74 |             return None
 75 | 
 76 |         self.filter_mask(self.lengths >= length)  # type:ignore
 77 |         self.minlength = length
 78 | 
 79 | 
 80 | C = TypeVar("C", bound="Composition")
 81 | 
 82 | 
 83 | class Composition:
 84 |     """A class containing a CompositionMetaData and its TNF matrix.
 85 |     Current fields are:
 86 |     * metadata: A CompositionMetaData object
 87 |     * matrix: The composition matrix itself
 88 |     """
 89 | 
 90 |     __slots__ = ["metadata", "matrix"]
 91 | 
 92 |     def __init__(self, metadata: CompositionMetaData, matrix: _np.ndarray):
 93 |         assert matrix.dtype == _np.float32
 94 |         assert matrix.shape == (metadata.nseqs, 103)
 95 | 
 96 |         self.metadata = metadata
 97 |         self.matrix = matrix
 98 | 
 99 |     def count_bases(self) -> int:
100 |         return self.metadata.lengths.sum()
101 | 
102 |     @property
103 |     def nseqs(self) -> int:
104 |         return self.metadata.nseqs
105 | 
106 |     def save(self, io: Union[str, Path, IO[bytes]]):
107 |         _np.savez_compressed(
108 |             io,
109 |             matrix=self.matrix,
110 |             identifiers=self.metadata.identifiers,
111 |             lengths=self.metadata.lengths,
112 |             mask=self.metadata.mask,
113 |             minlength=self.metadata.minlength,
114 |         )
115 | 
116 |     @classmethod
117 |     def load(cls, io: Union[str, IO[bytes], Path]):
118 |         arrs = _np.load(io, allow_pickle=True)
119 |         metadata = CompositionMetaData(
120 |             _vambtools.validate_input_array(arrs["identifiers"]),
121 |             _vambtools.validate_input_array(arrs["lengths"]),
122 |             _vambtools.validate_input_array(arrs["mask"]),
123 |             arrs["minlength"].item(),
124 |         )
125 |         return cls(metadata, _vambtools.validate_input_array(arrs["matrix"]))
126 | 
127 |     def filter_min_length(self, length: int):
128 |         if length <= self.metadata.minlength:
129 |             return None
130 | 
131 |         mask = self.metadata.lengths >= length
132 |         self.metadata.filter_mask(mask)
133 |         self.metadata.minlength = length
134 |         _vambtools.numpy_inplace_maskarray(self.matrix, mask)
135 | 
136 |     @staticmethod
137 |     def _project(fourmers: _np.ndarray, kernel: _np.ndarray = _KERNEL) -> _np.ndarray:
138 |         "Project fourmers down in dimensionality"
139 |         s = fourmers.sum(axis=1).reshape(-1, 1)
140 |         s[s == 0] = 1.0
141 |         fourmers *= 1 / s
142 |         fourmers += -(1 / 256)
143 |         return _np.dot(fourmers, kernel)
144 | 
145 |     @staticmethod
146 |     def _convert(raw: _vambtools.PushArray, projected: _vambtools.PushArray):
147 |         "Move data from raw PushArray to projected PushArray, converting it."
148 |         raw_mat = raw.take().reshape(-1, 256)
149 |         projected_mat = Composition._project(raw_mat)
150 |         projected.extend(projected_mat.ravel())
151 |         raw.clear()
152 | 
153 |     @classmethod
154 |     def from_file(
155 |         cls: type[C],
156 |         filehandle: Iterable[bytes],
157 |         filename: Optional[str],
158 |         minlength: int = 2000,
159 |     ) -> C:
160 |         """Parses a FASTA file open in binary reading mode, returning Composition.
161 | 
162 |         Input:
163 |             filehandle: Filehandle open in binary mode of a FASTA file
164 |             minlength: Ignore any references shorter than N bases [2000]
165 |         """
166 | 
167 |         if minlength < 4:
168 |             raise ValueError(f"Minlength must be at least 4, not {minlength}")
169 | 
170 |         raw = _vambtools.PushArray(_np.float32)
171 |         projected = _vambtools.PushArray(_np.float32)
172 |         lengths = _vambtools.PushArray(_np.int32)
173 |         mask = bytearray()  # we convert to Numpy at end
174 |         contignames: list[str] = list()
175 |         entries = _vambtools.byte_iterfasta(filehandle, filename)
176 | 
177 |         for entry in entries:
178 |             length = len(entry)
179 |             skip = length < minlength
180 |             mask.append(not skip)
181 | 
182 |             if skip:
183 |                 continue
184 | 
185 |             counts = entry.kmercounts()
186 |             if counts.sum() == 0:
187 |                 raise ValueError(
188 |                     f'TNF value of contig "{entry.header}" is all zeros. '
189 |                     + "This implies that the sequence contained no 4-mers of A, C, G, T or U, "
190 |                     + "making this sequence uninformative. This is probably a mistake. "
191 |                     + "Verify that the sequence contains usable information (e.g. is not all N's)"
192 |                 )
193 |             raw.extend(counts)
194 | 
195 |             if len(raw) > 256000:
196 |                 Composition._convert(raw, projected)
197 | 
198 |             lengths.append(len(entry))
199 |             contignames.append(entry.identifier)
200 | 
201 |         # Convert rest of contigs
202 |         Composition._convert(raw, projected)
203 |         tnfs_arr = projected.take()
204 |         _vambtools.mask_lower_bits(tnfs_arr, 12)
205 | 
206 |         # Don't use reshape since it creates a new array object with shared memory
207 |         tnfs_arr.shape = (len(tnfs_arr) // 103, 103)
208 |         lengths_arr = lengths.take()
209 | 
210 |         metadata = CompositionMetaData(
211 |             _np.array(contignames, dtype=object),
212 |             lengths_arr,
213 |             _np.array(mask, dtype=bool),
214 |             minlength,
215 |         )
216 |         return cls(metadata, tnfs_arr)
217 | 


--------------------------------------------------------------------------------
/vamb/parsemarkers.py:
--------------------------------------------------------------------------------
  1 | # Overview
  2 | # We use pyrodigal to predict genes in every contigs not filtered away by
  3 | # the given mask, then use pyhmmer to predict single copy marker genes (SCGs)
  4 | # on the genes, hence getting a contig => list[SCG] mapping.
  5 | # Pyrodigal/pyhmmer is a bottleneck, so we run in parallel processes.
  6 | # To avoid inter-process communication overhead, we first split the input
  7 | # FASTA files to N files, then we have each process work on the files independently.
  8 | 
  9 | from vamb.vambtools import FastaEntry, Reader, RefHasher, byte_iterfasta
 10 | import pyrodigal
 11 | import pyhmmer
 12 | from multiprocessing.pool import Pool
 13 | import os
 14 | import itertools
 15 | from pathlib import Path
 16 | from typing import NewType, Sequence, Union, IO, Optional, Iterable
 17 | import shutil
 18 | from collections import defaultdict
 19 | import json
 20 | import numpy as np
 21 | from loguru import logger
 22 | 
 23 | MarkerID = NewType("MarkerID", int)
 24 | MarkerName = NewType("MarkerName", str)
 25 | ContigID = NewType("ContigID", int)
 26 | ContigName = NewType("ContigName", str)
 27 | 
 28 | 
 29 | class Markers:
 30 |     """
 31 |     The set of marker genes predicted for a collection of contigs.
 32 |     Instantiate using `Markers.from_files`, or load using `Markers.load`.
 33 |     Like Abundance objects, Markers carry a refhash to check that the markers correspond
 34 |     to the same sequences used to create the markers.
 35 |     Access the markers with `markers.markers`, a `list[Optional[np.array]]`, with one
 36 |     element for each contig. The element is `None` if there are no markers, else a list
 37 |     of marker genes present in the contig.
 38 |     The marker genes are stored as integers - the name of a marker `i` can be gotten using
 39 |     `markers.marker_names[i]`.
 40 |     In each contig, markers are deduplicated, so at most 1 of each marker is found
 41 |     in each contig.
 42 |     """
 43 | 
 44 |     __slots__ = ["markers", "marker_names", "refhash"]
 45 | 
 46 |     def __init__(
 47 |         self,
 48 |         markers: list[Optional[np.ndarray]],
 49 |         # Some IDs map to multiple names, if they act as the same SCG in the cell
 50 |         marker_names: list[list[MarkerName]],
 51 |         refhash: bytes,
 52 |     ):
 53 |         if len(set(itertools.chain.from_iterable(marker_names))) != sum(
 54 |             len(i) for i in marker_names
 55 |         ):
 56 |             raise ValueError("Marker names are not unique, but must be")
 57 | 
 58 |         self.markers = markers
 59 |         self.marker_names = marker_names
 60 |         self.refhash = refhash
 61 | 
 62 |     @property
 63 |     def n_markers(self):
 64 |         return len(self.marker_names)
 65 | 
 66 |     @property
 67 |     def n_seqs(self):
 68 |         return len(self.markers)
 69 | 
 70 |     def score_bin(self, indices: Iterable[int]) -> tuple[float, float]:
 71 |         counts = np.zeros(self.n_markers, dtype=np.uint8)
 72 |         for i in indices:
 73 |             mkrs = self.markers[i]
 74 |             if mkrs is None:
 75 |                 continue
 76 |             for m in mkrs:
 77 |                 counts[m] += 1
 78 | 
 79 |         n_unique = (counts > 0).sum()
 80 |         completeness = n_unique / self.n_markers
 81 |         contamination = (counts.sum() - n_unique) / self.n_markers
 82 |         return (completeness, contamination)
 83 | 
 84 |     def save(self, io: Union[Path, str, IO[str]]):
 85 |         representation = {
 86 |             "markers": [i if i is None else i.tolist() for i in self.markers],
 87 |             "marker_names": self.marker_names,
 88 |             "refhash": self.refhash.hex(),
 89 |         }
 90 |         # Check we didn't forget any fields
 91 |         assert len(representation) == len(self.__slots__)
 92 |         if isinstance(io, Path) or isinstance(io, str):
 93 |             with open(io, "w") as file:
 94 |                 json.dump(representation, file)
 95 | 
 96 |         else:
 97 |             json.dump(representation, io)
 98 | 
 99 |     @classmethod
100 |     def load(cls, io: Union[Path, str, IO[str]], refhash: Optional[bytes]):
101 |         if isinstance(io, Path) or isinstance(io, str):
102 |             with open(io, "rb") as file:
103 |                 representation = json.load(file)
104 |         else:
105 |             representation = json.load(io)
106 |         observed_refhash = bytes.fromhex(representation["refhash"])
107 |         if refhash is not None:
108 |             RefHasher.verify_refhash(
109 |                 refhash=observed_refhash,
110 |                 target_refhash=refhash,
111 |                 observed_name="Loaded markers",
112 |                 target_name=None,
113 |                 identifiers=None,
114 |             )
115 |         markers_as_arrays = [
116 |             i if i is None else np.array(i, dtype=np.uint8)
117 |             for i in representation["markers"]
118 |         ]
119 | 
120 |         return cls(markers_as_arrays, representation["marker_names"], observed_refhash)
121 | 
122 |     @classmethod
123 |     def from_files(
124 |         cls,
125 |         contigs: Path,
126 |         hmm_path: Path,
127 |         contignames: Sequence[str],
128 |         tmpdir_to_create: Path,
129 |         n_processes: int,
130 |         target_refhash: Optional[bytes],
131 |     ):
132 |         """
133 |         Create the Markers from input files:
134 |         `contigs`: Path to a FASTA file with all contigs, gzipped or not.
135 |         `hmm_path`: Path to a HMMER .hmm file with the markers. Note: Currently,
136 |           this file can contain at most 256 markers, though this restriction can
137 |           be lifted if necessary
138 | 
139 |         The `fasta_entry_mask` is a boolean mask of which contigs in the FASTA
140 |         file to include. This affects the refhash which is only computed for
141 |         the contigs not filtered away.
142 |         If the target refhash is not None, and the computed reference hash does not
143 |         match, an exception is thrown. See vamb.vambtools.RefHasher.
144 |         """
145 |         n_processes = cap_processes(n_processes)
146 |         with open(hmm_path, "rb") as file:
147 |             hmms = list(pyhmmer.plan7.HMMFile(file))
148 |         (_, marker_names) = get_name_to_id(hmms)
149 | 
150 |         (refhash, paths) = split_file(
151 |             contigs,
152 |             contignames,
153 |             tmpdir_to_create,
154 |             n_processes,
155 |         )
156 | 
157 |         if target_refhash is not None:
158 |             RefHasher.verify_refhash(
159 |                 refhash, target_refhash, "Markers FASTA file", None, None
160 |             )
161 | 
162 |         index_of_name = {
163 |             ContigName(n): ContigID(i) for (i, n) in enumerate(contignames)
164 |         }
165 |         marker_list: list[Optional[np.ndarray]] = [None] * len(contignames)
166 |         with Pool(n_processes) as pool:
167 |             for sub_result in pool.imap_unordered(
168 |                 work_per_process,
169 |                 list(zip(paths, itertools.repeat(hmms))),
170 |             ):
171 |                 for contig_name, markers in sub_result:
172 |                     marker_list[index_of_name[contig_name]] = markers
173 | 
174 |         shutil.rmtree(tmpdir_to_create)
175 |         markers = cls(marker_list, marker_names, refhash)
176 | 
177 |         return markers
178 | 
179 | 
180 | def cap_processes(processes: int) -> int:
181 |     if processes < 1:
182 |         raise ValueError(f"Must use at least 1 process, not {processes}")
183 |     # Cap processes, because most OSs cap the number of open file handles,
184 |     # and we need one file per process when splitting FASTA file
185 |     elif processes > 64:
186 |         logger.warning(f"Processes set to {processes}, capping to 64")
187 |         return 64
188 |     return processes
189 | 
190 | 
191 | # Some markers have different names, but should be treated as the same SCG.
192 | NORMALIZE_MARKER_TRANS_DICT = {
193 |     "TIGR00388": "TIGR00389",
194 |     "TIGR00471": "TIGR00472",
195 |     "TIGR00408": "TIGR00409",
196 |     "TIGR02386": "TIGR02387",
197 | }
198 | 
199 | 
200 | def split_file(
201 |     input: Path,
202 |     contignames: Sequence[str],
203 |     tmpdir_to_create: Path,
204 |     n_splits: int,
205 | ) -> tuple[bytes, list[Path]]:
206 |     names = set(contignames)
207 |     os.mkdir(tmpdir_to_create)
208 |     paths = [tmpdir_to_create.joinpath(str(i)) for i in range(n_splits)]
209 |     filehandles = [open(path, "w") for path in paths]
210 |     refhasher = RefHasher()
211 |     with Reader(input) as infile:
212 |         for i, (outfile, record) in enumerate(
213 |             zip(
214 |                 itertools.cycle(filehandles),
215 |                 filter(lambda x: x.identifier in names, byte_iterfasta(infile, None)),
216 |             )
217 |         ):
218 |             refhasher.add_refname(record.identifier)
219 |             print(record.format(), file=outfile)
220 | 
221 |     for filehandle in filehandles:
222 |         filehandle.close()
223 |     refhash = refhasher.digest()
224 |     return (refhash, paths)
225 | 
226 | 
227 | def process_chunk(
228 |     chunk: list[FastaEntry],
229 |     hmms: list[pyhmmer.plan7.HMM],
230 |     name_to_id: dict[MarkerName, MarkerID],
231 |     finder: pyrodigal.GeneFinder,
232 | ) -> list[tuple[ContigName, np.ndarray]]:
233 |     # We temporarily store them as sets in order to deduplicate. While single contigs
234 |     # may have duplicate markers, it makes no sense to count this as contamination,
235 |     # because we are not about to second-guess the assembler's job of avoiding
236 |     # chimeric sequences.
237 |     markers: defaultdict[ContigName, set[MarkerID]] = defaultdict(set)
238 |     alphabet = pyhmmer.easel.Alphabet.amino()
239 |     digitized: list[pyhmmer.easel.DigitalSequence] = []
240 |     for record in chunk:
241 |         for gene in finder.find_genes(record.sequence):
242 |             seq = pyhmmer.easel.TextSequence(
243 |                 name=record.identifier.encode(), sequence=gene.translate()
244 |             ).digitize(alphabet)
245 |             digitized.append(seq)
246 | 
247 |     for hmm, top_hits in zip(hmms, pyhmmer.hmmsearch(hmms, digitized)):
248 |         marker_name = MarkerName(hmm.name.decode())
249 |         marker_id = name_to_id[marker_name]
250 |         # We need this score cutoff, which is stored in the HMM file to remove the large
251 |         # number of false positives from HMMER
252 |         score_cutoff = hmm.cutoffs.trusted1
253 |         assert score_cutoff is not None
254 |         for hit in top_hits:
255 |             if hit.score >= score_cutoff:
256 |                 markers[ContigName(hit.name.decode())].add(marker_id)
257 | 
258 |     return [
259 |         (name, np.array(list(ids), dtype=np.uint8)) for (name, ids) in markers.items()
260 |     ]
261 | 
262 | 
263 | def work_per_process(
264 |     args: tuple[Path, list[pyhmmer.plan7.HMM]],
265 | ) -> list[tuple[ContigName, np.ndarray]]:
266 |     (contig_path, hmms) = args
267 | 
268 |     (name_to_id, _) = get_name_to_id(hmms)
269 | 
270 |     # Chunk up the FASTA file for memory efficiency reasons, while still
271 |     # allowing pyhmmer to scan multiple sequences at once for speed
272 |     chunk: list[FastaEntry] = []
273 |     result: list[tuple[ContigName, np.ndarray]] = []
274 |     finder = pyrodigal.GeneFinder(meta=True)
275 |     with open(contig_path, "rb") as file:
276 |         for record in byte_iterfasta(file, None):
277 |             chunk.append(record)
278 |             if len(chunk) == 2048:
279 |                 result.extend(process_chunk(chunk, hmms, name_to_id, finder))
280 |                 chunk.clear()
281 |         result.extend(process_chunk(chunk, hmms, name_to_id, finder))
282 | 
283 |     return result
284 | 
285 | 
286 | def get_name_to_id(
287 |     hmms: list[pyhmmer.plan7.HMM],
288 | ) -> tuple[dict[MarkerName, MarkerID], list[list[MarkerName]]]:
289 |     name_to_id: dict[MarkerName, MarkerID] = dict()
290 |     for hmm in hmms:
291 |         name = hmm.name.decode()
292 |         if name in NORMALIZE_MARKER_TRANS_DICT:
293 |             continue
294 |         name_to_id[MarkerName(name)] = MarkerID(len(name_to_id))
295 |     for old_name, new_name in NORMALIZE_MARKER_TRANS_DICT.items():
296 |         name_to_id[MarkerName(old_name)] = name_to_id[MarkerName(new_name)]
297 | 
298 |     if len(set(name_to_id.values())) > 256:
299 |         raise ValueError("Maximum 256 marker IDs")
300 | 
301 |     id_to_names: defaultdict[MarkerID, list[MarkerName]] = defaultdict(list)
302 |     for n, i in name_to_id.items():
303 |         id_to_names[i].append(n)
304 |     marker_names = [id_to_names[MarkerID(i)] for i in range(len(id_to_names))]
305 | 
306 |     return name_to_id, marker_names
307 | 


--------------------------------------------------------------------------------
/vamb/reclustering.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The following code is based on the k-means based reclustering algorithm first published at https://github.com/BigDataBiology/SemiBin
  3 | The original code is distributed under MIT License.
  4 | """
  5 | 
  6 | from sklearn.cluster import KMeans
  7 | import numpy as np
  8 | from collections import defaultdict
  9 | from sklearn.cluster import DBSCAN
 10 | from sklearn.metrics import pairwise_distances
 11 | from vamb.taxonomy import Taxonomy
 12 | from vamb.parsemarkers import Markers, MarkerID
 13 | from vamb.parsecontigs import CompositionMetaData
 14 | from vamb.vambtools import RefHasher
 15 | from collections.abc import Sequence, Iterable
 16 | from typing import NewType, Optional, Union
 17 | 
 18 | # We use these aliases to be able to work with integers, which is faster.
 19 | ContigId = NewType("ContigId", int)
 20 | BinId = NewType("BinId", int)
 21 | 
 22 | # TODO: We might want to benchmark the best value for this constant.
 23 | # Right now, we do too much duplicated work by clustering 18 times.
 24 | EPS_VALUES = np.arange(0.01, 0.35, 0.02)
 25 | 
 26 | 
 27 | class KmeansAlgorithm:
 28 |     "Arguments needed specifically when using the KMeans algorithm"
 29 | 
 30 |     def __init__(
 31 |         self, clusters: list[set[ContigId]], random_seed: int, contiglengths: np.ndarray
 32 |     ):
 33 |         assert np.issubdtype(contiglengths.dtype, np.integer)
 34 |         self.contiglengths = contiglengths
 35 |         self.clusters = clusters
 36 |         self.random_seed = random_seed
 37 | 
 38 | 
 39 | class DBScanAlgorithm:
 40 |     "Arguments needed specifically when using the DBScan algorithm"
 41 | 
 42 |     def __init__(
 43 |         self, comp_metadata: CompositionMetaData, taxonomy: Taxonomy, n_processes: int
 44 |     ):
 45 |         if not taxonomy.is_canonical:
 46 |             raise ValueError(
 47 |                 "Can only run DBScan on a Taxonomy object with is_canonical set"
 48 |             )
 49 |         RefHasher.verify_refhash(
 50 |             taxonomy.refhash,
 51 |             comp_metadata.refhash,
 52 |             "taxonomy",
 53 |             "composition",
 54 |             None,
 55 |         )
 56 |         self.contiglengths = comp_metadata.lengths
 57 |         self.taxonomy = taxonomy
 58 |         self.n_processes = n_processes
 59 | 
 60 | 
 61 | def recluster_bins(
 62 |     markers: Markers,
 63 |     latent: np.ndarray,
 64 |     algorithm: Union[KmeansAlgorithm, DBScanAlgorithm],
 65 | ) -> list[set[ContigId]]:
 66 |     assert np.issubdtype(algorithm.contiglengths.dtype, np.integer)
 67 |     assert np.issubdtype(latent.dtype, np.floating)
 68 | 
 69 |     if not (len(algorithm.contiglengths) == markers.n_seqs == len(latent)):
 70 |         raise ValueError(
 71 |             "Number of elements in contiglengths, markers and latent must match"
 72 |         )
 73 | 
 74 |     # Simply dispatch to the right implementation based on the algorithm used
 75 |     if isinstance(algorithm, KmeansAlgorithm):
 76 |         return recluster_kmeans(
 77 |             algorithm.clusters,
 78 |             latent,
 79 |             algorithm.contiglengths,
 80 |             markers,
 81 |             algorithm.random_seed,
 82 |         )
 83 |     elif isinstance(algorithm, DBScanAlgorithm):
 84 |         assert len(algorithm.taxonomy.contig_taxonomies) == markers.n_seqs
 85 |         return recluster_dbscan(
 86 |             algorithm.taxonomy,
 87 |             latent,
 88 |             algorithm.contiglengths,
 89 |             markers,
 90 |             algorithm.n_processes,
 91 |         )
 92 | 
 93 | 
 94 | def recluster_kmeans(
 95 |     clusters: list[set[ContigId]],
 96 |     latent: np.ndarray,
 97 |     contiglengths: np.ndarray,
 98 |     markers: Markers,
 99 |     random_seed: int,
100 | ) -> list[set[ContigId]]:
101 |     assert len(latent) == len(contiglengths) == markers.n_seqs
102 |     assert np.issubdtype(contiglengths.dtype, np.integer)
103 |     assert np.issubdtype(latent.dtype, np.floating)
104 |     assert latent.ndim == 2
105 | 
106 |     result: list[set[ContigId]] = []
107 |     indices_by_medoid: dict[int, set[ContigId]] = defaultdict(set)
108 |     # We loop over all existing clusters, and determine if they should be split,
109 |     # by looking at the median number of single-copy genes in the cluster
110 |     for cluster in clusters:
111 |         # All clusters with 1 contig by definition cannot have multiple single-copy
112 |         # genes (because SCGs are deduplicated within a single contig)
113 |         if len(cluster) == 1:
114 |             result.append(cluster)
115 |             continue
116 |         # Get a count of each marker and compute the median count of SCGs
117 |         counts = count_markers(cluster, markers)
118 |         cp = counts.copy()
119 |         cp.sort()
120 |         median_counts: int = cp[len(cp) // 2]
121 |         # If we have less than 2 SCGs on average, the cluster should not be split,
122 |         # and we emit it unchanged
123 |         if median_counts < 2:
124 |             result.append(cluster)
125 |             continue
126 | 
127 |         # Run K-means with the median number of SCGs to split the contig.
128 |         # We weigh the contigs by length.
129 |         seeds = get_kmeans_seeds(
130 |             cluster,
131 |             markers,
132 |             contiglengths,  # type: ignore
133 |             counts,
134 |             median_counts,
135 |         )
136 | 
137 |         cluster_indices = np.array(list(cluster))
138 |         cluter_latent = latent[cluster_indices]
139 |         cluster_lengths = contiglengths[cluster_indices]
140 |         seed_latent = latent[seeds]
141 |         kmeans = KMeans(
142 |             n_clusters=median_counts,
143 |             init=seed_latent,
144 |             n_init=1,
145 |             random_state=random_seed,
146 |         )
147 |         kmeans.fit(cluter_latent, sample_weight=cluster_lengths)
148 |         indices_by_medoid.clear()
149 |         for cluster_label, index in zip(kmeans.labels_, cluster_indices):
150 |             indices_by_medoid[cluster_label].add(ContigId(index))
151 |         result.extend(indices_by_medoid.values())
152 | 
153 |     return result
154 | 
155 | 
156 | # Get a vector of counts, of each SCG,
157 | # where if MarkerID(5) is seen 9 times, then counts[5] == 9.
158 | def count_markers(
159 |     contigs: Iterable[ContigId],
160 |     markers: Markers,
161 | ) -> np.ndarray:
162 |     counts = np.zeros(markers.n_markers, dtype=np.int32)
163 |     for contig in contigs:
164 |         m = markers.markers[contig]
165 |         if m is not None:
166 |             counts[m] += 1
167 |     return counts
168 | 
169 | 
170 | # Same as above, but once we see a very high number of marker genes,
171 | # we bail. This is because a large fraction of time spent in this module
172 | # would otherwise be counting markers of huge clusters, long after we already
173 | # know it's hopelessly contaminated
174 | def count_markers_saturated(
175 |     contigs: Iterable[ContigId],
176 |     markers: Markers,
177 | ) -> Optional[np.ndarray]:
178 |     counts = np.zeros(markers.n_markers, dtype=np.int32)
179 |     n_markers = 0
180 |     n_unique = 0
181 |     # This implies contamination == 1.0
182 |     max_duplicates = 1 * markers.n_markers
183 |     for contig in contigs:
184 |         m = markers.markers[contig]
185 |         if m is not None:
186 |             n_markers += len(m)
187 |             for i in m:
188 |                 existing = counts[i]
189 |                 n_unique += existing == 0
190 |                 counts[i] = existing + 1
191 | 
192 |             if (n_markers - n_unique) > max_duplicates:
193 |                 return None
194 |     return counts
195 | 
196 | 
197 | # This is not very effectively implemented, but I assume it does not matter.
198 | # This function looks at all markers that occur exactly `median` times, each of these
199 | # markers corresponding to a list of `median` number of contigs.
200 | # It picks the marker for which the smallest contig that contains it is largest.
201 | # The idea here is that long contigs, which contain one of the SCGs that exist exactly
202 | # `median` times are most likely to be close to the actual medoid
203 | # that Kmeans needs to find.
204 | # This is just one possible seeding strategy. We could also plausibly choose e.g.
205 | # the trio of contigs that have the most SCGs.
206 | def get_kmeans_seeds(
207 |     contigs: Iterable[ContigId],
208 |     markers: Markers,
209 |     contiglengths: Sequence[int],
210 |     counts: np.ndarray,
211 |     median: int,
212 | ) -> list[ContigId]:
213 |     considered_markers = {MarkerID(i) for (i, c) in enumerate(counts) if c == median}
214 |     contigs_of_markers: dict[MarkerID, list[ContigId]] = defaultdict(list)
215 |     for contig in contigs:
216 |         m = markers.markers[contig]
217 |         if m is None:
218 |             continue
219 |         for mid in m:
220 |             if mid not in considered_markers:
221 |                 continue
222 |             contigs_of_markers[MarkerID(mid)].append(contig)
223 | 
224 |     candidate_list = list(contigs_of_markers.items())
225 |     pair = max(candidate_list, key=lambda x: min(contiglengths[i] for i in x[1]))
226 |     result = pair[1]
227 |     assert len(result) == median
228 |     return result
229 | 
230 | 
231 | def get_completeness_contamination(counts: np.ndarray) -> tuple[float, float]:
232 |     n_total = counts.sum()
233 |     n_unique = (counts > 0).sum()
234 |     completeness = n_unique / len(counts)
235 |     contamination = (n_total - n_unique) / len(counts)
236 |     return (completeness, contamination)
237 | 
238 | 
239 | def recluster_dbscan(
240 |     taxonomy: Taxonomy,
241 |     latent: np.ndarray,
242 |     contiglengths: np.ndarray,
243 |     markers: Markers,
244 |     num_processes: int,
245 | ) -> list[set[ContigId]]:
246 |     # Since DBScan is computationally expensive, and scales poorly with the number
247 |     # of contigs, we use taxonomy to only cluster within each genus
248 |     n_worse_in_row = 0
249 |     genera_indices = group_indices_by_genus(taxonomy)
250 |     best_score = 0
251 |     best_bins: list[set[ContigId]] = []
252 |     for eps in EPS_VALUES:
253 |         bins: list[set[ContigId]] = []
254 |         for indices in genera_indices:
255 |             genus_clusters = dbscan_genus(
256 |                 latent[indices], indices, contiglengths[indices], num_processes, eps
257 |             )
258 |             bins.extend(genus_clusters)
259 | 
260 |         score = count_good_genomes(bins, markers)
261 |         if best_score == 0 or score > best_score:
262 |             best_bins = bins
263 |             best_score = score
264 | 
265 |         if score >= best_score:
266 |             n_worse_in_row = 0
267 |         else:
268 |             n_worse_in_row += 1
269 |             if n_worse_in_row > 2:
270 |                 break
271 | 
272 |     return best_bins
273 | 
274 | 
275 | # DBScan within the subset of contigs that are annotated with a single genus
276 | def dbscan_genus(
277 |     latent_of_genus: np.ndarray,
278 |     original_indices: np.ndarray,
279 |     contiglengths_of_genus: np.ndarray,
280 |     num_processes: int,
281 |     eps: float,
282 | ) -> list[set[ContigId]]:
283 |     assert len(latent_of_genus) == len(original_indices) == len(contiglengths_of_genus)
284 |     # Precompute distance matrix. This is O(N^2), but DBScan is even worse,
285 |     # so this pays off.
286 |     # TODO: Maybe we should emit a warning if this function is called with too
287 |     # many points such that this matrix becomes huge?
288 |     distance_matrix = pairwise_distances(
289 |         latent_of_genus, latent_of_genus, metric="cosine"
290 |     )
291 |     # The DBScan approach works by blindly clustering with different eps values
292 |     # (a critical parameter for DBscan), and then using SCGs to select the best
293 |     # subset of clusters.
294 |     # It's ugly and wasteful, but it does work.
295 |     dbscan = DBSCAN(
296 |         eps=eps,
297 |         min_samples=5,
298 |         n_jobs=num_processes,
299 |         metric="precomputed",
300 |     )
301 |     dbscan.fit(distance_matrix, sample_weight=contiglengths_of_genus)
302 |     bins: dict[int, set[ContigId]] = defaultdict(set)
303 |     for original_index, bin_index in zip(original_indices, dbscan.labels_):
304 |         bins[bin_index].add(ContigId(original_index))
305 |     return list(bins.values())
306 | 
307 | 
308 | def count_good_genomes(binning: Iterable[Iterable[ContigId]], markers: Markers) -> int:
309 |     max_contamination = 0.3
310 |     min_completeness = 0.75
311 |     result = 0
312 |     for contigs in binning:
313 |         count = count_markers_saturated(contigs, markers)
314 |         if count is None:
315 |             continue
316 |         (comp, cont) = get_completeness_contamination(count)
317 |         if comp >= min_completeness and cont <= max_contamination:
318 |             result += 1
319 | 
320 |     return result
321 | 
322 | 
323 | def group_indices_by_genus(
324 |     taxonomy: Taxonomy,
325 | ) -> list[np.ndarray]:
326 |     if not taxonomy.is_canonical:
327 |         raise ValueError("Can only group by genus for a canonical taxonomy")
328 |     by_genus: dict[Optional[str], list[ContigId]] = defaultdict(list)
329 |     for i, tax in enumerate(taxonomy.contig_taxonomies):
330 |         genus = None if tax is None else tax.genus
331 |         by_genus[genus].append(ContigId(i))
332 |     return [np.array(i, dtype=np.int32) for i in by_genus.values()]
333 | 


--------------------------------------------------------------------------------
/vamb/taxonomy.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, IO
  2 | from pathlib import Path
  3 | from vamb.parsecontigs import CompositionMetaData
  4 | from vamb.vambtools import strip_string_newline
  5 | import numpy as np
  6 | from typing import Union
  7 | 
  8 | TAXONOMY_HEADER = "contigs\tpredictions"
  9 | PREDICTED_TAXONOMY_HEADER = "contigs\tpredictions\tscores"
 10 | 
 11 | 
 12 | class ContigTaxonomy:
 13 |     """
 14 |     Hierarchical taxonomy of some contig.
 15 |     If `is_canonical`, the ranks are assumed to be domain, phylum, class,
 16 |     order, family, genus, species, in that order.
 17 |     The taxonomy may be arbitrarily truncated, e.g. ["Eukaryota", "Chordata"]
 18 |     is a valid (canonical) taxonomy for a human.
 19 |     """
 20 | 
 21 |     __slots__ = ["ranks"]
 22 | 
 23 |     def __init__(self, ranks: list[str], is_canonical: bool = False):
 24 |         if is_canonical and len(ranks) > 7:
 25 |             raise ValueError(
 26 |                 "For a canonical ContigTaxonomy, there must be at most 7 ranks"
 27 |             )
 28 | 
 29 |         self.ranks = ranks
 30 | 
 31 |     @classmethod
 32 |     def from_semicolon_sep(cls, s: str, is_canonical: bool = False):
 33 |         if len(s) == 0:
 34 |             return cls([], is_canonical)
 35 |         else:
 36 |             return cls(s.split(";"), is_canonical)
 37 | 
 38 |     @property
 39 |     def genus(self) -> Optional[str]:
 40 |         if len(self.ranks) < 6:
 41 |             return None
 42 |         return self.ranks[5]
 43 | 
 44 | 
 45 | class Taxonomy:
 46 |     """
 47 |     * contig_taxonomies: An Optional[ContigTaxonomy] for every contig given by the
 48 |       CompositionMetaData used to instantiate
 49 |     * refhash: Refhash of CompositionMetaData used to instantiate
 50 |     * is_canonical: If the taxonomy uses the canonical seven ranks
 51 |       (domain, phylum, class, order, family, genus, species).
 52 |     """
 53 | 
 54 |     __slots__ = ["contig_taxonomies", "refhash", "is_canonical"]
 55 | 
 56 |     @property
 57 |     def nseqs(self) -> int:
 58 |         return len(self.contig_taxonomies)
 59 | 
 60 |     @classmethod
 61 |     def from_file(
 62 |         cls, tax_file: Path, metadata: CompositionMetaData, is_canonical: bool
 63 |     ):
 64 |         observed = cls.parse_tax_file(tax_file, is_canonical)
 65 |         return cls.from_observed(observed, metadata, is_canonical)
 66 | 
 67 |     @classmethod
 68 |     def from_refined_file(
 69 |         cls, tax_file: Path, metadata: CompositionMetaData, is_canonical: bool
 70 |     ):
 71 |         observed = PredictedTaxonomy.parse_tax_file(tax_file, is_canonical)
 72 |         observed = [(name, tax.contig_taxonomy) for (name, tax) in observed]
 73 |         return cls.from_observed(observed, metadata, is_canonical)
 74 | 
 75 |     @classmethod
 76 |     def from_observed(
 77 |         cls,
 78 |         observed_taxonomies: list[tuple[str, ContigTaxonomy]],
 79 |         metadata: CompositionMetaData,
 80 |         is_canonical: bool,
 81 |     ):
 82 |         index_of_contigname: dict[str, int] = {
 83 |             c: i for (i, c) in enumerate(metadata.identifiers)
 84 |         }
 85 |         contig_taxonomies: list[Optional[ContigTaxonomy]] = [None] * len(
 86 |             metadata.identifiers
 87 |         )
 88 |         n_found = 0
 89 |         for contigname, taxonomy in observed_taxonomies:
 90 |             index = index_of_contigname.get(contigname)
 91 |             if index is None:
 92 |                 continue
 93 |             n_found += 1
 94 |             existing = contig_taxonomies[index]
 95 |             if existing is not None:
 96 |                 raise ValueError(
 97 |                     f'Duplicate contigname when parsing taxonomy: "{contigname}"'
 98 |                 )
 99 |             contig_taxonomies[index] = taxonomy
100 | 
101 |         if n_found != metadata.nseqs:
102 |             raise ValueError(
103 |                 f"In taxonomy file, expected {metadata.nseqs} contigs that are "
104 |                 f"also present in the filtered FASTA file, but found {n_found}. "
105 |                 "Note that this might occur because some contigs in the taxonomy "
106 |                 "file falls under the minimum length threshold."
107 |             )
108 |         return cls(contig_taxonomies, metadata.refhash, is_canonical)
109 | 
110 |     def __init__(
111 |         self,
112 |         contig_taxonomies: list[Optional[ContigTaxonomy]],
113 |         refhash: bytes,
114 |         is_canonical: bool,
115 |     ):
116 |         self.contig_taxonomies = contig_taxonomies
117 |         self.refhash = refhash
118 |         self.is_canonical = is_canonical
119 |         assert_unambiguous_ranks(self)
120 | 
121 |     @staticmethod
122 |     def parse_tax_file(
123 |         path: Path, force_canonical: bool
124 |     ) -> list[tuple[str, ContigTaxonomy]]:
125 |         with open(path) as file:
126 |             result: list[tuple[str, ContigTaxonomy]] = []
127 |             header = next(file, None)
128 |             header = None if header is None else header.rstrip()
129 |             if header is None or header != TAXONOMY_HEADER:
130 |                 raise ValueError(
131 |                     f"In taxonomy file '{path}', expected header to be {repr(TAXONOMY_HEADER)}, "
132 |                     f"but found {'no header' if header is None else repr(header)}"
133 |                 )
134 |             # Minus two because we already read header, and because Python is zero-indexed
135 |             for lineno_minus_two, line in enumerate(file):
136 |                 line = strip_string_newline(line)
137 |                 fields = line.split("\t")
138 |                 if len(fields) != 2:
139 |                     raise ValueError(
140 |                         f"In taxonomy file '{path}', on line {lineno_minus_two + 2}, "
141 |                         f"expected 2 tab-separated columns, but found {len(fields)}."
142 |                     )
143 |                 (contigname, taxonomy) = fields
144 |                 result.append(
145 |                     (
146 |                         contigname,
147 |                         ContigTaxonomy.from_semicolon_sep(taxonomy, force_canonical),
148 |                     )
149 |                 )
150 | 
151 |         return result
152 | 
153 | 
154 | class PredictedContigTaxonomy:
155 |     slots = ["contig_taxonomy", "probs"]
156 | 
157 |     def __init__(self, tax: ContigTaxonomy, probs: np.ndarray):
158 |         if len(probs) != len(tax.ranks):
159 |             raise ValueError("The length of probs must equal that of ranks")
160 |         # Due to floating point errors, the probabilities may be slightly outside of 0 or 1.
161 |         # We could perhaps validate the values, but that's not likely to be necessary.
162 |         np.clip(probs, a_min=0.0, a_max=1.0, out=probs)
163 |         self.contig_taxonomy = tax
164 |         self.probs = probs
165 | 
166 | 
167 | class PredictedTaxonomy:
168 |     "Output of Taxometer"
169 | 
170 |     __slots__ = ["contig_taxonomies", "refhash", "is_canonical"]
171 | 
172 |     def __init__(
173 |         self,
174 |         taxonomies: list[PredictedContigTaxonomy],
175 |         metadata: CompositionMetaData,
176 |         is_canonical: bool,
177 |     ):
178 |         if len(taxonomies) != len(metadata.identifiers):
179 |             raise ValueError("Length of taxonomies must match that of identifiers")
180 | 
181 |         self.contig_taxonomies = taxonomies
182 |         self.refhash = metadata.refhash
183 |         self.is_canonical = is_canonical
184 |         assert_unambiguous_ranks(self)
185 | 
186 |     def to_taxonomy(self) -> Taxonomy:
187 |         lst: list[Optional[ContigTaxonomy]] = [
188 |             p.contig_taxonomy for p in self.contig_taxonomies
189 |         ]
190 |         return Taxonomy(lst, self.refhash, self.is_canonical)
191 | 
192 |     @property
193 |     def nseqs(self) -> int:
194 |         return len(self.contig_taxonomies)
195 | 
196 |     @staticmethod
197 |     def parse_tax_file(
198 |         path: Path, force_canonical: bool
199 |     ) -> list[tuple[str, PredictedContigTaxonomy]]:
200 |         with open(path) as file:
201 |             result: list[tuple[str, PredictedContigTaxonomy]] = []
202 |             lines = filter(None, map(str.rstrip, file))
203 |             header = next(lines, None)
204 |             if header is None or header != PREDICTED_TAXONOMY_HEADER:
205 |                 raise ValueError(
206 |                     f"In predicted taxonomy file '{path}', "
207 |                     f"expected header to be {repr(PREDICTED_TAXONOMY_HEADER)}, "
208 |                     f"but found {'no header' if header is None else repr(header)}."
209 |                 )
210 |             for linenum_minus_two, line in enumerate(lines):
211 |                 fields = line.split("\t")
212 |                 if len(fields) != 3:
213 |                     raise ValueError(
214 |                         f"Expected 3 fields in line {linenum_minus_two + 2} of file '{path}', "
215 |                         f"got {len(fields)}.\nLine: '{line}'"
216 |                     )
217 |                 (contigname, taxonomy, scores) = fields
218 |                 contig_taxonomy = ContigTaxonomy.from_semicolon_sep(
219 |                     taxonomy, force_canonical
220 |                 )
221 |                 probs = np.array([float(i) for i in scores.split(";")], dtype=float)
222 |                 result.append(
223 |                     (
224 |                         contigname,
225 |                         PredictedContigTaxonomy(contig_taxonomy, probs),
226 |                     )
227 |                 )
228 | 
229 |         return result
230 | 
231 |     def write_as_tsv(self, file: IO[str], comp_metadata: CompositionMetaData):
232 |         if self.refhash != comp_metadata.refhash:
233 |             raise ValueError(
234 |                 "Refhash of comp_metadata and predicted taxonomy must match"
235 |             )
236 |         assert self.nseqs == comp_metadata.nseqs
237 |         print(PREDICTED_TAXONOMY_HEADER, file=file)
238 |         for i in range(self.nseqs):
239 |             tax = self.contig_taxonomies[i]
240 |             ranks_str = ";".join(tax.contig_taxonomy.ranks)
241 |             probs_str = ";".join([str(round(i, 5)) for i in tax.probs])
242 |             print(
243 |                 comp_metadata.identifiers[i],
244 |                 ranks_str,
245 |                 probs_str,
246 |                 file=file,
247 |                 sep="\t",
248 |             )
249 | 
250 | 
251 | def assert_unambiguous_ranks(taxonomy: Union[Taxonomy, PredictedTaxonomy]):
252 |     """
253 |     Ensure that no rank appears at multiple levels in the taxonomy.
254 |     This will mess up some of TaxVamb's algorithms since it's based on the names of
255 |     taxons, and therefore, having a name on two ranks may cause it to be parsed
256 |     as a graph which is not a tree.
257 |     """
258 |     seen_ranks: dict[str, int] = dict()
259 |     parent_of: dict[str, str] = dict()
260 |     for i in taxonomy.contig_taxonomies:
261 |         # May be missing from Taxonomy
262 |         if i is None:
263 |             continue
264 | 
265 |         if isinstance(i, ContigTaxonomy):
266 |             ranks = i.ranks
267 |         else:
268 |             ranks = i.contig_taxonomy.ranks
269 | 
270 |         for rank, name in enumerate(ranks):
271 |             if seen_ranks.setdefault(name, rank) != rank:
272 |                 raise ValueError(
273 |                     f'Taxonomy is ambiguous: "{name}" appears at multiple ranks'
274 |                 )
275 | 
276 |         for parent, child in zip(ranks, ranks[1:]):
277 |             if parent_of.setdefault(child, parent) != parent:
278 |                 raise ValueError(
279 |                     f'Taxonomy is ambiguous: "{child}" has multiple parents'
280 |                 )
281 | 


--------------------------------------------------------------------------------
/workflow_avamb/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "contigs": "contigs.txt",
 3 |    "sample_data": "samples2data.tsv",
 4 |    "index_size": "3G",
 5 |    "min_contig_size": "2000",
 6 |    "min_bin_size": "200000",
 7 |    "min_identity": "0.95",
 8 |    "minimap_mem": "15GB",
 9 |    "minimap_ppn": "15",
10 |    "avamb_mem": "15GB",
11 |    "avamb_ppn": "30",
12 |    "checkm2_mem": "15GB",
13 |    "checkm2_ppn": "15",
14 |    "checkm2_mem_r": "30GB",
15 |    "checkm2_ppn_r": "30",
16 |    "avamb_params": " --model vae-aae  -o C  --seed 0 ",
17 |    "avamb_preload": "",
18 |    "outdir": "avamb_outdir",
19 |    "min_comp": "0.9",
20 |    "max_cont": "0.05"
21 | }
22 | 


--------------------------------------------------------------------------------
/workflow_avamb/envs/avamb.yaml:
--------------------------------------------------------------------------------
 1 | name: avamb
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 | - python=3.9.16
 8 | - snakemake=7.22.0
 9 | - pip=23.0.1
10 | - biopython=1.81
11 | - networkx=3.0
12 | 
13 | - pip:
14 |   - ordered-set==4.1.0
15 | 


--------------------------------------------------------------------------------
/workflow_avamb/envs/checkm2.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.8.15
 7 |   - scikit-learn=0.23.2
 8 |   - h5py=2.10.0
 9 |   - numpy=1.23.2
10 |   - tensorflow=2.9.1
11 |   - lightgbm=3.3.2
12 |   - pandas=1.4.3
13 |   - scipy=1.9.0
14 |   - setuptools=65.3.0
15 |   - requests=2.28.1
16 |   - packaging=21.3
17 |   - tqdm=4.64.0
18 |   - diamond=2.0.15
19 |   - prodigal=2.6.3
20 | 


--------------------------------------------------------------------------------
/workflow_avamb/envs/minimap2.yaml:
--------------------------------------------------------------------------------
1 | name: minimap2
2 | channels:
3 |   - bioconda
4 | dependencies:
5 |   - minimap2
6 |   - samtools
7 | 


--------------------------------------------------------------------------------
/workflow_avamb/envs/samtools.yaml:
--------------------------------------------------------------------------------
1 | name: samtools
2 | channels:
3 |   - bioconda
4 | dependencies:
5 |   - samtools
6 | 


--------------------------------------------------------------------------------
/workflow_avamb/src/abundances_mask.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | from vamb.vambtools import RefHasher
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def abundances_mask(headers: Path, mask_refhash: Path, min_contig_size: int):
 8 |     """# Using the headers above, compute the mask and the refhash"""
 9 | 
10 |     mask = []
11 |     identifiers = []
12 | 
13 |     with open(headers) as file:
14 |         for line in file:
15 |             # SN:S27C112075   LN:2239
16 |             (sn, ln) = line.split("\t")
17 |             if sn[:3] != "SN:" or ln[:3] != "LN:":
18 |                 raise ValueError("Unknown format")
19 |             passed = int(ln[3:]) >= min_contig_size
20 |             mask.append(passed)
21 |             if passed:
22 |                 identifiers.append(sn[3:])
23 | 
24 |     np.savez_compressed(
25 |         mask_refhash,
26 |         mask=np.array(mask, dtype=bool),
27 |         refhash=RefHasher.hash_refnames(identifiers),
28 |     )
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument("--h", type=Path, help=" Headers file")
34 |     parser.add_argument("--msk", type=Path, help="mask refhash")
35 | 
36 |     parser.add_argument("--minsize", type=int, help="min contig size")
37 | 
38 |     opt = parser.parse_args()
39 | 
40 |     abundances_mask(opt.h, opt.msk, opt.minsize)
41 | 


--------------------------------------------------------------------------------
/workflow_avamb/src/create_abundances.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | import vamb
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def create_abundances(
 8 |     abundances: list[Path], mask_refhash: Path, min_id: float, outfile: Path
 9 | ):
10 |     """Merge the abundances to a single Abundance object and save it"""
11 |     refhash = np.load(mask_refhash)["refhash"]
12 | 
13 |     n_samples = len(abundances)
14 |     first = vamb.vambtools.read_npz(abundances[0])
15 |     print(len(first), n_samples)
16 |     print(first.shape)
17 |     matrix = np.empty((len(first), n_samples), dtype=np.float32)
18 |     matrix[:, 0] = first
19 |     for i, path in enumerate(abundances[1:]):
20 |         matrix[:, i + 1] = vamb.vambtools.read_npz(path)
21 |     abundance = vamb.parsebam.Abundance(
22 |         matrix, [str(i) for i in abundances], min_id, refhash
23 |     )
24 |     abundance.save(outfile)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument("--msk", type=Path, help="mask refhash")
30 |     parser.add_argument("--ab", type=Path, nargs="+", help=" abundancaes list of files")
31 |     parser.add_argument("--min_id", type=float, help="min identity for alignment")
32 |     parser.add_argument("--out", type=Path, help="abundances outfile")
33 | 
34 |     opt = parser.parse_args()
35 | 
36 |     create_abundances(opt.ab, opt.msk, opt.min_id, opt.out)
37 | 


--------------------------------------------------------------------------------
/workflow_avamb/src/create_cluster_scores_bin_path_dict.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import json
 4 | import argparse
 5 | 
 6 | from typing import cast
 7 | 
 8 | 
 9 | def get_cluster_score_bin_path(
10 |     path_checkm_all: str, path_bins: str, bins: set[str]
11 | ) -> tuple[dict[str, tuple[float, float]], dict[str, str]]:
12 |     """Given CheckM has been run for all samples, create 2 dictionaries:
13 |     - {bin:path_bin}
14 |     - {bin:[completeness, contamination]}"""
15 |     cluster_score: dict[str, tuple[float, float]] = dict()
16 |     bin_path: dict[str, str] = dict()
17 |     for sample in os.listdir(path_checkm_all):
18 |         path_quality_s = os.path.join(path_checkm_all, sample, "quality_report.tsv")
19 |         c_com_con = np.loadtxt(
20 |             path_quality_s,
21 |             delimiter="\t",
22 |             skiprows=1,
23 |             usecols=(0, 1, 2),
24 |             dtype=str,
25 |             ndmin=2,
26 |         )
27 | 
28 |         for row in c_com_con:
29 |             cluster, com, con = row
30 |             cluster = cast(str, cluster)
31 |             com, con = float(com), float(con)
32 |             bin_name = cluster + ".fna"
33 |             if bin_name in bins:
34 |                 cluster_score[cluster] = (com, con)
35 |                 bin_path[cluster + ".fna"] = os.path.join(
36 |                     path_bins, sample, cluster + ".fna"
37 |                 )
38 |     return cluster_score, bin_path
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument("--s", type=str, help="path checkm2 that contains all samples")
44 |     parser.add_argument("--b", type=str, help="path all bins ")
45 |     parser.add_argument(
46 |         "--cs_d", type=str, help="cluster_score dictionary will be stored here"
47 |     )
48 |     parser.add_argument(
49 |         "--bp_d", type=str, help="bin_path dictionary will be stored here "
50 |     )
51 | 
52 |     opt = parser.parse_args()
53 | 
54 |     bins_set = set()
55 |     for sample in os.listdir(opt.b):
56 |         for bin_ in os.listdir(os.path.join(opt.b, sample)):
57 |             if ".fna" in bin_:
58 |                 bins_set.add(bin_)
59 | 
60 |     cluster_score, bin_path = get_cluster_score_bin_path(opt.s, opt.b, bins_set)
61 |     with open(opt.cs_d, "w") as f:
62 |         json.dump(cluster_score, f)
63 | 
64 |     with open(opt.bp_d, "w") as f:
65 |         json.dump(bin_path, f)
66 | 


--------------------------------------------------------------------------------
/workflow_avamb/src/manual_drep_JN.py:
--------------------------------------------------------------------------------
  1 | import vamb
  2 | import numpy as np
  3 | import os
  4 | import itertools
  5 | 
  6 | from typing import NewType, Union, Optional
  7 | from collections.abc import Sequence, Mapping, Iterable
  8 | from pathlib import Path
  9 | 
 10 | import argparse
 11 | import json
 12 | 
 13 | ContigId = NewType("ContigId", int)
 14 | BinId = NewType("BinId", int)
 15 | 
 16 | 
 17 | def main(
 18 |     # Path to output clusters file. Will error if it already exists
 19 |     outpath: Path,
 20 |     # Path to composition.npz
 21 |     composition_path: Path,
 22 |     # Path to CheckM2 quality_report.tsv file
 23 |     quality_report: dict[str, list],
 24 |     # List of paths to clusters.tsv files as output by Vamb.
 25 |     # Names of clusters must match those in CheckM2 quality_report,
 26 |     # and those of contigs match those in names_npz
 27 |     binnings: Sequence[Path],
 28 |     # min coverage to consdier 2 bins the same with 100% above this coverage
 29 |     min_cov: float,
 30 |     # min completentes for bin to be included into the dereplication process
 31 |     min_comp: float,
 32 |     # max contamination for bin to be included into the dereplication process
 33 |     max_cont: float,
 34 |     bins_extension: str,
 35 |     min_bin_size: int,
 36 | ) -> None:
 37 |     # Load contig names and lengths
 38 |     comp = vamb.parsecontigs.Composition.load(composition_path)
 39 | 
 40 |     contig_names: list[str] = list(comp.metadata.identifiers)
 41 |     assert isinstance(contig_names, list)
 42 |     assert isinstance(contig_names[0], str)
 43 | 
 44 |     lengths = comp.metadata.lengths
 45 |     assert len(lengths) == len(contig_names)
 46 |     del comp  # free up memory
 47 | 
 48 |     # Load CheckM2
 49 |     (bin_names, qualities, bin_by_name) = load_checkm2(
 50 |         quality_report, min_comp, max_cont, bins_extension
 51 |     )
 52 |     # Load bins
 53 |     (bin_lengths, union_bins) = load_binnings(
 54 |         binnings, contig_names, lengths, bin_by_name, min_bin_size
 55 |     )
 56 |     del bin_by_name
 57 | 
 58 |     dereplicated = dereplicate(union_bins, qualities, lengths, bin_lengths, min_cov)
 59 |     del bin_lengths
 60 | 
 61 |     if os.path.exists(outpath):
 62 |         raise FileExistsError(outpath)
 63 | 
 64 |     with open(outpath, "w") as file:
 65 |         print(vamb.vambtools.CLUSTERS_HEADER, file=file)
 66 |         for bin in dereplicated:
 67 |             bin_name = bin_names[bin]
 68 |             bin_name = bin_name.replace(".fna", "")
 69 |             for contig in union_bins[bin]:
 70 |                 print(bin_name, contig_names[contig], sep="\t", file=file)
 71 | 
 72 | 
 73 | def load_checkm2(
 74 |     quality_report: dict[str, list],
 75 |     min_completeness: float,
 76 |     max_contamination: float,
 77 |     bins_extension: str,
 78 | ) -> tuple[
 79 |     list[str],  # Bin names
 80 |     list[tuple[float, float]],  # Bin qualities
 81 |     dict[str, Optional[BinId]],  # Mapping to binid, if not skipped
 82 | ]:
 83 |     """Extract all bin names and assign them either a BinId, or else None,
 84 |     if their completeness/contamination is so bad the bin should be discarded
 85 |     """
 86 |     # This is None if the bin is to be discarded
 87 |     bin_by_name: dict[str, Optional[BinId]] = dict()
 88 |     bin_names: list[str] = []
 89 |     qualities: list[tuple[float, float]] = []
 90 | 
 91 |     # The file looks like this:
 92 |     # Name    Completeness    Contamination   Completeness_Model_Used Translation_Table_Used  Additional_Notes
 93 |     # AAE_UC_Y_1980340Ccluster_501--AAE_UC_v3_1980340Ccluster_2599    5.18    0.0     Neural Network (Specific Model) 11      None
 94 | 
 95 |     for cluster, scores in quality_report.items():
 96 |         name = cluster + bins_extension
 97 |         comp, cont = scores
 98 |         completeness = float(comp) / 100
 99 |         contamination = float(cont) / 100
100 |         assert 0.0 <= completeness <= 1.0
101 |         assert 0.0 <= contamination  # can be unbounded
102 | 
103 |         if completeness >= min_completeness and contamination <= max_contamination:
104 |             bin = BinId(len(bin_names))
105 |             bin_names.append(name)
106 |             qualities.append((completeness, contamination))
107 |             bin_by_name[name] = bin
108 |         else:
109 |             bin_by_name[name] = None
110 | 
111 |     assert sum(1 for i in bin_by_name.values() if isinstance(i, int)) == len(bin_names)
112 |     return (bin_names, qualities, bin_by_name)
113 | 
114 | 
115 | def load_binnings(
116 |     binnings: Sequence[Path],
117 |     contig_names: Sequence[str],
118 |     lengths: np.ndarray,
119 |     bin_by_name: Mapping[str, Optional[BinId]],
120 |     min_bin_size: int,
121 | ) -> tuple[list[int], list[set[ContigId]]]:
122 |     """
123 |     Load clusters.tsv files from each binning, and filter away those assigned to be discarded based on CheckM2 data.
124 |     Return bin length and bins, each represented as a set of ContigId
125 |     """
126 |     id_len_of_contig_name: dict[str, tuple[ContigId, int]] = dict()
127 |     for index, (name, length) in enumerate(zip(contig_names, lengths)):
128 |         id_len_of_contig_name[name] = (ContigId(index), length)
129 | 
130 |     # Load binnings
131 |     n_union_bins = sum(1 for i in bin_by_name.values() if i is not None)
132 | 
133 |     lengthof = dict(zip(contig_names, lengths))
134 | 
135 |     union_bins: list[Optional[set[ContigId]]] = [None] * n_union_bins
136 |     for binning_path in binnings:
137 |         with open(binning_path) as file:
138 |             clusters = vamb.vambtools.read_clusters(file)
139 |             clusters_filtered = filterclusters(clusters, lengthof, min_bin_size)
140 |             # filter by clusters larger than 200kbs
141 |             for bin_name, contigs in clusters_filtered.items():
142 |                 bin_name += ".fna"
143 |                 # None is a valid value, so we use -1 as sentinel for missing
144 |                 bin = bin_by_name.get(bin_name, -1)
145 |                 if bin == -1:
146 |                     raise ValueError(
147 |                         f"Bin {bin_name} found in binning {binning_path}, but is not scored by CheckM2"
148 |                     )
149 |                 # Means: Below threshold, so skip it
150 |                 elif bin is None:
151 |                     continue
152 |                 else:
153 |                     ids: set[ContigId] = set()
154 |                     for contig in contigs:
155 |                         existing = id_len_of_contig_name.get(contig)
156 |                         if existing is None:
157 |                             raise KeyError(
158 |                                 f"Cluster file {binning_path} contain contig {contig}, "
159 |                                 "but that name is not present in provided names npz file"
160 |                             )
161 |                         ids.add(existing[0])
162 |                     union_bins[bin] = ids
163 | 
164 |     bin_lengths: list[int] = []
165 | 
166 |     for i in union_bins:
167 |         assert isinstance(i, set)
168 |     union_bins_asserted: list[set[ContigId]] = union_bins  # type: ignore
169 | 
170 |     for contigs in union_bins_asserted:
171 |         bin_lengths.append(sum(lengths[contig] for contig in contigs))
172 | 
173 |     return (bin_lengths, union_bins_asserted)
174 | 
175 | 
176 | def filterclusters(
177 |     clusters: Mapping[str, set], lengthof: Mapping[str, int], min_bin_size: int
178 | ) -> Mapping[str, set]:
179 |     filtered_bins = dict()
180 |     for medoid, contigs in clusters.items():
181 |         binsize = sum(lengthof[contig] for contig in contigs)
182 | 
183 |         if binsize >= min_bin_size:
184 |             filtered_bins[medoid] = contigs
185 | 
186 |     return filtered_bins
187 | 
188 | 
189 | def dereplicate(
190 |     union_bins: Sequence[set[ContigId]],
191 |     qualities: Sequence[tuple[float, float]],
192 |     contig_lengths: np.ndarray,
193 |     bin_lengths: Sequence[int],
194 |     threshold: float,
195 | ) -> list[BinId]:
196 |     "Removes bins if they are too similar to another bin. Return list of kept bins"
197 |     assert len(union_bins) == len(qualities) == len(bin_lengths)
198 | 
199 |     overlapping_pairs = get_overlapping_bin_pairs(get_binsof(union_bins), qualities)
200 |     to_remove = compute_to_remove(
201 |         union_bins, overlapping_pairs, contig_lengths, bin_lengths, threshold
202 |     )
203 |     return [BinId(i) for i in range(len(bin_lengths)) if BinId(i) not in to_remove]
204 | 
205 | 
206 | def get_binsof(union_bins: Iterable[Iterable[ContigId]]) -> dict[ContigId, list[BinId]]:
207 |     "Makes a dict from contig -> list of bins the contig is present in, if in multiple bins"
208 |     binsof: dict[ContigId, Union[BinId, list[BinId]]] = dict()
209 |     for bin_int, contigs in enumerate(union_bins):
210 |         bin = BinId(bin_int)
211 |         for contig in contigs:
212 |             existing = binsof.get(contig)
213 |             if existing is None:
214 |                 binsof[contig] = bin
215 |             elif isinstance(existing, int):
216 |                 binsof[contig] = [existing, bin]
217 |             else:
218 |                 assert isinstance(existing, list)
219 |                 existing.append(bin)
220 |     return {k: v for (k, v) in binsof.items() if isinstance(v, list)}
221 | 
222 | 
223 | def bin_score(completeness: float, contamination: float) -> float:
224 |     return completeness - 5 * contamination
225 | 
226 | 
227 | def get_overlapping_bin_pairs(
228 |     binsof: Mapping[ContigId, list[BinId]], qualities: Sequence[tuple[float, float]]
229 | ) -> Sequence[tuple[BinId, BinId]]:
230 |     "Get a list of pairs of bins that share at least one contig"
231 |     pairs: set[tuple[BinId, BinId]] = set()
232 |     for overlapping_bins in binsof.values():
233 |         for a, b in itertools.combinations(overlapping_bins, r=2):
234 |             # Order them so we don't have (a, b) and (b, a) as distinct pairs
235 |             if a > b:
236 |                 (a, b) = (b, a)
237 |             pairs.add((a, b))
238 | 
239 |     # Now be sure to order them as (worst, best) depending on score
240 |     # If they tie, then use lexographic order (a, b) we added them
241 |     # in above
242 |     result: list[tuple[BinId, BinId]] = []
243 |     for a, b in pairs:
244 |         score_a = bin_score(*qualities[a])
245 |         score_b = bin_score(*qualities[b])
246 |         if score_a > score_b:
247 |             result.append((b, a))
248 |         else:
249 |             result.append((a, b))
250 | 
251 |     return result
252 | 
253 | 
254 | def compute_to_remove(
255 |     union_bins: Sequence[set[ContigId]],
256 |     overlapping_pairs: Iterable[tuple[BinId, BinId]],
257 |     lengths: np.ndarray,
258 |     bin_lengths: Sequence[int],
259 |     threshold: float,
260 | ) -> set[BinId]:
261 |     "Create a list of bins to remove because they overlap with another bin"
262 |     result: set[BinId] = set()
263 |     for bin_a, bin_b in overlapping_pairs:
264 |         if bin_a in result or bin_b in result:
265 |             continue
266 | 
267 |         intersection = union_bins[bin_a] & union_bins[bin_b]
268 |         int_len = sum(lengths[i] for i in intersection)
269 |         if int_len / min(bin_lengths[bin_a], bin_lengths[bin_b]) >= threshold:
270 |             # We remove an arbitrary one
271 |             result.add(bin_a)
272 |     return result
273 | 
274 | 
275 | if __name__ == "__main__":
276 |     parser = argparse.ArgumentParser()
277 |     parser.add_argument("--cs_d", type=str, help="path bins_scores dictionary")
278 |     parser.add_argument(
279 |         "--composition", type=Path, help="Path to the composition.npz file"
280 |     )
281 |     parser.add_argument(
282 |         "--output",
283 |         type=str,
284 |         help="Path output clusters generated by dereplicating bins",
285 |     )
286 |     parser.add_argument(
287 |         "--clusters",
288 |         type=str,
289 |         nargs="*",
290 |         help="Path input clusters generated by aamb and vamb",
291 |     )
292 |     parser.add_argument("--cov", type=float, default=0.75, help="Min coverage ")
293 |     parser.add_argument("--comp", type=float, default=0.9, help="Min completeness ")
294 |     parser.add_argument("--cont", type=float, default=0.05, help="Max contamination ")
295 |     parser.add_argument(
296 |         "--bins_extension", type=str, default=".fna", help="Extension of the bins  "
297 |     )
298 |     parser.add_argument(
299 |         "--min_bin_size",
300 |         type=int,
301 |         help="Min bin length to be considered for dereplication ",
302 |     )
303 | 
304 |     opt = parser.parse_args()
305 |     args = vars(parser.parse_args())
306 |     with open(opt.cs_d) as f:
307 |         cluster_scores = json.load(f)
308 | 
309 |     main(
310 |         outpath=opt.output,
311 |         composition_path=opt.composition,
312 |         quality_report=cluster_scores,
313 |         binnings=opt.clusters,
314 |         min_cov=opt.cov,
315 |         min_comp=opt.comp,
316 |         max_cont=opt.cont,
317 |         bins_extension=opt.bins_extension,
318 |         min_bin_size=opt.min_bin_size,
319 |     )
320 | 


--------------------------------------------------------------------------------
/workflow_avamb/src/mv_bins_from_mdrep_clusters.py:
--------------------------------------------------------------------------------
  1 | import vamb
  2 | import argparse
  3 | import shutil
  4 | import os
  5 | import json
  6 | 
  7 | from typing import Optional
  8 | 
  9 | 
 10 | def main(
 11 |     cluster_scores: dict[str, tuple[float, float]],
 12 |     cluster_contigs: dict[str, set[str]],
 13 |     bin_separator: Optional[str],
 14 |     path_nc_bins_folder: str,
 15 |     path_bins_folder: str,
 16 |     path_nc_clusters: str,
 17 |     min_comp: float = 0.9,
 18 |     max_cont: float = 0.05,
 19 | ):
 20 |     cluster_sample = get_cluster_sample(cluster_contigs, bin_separator)
 21 |     nc_cluster_scores = get_nc_cluster_scores(
 22 |         cluster_scores, cluster_sample, min_comp, max_cont
 23 |     )
 24 |     create_nc_sample_folders(nc_cluster_scores, cluster_sample, path_nc_bins_folder)
 25 |     write_nc_bins_from_mdrep_clusters(
 26 |         nc_cluster_scores, cluster_sample, path_nc_bins_folder, path_bins_folder
 27 |     )
 28 |     write_quality_report(nc_cluster_scores, path_nc_bins_folder)
 29 |     write_final_nc_clusters(nc_cluster_scores, cluster_contigs, path_nc_clusters)
 30 | 
 31 | 
 32 | def get_nc_cluster_scores(
 33 |     cluster_scores: dict[str, tuple[float, float]],
 34 |     cluster_sample: dict[str, str],
 35 |     min_comp: float,
 36 |     max_cont: float,
 37 | ) -> dict[str, tuple[float, float]]:
 38 |     nc_cluster_scores: dict[str, tuple[float, float]] = dict()
 39 |     for cluster, scores in cluster_scores.items():
 40 |         comp, cont = scores
 41 |         comp, cont = float(comp), float(cont)
 42 |         comp, cont = comp / 100, cont / 100
 43 |         if cluster not in cluster_sample.keys():
 44 |             continue
 45 |         if comp >= min_comp and cont <= max_cont:
 46 |             nc_cluster_scores[cluster] = (comp, cont)
 47 | 
 48 |     return nc_cluster_scores
 49 | 
 50 | 
 51 | def get_cluster_sample(
 52 |     cluster_contigs: dict[str, set[str]], bin_separator: Optional[str]
 53 | ) -> dict[str, str]:
 54 |     cluster_sample: dict[str, str] = dict()
 55 |     for cluster_ in cluster_contigs.keys():
 56 |         contigs = cluster_contigs[cluster_]
 57 |         contig_i = next(iter(contigs))
 58 |         sample = contig_i.split(bin_separator)[0]
 59 |         cluster_sample[cluster_] = sample
 60 | 
 61 |     return cluster_sample
 62 | 
 63 | 
 64 | def create_nc_sample_folders(
 65 |     cluster_scores: dict[str, tuple[float, float]],
 66 |     cluster_sample: dict[str, str],
 67 |     path_nc_bins_folder: str,
 68 | ):
 69 |     nc_samples: set[str] = set()
 70 |     for cluster in cluster_scores.keys():
 71 |         sample = cluster_sample[cluster]
 72 |         nc_samples.add(sample)
 73 | 
 74 |     for sample in nc_samples:
 75 |         try:
 76 |             os.mkdir(os.path.join(path_nc_bins_folder, sample))
 77 |         except FileExistsError:
 78 |             pass
 79 | 
 80 | 
 81 | def write_nc_bins_from_mdrep_clusters(
 82 |     cluster_scores: dict[str, tuple[float, float]],
 83 |     cluster_sample: dict[str, str],
 84 |     path_nc_bins_folder: str,
 85 |     path_bins_folder: str,
 86 | ):
 87 |     for cluster in cluster_scores.keys():
 88 |         sample = cluster_sample[cluster]
 89 |         src_bin = os.path.join(path_bins_folder, sample, cluster + ".fna")
 90 |         trg_bin = os.path.join(path_nc_bins_folder, sample, cluster + ".fna")
 91 |         shutil.move(src_bin, trg_bin)
 92 | 
 93 | 
 94 | def write_quality_report(
 95 |     cluster_scores: dict[str, tuple[float, float]], path_nc_bins_folder: str
 96 | ):
 97 |     with open(os.path.join(path_nc_bins_folder, "quality_report.tsv"), "w") as file:
 98 |         print("Name completeness contamination", sep="\t", file=file)
 99 |         file.flush()
100 |         for nc_cluster, (completeness, contaminaton) in cluster_scores.items():
101 |             print(nc_cluster, completeness, contaminaton, sep="\t", file=file)
102 |             file.flush()
103 | 
104 | 
105 | def write_final_nc_clusters(
106 |     cluster_scores: dict[str, tuple[float, float]],
107 |     cluster_contigs: dict[str, set[str]],
108 |     path_nc_clusters: str,
109 | ):
110 |     with open(path_nc_clusters, "w") as file:
111 |         print(vamb.vambtools.CLUSTERS_HEADER, file=file)
112 |         for nc_cluster in cluster_scores.keys():
113 |             nc_contigs = cluster_contigs[nc_cluster]
114 |             for nc_contig in nc_contigs:
115 |                 print(nc_cluster, nc_contig, sep="\t", file=file)
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     parser = argparse.ArgumentParser()
120 |     parser.add_argument("--c", type=str, help="path clusters file from tmp folder")
121 |     parser.add_argument("--cf", type=str, help="path clusters file final")
122 |     parser.add_argument("--cs_d", type=str, help="cluster_scores dictionary path ")
123 |     parser.add_argument("--b", type=str, help="path all bins ")
124 |     parser.add_argument(
125 |         "--d", type=str, help="path to folder that will contain all nc bins"
126 |     )
127 |     parser.add_argument("--bin_separator", type=str, help="separator ")
128 |     parser.add_argument("--comp", type=float, default=0.9, help="Min completeness ")
129 |     parser.add_argument("--cont", type=float, default=0.05, help="Max contamination ")
130 | 
131 |     opt = parser.parse_args()
132 | 
133 |     with open(opt.c) as clusters_file:
134 |         cluster_contigs = vamb.vambtools.read_clusters(clusters_file)
135 | 
136 |     with open(opt.cs_d) as f:
137 |         cluster_scores = json.load(f)
138 | 
139 |     main(
140 |         cluster_scores,
141 |         cluster_contigs,
142 |         opt.bin_separator,
143 |         opt.d,
144 |         opt.b,
145 |         opt.cf,
146 |         opt.comp,
147 |         opt.cont,
148 |     )
149 | 


--------------------------------------------------------------------------------
/workflow_avamb/src/update_cluster_scores_dict_after_ripping.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import json
 3 | import argparse
 4 | 
 5 | 
 6 | def update_cluster_score_bin_path(
 7 |     path_checkm_ripped: str, cluster_score: dict[str, tuple[float, float]]
 8 | ):
 9 |     c_com_con = np.loadtxt(
10 |         path_checkm_ripped,
11 |         delimiter="\t",
12 |         skiprows=1,
13 |         usecols=(0, 1, 2),
14 |         dtype=str,
15 |         ndmin=2,
16 |     )
17 |     for row in c_com_con:
18 |         cluster, com, con = row
19 |         if "--" in cluster:
20 |             continue
21 |         com, con = float(com), float(con)
22 |         print(cluster, "scores were", cluster_score[cluster])
23 | 
24 |         cluster_score[cluster] = (com, con)
25 |         print("and now are", cluster_score[cluster])
26 |     return cluster_score
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument(
32 |         "--s",
33 |         type=str,
34 |         help="path checkm2 that contains quality_report.tsv file for ripped bins",
35 |     )
36 |     parser.add_argument(
37 |         "--cs_d",
38 |         type=str,
39 |         help="cluster_score dictionary path  ",
40 |     )
41 |     parser.add_argument(
42 |         "--cs_d_o",
43 |         type=str,
44 |         help="cluster_score dictionary path updated, updated with tthe information for clusters that where ripped either becuase of meaningless edges or when making the component lenght <= 2  ",
45 |     )
46 | 
47 |     opt = parser.parse_args()
48 | 
49 |     with open(opt.cs_d) as f:
50 |         cluster_score = json.load(f)
51 | 
52 |     cluster_score_ = update_cluster_score_bin_path(opt.s, cluster_score)
53 | 
54 |     with open(opt.cs_d_o, "w") as f:
55 |         json.dump(cluster_score_, f)
56 | 


--------------------------------------------------------------------------------
/workflow_avamb/src/workflow_tools.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | from typing import cast
 4 | 
 5 | 
 6 | def get_cluster_score_bin_path(
 7 |     path_checkm_all: str, path_bins: str, bins: set[str]
 8 | ) -> tuple[dict[str, tuple[float, float]], dict[str, str]]:
 9 |     """Given CheckM has been run for all samples, create 2 dictionaries:
10 |     - {bin:path_bin}
11 |     - {bin:[completeness, contamination]}"""
12 |     cluster_score: dict[str, tuple[float, float]] = dict()
13 |     bin_path: dict[str, str] = dict()
14 |     for sample in os.listdir(path_checkm_all):
15 |         path_quality_s = os.path.join(path_checkm_all, sample, "quality_report.tsv")
16 |         c_com_con = np.loadtxt(
17 |             path_quality_s,
18 |             delimiter="\t",
19 |             skiprows=1,
20 |             usecols=(0, 1, 2),
21 |             dtype=str,
22 |             ndmin=2,
23 |         )
24 | 
25 |         for row in c_com_con:
26 |             cluster, com, con = row
27 |             cluster = cast(str, cluster)
28 |             com, con = float(com), float(con)
29 |             bin_name = cluster + ".fna"
30 |             if bin_name in bins:
31 |                 cluster_score[cluster] = (com, con)
32 |                 bin_path[cluster + ".fna"] = os.path.join(
33 |                     path_bins, sample, cluster + ".fna"
34 |                 )
35 |     return cluster_score, bin_path
36 | 
37 | 
38 | def update_cluster_score_bin_path(
39 |     path_checkm_ripped: str, cluster_score: dict[str, tuple[float, float]]
40 | ) -> dict[str, tuple[float, float]]:
41 |     c_com_con = np.loadtxt(
42 |         path_checkm_ripped,
43 |         delimiter="\t",
44 |         skiprows=1,
45 |         usecols=(0, 1, 2),
46 |         dtype=str,
47 |         ndmin=2,
48 |     )
49 |     for row in c_com_con:
50 |         cluster, com, con = row
51 |         if "--" in cluster:
52 |             continue
53 |         com, con = float(com), float(con)
54 |         print(cluster, "scores were", cluster_score[cluster])
55 | 
56 |         cluster_score[cluster] = (com, con)
57 |         print("and now are", cluster_score[cluster])
58 |     return cluster_score
59 | 


--------------------------------------------------------------------------------
/workflow_avamb/src/write_abundances.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | import vamb
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def write_abundances(
 8 |     mask_refhash: Path, bampath: Path, min_identity: float, outfile: Path
 9 | ):
10 |     """For every sample, compute the abundances given the mask and refhashes"""
11 |     loadnpz = np.load(mask_refhash)
12 |     refhash = loadnpz["refhash"]
13 |     mask = loadnpz["mask"]
14 |     refhash = refhash.reshape(1)[0]
15 |     (abundance, _) = vamb.parsebam.Abundance.run_pycoverm(
16 |         paths=[bampath],
17 |         minid=min_identity,
18 |         target_refhash=refhash,
19 |         target_identifiers=None,
20 |         mask=mask,
21 |     )
22 |     vamb.vambtools.write_npz(outfile, abundance.ravel())
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument("--msk", type=Path, help="mask refhash")
28 |     parser.add_argument("--b", type=Path, help=" bam path")
29 |     parser.add_argument("--min_id", type=float, help="min identity for alignment")
30 |     parser.add_argument("--out", type=Path, help="abundances outfile")
31 | 
32 |     opt = parser.parse_args()
33 | 
34 |     write_abundances(opt.msk, opt.b, opt.min_id, opt.out)
35 | 


--------------------------------------------------------------------------------
/workflow_avamb/src/write_clusters_from_dereplicated_and_ripped_bins.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | while getopts "d:o:" opt; do
 3 |   case $opt in
 4 |     d) drep_dir=$OPTARG    ;;
 5 |     o) clusters_file=$OPTARG    ;;
 6 |     *) echo 'error' >&2
 7 |        exit 1
 8 |   esac
 9 | done
10 | output_file=$(pwd)/${clusters_file}/avamb/avamb_manual_drep_disjoint_clusters.tsv
11 | echo 'creating z y v clusters from the final set of bins'
12 | for s in $(ls $drep_dir)
13 | do
14 | s="$drep_dir"/"$s"/
15 | if [ -d "$s" ]
16 | then
17 | cd $s
18 | for bin in $(ls . 2> /dev/null)
19 | 
20 | do
21 | if [[ $bin == **".fna" ]]
22 | then
23 | 
24 | cluster_name=$(echo $bin | sed 's=.fna==g' | sed 's=.fa==g')
25 | 
26 | echo -e   "clustername\tcontigname"  >> $output_file
27 | for contig in $(grep '>' $bin | sed 's=>==g')
28 | do
29 | echo -e   "$cluster_name""\t""$contig"  >> $output_file
30 | done
31 | 
32 | 
33 | fi
34 | done
35 | 
36 | fi
37 | done
38 | 


--------------------------------------------------------------------------------