├── tests
    ├── __init__.py
    └── test_pipeline.py
├── witch_msa
    ├── helpers
    │   ├── __init__.py
    │   ├── math_utils.py
    │   ├── general_tools.py
    │   └── pyhmmer_tools.py
    ├── tools
    │   ├── magus
    │   │   ├── align
    │   │   │   ├── __init__.py
    │   │   │   ├── merge
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── graph_build
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── graph_builder.py
    │   │   │   │   ├── graph_trace
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── naive.py
    │   │   │   │   │   ├── tracer.py
    │   │   │   │   │   ├── rg_search.py
    │   │   │   │   │   └── rg_fast_search.py
    │   │   │   │   ├── graph_cluster
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── mcl.py
    │   │   │   │   │   ├── rg.py
    │   │   │   │   │   ├── clusterer.py
    │   │   │   │   │   ├── mlr_mcl.py
    │   │   │   │   │   └── clean_clusters.py
    │   │   │   │   ├── merger.py
    │   │   │   │   └── alignment_graph.py
    │   │   │   ├── decompose
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── kmh.py
    │   │   │   │   ├── decomposer.py
    │   │   │   │   └── initial_tree.py
    │   │   │   ├── aligner.py
    │   │   │   └── alignment_context.py
    │   │   ├── helpers
    │   │   │   ├── __init__.py
    │   │   │   ├── hmmutils.py
    │   │   │   └── sequenceutils.py
    │   │   ├── tasks
    │   │   │   ├── __init__.py
    │   │   │   ├── files.py
    │   │   │   ├── controller.py
    │   │   │   ├── task.py
    │   │   │   └── manager.py
    │   │   ├── tools
    │   │   │   ├── __init__.py
    │   │   │   ├── mcl
    │   │   │   │   └── bin
    │   │   │   │   │   ├── clm
    │   │   │   │   │   ├── mcl
    │   │   │   │   │   ├── mcx
    │   │   │   │   │   ├── mcxi
    │   │   │   │   │   ├── mclcm
    │   │   │   │   │   ├── mcxmap
    │   │   │   │   │   ├── clmformat
    │   │   │   │   │   ├── mcxarray
    │   │   │   │   │   ├── mcxdump
    │   │   │   │   │   ├── mcxload
    │   │   │   │   │   ├── mcxrand
    │   │   │   │   │   ├── mcxsubs
    │   │   │   │   │   ├── mcxassemble
    │   │   │   │   │   └── mclblastline
    │   │   │   ├── mlrmcl
    │   │   │   │   ├── ncut
    │   │   │   │   ├── mlrmcl
    │   │   │   │   └── README
    │   │   │   ├── hmmer
    │   │   │   │   ├── hmmalign
    │   │   │   │   ├── hmmbuild
    │   │   │   │   └── hmmsearch
    │   │   │   ├── clustal
    │   │   │   │   └── clustalo
    │   │   │   ├── fasttree
    │   │   │   │   ├── FastTree
    │   │   │   │   └── FastTreeMP
    │   │   │   ├── raxmlng
    │   │   │   │   └── raxml-ng
    │   │   │   ├── mafft
    │   │   │   │   ├── mafftdir
    │   │   │   │   │   └── libexec
    │   │   │   │   │   │   ├── f2cl
    │   │   │   │   │   │   ├── dndpre
    │   │   │   │   │   │   ├── getlag
    │   │   │   │   │   │   ├── score
    │   │   │   │   │   │   ├── tbfast
    │   │   │   │   │   │   ├── addsingle
    │   │   │   │   │   │   ├── countlen
    │   │   │   │   │   │   ├── dndblast
    │   │   │   │   │   │   ├── dndfast7
    │   │   │   │   │   │   ├── dvtditr
    │   │   │   │   │   │   ├── nodepair
    │   │   │   │   │   │   ├── pairash
    │   │   │   │   │   │   ├── replaceu
    │   │   │   │   │   │   ├── restoreu
    │   │   │   │   │   │   ├── setcore
    │   │   │   │   │   │   ├── sextet5
    │   │   │   │   │   │   ├── version
    │   │   │   │   │   │   ├── dash_client
    │   │   │   │   │   │   ├── disttbfast
    │   │   │   │   │   │   ├── multi2hat3s
    │   │   │   │   │   │   ├── splittbfast
    │   │   │   │   │   │   ├── contrafoldwrap
    │   │   │   │   │   │   ├── hex2maffttext
    │   │   │   │   │   │   ├── mafft-distance
    │   │   │   │   │   │   ├── mafft-profile
    │   │   │   │   │   │   ├── maffttext2hex
    │   │   │   │   │   │   ├── mccaskillwrap
    │   │   │   │   │   │   ├── pairlocalalign
    │   │   │   │   │   │   ├── regtable2seq
    │   │   │   │   │   │   ├── seq2regtable
    │   │   │   │   │   │   ├── setdirection
    │   │   │   │   │   │   ├── makedirectionlist
    │   │   │   │   │   │   └── mafft-homologs.1
    │   │   │   │   └── mafft
    │   │   │   └── external_tools.py
    │   │   ├── README.md
    │   │   ├── magus.py
    │   │   └── configuration.py
    │   └── macOS
    │   │   ├── mcl
    │   │   ├── hmmalign
    │   │   ├── hmmbuild
    │   │   ├── FastTreeMP
    │   │   └── hmmsearch
    ├── gcmm
    │   ├── callback.py
    │   ├── __init__.py
    │   ├── merger.py
    │   ├── task.py
    │   ├── weighting.py
    │   └── decompose_tree.py
    ├── default.config
    └── init_configs.py
├── examples
    ├── data
    │   ├── aligned_all.fasta.gz
    │   └── backbone.aln.fasta.gz
    ├── user.config
    └── run.sh
├── requirements.txt
├── witch.py
├── bin
    └── witch-msa
├── .gitignore
├── MANIFEST.in
├── .github
    └── workflows
    │   ├── coveralls.yml
    │   ├── python-publish.yml
    │   └── python-package.yml
├── pyproject.toml
└── CHANGELOG.rst


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/witch_msa/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/decompose/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_build/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_trace/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_cluster/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/witch_msa/tools/macOS/mcl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/macOS/mcl


--------------------------------------------------------------------------------
/witch_msa/tools/macOS/hmmalign:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/macOS/hmmalign


--------------------------------------------------------------------------------
/witch_msa/tools/macOS/hmmbuild:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/macOS/hmmbuild


--------------------------------------------------------------------------------
/witch_msa/tools/macOS/FastTreeMP:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/macOS/FastTreeMP


--------------------------------------------------------------------------------
/witch_msa/tools/macOS/hmmsearch:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/macOS/hmmsearch


--------------------------------------------------------------------------------
/examples/data/aligned_all.fasta.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/examples/data/aligned_all.fasta.gz


--------------------------------------------------------------------------------
/examples/data/backbone.aln.fasta.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/examples/data/backbone.aln.fasta.gz


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cython>=0.29
2 | configparser>=5.0.0
3 | DendroPy>=4.4.0
4 | numpy>=1.15
5 | psutil>=5.0
6 | tqdm>=4.0.0
7 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/clm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/clm


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/mcl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcl


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/mcx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcx


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/mcxi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxi


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mlrmcl/ncut:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mlrmcl/ncut


--------------------------------------------------------------------------------
/witch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from witch_msa import witch_runner
3 | 
4 | if __name__ == '__main__':
5 |     witch_runner()
6 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/hmmer/hmmalign:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/hmmer/hmmalign


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/hmmer/hmmbuild:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/hmmer/hmmbuild


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/mclcm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mclcm


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/mcxmap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxmap


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mlrmcl/mlrmcl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mlrmcl/mlrmcl


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/clustal/clustalo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/clustal/clustalo


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/fasttree/FastTree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/fasttree/FastTree


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/hmmer/hmmsearch:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/hmmer/hmmsearch


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/clmformat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/clmformat


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/mcxarray:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxarray


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/mcxdump:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxdump


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/mcxload:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxload


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/mcxrand:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxrand


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/mcxsubs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxsubs


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/raxmlng/raxml-ng:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/raxmlng/raxml-ng


--------------------------------------------------------------------------------
/bin/witch-msa:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from witch_msa import witch_runner 
3 | 
4 | if __name__ == "__main__":
5 |     witch_runner()
6 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/fasttree/FastTreeMP:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/fasttree/FastTreeMP


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/mcxassemble:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxassemble


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/f2cl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/f2cl


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dndpre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dndpre


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/getlag:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/getlag


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/score:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/score


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/tbfast:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/tbfast


--------------------------------------------------------------------------------
/tests/test_pipeline.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | 
3 | class TryPipeline(TestCase):
4 |     def test_always_pass(self):
5 |         self.assertTrue(True)
6 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/addsingle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/addsingle


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/countlen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/countlen


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dndblast:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dndblast


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dndfast7:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dndfast7


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dvtditr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dvtditr


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/nodepair:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/nodepair


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/pairash:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/pairash


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/replaceu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/replaceu


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/restoreu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/restoreu


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/setcore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/setcore


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/sextet5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/sextet5


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/version:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/version


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dash_client:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dash_client


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/disttbfast:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/disttbfast


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/multi2hat3s:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/multi2hat3s


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/splittbfast:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/splittbfast


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/contrafoldwrap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/contrafoldwrap


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/hex2maffttext:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/hex2maffttext


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mafft-distance:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mafft-distance


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mafft-profile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mafft-profile


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/maffttext2hex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/maffttext2hex


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mccaskillwrap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mccaskillwrap


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/pairlocalalign:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/pairlocalalign


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/regtable2seq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/regtable2seq


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/seq2regtable:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/seq2regtable


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/setdirection:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/setdirection


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/makedirectionlist:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/makedirectionlist


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/__pycache__/
 2 | __pycache__/
 3 | *.backup
 4 | *witch_output
 5 | *_output*
 6 | main.config
 7 | examples/data2
 8 | dist/
 9 | *egg-info
10 | home.path
11 | .witch_msa/
12 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.rst
 2 | include CHANGELOG
 3 | include witch.py
 4 | include witch_msa/default.config
 5 | include requirements.txt
 6 | graft witch_msa/tools/
 7 | graft witch_msa/gcmm/
 8 | graft witch_msa/helpers/
 9 | graft tests/
10 | prune */__pycache__
11 | global-exclude *.py[cod]
12 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafft:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | # sh -> bash for debian. By J. R. Peterson. 2015/Jun.
3 | 
4 | pushd "`dirname "$0"`" > /dev/null 2>&1; rootdir="$PWD"; popd > /dev/null 2>&1;
5 | MAFFT_BINARIES="$rootdir/mafftdir/libexec"; export MAFFT_BINARIES;
6 | 
7 | "$rootdir/mafftdir/bin/mafft" "$@"
8 | # input file name can have space
9 | 


--------------------------------------------------------------------------------
/witch_msa/helpers/math_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created on Nov 14, 2012
 3 | 
 4 | @author: Siavash Mirarab
 5 | """
 6 | 
 7 | # 1.19.2022 - Copied from SEPP/UPP by Chengze
 8 | 
 9 | 
10 | def gcd(a, b):
11 |     """Return greatest common divisor using Euclid's Algorithm."""
12 |     while b:
13 |         a, b = b, a % b
14 |     return a
15 | 
16 | 
17 | def lcm(a, b):
18 |     """Return lowest common multiple."""
19 |     return a * b // gcd(a, b)
20 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_cluster/mcl.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 14, 2020
 3 | 
 4 | @author: Vlad
 5 | '''
 6 | 
 7 | from configuration import Configs
 8 | from tools import external_tools
 9 | 
10 | 
11 | def runMclClustering(graph):  
12 |     Configs.log("Running MCL alignment graph clustering..")
13 |     external_tools.runMcl(graph.graphPath, Configs.mclInflationFactor, graph.workingDir, graph.clusterPath).run()
14 |     graph.readClustersFromFile(graph.clusterPath)
15 |     
16 |     
17 | 


--------------------------------------------------------------------------------
/.github/workflows/coveralls.yml:
--------------------------------------------------------------------------------
 1 | on: ["push", "pull_request"]
 2 | 
 3 | name: Test Coveralls
 4 | 
 5 | jobs:
 6 | 
 7 |   build:
 8 |     name: Build
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 | 
12 |     - uses: actions/checkout@v1
13 | 
14 |     - name: Use Node.js 16.x
15 |       uses: actions/setup-node@v3
16 |       with:
17 |         node-version: 16.x
18 | 
19 |     - name: npm install, make test-coverage
20 |       run: |
21 |         npm install
22 |         make test-coverage
23 | 
24 |     - name: Coveralls GitHub Action
25 |       uses: coverallsapp/github-action@v2.2.3
26 | 


--------------------------------------------------------------------------------
/witch_msa/helpers/general_tools.py:
--------------------------------------------------------------------------------
 1 | import psutil, os
 2 | import argparse
 3 | 
 4 | # return memory usage of python process by MB
 5 | def memoryUsage():
 6 |     process = psutil.Process(os.getpid())
 7 |     mem = process.memory_info().rss / float(2 ** 20)
 8 |     return mem
 9 | 
10 | # reformat argparse help text formatting
11 | class SmartHelpFormatter(argparse.HelpFormatter):
12 |     def _split_lines(self, text, width):
13 |         if '\n' in text:
14 |             temp = text.split('\n')
15 |             ret = []
16 |             for _splice in [argparse.HelpFormatter._split_lines(self, x, width)
17 |                     for x in temp]:
18 |                 ret.extend(_splice)
19 |             return ret
20 |         return argparse.HelpFormatter._split_lines(self, text, width)
21 | 


--------------------------------------------------------------------------------
/examples/user.config:
--------------------------------------------------------------------------------
 1 | [commandline]
 2 | timeout=999999
 3 | max-concurrent-jobs=1000000
 4 | 
 5 | [Basic]
 6 | alignment_size = 25
 7 | #magus_path =
 8 | #mafftpath =
 9 | #mclpath =
10 | #fasttreepath =
11 | #hmmsearchpath = /anaconda3/bin/hmmsearch
12 | #hmmbuildpath = /anaconda3/bin/hmmbuild
13 | #hmmalignpath = /anaconda3/bin/hmmalign
14 | 
15 | [Backbone]
16 | backbone_size = 500
17 | #alignment_method = magus
18 | #alignment_path = 
19 | #backbone_size = 
20 | #selection_strategy = median_length
21 | #tree_method = FastTree2
22 | #tree_path = 
23 | 
24 | [MAGUS]
25 | #inflationfactor = 
26 | #graphclustermethod = 
27 | #graphtracemethod = 
28 | #graphtraceoptimize = 
29 | #maxnumsubsets = 
30 | #mafftpath = 
31 | #mclpath =
32 | #fasttreepath =
33 | #hmmsearchpath = /anaconda3/bin/hmmsearch
34 | #hmmbuildpath = /anaconda3/bin/hmmbuild
35 | #hmmalignpath = /anaconda3/bin/hmmalign
36 | 


--------------------------------------------------------------------------------
/witch_msa/helpers/pyhmmer_tools.py:
--------------------------------------------------------------------------------
 1 | from witch_msa.helpers.alignment_tools import Alignment
 2 | from pyhmmer import easel
 3 | 
 4 | # helper function to convert an alignment object to TextMSA object
 5 | def alignmentToTextMSA(aln, name):
 6 |     sequences = []
 7 |     for taxon, seq in aln.items():
 8 |         sequences.append(easel.TextSequence(name=taxon.encode(), sequence=seq))
 9 |     msa = easel.TextMSA(name=name.encode(), sequences=sequences)
10 |     return msa
11 | 
12 | # helper function to obtain alphabet given molecule type
13 | def moleculeToAlphabet(molecule):
14 |     alphabet = None
15 |     if molecule == 'amino':
16 |         alphabet = easel.Alphabet.amino()
17 |     elif molecule == 'dna':
18 |         alphabet = easel.Alphabet.dna()
19 |     elif molecule == 'rna':
20 |         alphabet = easel.Alphabet.rna()
21 |     else:
22 |         raise ValueError(f"alphabet {molecule} is not amino, dna, or rna")
23 |     return alphabet
24 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/merger.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on May 14, 2020
 3 | 
 4 | @author: Vlad
 5 | '''
 6 | 
 7 | import os
 8 | import time
 9 | 
10 | 
11 | from align.merge.graph_build.graph_builder import buildGraph
12 | from align.merge.graph_cluster.clusterer import clusterGraph
13 | from align.merge.graph_trace.tracer import findTrace
14 | from align.merge.optimizer import optimizeTrace
15 | from align.merge.alignment_writer import writeAlignment
16 | from configuration import Configs
17 | 
18 | 
19 | def mergeSubalignments(context):
20 |     Configs.log("Merging {} subaligments..".format(len(context.subalignmentPaths)))
21 |     time1 = time.time()  
22 |     
23 |     buildGraph(context)
24 |     clusterGraph(context.graph)
25 |     findTrace(context.graph)
26 |     optimizeTrace(context.graph)    
27 |     writeAlignment(context)
28 |     
29 |     time2 = time.time()  
30 |     Configs.log("Merged {} subalignments into {} in {} sec..".format(len(context.subalignmentPaths), context.outputFile, time2-time1))
31 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_cluster/rg.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Aug 23, 2020
 3 | 
 4 | @author: Vlad
 5 | '''
 6 | 
 7 | from configuration import Configs
 8 | from align.merge.graph_trace.rg_search import rgCluster
 9 | from align.merge.graph_trace.rg_fast_search import rgFastCluster
10 | 
11 | def rgClustering(graph):
12 |     Configs.log("Building a region-growing graph clustering..")
13 |     
14 |     k = len(graph.context.subalignments)
15 |     lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
16 |     upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 
17 |     graph.clusters =  rgCluster(graph, lowerBound, upperBound, False)
18 |     graph.writeClustersToFile(graph.clusterPath)
19 | 
20 | def rgFastClustering(graph):
21 |     Configs.log("Building a fast region-growing graph clustering..")
22 |     
23 |     k = len(graph.context.subalignments)
24 |     lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
25 |     upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 
26 |     graph.clusters =  rgFastCluster(graph, lowerBound, upperBound, False)
27 |     graph.writeClustersToFile(graph.clusterPath)
28 | 


--------------------------------------------------------------------------------
/witch_msa/gcmm/callback.py:
--------------------------------------------------------------------------------
 1 | from witch_msa.configs import Configs
 2 | from witch_msa.helpers.alignment_tools import ExtendedAlignment
 3 | import gzip
 4 | 
 5 | '''
 6 | Callback for a query alignment result
 7 | *args should have three fields: _query, _index, checkpoint_path
 8 | '''
 9 | def callback_queryAlignment(success, ignored, retry, i_retry, 
10 |         query, index, taxon_name, checkpoint_path):
11 |     if (not query) and i_retry > 0:
12 |         retry.append(index)
13 |     else:
14 |         # failed job indicated in the <witch-ng> or <default> pipelines,
15 |         # should be ignored in the output
16 |         if (not query) or len(query) == 0:
17 |             ignored.append(taxon_name)
18 |         else:
19 |             # write to checkpoint_path
20 |             if (not isinstance(query, ExtendedAlignment)) or (len(query) != 1):
21 |                 return
22 |             seq = query[taxon_name]
23 |             line = '{}\t{}\n'.format(taxon_name, seq)
24 |             encoded = line.encode('utf-8')
25 |             with gzip.open(checkpoint_path, 'ab') as f:
26 |                 f.write(encoded)
27 | 
28 |             # update success
29 |             success.append(query)
30 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_cluster/clusterer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Aug 23, 2020
 3 | 
 4 | @author: Vlad
 5 | '''
 6 | 
 7 | import os
 8 | import time
 9 | 
10 | from configuration import Configs
11 | 
12 | from align.merge.graph_cluster.mcl import runMclClustering
13 | from align.merge.graph_cluster.mlr_mcl import runMlrMclClustering
14 | from align.merge.graph_cluster.rg import rgClustering
15 | 
16 | '''
17 | The alignment graph is clustered, the clusters are written out as an array of node arrays.
18 | MCL is the main way to do this, but rg could be used if there are scalability issues.
19 | '''
20 | 
21 | def clusterGraph(graph):
22 |     time1 = time.time()
23 |     
24 |     if os.path.exists(graph.clusterPath):
25 |         Configs.log("Found existing cluster file {}".format(graph.clusterPath))
26 |         graph.readClustersFromFile(graph.clusterPath)
27 |         
28 |     elif Configs.graphClusterMethod == "mcl":
29 |         runMclClustering(graph)      
30 |         
31 |     elif Configs.graphClusterMethod == "mlrmcl":
32 |         runMlrMclClustering(graph)
33 |         
34 |     elif Configs.graphClusterMethod == "rg":
35 |         rgClustering(graph)
36 |         
37 |     else:
38 |         Configs.log("No alignment graph clustering requested..")
39 |     
40 |     time2 = time.time()  
41 |     Configs.log("Clustered the graph in {} sec..".format(time2-time1))
42 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tasks/files.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 26, 2020
 3 | 
 4 | @author: Vlad
 5 | '''
 6 | 
 7 | import os
 8 | import json
 9 | import time
10 | import random
11 | from tasks import task
12 | 
13 | 
14 | def writeTasksToFile(taskList, tasksFile, append = True):
15 |     if not append and len(taskList) == 0 and os.path.exists(tasksFile):
16 |         os.remove(tasksFile)
17 |     elif len(taskList) > 0:
18 |         with open(tasksFile, 'a' if append else 'w') as file:
19 |             for t in taskList:
20 |                 file.write(t.json + "\n")
21 | 
22 | def readTasksFromFile(tasksFile):
23 |     fileTasks = []
24 |     if os.path.exists(tasksFile):
25 |         with open(tasksFile) as file:
26 |             for line in file:
27 |                 mapper = json.loads(line.strip())
28 |                 fileTasks.append(task.Task(**mapper))
29 |     return fileTasks
30 | 
31 | 
32 | class FileLock:
33 |     
34 |     def __init__(self, filePath):
35 |         self.filePath = filePath
36 |     
37 |     def __enter__(self):
38 |         while True:
39 |             try:
40 |                 lock = open(self.filePath, 'x')
41 |                 lock.close()
42 |                 return self
43 |             except:
44 |                 time.sleep(random.random()*0.1 + 0.05)
45 |                 #time.sleep(random.random() + 0.5)
46 |             
47 |     def __exit__(self, excType, excVal, excTb):
48 |         os.remove(self.filePath)
49 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |     - name: Lint with flake8
33 |       run: |
34 |         # stop the build if there are Python syntax errors or undefined names
35 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |     - name: Test with pytest
39 |       run: |
40 |         pytest
41 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_trace/naive.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Aug 23, 2020
 3 | 
 4 | @author: Vlad
 5 | '''
 6 | 
 7 | 
 8 | from configuration import Configs
 9 | 
10 | def atomizedClustering(graph):
11 |     Configs.log("Building a fully atomized clustering..")
12 |     
13 |     k = len(graph.context.subalignments)
14 |     lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
15 |     upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 
16 |     graph.clusters = atomizedCluster(lowerBound, upperBound)
17 | 
18 | def naiveClustering(graph):
19 |     Configs.log("Building a naive left-justified clustering..")
20 |     
21 |     k = len(graph.context.subalignments)
22 |     lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
23 |     upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 
24 |     graph.clusters = naiveCluster(lowerBound, upperBound)
25 | 
26 | def atomizedCluster(lowerBound, upperBound):
27 |     clusters = []
28 |     for j in range(len(lowerBound)):
29 |         for i in range(lowerBound[j], upperBound[j]):
30 |             clusters.append([i])
31 |     return clusters
32 | 
33 | def naiveCluster(lowerBound, upperBound):
34 |     clusters = []
35 |     i = 0
36 |     while True:
37 |         cluster = []
38 |         for j in range(len(lowerBound)):
39 |             if lowerBound[j] + i < upperBound[j]:
40 |                 cluster.append(lowerBound[j] + i)
41 |         if len(cluster) == 0:
42 |             break   
43 |         clusters.append(cluster)     
44 |         i = i+1
45 |     return clusters


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "witch-msa"
 7 | dynamic = ["version", "dependencies"]
 8 | description = "WITCH - A Multiple Sequence Alignment Tool"
 9 | readme = "README.rst"
10 | authors = [
11 |         {name = "Chengze Shen", email = "chengze5@illinois.edu"}
12 |         ]
13 | license = {file = "LICENSE"}
14 | requires-python = ">=3.7"
15 | classifiers = [
16 |         "Development Status :: 4 - Beta",
17 |         "Operating System :: OS Independent",
18 |         "Intended Audience :: Developers",
19 |         "Intended Audience :: Science/Research",
20 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
21 |         "Topic :: Software Development",
22 |         "License :: OSI Approved :: GNU General Public License (GPL)",
23 |         "Programming Language :: Python",
24 |         "Programming Language :: Python :: 3",
25 |         "Programming Language :: Python :: 3.7",
26 |         "Programming Language :: Python :: 3.8",
27 |         "Programming Language :: Python :: 3.9",
28 |         "Programming Language :: Python :: 3.10",
29 |         "Programming Language :: Python :: 3.11",
30 |         "Programming Language :: Python :: 3.12"
31 |         ]
32 | #scripts = ["witch.py"]
33 | #packages = ["gcmm", "helpers"]
34 | 
35 | [project.urls]
36 | Homepage = "https://github.com/c5shen/WITCH"
37 | Changelog = "https://github.com/c5shen/WITCH/blob/main/CHANGELOG.rst"
38 | 
39 | [tool.setuptools.dynamic]
40 | version = {attr = "witch_msa.__version__"}
41 | dependencies = {file = ["requirements.txt"]}
42 | 


--------------------------------------------------------------------------------
/witch_msa/default.config:
--------------------------------------------------------------------------------
 1 | [Basic]
 2 | #### the FastTreeMP executable provided may not be compatible to certain
 3 | #### system, such as macOS on M1 chip (arm64 instead of x86). In that case,
 4 | #### please provide your own fasttreepath (also do the same for the [MAGUS]
 5 | #### configuration below) by compiling the source code from:
 6 | #### http://www.microbesonline.org/fasttree/FastTree.c
 7 | #### 
 8 | #### command for compilation (please use gcc-10 or higher):
 9 | #### gcc -DOPENMP -fopenmp -O3 -finline-functions -funroll-loops -Wall -o FastTreeMP FastTree.c -lm
10 | ####
11 | #### Other softwares used can also be self-provided if necessary.
12 | magus_path = 
13 | mafftpath =
14 | mclpath =
15 | hmmsearchpath =
16 | hmmbuildpath =
17 | hmmalignpath =
18 | fasttreepath =
19 | 
20 | [Backbone]
21 | #### alignment_method can be set to [mafft, magus] for now ####
22 | alignment_method = magus
23 | alignment_path =
24 | #alignment_method = mafft
25 | #alignment_path = /anaconda3/bin/mafft
26 | #### default backbone_size is min(1000, len(taxa)), but could be fewer
27 | #### if there aren't many taxa within median length (selection strategy)
28 | backbone_size =
29 | #### selection strategy can be [random, median_length]; default median_length
30 | selection_strategy = median_length 
31 | tree_method = FastTree2 
32 | tree_path =
33 | 
34 | [MAGUS]
35 | #### settings for running MAGUS backbone specifically. Refer to MAGUS
36 | #### github page for more details.
37 | inflationfactor =
38 | graphclustermethod =
39 | graphtracemethod =
40 | graphtraceoptimize =
41 | maxnumsubsets =
42 | #### Custom binary executable paths to run MAGUS/GCM. Specifically added
43 | #### for macOS systems.
44 | #### This will be the same as the ones in [Basic] if generated from setup.py
45 | ####Please use the absolute path to each desired executable.
46 | mafftpath =
47 | mclpath =
48 | hmmsearchpath =
49 | hmmbuildpath =
50 | hmmalignpath =
51 | fasttreepath =
52 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mcl/bin/mclblastline:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | #     Copyright (C) 2003, 2004, 2005, 2006, 2007 Stijn van Dongen
 4 |  #
 5 | # You can redistribute and/or modify this program under the terms of the GNU
 6 | # General Public License;  either version 3 of the License or (at your option)
 7 | # any later  version.
 8 | 
 9 | use strict;
10 | 
11 | $" = ' ';
12 | 
13 | my $do_help = 0;
14 | my $cline = "mclpipeline --parser=mcxdeblast --parser-tag=blast";
15 | 
16 | unless (grep { $_ =~ /^--ass-r[vei]?=/; } @ARGV) {
17 |    $cline .= " --ass-r=max";
18 | }
19 | 
20 | $cline .= " @ARGV";
21 | 
22 | if (grep { $_ =~ /--(help|apropos)/; } @ARGV) {
23 |    $do_help = 1;
24 | }
25 | elsif (!@ARGV) {
26 |    $do_help = 1;
27 |    $cline .= " --help";
28 | }
29 | 
30 | if ($do_help) {
31 |    print <<_help_;
32 | mcxblastline wraps around the generic mclpipeline script. It fills in the name
33 | of the BLAST parser (mcxdeblast) and the tag ('blast') used to propagate
34 | mcxdeblast options through the pipeline to mcxdeblast itself.  You can freely
35 | use all mclpipeline options other than --parser=<executable> and
36 | --parser-tag=<str>.
37 | _help_
38 | }
39 | if (system $cline) {
40 |    print "mcxblastline wrapper: pipeline failed\n";
41 |    print "cline: $cline\n";
42 |    exit(1);
43 | }
44 | if ($do_help) {
45 |    print <<_help_;
46 | ________________
47 | The above options are generic pipeline options.  You can pass any mcxdeblast
48 | option by inserting the 'blast' tag in front of that particular option. For
49 | example, the mcxdeblast --score=x option (where x is 'b' or 'e') should
50 | be passed to mcxblastline as --blast-score=x.
51 | 
52 | The mcxdeblast --xo-dat option is special; it must *not* be prefixed, as it is
53 | shared with mclpipeline, as can be seen from the above listing.  The mcxdeblast
54 | --xi-dat option should not be used, as it encapsulated by the mclpipeline --xi
55 | option.
56 | _help_
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/examples/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | all_unaligned_path=data/unaligned_all.fasta
 4 | backbone_aln_path=data/backbone.aln.fasta.gz
 5 | backbone_tre_path=data/backbone.tre
 6 | query_path=data/unaligned_frag.fasta
 7 | outname=aligned.fasta
 8 | 
 9 | scenario=4
10 | if [[ $1 != "" ]]; then
11 |     scenario=$1
12 | fi
13 | 
14 | if [[ $scenario == 1 ]]; then
15 |     # Scenario A - unaligned sequences only
16 |     python ../witch.py -i ${all_unaligned_path} -d scenarioA_output \
17 |         -o ${outname}
18 | elif [[ $scenario == 2 ]]; then
19 |     # Scenario B - unaligned sequences only; using bit scores;
20 |     #              using 10 HMMs to align a sequence
21 |     python ../witch.py -i ${all_unaligned_path} -d scenarioB_output \
22 |         -o ${outname} -w 0 -k 10
23 | elif [[ $scenario == 3 ]]; then
24 |     # 3) Scenario C - backbone alignment available; backbone tree missing;
25 |     #                 query sequences available; also saving weights to local
26 |     python ../witch.py --num-cpus -1 -b ${backbone_aln_path} \
27 |         -q ${query_path} -d scenarioC_output -o ${outname} \
28 |         --save-weight 1
29 | elif [[ $scenario == 4 ]]; then
30 |     # 4) Scenario D - backbone alignment available; backbone tree available;
31 |     #                 query sequences available; saving weights to local;
32 |     #                 also save decomposition results for future use (e.g.,
33 |     #                 faster rerun)
34 |     python ../witch.py --num-cpus -1 -b ${backbone_aln_path} \
35 |         -e ${backbone_tre_path} \
36 |         -q ${query_path} -d scenarioD_output -o ${outname} \
37 |         --save-weight 1 --keep-decomposition 1
38 | elif [[ $scenario == 5 ]]; then
39 |     # 5) Scenario E - same as Scenario D, but with a user-specified config file
40 |     python ../witch.py --num-cpus -1 -b ${backbone_aln_path} \
41 |         -e ${backbone_tre_path} \
42 |         -q ${query_path} -d scenarioE_output -o ${outname} \
43 |         --save-weight 1 --keep-decomposition 1 \
44 |         -c user.config
45 | fi
46 | 


--------------------------------------------------------------------------------
/witch_msa/gcmm/__init__.py:
--------------------------------------------------------------------------------
 1 | from witch_msa.configs import Configs
 2 | from concurrent.futures import ProcessPoolExecutor
 3 | import os, sys, inspect
 4 | 
 5 | '''
 6 | Customized ProcessPoolExecutor class to handle callbacks and monitor current
 7 | progress in query alignments
 8 | '''
 9 | class WITCHProcessPoolExecutor(ProcessPoolExecutor):
10 |     def __init__(self, *args, **kwargs):
11 |         super().__init__(*args, **kwargs)
12 |         self._running_jobs = 0
13 |         self._submitted_jobs = 0
14 |         self._finished_jobs = 0
15 |     
16 |     def submit(self, *args, **kwargs):
17 |         future = super().submit(*args, **kwargs)
18 |         self._running_jobs += 1
19 |         self._submitted_jobs += 1
20 |         #future.add_done_callback(self._worker_is_done)
21 |         return future
22 | 
23 |     def _worker_is_done(self, future):
24 |         self._running_jobs -= 1
25 |         self._finished_jobs += 1
26 |         print('Finished jobs: {}/{}'.format(
27 |             self._finished_jobs, self._submitted_jobs), end='\r', flush=True)
28 |     
29 |     def get_pool_usage(self):
30 |         return self._running_jobs
31 |     
32 |     def get_finished_jobs(self):
33 |         return self._finished_jobs
34 | 
35 | '''
36 | Simple function for notifying user of errors encountered
37 | '''
38 | def notifyError(location):
39 |     print('Encountered an error at {}\n\tcheck {}'.format(
40 |         location, Configs.error_path))
41 |     exit(1)
42 | 
43 | '''
44 | Simple function to obtain current line number of the caller
45 | '''
46 | def getLineInfo():
47 |     items = inspect.stack()[1][1:4]
48 |     return '{}:{} - Line {}'.format(items[0], items[2], items[1])
49 | 
50 | '''
51 | Simple function for sanity-checking all output files of a given list that:
52 |     (1) they exists
53 |     (2) they have size > 0
54 | '''
55 | def sanityCheckFileCreation(files):
56 |     ret = []    # list of problematic files
57 |     for f in files:
58 |         if os.path.exists(f) and os.stat(f).st_size > 0:
59 |             pass
60 |         else:
61 |             ret.append(f)
62 |     return ret
63 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tasks/controller.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Nov 1, 2020
 3 | 
 4 | @author: Vlad
 5 | '''
 6 | 
 7 | from tasks.manager import TaskManager, runTask
 8 | from configuration import Configs
 9 | 
10 | '''
11 | This is where the main thread goes to submit and await tasks.
12 | When the main thread is in a blocking wait, it can pick up a new alignment task to run.
13 | '''
14 | 
15 | def submitTasks(tasks):
16 |     with TaskManager.managerLock:
17 |         checkTaskManager()
18 |         TaskManager.submittedTasks.update(tasks)
19 |         TaskManager.managerSignal.set()
20 | 
21 | def asCompleted(tasks):
22 |     unfinished = tasks
23 |     while True:
24 |         finished, unfinished = checkWhatFinished(unfinished)
25 |         yield from finished
26 |         if len(unfinished) == 0:
27 |             return
28 |         if len(finished) == 0:
29 |             observeTaskManager()
30 |         
31 | def awaitTasks(tasks):
32 |     finished, unfinished = checkWhatFinished(tasks)
33 |     while len(unfinished) > 0:
34 |         observeTaskManager()
35 |         finished, unfinished = checkWhatFinished(unfinished)
36 |         
37 | def checkWhatFinished(tasks):
38 |     finished, unfinished = [], []
39 |     for t in tasks:
40 |         finished.append(t) if t.checkFinished() else unfinished.append(t)
41 |     return finished, unfinished
42 | 
43 | def observeTaskManager():
44 |     TaskManager.observerWaiting = True
45 |     TaskManager.managerSignal.set()
46 |     TaskManager.observerSignal.wait(10)
47 |     with TaskManager.managerLock:
48 |         TaskManager.observerWaiting = False
49 |         TaskManager.observerSignal.clear()
50 |         checkTaskManager()
51 |         task, TaskManager.observerTask = TaskManager.observerTask, None
52 |     if task is not None:
53 |         runTask(task)
54 |             
55 |         #manager.setTaskFinished(task)
56 | 
57 | def checkTaskManager():
58 |     if not TaskManager.managerFuture.running():
59 |         Configs.error("Task manager is dead for some reason..")
60 |         TaskManager.managerFuture.result()
61 |         raise Exception("Task manager is dead for some reason..")
62 |         
63 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_cluster/mlr_mcl.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 13, 2020
 3 | 
 4 | @author: Vlad
 5 | '''
 6 | 
 7 | import os
 8 | 
 9 | from configuration import Configs
10 | from tools import external_tools
11 | 
12 | 
13 | def runMlrMclClustering(graph):  
14 |     Configs.log("Running MLR-MCL alignment graph clustering..")
15 |     graphPath = os.path.join(graph.workingDir, "graph_mlr_mcl.txt")
16 |     clusterPath = os.path.join(graph.workingDir, "clusters_mlr_mcl.txt")
17 |     
18 |     if not os.path.exists(clusterPath):
19 |         if not os.path.exists(graphPath):
20 |             writeGraphToFile(graph, graphPath)
21 |         external_tools.runMlrMcl(graphPath, 30000, 0.5, 4, graph.workingDir, clusterPath).run()
22 | 
23 |     graph.clusters = readClustersFromFile(clusterPath)
24 |     graph.writeClustersToFile(graph.clusterPath)
25 |     
26 | def writeGraphToFile(graph, filePath):
27 |     Configs.log("Writing MLR-MCL graph file to {}".format(filePath))
28 |     vertices, edges = 0, 0
29 |     lines = []
30 |     for i in range(len(graph.matrix)):
31 |         pairs = graph.matrix[i].items()
32 |         vertices = vertices + 1        
33 |         edges = edges + len(pairs)
34 |         lines.append(" ".join(["{} {}".format(a+1, b) for a, b in pairs]))      
35 |             
36 |     with open(filePath, 'w') as textFile:
37 |         textFile.write("{} {} 1\n".format(vertices, int(edges/2)))
38 |         for line in lines:
39 |             textFile.write(line + "\n")
40 |             
41 |     Configs.log("Wrote graph with {} vertices and {} edges to {}".format(vertices, int(edges/2), filePath))
42 | 
43 | def readClustersFromFile(filePath):
44 |     assignments = {}
45 |     with open(filePath) as f:
46 |         num = 0
47 |         for line in f:
48 |             cluster = int(line.strip())
49 |             if cluster not in assignments:
50 |                 assignments[cluster] = [num]
51 |             else:
52 |                 assignments[cluster].append(num) 
53 |             num = num + 1
54 |     clusters = [assignments[c] for c in range(len(assignments))]
55 |     Configs.log("Found {} clusters..".format(len(clusters)))
56 |     return clusters


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_trace/tracer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 17, 2020
 3 | 
 4 | @author: Vlad
 5 | '''
 6 | 
 7 | import os
 8 | import time
 9 | 
10 | from configuration import Configs
11 | from align.merge.graph_cluster.clean_clusters import purgeClusterViolations, purgeDuplicateClusters
12 | from align.merge.graph_trace.min_clusters import minClustersSearch
13 | from align.merge.graph_trace.fm import fmAlgorithm
14 | from align.merge.graph_trace.mwt_search import mwtGreedySearch, mwtSearch
15 | from align.merge.graph_trace.rg_search import rgSearch
16 | from align.merge.graph_trace.rg_fast_search import rgFastSearch
17 | from align.merge.graph_trace.naive import naiveClustering
18 | 
19 | '''
20 | Graph clusters must be refined into a "trace", a constrained clustering that corresponds to a valid MSA.
21 | There are a variety of ways to do this, "minclusters" is usually the most dependable option.
22 | "mwtgreedy" or "rgfast" might be used if there are scalability issues.
23 | Some of these options don't require an existing clustering, and can work on the raw graph.
24 | '''
25 | 
26 | def findTrace(graph):
27 |     time1 = time.time() 
28 |     
29 |     if os.path.exists(graph.tracePath):
30 |         Configs.log("Found existing trace file {}".format(graph.tracePath))
31 |         graph.readClustersFromFile(graph.tracePath)
32 |         
33 |     else:
34 |         purgeDuplicateClusters(graph)
35 |         purgeClusterViolations(graph)
36 |         
37 |         if Configs.graphTraceMethod == "minclusters":
38 |             minClustersSearch(graph)       
39 |         elif Configs.graphTraceMethod == "fm":            
40 |             fmAlgorithm(graph)        
41 |         elif Configs.graphTraceMethod == "mwtgreedy":
42 |             mwtGreedySearch(graph)
43 |         elif Configs.graphTraceMethod == "mwtsearch":
44 |             mwtSearch(graph)
45 |         elif Configs.graphTraceMethod == "rg":
46 |             rgSearch(graph)
47 |         elif Configs.graphTraceMethod == "rgfast":
48 |             rgFastSearch(graph)
49 |         elif Configs.graphTraceMethod == "naive":
50 |             naiveClustering(graph)
51 |         
52 |         graph.writeClustersToFile(graph.tracePath)
53 |     
54 |     
55 |     time2 = time.time()
56 |     Configs.log("Found alignment graph trace in {} sec..".format(time2-time1))
57 |     Configs.log("Found a trace with {} clusters and a total cost of {}".format(len(graph.clusters), graph.computeClusteringCost(graph.clusters)))
58 |     
59 |     
60 |     
61 |     


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_cluster/clean_clusters.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Aug 23, 2020
 3 | 
 4 | @author: Vlad
 5 | '''
 6 | 
 7 | from configuration import Configs
 8 | 
 9 | def purgeDuplicateClusters(graph):
10 |     uniqueClusters = set()
11 |     newclusters = []
12 |     for cluster in graph.clusters:
13 |         cluster.sort()
14 |         clusterTuple = tuple(cluster)
15 |         if clusterTuple not in uniqueClusters:
16 |             uniqueClusters.add(clusterTuple)
17 |             newclusters.append(cluster)
18 |     graph.clusters = newclusters
19 |     Configs.log("Purged duplicate clusters. Found {} unique clusters..".format(len(graph.clusters)))
20 | 
21 | def purgeClusterViolations(graph):
22 |     redundantCols = {}
23 |     redundantRows = {}
24 |     elementScores = {}
25 |     for a, cluster in enumerate(graph.clusters):
26 |         for b in cluster:
27 |             bsub, bpos = graph.matSubPosMap[b] 
28 |             redundantCols[a, bsub] = redundantCols.get((a, bsub), []) + [(a, b)] 
29 |             redundantRows[b] = redundantRows.get(b, []) + [(a, b)]
30 |             
31 |             scoresum = 0
32 |             for c in cluster:
33 |                 csub, cpos = graph.matSubPosMap[c]
34 |                 if bsub != csub:
35 |                     scoresum  = scoresum  + graph.matrix[b].get(c,0)
36 |             elementScores[a, b] = scoresum 
37 |     
38 |     problemCols = [(a,b) for a,b in redundantCols if len(redundantCols[a,b]) > 1]
39 |     problemRows = [a for a in redundantRows if len(redundantRows[a]) > 1]
40 |     Configs.log("Found {} row violations and {} column violations..".format(len(problemRows), len(problemCols)))
41 |     
42 |     sortedScores = list(elementScores.keys())
43 |     sortedScores.sort(key = lambda x : elementScores[x])
44 |     
45 |     for a,b in sortedScores:
46 |         bsub, bpos = graph.matSubPosMap[b] 
47 |         if len(redundantCols[a, bsub]) > 1 or len(redundantRows[b]) > 1:
48 |             graph.clusters[a].remove(b)
49 |             redundantCols[a, bsub].remove((a,b))
50 |             redundantRows[b].remove((a,b))                
51 |     
52 |     problemCols = [(a,b) for a,b in redundantCols if len(redundantCols[a,b]) > 1]
53 |     problemRows = [a for a in redundantRows if len(redundantRows[a]) > 1]
54 |     Configs.log("Finished violations sweep. Now {} row violations and {} column violations..".format(len(problemRows), len(problemCols)))
55 |     
56 |     graph.clusters = [cluster for cluster in graph.clusters if len(cluster) > 1]
57 |     Configs.log("Purged cluster violations. Found {} clean clusters..".format(len(graph.clusters)))
58 |     


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tasks/task.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Sep 29, 2020
 3 | 
 4 | @author: Vlad
 5 | '''
 6 | 
 7 | import os
 8 | import importlib
 9 | import json
10 | import traceback
11 | from configuration import Configs
12 | from tasks import controller
13 | 
14 | '''
15 | Tasks are self-contained, parallelizable units of work. 
16 | For example, running MAFFT, compressing a subalignment, etc..
17 | Primarily used to thread-parallelize MAFFT runs and node-parallelize subalignment operations.
18 | Saved as JSON in task files, which are then read back by computing nodes with available threads.
19 | This also serves the purpose of allowing aborted MAGUS runs to pick up where they left off.
20 | '''
21 | 
22 | class Task:
23 |     
24 |     functionModuleMap = {"runCommand" : "tools.external_tools",
25 |                          "runAlignmentTask" : "align.aligner",
26 |                          "recordGapCounts" : "align.merge.alignment_writer",
27 |                          "buildInducedSubalignment" : "align.merge.alignment_writer",
28 |                          "compressSubalignment" : "align.merge.alignment_writer"}
29 |     
30 |     def __init__(self, taskType, outputFile, taskArgs, **kwargs):
31 |         self.taskType = taskType
32 |         self.outputFile = outputFile
33 |         self.taskArgs = taskArgs
34 |         
35 |         for attr in kwargs:
36 |             vars(self)[attr] = kwargs.get(attr)    
37 |         self.attributes =  list(vars(self).keys())
38 |         
39 |         self.isFinished = False
40 |         self.future = None
41 |         self.json = self.toJson()   
42 |     
43 |     def submitTask(self):
44 |         submitTasks([self])
45 |     
46 |     def awaitTask(self):
47 |         awaitTasks([self])
48 |         
49 |     def submitAndAwaitTask(self):
50 |         self.submitTask()
51 |         self.awaitTask()
52 |         
53 |     def run(self):
54 |         try:
55 |             if not os.path.exists(self.outputFile):
56 |                 Configs.log("Running a task, output file: {}".format(self.outputFile))
57 |                 mod = importlib.import_module(Task.functionModuleMap[self.taskType])
58 |                 func = getattr(mod, self.taskType)
59 |                 func(**self.taskArgs)
60 |                 Configs.log("Completed a task, output file: {}".format(self.outputFile))
61 |             else:
62 |                 Configs.log("File already exists: {}".format(self.outputFile))
63 |         except Exception as exc:
64 |             Configs.error("Task for {} threw an exception:\n{}".format(self.outputFile, exc))
65 |             Configs.error(traceback.format_exc())
66 |             raise
67 |         finally:
68 |             self.isFinished = True
69 |         
70 |     def checkFinished(self):
71 |         if not self.isFinished:
72 |             return False
73 |         if self.future is not None:
74 |             self.future.result()
75 |         return True
76 |         
77 |     def toJson(self):
78 |         mapper = {attr : getattr(self, attr) for attr in self.attributes}
79 |         return json.dumps(mapper)
80 |     
81 |     def __eq__(self, other):
82 |         if isinstance(other, Task):
83 |             return self.outputFile == other.outputFile
84 |         return NotImplemented
85 | 
86 |     def __hash__(self):
87 |         return hash(self.outputFile)
88 | 
89 | 
90 | def asCompleted(tasks):
91 |     yield from controller.asCompleted(tasks)
92 |             
93 | def awaitTasks(tasks):
94 |     controller.awaitTasks(tasks)
95 |     
96 | def submitTasks(tasks):
97 |     controller.submitTasks(tasks)
98 |     


--------------------------------------------------------------------------------
/witch_msa/gcmm/merger.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 10.28.2021 by Chengze Shen
  3 | 
  4 | Merger of all query alignments to form the final alignment. The merging step
  5 | is exactly the same one from PASTA and UPP (by transitivity).
  6 | '''
  7 | 
  8 | import os, sys, re
  9 | import time
 10 | from witch_msa.configs import Configs
 11 | from witch_msa.helpers.alignment_tools import Alignment, read_fasta, \
 12 |         CompactAlignment, compact, ExtendedAlignment 
 13 | from functools import partial
 14 | #from concurrent.futures.process import ProcessPoolExecutor
 15 | from math import ceil
 16 | 
 17 | '''
 18 | function to merge a set of input paths (to alignments) sequentially
 19 | '''
 20 | def sequential_merger(queries, inpaths):
 21 |     init_index = 0
 22 |     while init_index < len(inpaths) and inpaths[init_index] == 'skipped':
 23 |         init_index += 1
 24 |     init_aln = Alignment(); init_aln.read_file_object(queries[init_index])
 25 |     new_aln = compact(init_aln)
 26 |     for i in range(init_index + 1, len(inpaths)):
 27 |         inpath = inpaths[i]
 28 |         
 29 |         # skip these ones
 30 |         if inpath == 'skipped':
 31 |             continue
 32 |         frag_aln = Alignment(); frag_aln.read_file_object(inpath)
 33 |         new_aln.merge_in(compact(frag_aln))
 34 |     return new_aln
 35 | 
 36 | '''
 37 | function to merge all subalignments to one alignment and with all singletons
 38 | in queries collapsed (in lower cases). This is the same behavior as UPP.
 39 | An additional "masked" version of the final alignment with all lower cases
 40 | removed will also be written to disk.
 41 | '''
 42 | def mergeAlignmentsCollapsed(backbone_alignment_path, queries,
 43 |         renamed_taxa, pool):
 44 |     Configs.log('(UPP-style merging) Merging all GCM subproblems ' \
 45 |             'with transitivity and singletons from queries collapsed...')
 46 |     start = time.time()
 47 |     outpath = Configs.output_path
 48 |     #masked_outpath = Configs.output_path + '.masked'
 49 | 
 50 |     # Updated @ 10.26.2024 by Chengze Shen
 51 |     # masked alignment output name change to <name>.masked.fasta
 52 |     # if user gives `-o <name>.fa` or `<name>.fasta`, then the output name
 53 |     # will adapt to the correct suffix
 54 |     suffix = outpath.split('.')[-1]
 55 |     if suffix in ['fa', 'fasta']:
 56 |         masked_outpath = '.'.join(outpath.split('.')[:-1]) + '.masked.' + suffix
 57 |     else:
 58 |         masked_outpath = outpath + '.masked.fasta'
 59 | 
 60 |     if not (len(queries) > 0):
 61 |         print('No query alignment provided to merger!')
 62 |         exit(1)
 63 | 
 64 |     # read in all backbone sequences/alignment
 65 |     full_aln = ExtendedAlignment([])
 66 |     full_aln.read_file_object(backbone_alignment_path)
 67 |     full_aln.from_string_to_bytearray()
 68 |     
 69 |     # read in queries so that insertions are marked
 70 |     #backbone_keys = {x: 1 for x in full_aln.keys()}
 71 |     #func = partial(getQueryAlignment, backbone_keys)
 72 |     #queries = list(pool.map(func, inpaths))
 73 | 
 74 |     # merge all queries to the backbone
 75 |     for query in queries:
 76 |         if query != 'skipped':
 77 |             full_aln.merge_in(query, False)
 78 |             #del query
 79 |     full_aln.from_bytearray_to_string()
 80 |     
 81 |     # rename back taxa
 82 |     name_map = {v: k for k, v in renamed_taxa.items()}
 83 |     count = 0
 84 |     for name in list(name_map.keys()):
 85 |         ori_name = name_map[name]
 86 |         if name in full_aln:
 87 |             full_aln[ori_name] = full_aln[name]
 88 |             full_aln.pop(name)
 89 |             count += 1
 90 |     if count > 0:
 91 |         Configs.log('Converted {} names back to their originals'.format(count))
 92 |     Configs.log('Finished merging all GCM subproblems, output file: {}'.format(
 93 |         outpath))
 94 |     full_aln.write(outpath, 'FASTA')
 95 |     
 96 |     # write a masked version of full alignment
 97 |     full_aln.remove_insertion_columns()
 98 |     full_aln.write(masked_outpath, 'FASTA')
 99 |     Configs.log('Masked final alignment written to: {}'.format(masked_outpath))
100 | 
101 |     end = time.time()
102 |     Configs.runtime('Time to merge all outputs (s): {}'.format(end - start))
103 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/decompose/kmh.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on May 29, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import os
  8 | import shutil
  9 | import time
 10 | import random
 11 | 
 12 | from align.decompose import decomposer
 13 | from helpers import sequenceutils, treeutils, hmmutils
 14 | from tasks import task
 15 | from tools import external_tools
 16 | from configuration import Configs
 17 | 
 18 | '''
 19 | Not really used for anything, may be removed in the future.
 20 | '''
 21 | 
 22 | def buildSubsetsKMH(context, subsetsDir):
 23 |     tempDir = os.path.join(subsetsDir, "initial_tree")
 24 |     
 25 |     Configs.log("Building KMH decomposition on {} with skeleton size {}/{}..".format(context.sequencesPath, Configs.decompositionSkeletonSize, 1000))
 26 |     time1 = time.time()
 27 |     
 28 |     initialTreePath, initialAlignPath, unusedTaxa = buildInitialTreeAlign(tempDir, context.sequencesPath) 
 29 |     
 30 |     if len(unusedTaxa) == 0:
 31 |         subsetPaths = treeutils.decomposeGuideTree(tempDir, initialAlignPath, initialTreePath, Configs.decompositionMaxSubsetSize, Configs.decompositionMaxNumSubsets)
 32 |     else:
 33 |         subsetSeedDir = os.path.join(subsetsDir, "seed_subsets")
 34 |         if not os.path.exists(subsetSeedDir):
 35 |             os.makedirs(subsetSeedDir)
 36 |         subsetSeedPaths = treeutils.decomposeGuideTree(subsetSeedDir, initialAlignPath, initialTreePath, None, Configs.decompositionMaxNumSubsets)
 37 |         subsetPaths = reassignTaxons(subsetsDir, subsetSeedPaths, context.unalignedSequences, unusedTaxa)
 38 |     
 39 |     time2 = time.time()
 40 |     Configs.log("Built KMH decomposition on {} in {} sec..".format(context.sequencesPath, time2-time1))
 41 | 
 42 |     return subsetPaths
 43 | 
 44 | def buildInitialTreeAlign(tempDir, sequencesPath):
 45 |     outputTreePath = os.path.join(tempDir, "initial_tree.tre")
 46 |     outputAlignPath = os.path.join(tempDir, "initial_align.txt")
 47 |     
 48 |     if os.path.exists(outputTreePath) and os.path.exists(outputAlignPath):
 49 |         return outputTreePath, outputAlignPath
 50 |     if os.path.exists(tempDir):
 51 |         shutil.rmtree(tempDir)
 52 |     os.makedirs(tempDir)
 53 |     
 54 |     initialAlign, unusedTaxa = decomposer.initial_tree.buildInitialAlignment(sequencesPath, tempDir, Configs.decompositionSkeletonSize, 1000)
 55 |     sequenceutils.writeFasta(initialAlign, outputAlignPath)    
 56 |     #external_tools.runRaxmlNg(outputAlignPath, tempDir, outputTreePath, 8).run()
 57 |     external_tools.runFastTree(outputAlignPath, tempDir, outputTreePath).run()
 58 | 
 59 |     return outputTreePath, outputAlignPath, unusedTaxa
 60 | 
 61 | def reassignTaxons(subsetsDir, subsetSeedPaths, sequences, unusedTaxa):
 62 |     unusedPath = os.path.join(subsetsDir, "unassigned_sequences.txt")
 63 |     sequenceutils.writeFasta(sequences, unusedPath, unusedTaxa)
 64 |     
 65 |     hmmMap = {}
 66 |     for subsetPath in subsetSeedPaths:
 67 |         hmmDir = os.path.join(os.path.dirname(subsetPath), "hmm_{}".format(os.path.basename(subsetPath)).replace(".", "_"))
 68 |         if not os.path.exists(hmmDir):
 69 |             os.makedirs(hmmDir)
 70 |         hmmMap[subsetPath] = os.path.join(hmmDir, "hmm_model.txt") 
 71 |     hmmTasks = hmmutils.buildHmms(hmmMap)
 72 |     task.submitTasks(hmmTasks)
 73 |     task.awaitTasks(hmmTasks)
 74 |     hmmPaths = [t.outputFile for t in hmmTasks]
 75 |     
 76 |     scoreFileHmmFileMap = {}
 77 |     scoreTasks = hmmutils.buildHmmScores(hmmPaths, unusedPath, scoreFileHmmFileMap)
 78 |     task.submitTasks(scoreTasks) 
 79 |     
 80 |     bestScores = {}
 81 |     taxonHmmMap = {}
 82 |     for scoreTask in task.asCompleted(scoreTasks):
 83 |         subsetScores = hmmutils.readSearchFile(scoreTask.outputFile)
 84 |         for taxon, scores in subsetScores.items():
 85 |             if scores[1] > bestScores.get(taxon, -float("inf")):
 86 |                 bestScores[taxon] = scores[1]
 87 |                 taxonHmmMap[taxon] = scoreFileHmmFileMap[scoreTask.outputFile]
 88 |     
 89 |     subsetTaxons = {file : [] for file in hmmPaths}
 90 |     for taxon, hmmPath in taxonHmmMap.items():
 91 |         subsetTaxons[hmmPath].append(taxon)
 92 |     for subsetPath, hmmPath in hmmMap.items():
 93 |         subset = sequenceutils.readFromFasta(subsetPath)
 94 |         for taxon in subset:
 95 |             subsetTaxons[hmmPath].append(taxon)
 96 |     
 97 |     subsetPaths = []    
 98 |     i = 1
 99 |     for hmmPath, subset in subsetTaxons.items():
100 |         subsetPath = os.path.join(subsetsDir, "subset_{}.txt".format(i))
101 |         subsetPaths.append(subsetPath)
102 |         sequenceutils.writeFasta(sequences, subsetPath, subset)
103 |         i = i + 1
104 | 
105 |     return subsetPaths
106 | 
107 | 
108 |     
109 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/decompose/decomposer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on May 28, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import os
  8 | import random
  9 | import time
 10 | 
 11 | from align.decompose import initial_tree, kmh
 12 | from helpers import treeutils, sequenceutils
 13 | from configuration import Configs
 14 | 
 15 | '''
 16 | Handles the different ways to decompose the dataset into subsets.
 17 | The main way is to estimate a guide tree, then use PASTA's centroid edge decomposition
 18 | on the guide tree. Can also decompose randomly (for high speed on huge datasets).
 19 | '''
 20 | 
 21 | def decomposeSequences(context):
 22 |     time1 = time.time()
 23 |     
 24 |     if len(context.subsetPaths) > 0:
 25 |         Configs.log("Subset paths already provided, skipping decomposition..")
 26 |     
 27 |     elif len(context.subalignmentPaths) > 0:
 28 |         context.subsetPaths = context.subalignmentPaths
 29 |         Configs.log("Subalignment paths already provided, skipping decomposition..")
 30 |     
 31 |     else:
 32 |         subsetsDir = os.path.join(context.workingDir, "decomposition")
 33 |         context.subsetPaths = []
 34 |         n = 1
 35 |         while True:
 36 |             filePath = os.path.join(subsetsDir, "subset_{}.txt".format(n))
 37 |             if not os.path.exists(filePath):
 38 |                 break
 39 |             Configs.log("Detected existing subset file {}".format(filePath))
 40 |             context.subsetPaths.append(filePath)
 41 |             n = n + 1
 42 |         
 43 |         if len(context.subsetPaths) == 0:
 44 |             buildDecomposition(context, subsetsDir)
 45 |     
 46 |     time2 = time.time()  
 47 |     Configs.log("Decomposed {} into {} subsets in {} sec..".format(context.sequencesPath, len(context.subsetPaths), time2-time1))
 48 | 
 49 | def buildDecomposition(context, subsetsDir):  
 50 |     if not os.path.exists(subsetsDir):
 51 |         os.makedirs(subsetsDir)  
 52 |     if context.unalignedSequences is None:
 53 |         context.unalignedSequences = sequenceutils.readFromFasta(context.sequencesPath, removeDashes=True)    
 54 |     
 55 |     if (Configs.decompositionStrategy == "random" or context.guideTree == "random") and Configs.outputPath == context.outputFile:
 56 |         context.subsetPaths = randomDecomposition(subsetsDir, context.unalignedSequences, Configs.decompositionMaxNumSubsets)
 57 |         
 58 |     elif Configs.decompositionStrategy == "kmh":
 59 |         Configs.log("Decomposing {} with KMH..".format(context.sequencesPath))
 60 |         Configs.log("Targetting {} subsets..".format(Configs.decompositionMaxNumSubsets))
 61 |         context.subsetPaths = kmh.buildSubsetsKMH(context, subsetsDir)
 62 |     
 63 |     else:
 64 |         guideTreePath  = initial_tree.buildInitialTree(context, subsetsDir, context.guideTree)
 65 |         Configs.log("Using target subset size of {}, and maximum number of subsets {}..".format(Configs.decompositionMaxSubsetSize, Configs.decompositionMaxNumSubsets))
 66 |         context.subsetPaths = treeutils.decomposeGuideTree(subsetsDir, context.sequencesPath, guideTreePath, 
 67 |                                                    Configs.decompositionMaxSubsetSize, Configs.decompositionMaxNumSubsets)        
 68 | 
 69 | def chooseSkeletonTaxa(sequences, skeletonSize, mode = "fulllength"):
 70 |     allTaxa = list(sequences.keys())
 71 |     
 72 |     if mode == "fulllength":
 73 |         seqLengths = [len(sequences[t].seq) for t in sequences]
 74 |         
 75 |         #topQuartile = numpy.quantile(seqLengths, 0.75)
 76 |         seqLengths.sort()
 77 |         topQuartile = seqLengths[int(0.75*(len(seqLengths)-1))]
 78 |         
 79 |         fullLength = []
 80 |         notFullLength = []
 81 |         for t in allTaxa:
 82 |             if abs(len(sequences[t].seq) - topQuartile) < 0.25 * topQuartile:
 83 |                 fullLength.append(t)
 84 |             else:
 85 |                 notFullLength.append(t) 
 86 |         
 87 |         random.shuffle(fullLength)
 88 |         random.shuffle(notFullLength)     
 89 |         allTaxa = fullLength + notFullLength
 90 |     else:
 91 |         random.shuffle(allTaxa)
 92 |         
 93 |     skeletonTaxa = allTaxa[:skeletonSize]
 94 |     remainingTaxa = allTaxa[skeletonSize:]
 95 |     return skeletonTaxa, remainingTaxa
 96 | 
 97 | def randomDecomposition(subsetsDir, sequences, numSubsets):
 98 |     allTaxa = list(sequences.keys())
 99 |     random.shuffle(allTaxa)
100 |     
101 |     taxonSubsets = [allTaxa[i :: numSubsets] for i in range(numSubsets)]
102 |     subsetPaths = []
103 |     for n, subset in enumerate(taxonSubsets):
104 |         subsetPath = os.path.join(subsetsDir, "subset_{}.txt".format(n+1))
105 |         subsetPaths.append(subsetPath)                    
106 |         sequenceutils.writeFasta(sequences, subsetPath, subset) 
107 |     return subsetPaths
108 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/aligner.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on May 29, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import os
  8 | import shutil
  9 | 
 10 | from align.alignment_context import AlignmentContext
 11 | from align.decompose.decomposer import decomposeSequences 
 12 | from align.merge.merger import mergeSubalignments
 13 | from tools import external_tools
 14 | from configuration import Configs
 15 | from helpers import sequenceutils
 16 | from tasks import task
 17 | 
 18 | '''
 19 | Alignments are treated as "tasks", units of work that are written out to task files and 
 20 | processed as threads and/or compute nodes become available. 
 21 | MAGUS tasks will recursively generate MAGUS tasks over large subsets and MAFFT tasks over smaller subsets.
 22 | '''
 23 | 
 24 | def mainAlignmentTask():    
 25 |     '''
 26 |         6.1.2021 - modified by Chengze Shen
 27 |         added a new argument for backboneWeightsPath
 28 |     '''
 29 |     args = {"workingDir" : Configs.workingDir, "outputFile" : Configs.outputPath,
 30 |             "subalignmentPaths" : Configs.subalignmentPaths, "sequencesPath" : Configs.sequencesPath,
 31 |             "backbonePaths" : Configs.backbonePaths, "guideTree" : Configs.guideTree,
 32 |             "backboneWeightsPath": Configs.backboneWeightsPath}
 33 |     task = createAlignmentTask(args)
 34 |     task.submitTask()
 35 |     task.awaitTask()
 36 |     
 37 | def createAlignmentTask(args):
 38 |     return task.Task(taskType = "runAlignmentTask", outputFile = args["outputFile"], taskArgs = args)
 39 | 
 40 | def runAlignmentTask(**kwargs):
 41 |     '''
 42 |     The standard MAGUS task: 
 43 |     decompose the data into subsets, align each subset, and merge the subalignments.
 44 |     '''
 45 |     
 46 |     with AlignmentContext(**kwargs) as context:
 47 |         if context.sequencesPath is not None:
 48 |             Configs.log("Aligning sequences {}".format(context.sequencesPath))
 49 |         
 50 |         decomposeSequences(context)
 51 |         if Configs.onlyGuideTree:
 52 |             Configs.log("Outputting only the guide tree, as requested..")
 53 |             shutil.copyfile(os.path.join(context.workingDir, "decomposition", "initial_tree", "initial_tree.tre"), context.outputFile)
 54 |             return
 55 |         
 56 |         '''
 57 |             6.1.2021 - added by Chengze Shen
 58 |             a quick log for sanity check that the backbone weights path is correct
 59 |         '''
 60 |         if context.backboneWeightsPath:
 61 |             Configs.log("Backbone weights path is {}..".format(
 62 |                 context.backboneWeightsPath))
 63 |         else:
 64 |             Configs.log("Heads up! All backbone alignments are treated equally..")
 65 | 
 66 |         alignSubsets(context)
 67 |         mergeSubalignments(context)
 68 | 
 69 | def alignSubsets(context):
 70 |     if len(context.subalignmentPaths) > 0:
 71 |         Configs.log("Subalignment paths already provided, skipping subalignments..")
 72 |         return
 73 |     
 74 |     Configs.log("Building {} subalignments..".format(len(context.subsetPaths)))
 75 |     subalignDir = os.path.join(context.workingDir, "subalignments")
 76 |     if not os.path.exists(subalignDir):
 77 |         os.makedirs(subalignDir)
 78 |         
 79 |     mafftThreshold = max(Configs.mafftSize, Configs.decompositionMaxSubsetSize, Configs.recurseThreshold)
 80 |     
 81 |     for file in context.subsetPaths:
 82 |         subset = sequenceutils.readFromFasta(file)
 83 |         subalignmentPath = os.path.join(subalignDir, "subalignment_{}".format(os.path.basename(file)))
 84 |         context.subalignmentPaths.append(subalignmentPath)
 85 |         
 86 |         if os.path.exists(subalignmentPath):
 87 |             Configs.log("Existing subalignment file detected: {}".format(subalignmentPath))       
 88 |              
 89 |         elif len(subset) <= mafftThreshold or not Configs.recurse:
 90 |             Configs.log("Subset has {}/{} sequences, aligning with MAFFT..".format(len(subset), mafftThreshold))            
 91 |             subalignmentTask = external_tools.buildMafftAlignment(file, subalignmentPath)
 92 |             context.subalignmentTasks.append(subalignmentTask)
 93 |             
 94 |         else:
 95 |             Configs.log("Subset has {}/{} sequences, recursively subaligning with MAGUS..".format(len(subset), mafftThreshold))
 96 |             subalignmentDir = os.path.join(subalignDir, os.path.splitext(os.path.basename(subalignmentPath))[0])
 97 |             subalignmentTask = createAlignmentTask({"outputFile" : subalignmentPath, "workingDir" : subalignmentDir, 
 98 |                                                     "sequencesPath" : file, "guideTree" : Configs.recurseGuideTree})   
 99 |             context.subalignmentTasks.append(subalignmentTask)
100 |                 
101 |     task.submitTasks(context.subalignmentTasks)
102 |     Configs.log("Prepared {} subset alignment tasks..".format(len(context.subalignmentTasks)))
103 | 
104 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/alignment_context.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Dec 4, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import os
  8 | from helpers import sequenceutils
  9 | from tasks import task, manager
 10 | from configuration import Configs
 11 | 
 12 | '''
 13 | The AlignmentContext data structure maintains all information pertaining to a single MAGUS alignment.
 14 | The main thread keeps one context active at a time (to manage resources). Subalignments will spawn their
 15 | own contexts, which will become active when the parent context is in a blocking wait. The previously active
 16 | context resumes when the current alignment is completed.
 17 | '''
 18 | 
 19 | class AlignmentContext:
 20 |         
 21 |     def __init__(self, **kwargs):
 22 |         self.outputFile = None
 23 |         self.workingDir = None
 24 |         self.sequencesPath = None
 25 |         self.subsetPaths = []
 26 |         self.subalignmentPaths = []
 27 |         self.backbonePaths = []
 28 |         self.guideTree = None
 29 |         
 30 |         self.unalignedSequences = None
 31 |         #self.taxa = []
 32 |         self.subsets = []
 33 |         self.subalignments = []
 34 |         self.taxonSubsetMap = {}
 35 |         self.taxonSubalignmentMap = {}
 36 |                 
 37 |         self.backboneTaxa = {}
 38 |         self.backboneExtend = set()
 39 |         self.backboneSubalignment = {}
 40 |         
 41 |         self.subalignmentTasks = []
 42 |         self.backboneTasks = []
 43 |         self.graph = None
 44 |         
 45 |         '''
 46 |             6.1.2021 - added by Chengze Shen
 47 |             a dict: <key> = backbone aln name, <value> = weights
 48 |             a path: the file path to define the dict
 49 |         '''
 50 |         self.backboneWeights = {}
 51 |         self.backboneWeightsPath = None
 52 | 
 53 |         for attr in kwargs:
 54 |             vars(self)[attr] = kwargs.get(attr)
 55 |         
 56 |         if not os.path.exists(self.workingDir):
 57 |             os.makedirs(self.workingDir)
 58 |     
 59 |     def awaitSubalignments(self):
 60 |         task.awaitTasks(self.subalignmentTasks)
 61 |     
 62 |     def initializeSequences(self):
 63 |         self.unalignedSequences = {}
 64 |         for i, subsetPath in enumerate(self.subsetPaths):
 65 |             self.subsets.append([])
 66 |             subset = sequenceutils.readFromFastaOrdered(subsetPath, removeDashes=True)
 67 |             for sequence in subset:
 68 |                 self.unalignedSequences[sequence.tag] = sequence
 69 |                 self.taxonSubsetMap[sequence.tag] = i
 70 |                 self.subsets[i].append(sequence.tag)
 71 |         
 72 |         if Configs.constrain:
 73 |             self.subalignments = self.subsets
 74 |             self.taxonSubalignmentMap = self.taxonSubsetMap
 75 |         else:
 76 |             for s in self.subsets:
 77 |                 for taxon in s:
 78 |                     self.taxonSubalignmentMap[taxon] = len(self.subalignments)
 79 |                     self.subalignments.append([taxon])
 80 |        
 81 |     '''
 82 |         6.1.2021 - added by Chengze Shen
 83 |         a new function to initialize readings of weights from the
 84 |         backbone weights path (if such path exists)
 85 |     '''
 86 |     def initializeBackboneWeights(self):
 87 |         if self.backboneWeightsPath:
 88 |             # the weights should be put in the following manner:
 89 |             # > each line denotes a weight (for a backbone)
 90 |             # > for each line, it should have the format: backbone path,weight
 91 |             with open(self.backboneWeightsPath, 'r') as f:
 92 |                 lines = f.read().split('\n')[:-1]
 93 |                 for line in lines:
 94 |                     if line == '':
 95 |                         continue
 96 |                     w = [x.strip() for x in line.split(',')]
 97 |                     assert len(w) == 2
 98 |                     self.backboneWeights[w[0]] = float(w[1])
 99 |         else:
100 |             return
101 | 
102 | 
103 |     def initializeBackboneSequenceMapping(self):
104 |         if len(self.backboneTaxa) == 0:
105 |             backboneSubsetTaxonMap = {i : subset for i, subset in enumerate(self.subsets)}
106 |         else:
107 |             backboneSubsetTaxonMap = {}
108 |             for taxon in self.backboneTaxa:
109 |                 i = self.taxonSubsetMap[taxon]
110 |                 backboneSubsetTaxonMap[i] = backboneSubsetTaxonMap.get(i, [])
111 |                 backboneSubsetTaxonMap[i].append(taxon) 
112 |         
113 |         if Configs.constrain:
114 |             for i, subalignPath in enumerate(self.subalignmentPaths):
115 |                 subalignment = sequenceutils.readFromFasta(subalignPath, removeDashes=False)
116 |                 for taxon in backboneSubsetTaxonMap.get(i, []):
117 |                     self.backboneSubalignment[taxon] = subalignment[taxon]
118 |         else:
119 |             self.backboneSubalignment = self.unalignedSequences
120 |     
121 |     def __enter__(self):
122 |         manager.TaskManager.contextStack.append(self)
123 |         return self
124 |             
125 |     def __exit__(self, excType, excVal, excTb):
126 |         manager.TaskManager.contextStack.pop()
127 |     
128 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mafft-homologs.1:
--------------------------------------------------------------------------------
  1 | .\"     Title: MAFFT-HOMOLOGS
  2 | .\"    Author: Kazutaka Katoh <katoh_at_bioreg.kyushu-u.ac.jp.>
  3 | .\" Generator: DocBook XSL Stylesheets v1.72.0 <http://docbook.sf.net/>
  4 | .\"      Date: 2007-08-14
  5 | .\"    Manual: Mafft Manual
  6 | .\"    Source: mafft-homologs 2.1
  7 | .\"
  8 | .TH "MAFFT\-HOMOLOGS" "1" "2007\-06\-09" "mafft\-homologs 2.1" "Mafft Manual"
  9 | .\" disable hyphenation
 10 | .nh
 11 | .\" disable justification (adjust text to left margin only)
 12 | .ad l
 13 | .SH "NAME"
 14 | .RS 0
 15 | mafft\-homologs \- aligns sequences together with homologues automatically collected from SwissProt via NCBI BLAST
 16 | .RE
 17 | .SH "SYNOPSIS"
 18 | .RS 0
 19 | \fBmafft\-homologs\fR [\fBoptions\fR] \fIinput\fR [>\ \fIoutput\fR]
 20 | .RE
 21 | .SH "DESCRIPTION"
 22 | .RS 0
 23 | The accuracy of an alignment of a few distantly related sequences is considerably improved when being aligned together with their close homologs. The reason for the improvement is probably the same as that for PSI\-BLAST. That is, the positions of highly conserved residues, those with many gaps and other additional information is brought by close homologs. According to Katoh et al. (2005), the improvement by adding close homologs is 10% or so, which is comparable to the improvement by incorporating structural information of a pair of sequences. Mafft\-homologs in a mafft server works like this:
 24 | .sp
 25 | .RS 4
 26 | \h'-04' 1.\h'+02'Collect a number (50 by default) of close homologs (E=1e\-10 by default) of the input sequences.
 27 | .RE
 28 | .sp
 29 | .RS 4
 30 | \h'-04' 2.\h'+02'Align the input sequences and homologs all together using the L\-INS\-i strategy.
 31 | .RE
 32 | .sp
 33 | .RS 4
 34 | \h'-04' 3.\h'+02'Remove the homologs.
 35 | .RE
 36 | .RE
 37 | .SH "OPTIONS"
 38 | .RS 0
 39 | .PP
 40 | \fB\-a\fR \fI\fIn\fR\fR
 41 | .RS 4
 42 | The number of collected sequences (default: 50).
 43 | .RE
 44 | .PP
 45 | \fB\-e\fR \fI\fIn\fR\fR
 46 | .RS 4
 47 | Threshold value (default: 1e\-10).
 48 | .RE
 49 | .PP
 50 | \fB\-o\fR \fI\fIxxx\fR\fR
 51 | .RS 4
 52 | Options for mafft (default: " \-\-op 1.53 \-\-ep 0.123 \-\-maxiterate 1000 --localpair --reorder").
 53 | .RE
 54 | .PP
 55 | \fB\-l\fR
 56 | .RS 4
 57 | Locally carries out BLAST searches instead of NCBI BLAST (requires locally installed BLAST and a database).
 58 | .RE
 59 | .PP
 60 | \fB\-f\fR
 61 | .RS 4
 62 | Outputs collected homologues also (default: off).
 63 | .RE
 64 | .PP
 65 | \fB\-w\fR
 66 | .RS 4
 67 | entire sequences are subjected to BLAST search (default: well\-aligned region only)
 68 | .RE
 69 | .RE
 70 | .SH "REQUIREMENTS"
 71 | .RS 0
 72 | .PP
 73 | MAFFT version > 5.58.
 74 | .PP
 75 | Either of
 76 | .RS 4
 77 | .PP
 78 | lynx (when remote BLAST server is used)
 79 | .PP
 80 | BLAST and a protein sequence database (when local BLAST is used)
 81 | .RE
 82 | .RE
 83 | .SH "REFERENCES"
 84 | .RS 0
 85 | .PP
 86 | Katoh, Kuma, Toh and Miyata (Nucleic Acids Res. 33:511\-518, 2005) MAFFT version 5: improvement in accuracy of multiple sequence alignment.
 87 | .RE
 88 | .SH "SEE ALSO"
 89 | .RS 0
 90 | .PP
 91 | \fBmafft\fR(1)
 92 | .RE
 93 | .SH "AUTHORS"
 94 | .RS 0
 95 | .PP
 96 | \fBKazutaka Katoh\fR <\&katoh_at_bioreg.kyushu\-u.ac.jp.\&>
 97 | .sp -1n
 98 | .IP "" 4
 99 | Wrote Mafft.
100 | .PP
101 | \fBCharles Plessy\fR <\&charles\-debian\-nospam@plessy.org\&>
102 | .sp -1n
103 | .IP "" 4
104 | Wrote this manpage in DocBook XML for the Debian distribution, using Mafft's homepage as a template.
105 | .RE
106 | .SH "COPYRIGHT"
107 | .RS 0
108 | Copyright \(co 2002\-2007 Kazutaka Katoh (mafft)
109 | .br
110 | Copyright \(co 2007 Charles Plessy (this manpage)
111 | .br
112 | .PP
113 | Mafft and its manpage are offered under the following conditions:
114 | .PP
115 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
116 | .sp
117 | .RS 4
118 | \h'-04' 1.\h'+02'Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
119 | .RE
120 | .sp
121 | .RS 4
122 | \h'-04' 2.\h'+02'Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
123 | .RE
124 | .sp
125 | .RS 4
126 | \h'-04' 3.\h'+02'The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission.
127 | .RE
128 | .PP
129 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130 | .br
131 | .RE
132 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/README.md:
--------------------------------------------------------------------------------
 1 | # MAGUS
 2 | Multiple Sequence Alignment using Graph Clustering
 3 | 
 4 | - - - -
 5 | 
 6 | ## Purpose and Functionality
 7 | MAGUS is a tool for piecewise large-scale multiple sequence alignment.  
 8 | The dataset is divided into subsets, which are independently aligned with a base method (currently MAFFT -linsi). These subalignments are merged together with the Graph Clustering Merger (GCM). GCM builds the final alignment by clustering an alignment graph, which is constructed from a set of backbone alignments. This process allows MAGUS to effectively boost MAFFT -linsi to over a million sequences.
 9 | 
10 | The basic procedure is outlined below. Steps 4-7 are GCM.
11 | 1. The input is a set of unaligned sequences. Alternatively, the user can provide a set of multiple sequence alignments and skip the next two steps.
12 | 2. The dataset is decomposed into subsets.
13 | 3. The subsets are aligned with MAFFT -linsi. 
14 | 4. A set of backbone alignments are generated with MAFFT -linsi (or provided by the user).
15 | 5. The backbones are compiled into an alignment graph.
16 | 6. The graph is clustered with MCL.
17 | 7. The clusters are resolved into a final alignment.
18 | 
19 | - - - -
20 | 
21 | ## Dependencies
22 | MAGUS requires
23 | * Python 3
24 | * MAFFT (linux version is included)
25 | * MCL (linux version is included)
26 | * FastTree and Clustal Omega are needed if using these guide trees (linux versions included) 
27 | 
28 | If you would like to use some other version of MAFFT and/or MCL (for instance, if you're using Mac),
29 | you will need to edit the MAFFT/MCL paths in configuration.py  
30 | (I'll pull these out into a separate config file to make it simpler).
31 | 
32 | - - - -
33 | 
34 | ## Getting Started
35 | Please navigate your terminal to the "example" directory to get started with some sample data.  
36 | A few basic ways of running MAGUS are shown below.  
37 | Run "magus.py -h" to view the full list of arguments. 
38 | 
39 | **Align a set of unaligned sequences from scratch**  
40 | *python3 ../magus.py -d outputs -i unaligned_sequences.txt -o magus_result.txt*  
41 | 
42 | *-o* specifies the output alignment path  
43 | *-d* (optional) specifies the working directory for GCM's intermediate files, like the graph, clusters, log, etc.  
44 | 
45 | **Merge a prepared set of alignments**  
46 | *python3 ../magus.py -d outputs -s subalignments -o magus_result.txt*  
47 | 
48 | *-s* specifies the directory with subalignment files. Alternatively, you can pass a list of file paths.   
49 | 
50 | - - - -
51 | 
52 | ## Controlling the pipeline
53 | 
54 | **Specify subset decomposition behavior**  
55 | *python3 ../magus.py -d outputs -i unaligned_sequences.txt -t fasttree --maxnumsubsets 100 --maxsubsetsize 50 -o magus_result.txt*  
56 | 
57 | *-t* specifies the guide tree method to use, and is the main way to set the decomposition strategy.  
58 | Available options are fasttree (default), parttree, clustal (recommended for very large datasets), and random.  
59 | *--maxnumsubsets* sets the desired number of subsets to decompose into (default 25).  
60 | *--maxsubsetsize* sets the threshold to stop decomposing subsets below this number (default 50).  
61 | Decomposition proceeds until maxnumsubsets is reached OR all subsets are below maxsubsetsize.
62 | 
63 | **Specify beckbones for alignment graph**  
64 | *python3 ../magus.py -d outputs -i unaligned_sequences.txt -r 10 -m 200 -o magus_result.txt*  
65 | *python3 ../magus.py -d outputs -s subalignments -b backbones -o magus_result.txt*  
66 | 
67 | *-r* and *-m* specify the number of MAFFT backbones and their maximum size, respectively. Default to 10 and 200.  
68 | Alternatively, the user can provide his own backbones; *-b* can be used to provide a directory or a list of files.
69 | 
70 | **Specify graph trace method**  
71 | *python3 ../magus.py -d outputs -i unaligned_sequences.txt --graphtracemethod mwtgreedy -o magus_result.txt*  
72 | 
73 | *--graphtracemethod* is the flag that governs the graph trace method. Options are minclusters (default and recommended), fm, mwtgreedy (recommended for very large graphs), rg, or mwtsearch.
74 | 
75 | **Unconstrained alignment**  
76 | *python3 ../magus.py -d outputs -i unaligned_sequences.txt -c false -o magus_result.txt*  
77 | 
78 | By default, MAGUS constrains the merged alignment to induce all subalignments. This constraint can be disabled with *-c false*.  
79 | This drastically slows MAGUS and is strongly not recommended above 200 sequences. 
80 | 
81 | - - - -
82 | 
83 | ## Things to Keep in Mind
84 | 
85 | * MAGUS will not overwrite existing backbone, graph and cluster files.  
86 | Please delete them/specify a different working directory to perform a clean run.
87 | * Related issue: if MAGUS is stopped while running MAFFT, MAFFT's output backbone files will be empty.  
88 | This will cause errors if MAGUS reruns and finds these empty files.
89 | * A large number of subalignments (>100) will start to significantly slow down the ordering phase, especially for very heterogenous data.  
90 | I would generally disadvise using more than 100 subalignments, unless the data is expected to be well-behaved.  
91 | 
92 | - - - -
93 | 
94 | ## Related Publications
95 | 
96 | * Original MAGUS paper: ___Smirnov, V. and Warnow, T., 2020. MAGUS: Multiple Sequence Alignment using Graph Clustering. Bioinformatics. https://doi.org/10.1093/bioinformatics/btaa992___
97 | * GCM-MWT paper:
98 | * MAGUS on ultra-large datasets: 
99 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
  1 | WITCH v1.0.10
  2 | -------------
  3 | 1. Added an adaptive schematic for inclusion of top HMMs for aligning each
  4 |    query sequence, when using adjusted bitscores (``--use-weights 1``).
  5 |    Previously, all top `k` HMMs will be used. Now, WITCH includes up to `k`
  6 |    HMMs, or until the sum of weights exceeds 0.999. This should keep the
  7 |    core design of WITCH, but this has not been tested with data yet.
  8 | 2. Added type check and cast for various user-defined configs in the
  9 |    configuration file before passing them to the main process.
 10 | 
 11 | WITCH v1.0.9
 12 | ------------
 13 | 1. Fixed the issue with feeding FastTree2 with gzipped alignment file for
 14 |    tree estimation. Now use piping to pipe the alignment file (gzipped or
 15 |    not) as stdin to the FastTree2 executable.
 16 | 2. Enforced the start method for multiprocessing on macOS to be ``fork``.
 17 |    This ensures WITCH usability on a macOS environment.
 18 | 3. Changed the default invocation of example codes from ``python3`` to
 19 |    just ``python``.
 20 | 
 21 | WITCH v1.0.8
 22 | ------------
 23 | 1. Supported reading gzipped alignment file. That is, when the user supplies
 24 |    with their own backbone alignment and adds some other query sequences).
 25 |    E.g., ``backbone.fasta.gz`` or ``backbone.fa.gzip``, etc.
 26 | 
 27 | WITCH v1.0.7
 28 | ------------
 29 | 1. Added example usages to ``witch.py --help``. Also changed the default
 30 |    formatter from a custom one to ``argparse.RawDescriptionHelpFormatter``.
 31 | 2. Now by default will bypass the initial WITCH setup step. Previously, this
 32 |    was achieved by giving WITCH the parameter ``-y`` (``--bypass-setup``).
 33 | 3. Changed the default behavior of ``examples/run.sh`` to running scenario D.
 34 | 4. Fixed the runtime error when home.path exists but the pointed main.config is
 35 |    missing. Now will regenerate ``main.config`` at the pointed location.
 36 | 5. Fixed ``--bypass-setup`` not working as True by default. Now will always
 37 |    create the config path at ``~/.witch/main.config``. 
 38 | 6. Changed the default filename for the masked alignment output. Previously,
 39 |    it will be named as ``<name>.fasta.masked``, as ``<name>`` supplied by the user
 40 |    or default to ``aligned``. Now will be written as ``<name>.masked.fasta``.
 41 |    If the user gives ``-o <name>.fa`` or ``-o <name>.fasta``, the masked alignment
 42 |    will use the corresponding suffix (e.g., ``<name>.masked.fa``).
 43 | 
 44 | WITCH v1.0.6
 45 | ------------
 46 | 1. Added Software Output Explanation to the README to avoid confusion on what
 47 |    alignment file to use for downstream analyses.
 48 | 
 49 | WITCH v1.0.5
 50 | ------------
 51 | 1. Added compatibility to Dendropy with version >4.5.2 and removed its
 52 |    requirement from requirements.txt for pip.
 53 | 
 54 | WITCH v1.0.5b
 55 | -------------
 56 | 1. Added a new parameter option allowing users to specify a customized config
 57 |    file to override the default ``main.config`` (usually can be found at
 58 |    ``~/.witch_msa/main.config``). Use ``-c <user config file>`` to do so.
 59 |    The priority for arguments: ``commandline > user.config > main.config``.
 60 | 
 61 | WITCH v1.0.5a
 62 | -------------
 63 | 1. Added two sanity checks to HMMBuild and HMMSearch jobs: making sure all
 64 |    files are created correctly before proceeding.
 65 | 2. Added a file number check utility function using the ``inspect`` package.
 66 | 
 67 | WITCH v1.0.4
 68 | ------------
 69 | 1. Added an additional parameter option to set an upper bound to the HMM
 70 |    subsets created (``-Z``, complementary to ``-A`` which is for lower bound),
 71 |    based on the number of sequences in a subset.
 72 | 2. Changed the behavior for creating HMM subsets. Instead of reading in the
 73 |    backboen alignment at once, WITCH now reads line by line to avoid large
 74 |    memory consumption if the backbone is very large.
 75 | 3. Changed the behavior for running HMMSearch. Now also uses my task manager
 76 |    to manage the maximum amount of concurrently running jobs (instead of
 77 |    submitting all at once).
 78 | 
 79 | WITCH v1.0.3
 80 | ------------
 81 | 1. Fixed an oversight in memory issue if the backbone alignment is large and
 82 |    we are creating a lot of subsets (``gcmm/algorithm.py``).
 83 | 2. Added default behavior to function ``gcmm/task.py/handleFuture(...)`` so that
 84 |    return values are attached to ``success`` if no callback function is provided.
 85 | 3. Moving towards using my generic task manager for all parallelization.
 86 | 
 87 | WITCH v1.0.2
 88 | ------------
 89 | 1. Added an option to bypass the initial config setup step. Use ``-y``, or
 90 |    ``--bypass-setup`` in your commandline running WITCH to avoid being asked where
 91 |    to generate the config directory (will default to ``~/.witch_msa``). Example:
 92 |    running ``witch.py -y -i [sequence files]`` for the first time will directly
 93 |    set up WITCH configuration file and start aligning the input sequences.
 94 | 
 95 | WITCH v1.0.2a1
 96 | --------------
 97 | 1. Fixed not using absolute path when setting up the directory for
 98 |    ``main.config``.
 99 | 2. Improved metavar naming for argument parameters.
100 | 3. Fixed a bug in the included MAGUS installation such that if the user has 
101 |    a file or directory named ``fasttree/`` in the folder where WITCH is run,
102 |    MAGUS will try to read it as the guide tree instead of creating a FastTree
103 |    guide tree.
104 | 4. Changed the executable search from ``usr/bin/env python`` to
105 |    ``usr/bin/env python3``.
106 | 5. Further added systemrecursionlimit from 10000 to 20000 to combat issues
107 |    with large tree.
108 | 
109 | WITCH v1.0.1
110 | ------------
111 | 1. First working release across different platform and different python
112 |    versions.
113 | 


--------------------------------------------------------------------------------
/witch_msa/gcmm/task.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from witch_msa.configs import Configs, tqdm_styles
  3 | from tqdm import tqdm
  4 | import itertools
  5 | 
  6 | import concurrent.futures
  7 | 
  8 | '''
  9 | A class defining a generic task object that will be used for job submission 
 10 | '''
 11 | class MyTask(object):
 12 |     # (required) the list of parameters used in the task
 13 |     # (optional) parent(s) of the task
 14 |     #               (i.e., the other tasks that depends on this task)
 15 |     # (optional) children of this task
 16 |     #               (i.e., the other tasks that this task depends on)
 17 |     def __init__(self, *args, **kwargs):
 18 |         self.args = tuple(*args)
 19 | 
 20 |         _valid = ['parent', 'children']
 21 |         for k, v in kwargs.items():
 22 |             if k in _valid:
 23 |                 if isinstance(v, MyTask):
 24 |                     setattr(self, k, [v])
 25 |                 elif isinstance(v, list):
 26 |                     setattr(self, k, v)
 27 |                 else:
 28 |                     raise TypeError(type(v))
 29 |     
 30 |     def get_args(self):
 31 |         return self.args
 32 |     
 33 |     # currently unused
 34 |     def get_parent(self):
 35 |         if 'parent' in self.__dict__:
 36 |             return self.parent
 37 |         else:
 38 |             return None
 39 |     
 40 |     # currently unused
 41 |     def get_children(self):
 42 |         if 'children' in self.__dict__:
 43 |             return self.children
 44 |         else:
 45 |             return None
 46 | 
 47 | '''
 48 | Helper function to convert a list of lists of arguments to a list of MyTask objects
 49 | Assumption: all elements of args are of the same length
 50 | Return:     a generator of MyTask
 51 | '''
 52 | def getTasks(*args):
 53 |     total_length = len(args[0])
 54 |     for i in range(total_length):
 55 |         _args = [x[i] for x in args]
 56 |         yield MyTask(_args)
 57 | 
 58 | '''
 59 | Variant of getTasks to use given index positions to select
 60 | Also append the index to each yielded element at back
 61 | '''
 62 | def getTasksWithIndexes(indexes, *args):
 63 |     for i in indexes:
 64 |         _args = [x[i] for x in args] + [i]
 65 |         yield MyTask(_args)
 66 | 
 67 | '''
 68 | Helper function to handle a single future object with any return values.
 69 | Run additional callbacks with the return values and the additional callback
 70 | arguments supplemented.
 71 | Return:     the runtime to run the additional callback
 72 | '''
 73 | def handleFuture(future, success, ignored, retry, i_retry,
 74 |         callback_func, callback_args):
 75 |     s1 = time.time()
 76 |     ret = future.result()
 77 |     
 78 |     # first four fields of any callbacks will be: success=<list>,
 79 |     # ignored=<list>, retry=<list>, i_retry=<int>
 80 |     if callback_func:
 81 |         callback_func(success, ignored, retry, i_retry, 
 82 |                 *ret, *callback_args)
 83 |         return time.time() - s1
 84 |     else:
 85 |         # default behavior: attach ret to success
 86 |         success.append(ret)
 87 |         return 0.
 88 | 
 89 | '''
 90 | Helper function to run tasks defined by a list of MyTask objects
 91 | Required:   the function that defines the task
 92 |             the process pool to submit to
 93 |             a generator of MyTask objects
 94 |             the number of MyTask objects
 95 | Optional:   max_concurrent_jobs=<int>   # default will submit all tasks at once
 96 |             i_retry=<int>               # number of retry for failed tasks
 97 |             callback_func=<function>    # callback to run after a future is handled
 98 |                                           should take future return values as the
 99 |                                           first set of arguments
100 |             callback_args=<*args>       # additional arguments for the callback
101 | Return:     success, ignored, retry
102 |             total runtime (seconds) for handling/running the tasks
103 | '''
104 | def runTasks(func, pool, mytasks, num_tasks, **kwargs):
105 |     handle_runtime = 0.
106 |     max_concurrent_jobs = kwargs.get('max_concurrent_jobs', num_tasks)
107 |     i_retry = kwargs.get('i_retry', 0) 
108 |     callback_func = kwargs.get('callback_func', None)
109 |     callback_args = kwargs.get('callback_args', [])
110 | 
111 |     success, ignored, retry = [], [], []
112 |     with tqdm(total=num_tasks, **tqdm_styles) as pbar:
113 |         futures = {
114 |                 #pool.submit(func, *task.get_args()): task.get_id()
115 |                 pool.submit(func, *task.get_args()): task.get_parent()
116 |                 for task in itertools.islice(mytasks, max_concurrent_jobs)
117 |         }
118 |         while futures:
119 |             # wait for the next future to complete
120 |             done, _ = concurrent.futures.wait(
121 |                     futures, return_when=concurrent.futures.FIRST_COMPLETED)
122 |             
123 |             for future in done:
124 |                 # depending on kwargs, allow re-adding some failed tasks back
125 |                 # to queue
126 |                 handle_runtime += handleFuture(future, success, ignored, retry,
127 |                         i_retry, callback_func, callback_args)
128 |                 _ = futures.pop(future)
129 |             pbar.update(len(done))
130 | 
131 |             # schedule the next batch of tasks, no more than the number of tasks
132 |             # that just finished
133 |             for task in itertools.islice(mytasks, len(done)):
134 |                 future = pool.submit(func, *task.get_args())
135 |                 futures[future] = task.get_parent()
136 |     return success, ignored, retry, handle_runtime
137 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/decompose/initial_tree.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on May 29, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import os
  8 | import shutil
  9 | import time
 10 | import random
 11 | 
 12 | from align.decompose import decomposer
 13 | from helpers import sequenceutils, hmmutils, treeutils
 14 | from tasks import task
 15 | from tools import external_tools
 16 | from configuration import Configs
 17 | 
 18 | '''
 19 | Different options for estimating a guide tree.
 20 | The main ones are FastTree (for accuracy) and Clustal Omega's mbed (for speed).
 21 | '''
 22 | 
 23 | def buildInitialTree(context, workingDir, treeType):
 24 |     '''
 25 |         11.16.2023 - modified by Chengze Shen
 26 |         Since the guide tree type and guide tree path are using the same arg,
 27 |         there is a chance that even user did not specify the type (default to
 28 |         "fasttree"), the code below will try to find if a file named "fasttree"
 29 |         exist in the directory where the script is run.
 30 |         Hence, if a user accidentally has a file/directory named "fasttree",
 31 |         it will be read and used as the guide tree instead of creating an initial
 32 |         tree with FastTree2.
 33 | 
 34 |         Modification: will not check for path existencce if treeType is among
 35 |         [fasttree, fasttree-noml, parttree, clustal]
 36 |     '''
 37 |     default_styles = ['fasttree', 'fasttree-noml', 'parttree', 'clustal']
 38 |     #if treeType is not None and os.path.exists(treeType):
 39 |     if treeType is not None:
 40 |         if treeType.lower() in default_styles:
 41 |             pass
 42 |         elif os.path.exists(treeType):
 43 |             Configs.log("Found user guide tree {}".format(treeType))
 44 |             return treeType
 45 |     else:
 46 |         # by default use fasttree
 47 |         treeType = "fasttree"
 48 |     
 49 |     tempDir = os.path.join(workingDir, "initial_tree")
 50 |     outputTreePath = os.path.join(tempDir, "initial_tree.tre")
 51 |     if os.path.exists(outputTreePath):
 52 |         Configs.log("Found existing initial tree {}".format(outputTreePath))
 53 |         return outputTreePath
 54 |     if os.path.exists(tempDir):
 55 |         shutil.rmtree(tempDir)
 56 |     os.makedirs(tempDir)
 57 |     
 58 |     time1 = time.time() 
 59 |     
 60 |     if treeType.lower() == "fasttree": 
 61 |         Configs.log("Building PASTA-style FastTree initial tree on {} with skeleton size {}..".format(context.sequencesPath, Configs.decompositionSkeletonSize))
 62 |         alignPath = os.path.join(tempDir, "initial_align.txt")
 63 |         buildInitialAlignment(context.unalignedSequences, tempDir, Configs.decompositionSkeletonSize, None, alignPath)
 64 |         external_tools.runFastTree(alignPath, tempDir, outputTreePath, "fast").run()
 65 |     elif treeType.lower() == "fasttree-noml": 
 66 |         Configs.log("Building PASTA-style FastTree (NO ML) initial tree on {} with skeleton size {}..".format(context.sequencesPath, Configs.decompositionSkeletonSize))
 67 |         alignPath = os.path.join(tempDir, "initial_align.txt")
 68 |         buildInitialAlignment(context.unalignedSequences, tempDir, Configs.decompositionSkeletonSize, None, alignPath)
 69 |         external_tools.runFastTree(alignPath, tempDir, outputTreePath, "noml").run()
 70 |     elif treeType.lower() == "parttree":
 71 |         Configs.log("Building MAFFT PartTree initial tree on {}..".format(context.sequencesPath))
 72 |         taxa = list(context.unalignedSequences.keys())
 73 |         external_tools.runMafftGuideTree(context.sequencesPath, tempDir, outputTreePath, Configs.numCores).run()
 74 |         treeutils.convertMafftGuideTree(outputTreePath, taxa)
 75 |     elif treeType.lower() == "clustal":
 76 |         Configs.log("Building Clustal Omega initial tree on {}..".format(context.sequencesPath))
 77 |         external_tools.runClustalOmegaGuideTree(context.sequencesPath, tempDir, outputTreePath, Configs.numCores).run()
 78 |     else:
 79 |         raise Exception("Guide tree {} not a file and not recognized..".format(treeType))
 80 | 
 81 |     time2 = time.time()
 82 |     Configs.log("Built initial tree on {} in {} sec..".format(context.sequencesPath, time2-time1))
 83 |     
 84 |     return outputTreePath
 85 | 
 86 | def buildInitialAlignment(sequences, tempDir, skeletonSize, initialAlignSize, outputAlignPath):
 87 |     skeletonPath = os.path.join(tempDir, "skeleton_sequences.txt")
 88 |     queriesPath = os.path.join(tempDir, "queries.txt") 
 89 |     hmmDir = os.path.join(tempDir, "skeleton_hmm")
 90 |     hmmPath = os.path.join(hmmDir, "hmm_model.txt")
 91 |     initialInsertPath = os.path.join(tempDir, "initial_insert_align.txt")
 92 |     if not os.path.exists(hmmDir):
 93 |         os.makedirs(hmmDir)
 94 |     
 95 |     if initialAlignSize is None or initialAlignSize > len(sequences):
 96 |         initialAlignSize = len(sequences)
 97 |     
 98 |     skeletonTaxa, remainingTaxa = decomposer.chooseSkeletonTaxa(sequences, skeletonSize)
 99 |     additional = initialAlignSize-skeletonSize
100 |     random.shuffle(remainingTaxa)
101 |     remainingTaxa, unusedTaxa = remainingTaxa[:additional], remainingTaxa[additional:]
102 |     
103 |     sequenceutils.writeFasta(sequences, skeletonPath, skeletonTaxa)
104 |     external_tools.runMafft(skeletonPath, None, tempDir, outputAlignPath, Configs.numCores).run()
105 |     
106 |     if len(remainingTaxa) > 0:
107 |         sequenceutils.writeFasta(sequences, queriesPath, remainingTaxa)    
108 |         hmmutils.buildHmmOverAlignment(outputAlignPath, hmmPath).run()
109 |         hmmTasks = hmmutils.hmmAlignQueries(hmmPath, queriesPath)
110 |         task.submitTasks(hmmTasks)
111 |         for hmmTask in task.asCompleted(hmmTasks):
112 |             hmmutils.mergeHmmAlignments([hmmTask.outputFile], outputAlignPath, includeInsertions=False)
113 |             if Configs.graphBuildMethod == "initial":
114 |                 hmmutils.mergeHmmAlignments([hmmTask.outputFile], initialInsertPath, includeInsertions=True)
115 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/mlrmcl/README:
--------------------------------------------------------------------------------
  1 | Compilation:
  2 | -----------
  3 | 
  4 | - A simple 'make' should do in Unix-like environments. It creates
  5 |   the executables 'mlrmcl' and 'ncut' in the top-level
  6 |   directory.
  7 | 
  8 | - 'make realclean' removes all libraries, executables and object
  9 |   files.
 10 | 
 11 | General Usage and Options:
 12 | -------------------------
 13 | 
 14 | - Executing a program without arguments will print its usage.
 15 | 
 16 | - The options may be specified in any order (similar to how it
 17 |   works for general unix utilities).
 18 | 
 19 | Usage and Options for mlrmcl:
 20 | ----------------------------
 21 | 
 22 | - The only required argument for mlrmcl is the graph file. The
 23 |   format for this file is described below in the Format section.
 24 | 
 25 | - The output file can be specified using the -o option. This is
 26 |   optional. The file that the output is written to is printed in
 27 |   the stdout output of the program.
 28 | 
 29 | - The granularity of the output clustering can be controlled
 30 |   using the '-c' option. This option specifies how small the
 31 |   graph can get before the coarsening in MLR-MCL stops. For
 32 |   example, if mlrmcl is run with '-c 1000', the graph is
 33 |   coarsened until it has no more than 1000 vertices.
 34 |   The default value for this option is 1000. If c is the same as
 35 |   the number of vertices in the graph, then no coarsening will
 36 |   take place at all and this is the same as R-MCL. The smaller
 37 |   the value of this option, the fewer clusters are output by the
 38 |   program. Note that if the cluster structure is especially
 39 |   clear (such as for synthetic graphs), 
 40 |   the program will simply output the same clustering
 41 |   regardless of the parameter value.
 42 | 
 43 | - The balance (i.e. the variance in output cluster sizes) can be
 44 |   controlled using the '-b' option. The default value of 0.5
 45 |   should be good enough in most cases. If you find that the
 46 |   output clustering is too balanced, you can try lower values for
 47 |   b (until 0), or if it is too imbalanced, you can try higher
 48 |   values for b such as 0.75 or 1.
 49 | 
 50 | - The inflation parameter is specified using the '-i option' and 
 51 |   can also be used to control the granularity of the clustering.
 52 |   (In the case when no coarsening is performed, i.e. for R-MCL,
 53 |   it is the only way to control the number of clusters.) Higher
 54 |   values of 'i' lead to more clusters, and the clustering also
 55 |   converges faster. The default is 2.0.
 56 | 
 57 | Usage and Options for mergeClusters:
 58 | -----------------------------------
 59 | 
 60 | - The mergeClusters program performs hierarchical agglomerative
 61 |   clustering. It may be used in situations where the user needs
 62 |   to exactly control the number of output clusters. First, one
 63 |   can run MLR-MCL, setting the options such that more clusters
 64 |   than required are output by the program. Subsequently one may
 65 |   run the mergeClusters program, specifying the number of merges
 66 |   to be the same as the number of additional clusters that were
 67 |   output by MLR-MCL.
 68 | 
 69 | - The exact usage for mergeClusters can be seen by executing the
 70 |   program without any arguments.
 71 | 
 72 | Input format:
 73 | ------------
 74 | 
 75 | - The input format is the same as that for Metis and Graclus. A
 76 |   pdf document explaining this format is available at
 77 |   http://glaros.dtc.umn.edu/gkhome/fetch/sw/metis/manual.pdf
 78 |   This pdf is also present inside the Metis distribution. For
 79 |   convenience, we have also included a copy of this manual under
 80 |   the name metis.4.0.manual.pdf.
 81 | 
 82 | Output format:
 83 | -------------
 84 | 
 85 | - The output format is also the same as that for Metis and
 86 |   Graclus. Each line contains the cluster index to which the
 87 |   node of the corresponding line number has been assigned. (For
 88 |   example, if line 20 is '4', that means that the node 20 has
 89 |   been assigned to the cluster 4.) 
 90 | 
 91 | Examples:
 92 | ---------
 93 | - The 'examples' folder has two graphs: synthetic.graph, which
 94 |   is a synthetic graph of 1000  nodes, generated with 25 clusters 
 95 |   and astro.graph, which is a collaboration network of astro physics 
 96 |   researchers (see http://snap.stanford.edu). 
 97 |   The following are some example usages
 98 |   (assuming the software is compiled, and we are in the example
 99 |   directory):
100 | 
101 | - ../mlrmcl -o synthetic.graph.out synthetic.graph
102 | - ../mlrmcl -o astro.graph.out astro.graph  
103 | - ../mlrmcl -c 500 -o astro.graph.c500.out astro.graph  
104 | - ../mlrmcl -c 2000 -o astro.graph.c2000.out astro.graph  
105 | - ../mlrmcl -c 2000 -b 0.25 -o astro.graph.c2000.b0.25.out astro.graph  
106 | - ../mlrmcl -c 20000 -i 1.8 -b 0.25 -o astro.graph.c20000.i1.8.b0.25.out astro.graph  
107 |  	(The last example above does not perform any coarsening, since
108 | 	the c value 20000 is more than the number of vertices in the
109 | 	graph, which is 17903.)
110 | 
111 | - ../mergeClusters -e astro.graph.c500.out -n 10 -o astro.graph.c500.10merges.out astro.graph 
112 | 	(If astro.graph.c500.out represents the clustering of
113 | 	astro.graph into x clusters, then
114 | 	astro.graph.c500.10.merges.out is a clustering of x-10
115 | 	clusters.)
116 | 
117 | 
118 | References:
119 | ----------
120 | - Venu Satuluri and Srinivasan Parthasarathy. "Scalable Graph
121 |   Clustering using Stochastic Flows: Applications to Community
122 |   Discovery." Proceedings of ACM SIGKDD 2009, Paris.
123 | 
124 | - Venu Satuluri, Srinivasan Parthasarathy and Dugyu Ucar. "Markov
125 |   Clustering of Protein Interaction Networks with Improved
126 |   Balance and Scalablity". Proceedings of ACM BCB 2010, Niagara
127 |   Falls.
128 | 
129 | Acknowledgments:
130 | ---------------
131 | I am very grateful to the authors of Metis and Graclus for releasing
132 | the source of their softwares, as this has enabled me to
133 | implement my own software much faster than would have been
134 | possible otherwise.
135 | 
136 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/helpers/hmmutils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on May 28, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import re
  8 | import os
  9 | import math
 10 | from tools import external_tools
 11 | from configuration import Configs
 12 | from helpers import sequenceutils
 13 | 
 14 | '''
 15 | def buildHmmScores(hmmPath, queriesPath, scoreMap):
 16 |     #tasks = [getHmmScores(hmmPath, queriesPath) for hmmPath in hmmPaths]
 17 |     queries = sequenceutils.readFromFasta(queriesPath, removeDashes = True)
 18 |     baseName = os.path.basename(queriesPath).split('.')[0]
 19 |     dirName = os.path.dirname(hmmPath)
 20 |     
 21 |     taxonScores = {}
 22 |     for taxon in queries:
 23 |         inputName = os.path.join(dirName, "{}.txt".format(baseName))
 24 |         outputName = os.path.join(dirName, "{}_score.txt".format(baseName))
 25 |         if os.path.exists(outputName):
 26 |             os.remove(outputName)
 27 |         sequenceutils.writeFasta({taxon : queries[taxon]}, inputName)
 28 |         getHmmScores(hmmPath, inputName, outputName).run()
 29 |         subsetScores = readSearchFile(outputName)
 30 |         taxonScores[taxon] = subsetScores[taxon][1]
 31 |     scoreMap[hmmPath] = taxonScores
 32 | '''
 33 | 
 34 | def buildHmmScores(hmmPaths, queriesPath, scoreFileHmmFileMap):
 35 |     #tasks = [getHmmScores(hmmPath, queriesPath) for hmmPath in hmmPaths]
 36 |     queries = sequenceutils.readFromFasta(queriesPath, removeDashes = True)
 37 |     baseName = os.path.basename(queriesPath).split('.')[0]
 38 |     dirName = os.path.join(os.path.dirname(queriesPath), "chunks_{}".format(baseName))
 39 |     if not os.path.exists(dirName):
 40 |         os.makedirs(dirName)
 41 |     
 42 |     chunkSize = 1000
 43 |     
 44 |     taxa = list(queries.keys())
 45 |     inputOutputs = []
 46 |     for i in range(math.ceil(len(taxa) / chunkSize)):
 47 |         chunk = taxa[i*chunkSize : min(len(taxa), (i+1)*chunkSize)]
 48 |         inputName = os.path.join(dirName, "{}_chunk_{}.txt".format(baseName, i+1))
 49 |         sequenceutils.writeFasta(queries, inputName, chunk)
 50 |         for hmmPath in hmmPaths:
 51 |             outputName = os.path.join(os.path.dirname(hmmPath), "{}_chunk_{}_score.txt".format(baseName, i+1))
 52 |             inputOutputs.append((hmmPath, inputName, outputName))
 53 |             scoreFileHmmFileMap[outputName] = hmmPath
 54 |     
 55 |     tasks = [getHmmScores(hmmPath, inputPath, outputPath) for hmmPath, inputPath, outputPath in inputOutputs]
 56 |     return tasks
 57 | 
 58 | def getHmmScores(hmmPath, queriesPath, scorePath):
 59 |     workingDir = os.path.dirname(hmmPath)
 60 |     #searchPath = os.path.join(workingDir, "hmm_search.txt")
 61 |     task = external_tools.runHmmSearch(hmmPath, queriesPath, workingDir, scorePath)
 62 |     return task
 63 | 
 64 | def readHmmScores(searchFiles):
 65 |     sequenceScores = {}
 66 |     for file in searchFiles:
 67 |         subsetScores = readSearchFile(file)
 68 |         taxonScores = {taxon : scores[1] for taxon, scores in subsetScores.items()}
 69 |         sequenceScores[file] = taxonScores
 70 |     return sequenceScores
 71 |     
 72 | def buildHmms(sequencesHmmsPathsMap):
 73 |     tasks = [buildHmmOverAlignment(sequencePath, hmmPath) for sequencePath, hmmPath in sequencesHmmsPathsMap.items()]
 74 |     return tasks
 75 |     
 76 | def buildHmmOverAlignment(sequencePath, hmmPath):
 77 |     workingDir = os.path.dirname(hmmPath)
 78 |     task = external_tools.runHmmBuild(sequencePath, workingDir, hmmPath)
 79 |     return task
 80 | 
 81 | def combineHmmAlignments(alignFiles, outputAlignmentPath, includeInsertions):
 82 |     alignment = {}
 83 |     for file in alignFiles:
 84 |         alignment.update(sequenceutils.readFromStockholm(file, includeInsertions))
 85 |     sequenceutils.writeFasta(alignment, outputAlignmentPath, None)
 86 | 
 87 | def mergeHmmAlignments(alignFiles, outputAlignmentPath, includeInsertions):
 88 |     for file in alignFiles:
 89 |         alignment = sequenceutils.readFromStockholm(file, includeInsertions)
 90 |         sequenceutils.writeFasta(alignment, outputAlignmentPath, None, True)
 91 | 
 92 | def hmmAlignQueries(hmmPath, queriesPath):
 93 |     queries = sequenceutils.readFromFasta(queriesPath, removeDashes = True)
 94 |     baseName = os.path.basename(queriesPath).split('.')[0]
 95 |     dirName = os.path.join(os.path.dirname(queriesPath), "chunks_{}".format(baseName))
 96 |     if not os.path.exists(dirName):
 97 |         os.makedirs(dirName)
 98 |     chunkSize = 1000
 99 |     
100 |     taxa = list(queries.keys())
101 |     alignFiles = {}
102 |     for i in range(math.ceil(len(taxa) / chunkSize)):
103 |         chunk = taxa[i*chunkSize : min(len(taxa), (i+1)*chunkSize)]
104 |         inputName = os.path.join(dirName, "{}_chunk_{}.txt".format(baseName, i+1))
105 |         outputName = os.path.join(dirName, "{}_chunk_{}_aligned.txt".format(baseName, i+1))
106 |         sequenceutils.writeFasta(queries, inputName, chunk)
107 |         alignFiles[inputName] = outputName
108 |     
109 |     tasks = []
110 |     for inputPath, outputPath in alignFiles.items():
111 |         task = buildHmmAlignment(hmmPath, inputPath, outputPath)
112 |         tasks.append(task)
113 |     return tasks
114 | 
115 | def buildHmmAlignment(hmmPath, queriesPath, outputAlignmentPath):
116 |     workingDir = os.path.dirname(hmmPath)
117 |     task = external_tools.runHmmAlign(hmmPath, queriesPath, workingDir, outputAlignmentPath)
118 |     return task
119 | 
120 | #from PASTA repo
121 | def readSearchFile(searchFilePath):
122 |     with open(searchFilePath, 'r') as searchFile:
123 |         results = {}
124 | 
125 |         pattern = re.compile(
126 |             r"([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+"
127 |             r"([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)")
128 |         start_reading = False
129 |         for line in searchFile:
130 |             line = line.strip()
131 |             if (not start_reading and line.startswith("E-value") is True):
132 |                 start_reading = True
133 |             elif (start_reading and line == ""):
134 |                 start_reading = False
135 |                 break
136 |             elif (start_reading):
137 |                 matches = pattern.search(line)
138 |                 if (matches is not None and matches.group(0).find("--") == -1):
139 |                     results[matches.group(9).strip()] = (
140 |                         float(matches.group(1).strip()),
141 |                         float(matches.group(2).strip()))
142 |                     # _LOG.debug("Fragment scores;"
143 |                     #           "fragment:%s E-Value:%s BitScore:%s" %(matches
144 |                     # .group(9).strip(),matches.group(1).strip(), matches.
145 |                     # group(2).strip()))
146 |         return results


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_trace/rg_search.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Aug 23, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import heapq
  8 | from collections import deque 
  9 | 
 10 | from configuration import Configs
 11 | 
 12 | '''
 13 | Region-growing strategy, similar to Kruskal's algorithm. Start with an empty graph, greedily
 14 | adding the heaviest edges first. Can be used for clustering or tracing.
 15 | '''
 16 | 
 17 | def rgSearch(graph):
 18 |     Configs.log("Finding graph trace with region-growing search..")
 19 |     
 20 |     k = len(graph.context.subalignments)
 21 |     lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
 22 |     upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 
 23 |     graph.clusters = rgCluster(graph, lowerBound, upperBound, True)
 24 | 
 25 | def rgCluster(graph, lowerBound, upperBound, enforceTrace = True):
 26 |     clusters = []
 27 |     clusterPointers = {}
 28 |     clusterPos = {}
 29 |     nodeClusters = {}
 30 |     weightMap = []
 31 |     absorbed = set()
 32 |     cantConnects = set()
 33 |     
 34 |     for s in range(len(lowerBound)):
 35 |         for a in range(lowerBound[s], upperBound[s]):
 36 |             clusters.append([a])
 37 |             idx = len(clusters)-1
 38 |             nodeClusters[a] = idx
 39 |             weightMap.append({})
 40 |             clusterPos[idx] = {s : a}
 41 |             clusterPointers[idx] = {s : (idx-1 if idx > lowerBound[s] else None, idx+1 if idx < upperBound[s]-1 else None)}
 42 |     
 43 |     
 44 |     heap = buildHeap(graph, nodeClusters, weightMap, lowerBound, upperBound)
 45 |     Configs.log("Built a heap of size {}..".format(len(heap)))
 46 |     crunchHeap(graph, heap, clusters, nodeClusters, clusterPos, clusterPointers, weightMap, cantConnects, absorbed, enforceTrace)
 47 | 
 48 |     #c2 = [sorted(c) for c in clusters if len(c) > 0]
 49 |     #c2.sort(key= lambda l : graph.matSubPosMap[l[0]])
 50 |     #for c in c2:
 51 |     #    print(c)
 52 |         
 53 |     if enforceTrace:
 54 |         clusters = orderClusters(graph, clusters, nodeClusters, lowerBound, upperBound)
 55 |         #for c in clusters:
 56 |         #    print(sorted(c))
 57 |     return clusters
 58 | 
 59 | def buildHeap(graph, nodeClusters, weightMap, lowerBound, upperBound):
 60 |     heap = []
 61 |     for s in range(len(lowerBound)):
 62 |         for a in range(lowerBound[s], upperBound[s]):
 63 |             asub, apos = graph.matSubPosMap[a]
 64 |             i = nodeClusters[a]
 65 |             for b, value in graph.matrix[a].items():
 66 |                 bsub, bpos = graph.matSubPosMap[b]
 67 |                 if b <= a or asub == bsub or b < lowerBound[bsub] or b >= upperBound[bsub]:
 68 |                     continue
 69 |                 j = nodeClusters[b]
 70 |                 weightMap[j][i] = value
 71 |                 weightMap[i][j] = value
 72 |                 heapq.heappush(heap, (-1 * value, a, b))
 73 |     
 74 |     return heap
 75 |     #baseIdx = max(range(k), key = lambda x : upperBound[x] - lowerBound[x])
 76 |     #baseLength = upperBound[baseIdx] - lowerBound[baseIdx]
 77 | 
 78 | def crunchHeap(graph, heap, clusters, nodeClusters, clusterPos, clusterPointers, weightMap, cantConnects, absorbed, enforceTrace):
 79 |     while len(heap) > 0:
 80 |         value, a, b = heapq.heappop(heap)
 81 |         i, j = nodeClusters[a], nodeClusters[b]
 82 |         if i == j or orderPair(i,j) in cantConnects:
 83 |             continue
 84 |         
 85 |         if not checkConnect(graph, i, j, clusters, clusterPos, enforceTrace):
 86 |             cantConnects.add(orderPair(i,j))
 87 |             continue
 88 |         
 89 |         absorbed.add(j)
 90 |         for e in clusters[j]:
 91 |             nodeClusters[e] = i
 92 |             clusters[i].append(e)
 93 |             asub, apos = graph.matSubPosMap[e]
 94 |             clusterPos[i][asub] = e
 95 |         clusters[j] = []
 96 | 
 97 |         if enforceTrace:        
 98 |             for s in clusterPointers[j]:
 99 |                 prev, nxt = clusterPointers[j][s]
100 |                 if prev is not None:
101 |                     clusterPointers[prev][s] = (clusterPointers[prev][s][0], i) 
102 |                 if nxt is not None:
103 |                     clusterPointers[nxt][s] = (i, clusterPointers[nxt][s][1])
104 |                 clusterPointers[i][s] = (prev, nxt)
105 |     
106 |             updateMergePointers(graph, i, clusterPointers, clusters, clusterPos)
107 |          
108 |         #print("Clusters left: {}".format(len(clusters) - len(absorbed)))   
109 |         for n in weightMap[j]:
110 |             if n in absorbed:
111 |                 continue
112 |             weightMap[i][n] = weightMap[i].get(n, 0) + weightMap[j][n]
113 |             weightMap[n][i] = weightMap[i][n]
114 |             heapq.heappush(heap, (-1 * weightMap[i][n], clusters[i][0], clusters[n][0]))
115 | 
116 | def updateMergePointers(graph, i, clusterPointers, clusters, clusterPos):
117 |     subsets = [graph.matSubPosMap[a][0] for a in clusters[i]]
118 |     
119 |     for s in subsets:
120 |         queue = deque([i])
121 |         visited = set([i])
122 |         
123 |         while len(queue) > 0:
124 |             curNode = queue.popleft()
125 |             
126 |             if clusterPos[curNode].get(s, float('inf')) > clusterPos[i][s] or curNode == i:
127 |                 clusterPos[curNode][s] = clusterPos[i][s]
128 |                 
129 |                 for p in clusterPointers[curNode]:
130 |                     prv, nxt = clusterPointers[curNode][p]
131 |                     if prv not in visited and prv is not None:
132 |                         queue.append(prv)
133 |                         visited.add(prv)
134 |            
135 |             
136 | def checkConnect(graph, i, j, clusters, clusterPos, enforceTrace):
137 |     ci , cj = set([graph.matSubPosMap[a][0] for a in clusters[i]]), set([graph.matSubPosMap[a][0] for a in clusters[j]])
138 |     for s in ci:
139 |         if s in cj:
140 |             return False
141 |     
142 |     if not enforceTrace:
143 |         return True
144 |     
145 |     for s in ci:
146 |         if clusterPos[j].get(s, float('inf')) <= clusterPos[i][s]:
147 |             return False
148 |     for s in cj:
149 |         if clusterPos[i].get(s, float('inf')) <= clusterPos[j][s]:
150 |             return False    
151 |     return True
152 | 
153 | def orderClusters(graph, clusters, nodeClusters, lowerBound, upperBound):
154 |     orderedClusters = []
155 |     frontier = list(lowerBound)
156 |     while True:
157 |         foundGood = False
158 |         for j in range(len(lowerBound)):
159 |             good = True
160 |             idx = frontier[j]
161 |             if idx >= upperBound[j]:
162 |                 continue
163 |             i = nodeClusters[idx]
164 |             for b in clusters[i]:
165 |                 bsub, bpos = graph.matSubPosMap[b]
166 |                 if b > frontier[bsub]:
167 |                     #print(bsub, b, frontier[bsub])
168 |                     good = False
169 |                     break
170 |                 
171 |             if good:
172 |                 orderedClusters.append(clusters[i])
173 |                 for b in clusters[i]:
174 |                     bsub, bpos = graph.matSubPosMap[b]
175 |                     frontier[bsub] = b + 1
176 |                 foundGood = True
177 |                 break
178 |         if not foundGood:
179 |             break
180 |     return orderedClusters
181 | 
182 | def orderPair(a, b):
183 |     return (min(a, b), max(a, b))


--------------------------------------------------------------------------------
/witch_msa/gcmm/weighting.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 10.28.2021 by Chengze Shen
  3 | 
  4 | Bitscore to weight calculation.
  5 | '''
  6 | 
  7 | import os
  8 | import time
  9 | import numpy as np
 10 | import concurrent.futures
 11 | from witch_msa.configs import Configs, tqdm_styles
 12 | from tqdm import tqdm
 13 | 
 14 | class Weights(object):
 15 |     weights = dict()
 16 |     weights_map = dict()
 17 |     ranked_bitscores = dict()
 18 |     def __init__(self):
 19 |         pass
 20 | 
 21 | '''
 22 | Function to read in weights from local given the taxon name
 23 | '''
 24 | def readWeights(taxon):
 25 |     infile = Configs.outdir + '/weights/w_{}.txt'.format(taxon)
 26 |     if not os.path.isfile(infile):
 27 |         return None, None
 28 |     else:
 29 |         weights, weights_map = [], [] 
 30 |         with open(infile, 'r') as f:
 31 |             line = f.read()
 32 |             taxon, raw = line.split(':')
 33 |             weights = [eval(x) for x in raw.split(';')]
 34 |             weights_map = {ind: w for (ind, w) in weights}
 35 |         return weights, weights_map
 36 | 
 37 | '''
 38 | Function to read in bitscores from local given the taxon name
 39 | '''
 40 | def readBitscores(taxon):
 41 |     infile = Configs.outdir + '/bitscores/b_{}.txt'.format(taxon)
 42 |     if not os.path.isfile(infile):
 43 |         return None, None
 44 |     else:
 45 |         bitscores = [] 
 46 |         with open(infile, 'r') as f:
 47 |             line = f.read()
 48 |             taxon, raw = line.split(':')
 49 |             bitscores = [eval(x) for x in raw.split(';')]
 50 |         return bitscores
 51 | 
 52 | '''
 53 | Function to calculate the HMM weighting, given the bitscores and sizes
 54 | of the HMMs (for a given query taxon)
 55 | inputs: ensemble of HMMs H (with their bitscores and sizes)
 56 | outputs: weights for HMMs H
 57 | '''
 58 | def calculateWeights(packed_data):
 59 |     taxon, indexes, bitscores, sizes = packed_data
 60 |     #logging.debug('working with: {}'.format(taxon))
 61 |     weights = {}
 62 |     
 63 |     assert len(indexes) == len(bitscores) == len(sizes)
 64 |     for i in range(len(bitscores)):
 65 |         score_i, size_i = bitscores[i], sizes[i]
 66 |         exponents = np.array(bitscores) - score_i \
 67 |                 + np.log2(np.array(sizes) / size_i)
 68 |         denominator = np.sum(np.power(2, exponents))
 69 |         weights[indexes[i]] = 1. / denominator
 70 |     
 71 |     num_to_retain = min(Configs.num_hmms, len(weights))
 72 |     sorted_weights = sorted([(ind, w) for ind, w in weights.items()],
 73 |             key = lambda x: x[1], reverse=True)[:num_to_retain]
 74 |     return {taxon: tuple(sorted_weights)}
 75 | 
 76 |     ## write weights to local (only top k ones)
 77 |     #sorted_weights = [str(x) for x in sorted_weights]
 78 |     #with open(Configs.outdir + '/weights/w_{}.txt'.format(taxon), 'w') as f:
 79 |     #    f.write(taxon + ':' + ';'.join(sorted_weights) + '\n')
 80 |     #return None
 81 | 
 82 | '''
 83 | Function to write a single taxon with its ranked bitscore to local
 84 | '''
 85 | def writeQueryBitscores(packed_data):
 86 |     taxon, sorted_scores = packed_data
 87 |     str_sorted_scores = [str(x) for x in sorted_scores]
 88 | 
 89 |     with open(Configs.outdir + '/bitscores/b_{}.txt'.format(taxon), 'w') as f:
 90 |         f.write(taxon + ':' + ';'.join(str_sorted_scores) + '\n')
 91 |     return None
 92 | 
 93 | '''
 94 | Write bitscores to local (the same way as we write weights)
 95 | '''
 96 | def writeBitscores(ranked_bitscores, pool):
 97 |     s2 = time.time()
 98 |     Configs.warning('Starting to load bitscores...')
 99 |     #if not os.path.isdir(Configs.outdir + '/bitscores'):
100 |     #    os.makedirs(Configs.outdir + '/bitscores')
101 | 
102 |     taxon_to_bitscores = {}
103 |     for taxon, sorted_scores in ranked_bitscores.items():
104 |         num_to_retain = min(Configs.num_hmms, len(sorted_scores))
105 |         taxon_to_bitscores[taxon] = tuple(sorted_scores[:num_to_retain])
106 | 
107 |     #args = []
108 |     #for taxon, sorted_scores in ranked_bitscores.items():
109 |     #    args.append((taxon, sorted_scores))
110 |     #all_score_temps = list(pool.map(writeQueryBitscores, args))
111 | 
112 |     time_write_scores = time.time() - s2
113 |     Configs.warning('Finished loading bitscores in memory.')
114 |     Configs.runtime(' '.join(['(writeBitscores) Time to write ranked bitscores',
115 |             'to local (s):', str(time_write_scores)]))
116 |     return taxon_to_bitscores
117 | 
118 | '''
119 | Obtain and write weights to local based on bitscores
120 | '''
121 | def writeWeights(index_to_hmm, ranked_bitscores, pool):
122 |     s2 = time.time()
123 |     Configs.warning('Starting to calculate weights...')
124 |     #pool = Pool(Configs.num_cpus)
125 | 
126 |     # - get sizes of each HMM
127 |     all_sizes = {}
128 |     for index, subset in index_to_hmm.items():
129 |         all_sizes[index] = subset.num_taxa
130 |         #all_sizes[index] = subset.alignment.get_num_taxa()
131 | 
132 |     # iterate through each query taxon
133 |     # write to local for each taxon and its weights
134 |     #if not os.path.isdir(Configs.outdir + '/weights'):
135 |     #    os.makedirs(Configs.outdir + '/weights')
136 |     weights, weights_map = {}, {}
137 |     args = []
138 |     for taxon, sorted_scores in ranked_bitscores.items():
139 |         indexes = [x[0] for x in sorted_scores]
140 |         bitscores = [x[1] for x in sorted_scores]
141 |         sizes = [all_sizes[x] for x in indexes]
142 |         args.append((taxon, indexes, bitscores, sizes))
143 | 
144 |         ## sequential version to calculate weights
145 |         #this_weights_map,_ = calculateWeights(taxon, indexes, bitscores, sizes)
146 |         #weights[taxon] = sorted([(ind, w) for ind, w in this_weights_map.items()],
147 |         #        key = lambda x: x[1], reverse=True)
148 |         #weights_map[taxon] = this_weights_map
149 |     #all_taxon_to_weights = list(pool.map(calculateWeights, args,
150 |     #    chunksize=Configs.chunksize))
151 |     all_taxon_to_weights, futures = [], []
152 |     for arg in args:
153 |         futures.append(pool.submit(calculateWeights, arg))
154 |     for future in tqdm(
155 |             concurrent.futures.as_completed(futures),
156 |             total=len(args), **tqdm_styles):
157 |         res = future.result()
158 |         if res:
159 |             all_taxon_to_weights.append(res)
160 | 
161 |     taxon_to_weights = {}
162 |     for item in all_taxon_to_weights:
163 |         taxon_to_weights.update(item)
164 | 
165 |     time_obtain_weights = time.time() - s2
166 |     Configs.warning('Finished calculating weights!')
167 |     Configs.runtime(' '.join(['(writeWeights) Time to obtain weights',
168 |             'given bitscores (s):', str(time_obtain_weights)]))
169 |     return taxon_to_weights
170 | 
171 | '''
172 | Write weights to local as [outdir]/weights.txt
173 | '''
174 | def writeWeightsToLocal(taxon_to_weights, path):
175 |     Configs.log('Writing weights to {}'.format(path))
176 |     with open(path, 'w') as f:
177 |         for taxon, weights in taxon_to_weights.items():
178 |             f.write('{}:{}\n'.format(taxon, weights))
179 | 
180 | '''
181 | Function to read weights from a given weights path (e.g., ./weights.txt)
182 | Return a dictionary of taxon to weights
183 | '''
184 | def readWeightsFromLocal(path):
185 |     Configs.log('Reading weights from {}'.format(path))
186 |     taxon_to_weights = {}
187 |     with open(path, 'r') as f:
188 |         line = f.readline()
189 |         while line:
190 |             # split by ':'
191 |             taxon, taxon_weight = line.split(':')
192 |             taxon_to_weights[taxon] = eval(taxon_weight)
193 |             line = f.readline()
194 |     return taxon_to_weights
195 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/alignment_graph.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 14, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import os
  8 | 
  9 | from helpers import sequenceutils
 10 | from configuration import Configs
 11 | import threading
 12 | 
 13 | 
 14 | '''
 15 | Data structure for dealing with alignment graphs.
 16 | Subalignment columns are mapped to graph nodes, represented by integers.
 17 | Integer nodes can be converted back to corresponding subalignment columns.
 18 | Reads/writes graph and cluster files.
 19 | '''
 20 | 
 21 | class AlignmentGraph:
 22 |     
 23 |     def __init__(self, context):
 24 |         self.context = context
 25 |         self.workingDir = os.path.join(self.context.workingDir, "graph")
 26 |         self.graphPath = os.path.join(self.workingDir, "graph.txt")
 27 |         self.clusterPath = os.path.join(self.workingDir, "clusters.txt")
 28 |         self.tracePath = os.path.join(self.workingDir, "trace.txt")
 29 |         if not os.path.exists(self.workingDir):
 30 |             os.makedirs(self.workingDir)
 31 |         
 32 |         self.subalignmentLengths = []
 33 |         self.subsetMatrixIdx = []
 34 |         self.matSubPosMap = []
 35 |         
 36 |         self.matrixSize = 0
 37 |         self.matrix = None
 38 |         self.matrixLock = threading.Lock()
 39 |         self.nodeEdges = None
 40 |         
 41 |         self.clusters = []
 42 |         self.insertions = set()
 43 |         
 44 |     def initializeMatrix(self):
 45 |         if Configs.constrain:
 46 |             self.subalignmentLengths = [sequenceutils.readSequenceLengthFromFasta(file) for file in self.context.subalignmentPaths]
 47 |         else:
 48 |             self.subalignmentLengths = [len(self.context.unalignedSequences[s[0]].seq) for s in self.context.subalignments]
 49 |         
 50 |         self.matrixSize = sum(self.subalignmentLengths)    
 51 |         self.subsetMatrixIdx = [0] * len(self.subalignmentLengths)
 52 |         for k in range(1, len(self.subalignmentLengths)):        
 53 |             self.subsetMatrixIdx[k] = self.subsetMatrixIdx[k-1] + self.subalignmentLengths[k-1]
 54 |         
 55 |         self.matSubPosMap = [0] * self.matrixSize
 56 |         i = 0
 57 |         for k in range(len(self.subalignmentLengths)):
 58 |             for j in range(self.subalignmentLengths[k]):
 59 |                 self.matSubPosMap[i] = (k, j)
 60 |                 i = i + 1
 61 |         
 62 |         self.matrix = [{} for i in range(self.matrixSize)]
 63 |     
 64 |     def writeGraphToFile(self, filePath):
 65 |         with open(filePath, 'w') as textFile:
 66 |             for i in range(len(self.matrix)):
 67 |                 for k in self.matrix[i]:
 68 |                     textFile.write("{} {} {}\n".format(i, k, self.matrix[i][k]))
 69 |         Configs.log("Wrote matrix to {}".format(filePath))
 70 | 
 71 |     def readGraphFromFile(self, filePath):
 72 |         self.matrix = [{} for i in range(self.matrixSize)]
 73 |         with open(filePath) as f:
 74 |             for line in f:
 75 |                 tokens = [int(token) for token in line.strip().split()]
 76 |                 self.matrix[tokens[0]][tokens[1]] = tokens[2]
 77 |         Configs.log("Read matrix from {}".format(filePath))
 78 |     
 79 |     def writeClustersToFile(self, filePath):
 80 |         with open(filePath, 'w') as textFile:
 81 |             for cluster in self.clusters:
 82 |                 textFile.write("{}\n".format(" ".join([str(c) for c in cluster])))
 83 |         
 84 |     def readClustersFromFile(self, filePath):
 85 |         self.clusters = []
 86 |         with open(filePath) as f:
 87 |             for line in f:
 88 |                 tokens = [int(token) for token in line.strip().split()]
 89 |                 if len(tokens) > 1:
 90 |                     self.clusters.append(tokens) 
 91 |         print("Found {} clusters..".format(len(self.clusters)))
 92 |     
 93 |     def buildNodeEdgeDataStructure(self):
 94 |         Configs.log("Preparing node edge data structure..")
 95 |         k = len(self.subalignmentLengths)
 96 |         self.nodeEdges = {}
 97 |         
 98 |         for a in range(self.matrixSize):
 99 |             asub, apos = self.matSubPosMap[a]
100 |             self.nodeEdges[a] = [[] for i in range(k)]
101 |             for b, value in self.matrix[a].items():
102 |                 bsub, bpos = self.matSubPosMap[b] 
103 |                 if asub == bsub:
104 |                     continue
105 |                 self.nodeEdges[a][bsub].append((b, value))
106 |             for i in range(k):
107 |                 self.nodeEdges[a][i].sort(key = lambda pair: pair[0])
108 |         Configs.log("Prepared node edge data structure..")
109 |     
110 |     def buildNodeEdgeDataStructureFromClusters(self):
111 |         Configs.log("Preparing node edge data structure..")
112 |         k = len(self.subalignmentLengths)
113 |         self.nodeEdges = {}
114 |         
115 |         Configs.log("Using {} pre-existing clusters to simplify alignment graph..".format(len(self.clusters)))
116 |         for a in range(self.matrixSize):
117 |             self.nodeEdges[a] = [[] for i in range(k)]
118 |         
119 |         for cluster in self.clusters:
120 |             for a in cluster:
121 |                 asub, apos = self.matSubPosMap[a]
122 |                 for b in cluster:
123 |                     bsub, bpos = self.matSubPosMap[b]
124 |                     if asub == bsub or b not in self.matrix[a]:
125 |                         continue
126 |                     value = self.matrix[a][b]
127 |                     self.nodeEdges[a][bsub].append((b, value))
128 |                 for i in range(k):
129 |                     self.nodeEdges[a][i].sort(key = lambda pair: pair[0])
130 |         Configs.log("Prepared node edge data structure..")
131 | 
132 |     def cutString(self, cut):
133 |         stringCut = list(cut)
134 |         for i, value in enumerate(stringCut):
135 |             stringCut[i] = value - self.subsetMatrixIdx[i]
136 |         return stringCut
137 | 
138 |     def computeClusteringCost(self, clusters):
139 |         cutCost = 0
140 |         nodeClusters = {}
141 |         
142 |         for n, cluster in enumerate(clusters):
143 |             for a in cluster:
144 |                 nodeClusters[a] = n
145 |                 
146 |         clusterCounter = len(clusters)
147 |         for a in range(self.matrixSize):
148 |             if a not in nodeClusters:
149 |                 nodeClusters[a] = clusterCounter
150 |                 clusterCounter = clusterCounter + 1
151 |                  
152 |         for a in range(self.matrixSize):
153 |             asub, apos = self.matSubPosMap[a]
154 |             for b, value in self.matrix[a].items():
155 |                 bsub, bpos = self.matSubPosMap[b] 
156 |                 if asub != bsub and nodeClusters[a] != nodeClusters[b]:
157 |                     cutCost = cutCost + value
158 |     
159 |         return int(cutCost/2) 
160 |     
161 |     def addSingletonClusters(self):
162 |         newClusters = []
163 |         
164 |         lastIdx = list(self.subsetMatrixIdx) 
165 |         for cluster in self.clusters:
166 |             for a in cluster:
167 |                 #print(a)
168 |                 asub, apos = self.matSubPosMap[a]                
169 |                 for node in range(lastIdx[asub], a):
170 |                     newClusters.append([node])
171 |                 lastIdx[asub] = a+1
172 |             newClusters.append(cluster)
173 |         for i in range(len(lastIdx)):
174 |             for node in range(lastIdx[i], self.subsetMatrixIdx[i] + self.subalignmentLengths[i]):
175 |                 newClusters.append([node])
176 |         self.clusters = newClusters
177 |         return newClusters
178 |     
179 |     
180 |     
181 |     


--------------------------------------------------------------------------------
/witch_msa/gcmm/decompose_tree.py:
--------------------------------------------------------------------------------
  1 | # uym2 added
  2 | # June 2017
  3 | # utils for tree decomposition
  4 | 
  5 | # 1.22.2022 - Copied over and modified to accomodate GCMM from SEPP
  6 | # by Chengze Shen
  7 | 
  8 | 
  9 | from dendropy import Tree
 10 | try:
 11 |     from queue import Queue  # python 3
 12 | except ImportError:
 13 |     from Queue import Queue  # python 2
 14 | # from tree import PhylogeneticTree
 15 | from witch_msa.configs import Configs
 16 | 
 17 | 
 18 | def decompose_by_diameter(a_tree, strategy, max_size=None, min_size=None,
 19 |                           max_diam=None):
 20 |     def __ini_record__():
 21 |         for node in a_tree.postorder_node_iter():
 22 |             __update_node__(node)
 23 | 
 24 |     def __find_midpoint_edge__(tre):
 25 |         u = tre.seed_node.bestLCA.anchor
 26 |         uel = u.edge_length if u.edge_length else 0
 27 |         d = 0
 28 |         while d + uel < tre.seed_node.diameter / 2:
 29 |             d += uel
 30 |             u = u.parent_node
 31 |             uel = u.edge_length if u.edge_length else 0
 32 |         return u.edge
 33 | 
 34 |     def __find_centroid_edge__(tre):
 35 |         u = tre.seed_node
 36 |         product = 0
 37 |         acc_nleaf = 0
 38 | 
 39 |         while not u.is_leaf():
 40 |             max_child = None
 41 |             max_child_nleaf = 0
 42 |             for ch in u.child_node_iter():
 43 |                 if ch.nleaf > max_child_nleaf:
 44 |                     max_child_nleaf = ch.nleaf
 45 |                     max_child = ch
 46 |             acc_nleaf += (u.nleaf-max_child.nleaf)
 47 |             new_product = max_child.nleaf * acc_nleaf
 48 |             if new_product <= product:
 49 |                 break
 50 |             product = new_product
 51 |             u = max_child
 52 | 
 53 |         return u.edge
 54 | 
 55 |     def __bisect__(tre, edg):
 56 |         # e = __find_centroid_edge__(t)
 57 | 
 58 |         u = edg.tail_node
 59 |         v = edg.head_node
 60 | 
 61 |         u.remove_child(v)
 62 |         tr1 = Tree(seed_node=v)
 63 | 
 64 |         if u.num_child_nodes() == 1:
 65 |             p = u.parent_node
 66 |             v = u.child_nodes()[0]
 67 |             l_v = v.edge_length if v.edge_length else 0
 68 |             u.remove_child(v)
 69 |             # u is the seed_node; this means the tree runs out of all but one
 70 |             # side
 71 |             if p is None:
 72 |                 tre.seed_node = v
 73 |                 return tre, tr1
 74 |             l_u = u.edge_length if u.edge_length else 0
 75 |             p.remove_child(u)
 76 |             p.add_child(v)
 77 |             v.edge_length = l_u + l_v
 78 |             u = p
 79 | 
 80 |         while u is not None:
 81 |             __update_node__(u)
 82 |             u = u.parent_node
 83 | 
 84 |         return tre, tr1
 85 | 
 86 |     def __clean_up__(tre):
 87 |         for node in tre.postorder_node_iter():
 88 |             delattr(node, "nleaf")
 89 |             delattr(node, "anchor")
 90 |             # delattr(node,"maxheight")
 91 |             delattr(node, "maxdepth")
 92 |             delattr(node, "diameter")
 93 |             # delattr(node,"topo_diam")
 94 |             delattr(node, "bestLCA")
 95 | 
 96 |     def __update_node__(node):
 97 |         if node.is_leaf():
 98 |             node.anchor = node
 99 |             # node.maxheight = 0
100 |             node.maxdepth = 0
101 |             node.diameter = 0
102 |             # node.topo_diam = 0
103 |             node.bestLCA = node
104 |             node.nleaf = 1
105 |             return
106 | 
107 |         # n1 = -1
108 |         # n2 = -1
109 |         d1 = -1
110 |         d2 = -1
111 |         anchor1 = None
112 |         node.diameter = 0
113 |         # node.topo_diam = 0
114 |         node.bestLCA = None
115 |         node.nleaf = 0
116 | 
117 |         for ch in node.child_node_iter():
118 |             node.nleaf += ch.nleaf
119 | #               n = ch.maxheight + 1
120 |             d = ch.maxdepth + ch.edge_length if ch.edge_length else 0
121 | #               if n > n1:
122 | #                   n2 = n1
123 | #                   n1 = n
124 | #                   anchor2 = anchor1
125 | #                   anchor1 = ch.anchor
126 | #               elif n > n2:
127 | #                   n2 = n
128 | #                   anchor2 = ch.anchor
129 |             if d > d1:
130 |                 d2 = d1
131 |                 d1 = d
132 |                 anchor1 = ch.anchor
133 |             elif d > d2:
134 |                 d2 = d
135 |             if ch.diameter > node.diameter:
136 |                 node.diameter = ch.diameter
137 |                 node.bestLCA = ch.bestLCA
138 | #               node.diameter = max(ch.diameter,node.diameter)
139 | 
140 | #        node.diameter = max(d1+d2, node.diameter)
141 |         node.maxdepth = d1
142 | #        node.maxheight = n1
143 |         node.anchor = anchor1
144 |         if d1+d2 > node.diameter:
145 |             node.diameter = d1+d2
146 |             node.bestLCA = node
147 | 
148 |     def __get_breaking_edge__(tre, edge_type):
149 |         if tre.seed_node.nleaf <= max_size and \
150 |                 tre.seed_node.diameter <= max_diam:
151 |             return None
152 |         if edge_type == 'midpoint':
153 |             ed = __find_midpoint_edge__(tre)
154 |         elif edge_type == 'centroid':
155 |             ed = __find_centroid_edge__(tre)
156 |         else:
157 |             Configs.warning(("Invalid decomposition type! Please use either "
158 |                           "'midpoint' or 'centroid'"))
159 |             return None
160 | 
161 |         n = ed.head_node.nleaf
162 |         if (n < min_size) or (tre.seed_node.nleaf - n) < min_size:
163 |             return None
164 |         return ed
165 | 
166 |     def __check_stop__(tre):
167 |         return ((tre.seed_node.nleaf <= max_size and
168 |                  tre.seed_node.diameter <= max_diam) or
169 |                 (tre.seed_node.nleaf // 2 < min_size))
170 | 
171 |     def __break_by_MP_centroid__(tre):
172 |         ed = __get_breaking_edge__(tre, 'midpoint')
173 |         if ed is None:
174 |             # print("Midpoint failed. Trying centroid decomposition...")
175 |             ed = __get_breaking_edge__(tre, 'centroid')
176 |         # else:
177 |         #    print("Successfully splitted by midpoint")
178 |         return ed
179 | 
180 |     def __break(tre):
181 |         if strategy == "centroid":
182 |             return __get_breaking_edge__(tre, 'centroid')
183 |         elif strategy == "midpoint":
184 |             return __break_by_MP_centroid__(tre)
185 |         else:
186 |             raise Exception("strategy not valid: %s" % strategy)
187 | 
188 |     tqueue = Queue()
189 | 
190 |     Configs.debug("Starting brlen decomposition ...")
191 |     __ini_record__()
192 |     min_size = min_size if min_size else 0
193 |     max_size = max_size if max_size else a_tree.seed_node.nleaf
194 |     max_diam = max_diam if max_diam else a_tree.seed_node.diameter
195 | 
196 |     Configs.debug(
197 |         "Now breaking by %s with min %d and max %d sizes and diameter %f ..." %
198 |         (strategy, min_size, max_size, max_diam))
199 |     # try using midpoint
200 |     e = __break(a_tree)
201 | 
202 |     if e is None:
203 |         __clean_up__(a_tree)
204 |         return [a_tree]
205 | 
206 |     tree_map = []
207 |     tqueue.put((a_tree, e))
208 |     while not tqueue.empty():
209 |         t, e = tqueue.get()
210 |         t1, t2 = __bisect__(t, e)
211 |         e1 = __break(t1)
212 |         if e1 is None:
213 |             __clean_up__(t1)
214 |             tree_map.append(t1)
215 |         else:
216 |             tqueue.put((t1, e1))
217 |         e2 = __break(t2)
218 |         if e2 is None:
219 |             __clean_up__(t2)
220 |             tree_map.append(t2)
221 |         else:
222 |             tqueue.put((t2, e2))
223 | 
224 |     return tree_map
225 | 


--------------------------------------------------------------------------------
/witch_msa/init_configs.py:
--------------------------------------------------------------------------------
  1 | import os, sys, shutil
  2 | try:
  3 |     import configparser
  4 | except ImportError:
  5 |     import ConfigParser as configparser
  6 | from argparse import ArgumentParser, Namespace
  7 | from platform import platform
  8 | 
  9 | def find_main_config(homepath):
 10 |     with open(homepath, 'r') as f:
 11 |         _root_dir = f.read().strip()
 12 |         main_config_path = os.path.join(_root_dir, 'main.config')
 13 |         if os.path.exists(main_config_path):
 14 |             return _root_dir, main_config_path
 15 |         else:
 16 |             return None, None
 17 | 
 18 | '''
 19 | first time run, need user to initialize the main.config
 20 | if it is not installed through github (i.e., python setup.py config)
 21 | will be needed if installed through pip/pypi
 22 | '''
 23 | def init_config_file(homepath, prioritize_user_software=True):
 24 |     # read from sys.argv to find if "-y" or "--bypass-setup" exists
 25 |     args = sys.argv[1:]
 26 |     bypass_setup = True
 27 |     #if '-y' in args or '--bypass-setup' in args:
 28 |     #    bypass_setup = True
 29 | 
 30 |     # initialize a home.path that points to local user main.config
 31 |     # if it exists then pass on
 32 |     if os.path.exists(homepath):
 33 |         # detecting old home.path file based on creation time
 34 |         if os.stat(homepath).st_mtime >= os.stat(__file__).st_mtime:
 35 |             _root_dir, main_config_path = find_main_config(homepath)
 36 |             if _root_dir is None:
 37 |                 print('home.path exists but main.config missing, regenerating...')
 38 |             else:
 39 |                 return _root_dir, main_config_path
 40 |         else:
 41 |             print('Found old home.path and regenerating...')
 42 |             os.remove(homepath)
 43 |     else:
 44 |         print('Cannot find home.path: {}'.format(homepath))
 45 | 
 46 |     # install to user local directory
 47 |     # bypassing the setup step to directly use the default path
 48 |     _root_dir = ''
 49 |     if not bypass_setup:
 50 |         _root_dir = input('Create main.config file at [default: ~/.witch_msa/]: ')
 51 | 
 52 |     if _root_dir == '':
 53 |         _root_dir = os.path.expanduser('~/.witch_msa')
 54 |     else:
 55 |         _root_dir = os.path.abspath(_root_dir)
 56 |     main_config_path = os.path.join(_root_dir, 'main.config')
 57 |     print('Initializing main configuration file: {}...'.format(main_config_path))
 58 | 
 59 |     # write to local for installation to system
 60 |     # will read in during runs to find the main.config file
 61 |     if not os.path.isdir(_root_dir):
 62 |         os.mkdir(_root_dir)
 63 |     with open(homepath, 'w') as f:
 64 |         f.write(_root_dir)
 65 | 
 66 |     # create main.config file at configfile using default.config
 67 |     _config_path = os.path.join(os.path.dirname(__file__), 'default.config')
 68 |     config_defaults = []
 69 |     cparser = configparser.ConfigParser()
 70 |     cparser.optionxform = str
 71 |     assert os.path.exists('{}'.format(_config_path)), \
 72 |             "default config file {} missing! Please redownload from Github\n".format(
 73 |                     _config_path)
 74 | 
 75 |     if os.path.exists(main_config_path):
 76 |         print('Main configuration file {} exists...'.format(main_config_path))
 77 |         print('Overwriting existing configuration file...')
 78 | 
 79 |     print('\n')
 80 |     # initialize main config file using default config file
 81 |     default_config = configparser.ConfigParser()
 82 |     with open(_config_path, 'r') as f:
 83 |         default_config.read_file(f)
 84 |     for section in default_config.sections():
 85 |         cparser.add_section(section)
 86 |         for k, v in default_config[section].items():
 87 |             cparser.set(section, k, v)
 88 | 
 89 |     # if platform is linux then we just copy the default config file
 90 |     # as main.config
 91 |     platform_name = platform()
 92 |     tools_dir = os.path.join(os.path.dirname(__file__), 'tools')
 93 |     set_sections = ['Basic', 'MAGUS']
 94 | 
 95 |     # copy magus directory to tools/
 96 |     magus_dir = os.path.join(tools_dir, 'magus')
 97 |     cparser.set('Basic', 'magus_path', magus_dir + '/magus.py')
 98 | 
 99 |     if 'macos' not in platform_name.lower():
100 |         print('System is {}, using default config as main.config...'.format(
101 |             platform_name))
102 |         # use existing binaries from MAGUS subfolder (reduce redundancy of
103 |         # duplicated binaries)
104 |         for _section in set_sections:
105 |             # mafftpath
106 |             cparser.set(_section, 'mafftpath',
107 |                     os.path.join(magus_dir, 'tools', 'mafft', 'mafft'))
108 |             # mclpath
109 |             cparser.set(_section, 'mclpath',
110 |                     os.path.join(magus_dir, 'tools', 'mcl', 'bin', 'mcl'))
111 |             # fasttreepath
112 |             cparser.set(_section, 'fasttreepath',
113 |                     os.path.join(magus_dir, 'tools', 'fasttree', 'FastTreeMP'))
114 |             # hmmer packages
115 |             for hmmer_pkg in ['hmmsearch', 'hmmalign', 'hmmbuild']:
116 |                 cparser.set(_section, '{}path'.format(hmmer_pkg),
117 |                         os.path.join(magus_dir, 'tools', 'hmmer', hmmer_pkg))
118 |     else:
119 |         if 'x86' not in platform_name:
120 |             print('Warning: system is not using x86 architecture.',
121 |                     'Some softwares such as FastTreeMP need to be',
122 |                     'self-provided. See {} [Basic] '.format(_config_path),
123 |                     'section for more information.')
124 |         print("System is {}, reconfiguring main.config...".format(platform_name))
125 | 
126 |         # configure MAGUS to use macOS compatible executables
127 |         binaries = os.listdir(tools_dir + '/macOS')
128 |         for binary in binaries:
129 |             path = os.path.join(tools_dir, 'macOS', binary)
130 |             #path = os.path.join(_macOS_dir, binary)
131 |             for _section in set_sections:
132 |                 if 'FastTreeMP' in path:
133 |                     cparser.set(_section, 'fasttreepath', path)
134 |                             #self.copy_tool_to_lib('FastTreeMP', path))
135 |                 else:
136 |                     cparser.set(_section, '{}path'.format(binary), path)
137 |                             #self.copy_tool_to_lib(binary, path))
138 | 
139 |     # binaries from the user's environment will be used in priority
140 |     # if they exist
141 |     if prioritize_user_software:
142 |         print('Detecting existing software from the user\'s environment...')
143 |         software = ['mafft', 'mcl', 
144 |                 'hmmsearch', 'hmmalign', 'hmmbuild', 'FastTreeMP']
145 |         print('\tDetected:\n')
146 |         for soft in software:
147 |             if shutil.which(soft):
148 |                 print('\t{}: {}'.format(soft, shutil.which(soft)))
149 |                 for _section in set_sections:
150 |                     if soft == 'FastTreeMP':
151 |                         cparser.set(_section, 'fasttreepath',
152 |                                 shutil.which(soft))
153 |                     elif soft == 'magus':
154 |                         cparser.set('Basic', 'magus_path',
155 |                                 shutil.which(soft))
156 |                     else:
157 |                         cparser.set(_section, '{}path'.format(soft),
158 |                                 shutil.which(soft))
159 | 
160 |     with open(main_config_path, 'w') as f:
161 |         cparser.write(f)
162 |     print('\n(Done) main.config written to {}'.format(main_config_path))
163 |     print('If you would like to make manual changes, please directly edit {}'.format(
164 |         main_config_path))
165 |     # DO NOT EXIT; can start running WITCH with any given commands now
166 |     #exit(0)
167 |     return _root_dir, main_config_path
168 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/magus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | '''
  3 | Created on Apr 14, 2020
  4 | 
  5 | @author: Vlad
  6 | '''
  7 | 
  8 | import time
  9 | import argparse
 10 | import sys
 11 | import traceback
 12 | 
 13 | from align.aligner import mainAlignmentTask
 14 | from configuration import buildConfigs, Configs
 15 | from tasks import manager
 16 | 
 17 | def main():   
 18 |     '''
 19 |     Resolve the args/configs, spin up the task manager (which deals with worker threads and handles parallelism), 
 20 |     and get started on the main alignment task. 
 21 |     '''
 22 |     
 23 |     startTime = time.time()
 24 |     args = parseArgs()
 25 |     buildConfigs(args)    
 26 |     Configs.log("MAGUS was run with: {}".format(" ".join(sys.argv)))
 27 |     
 28 |     try:
 29 |         manager.startTaskManager()
 30 |         mainAlignmentTask()
 31 |     except:
 32 |         Configs.error("MAGUS aborted with an exception..")
 33 |         Configs.error(traceback.format_exc())
 34 |     finally:
 35 |         manager.stopTaskManager()
 36 |     
 37 |     endTime = time.time()
 38 |     Configs.log("MAGUS finished in {} seconds..".format(endTime-startTime))
 39 |     
 40 | def parseArgs():
 41 |     parser = argparse.ArgumentParser()
 42 | 
 43 |     parser.add_argument("-d", "--directory", type=str,
 44 |                         help="Path to working directory", required=False, default=None)
 45 |     
 46 |     parser.add_argument("-i", "--sequences", type=str,
 47 |                         help="Path to input unaligned sequences", required=False, default=None)
 48 |     
 49 |     parser.add_argument("-s", "--subalignments", type=str, nargs="+",
 50 |                         help="Paths to input subalignment files", required=False, default=[])
 51 |     
 52 |     parser.add_argument("-b", "--backbones", type=str, nargs="+",
 53 |                         help="Paths to input backbone alignment files", required=False, default=[])
 54 | 
 55 |     parser.add_argument("-o", "--output", type=str,
 56 |                         help="Output alignment path", required=True)
 57 |     
 58 |     parser.add_argument("-t", "--guidetree", type=str,
 59 |                         help="Guide tree for subset decomposition. fasttree (default), fasttree-noml, clustal, parttree, or path to user guide tree",
 60 |                         required=False, default="fasttree")
 61 | 
 62 |     parser.add_argument("-np", "--numprocs", type=int,
 63 |                         help="Number of processors to use (default: # cpus available)",
 64 |                         required=False, default=-1)
 65 |     
 66 |     parser.add_argument("--maxsubsetsize", type=int,
 67 |                         help="Maximum subset size for divide-and-conquer",
 68 |                         required=False, default=50)
 69 |     
 70 |     parser.add_argument("--maxnumsubsets", type=int,
 71 |                         help="Maximum number of subsets for divide-and-conquer",
 72 |                         required=False, default=25)
 73 |     
 74 |     parser.add_argument("--decompstrategy", type=str,
 75 |                         help="Initial decomposition strategy (pastastyle or kmh)",
 76 |                         required=False, default="pastastyle")
 77 |     
 78 |     parser.add_argument("--decompskeletonsize", type=int,
 79 |                         help="Number of skeleton sequences for the initial decomposition strategy",
 80 |                         required=False, default=300)
 81 |     
 82 |     parser.add_argument("--datatype", type=str,
 83 |                         help="Data type (dna, rna, or protein). Will be inferred if not provided",
 84 |                         required=False, default=None)
 85 |     
 86 |     parser.add_argument("--graphbuildmethod", type=str,
 87 |                         help="Method for building the alignment graph (mafft, mafftmerge, or initial)",
 88 |                         required=False, default="mafft")
 89 |     
 90 |     parser.add_argument("--graphbuildrestrict", type=str,
 91 |                         help="Prevent the alignment graph from adding edges that violate subalignments (true or false)",
 92 |                         required=False, default="False")
 93 |     
 94 |     parser.add_argument("--graphbuildhmmextend", type=str,
 95 |                         help="Extend the alignment graph MAFFT backbones with hmmer (true or false)",
 96 |                         required=False, default="False")
 97 |     
 98 |     parser.add_argument("--graphclustermethod", type=str,
 99 |                         help="Method for initial clustering of the alignment graph (mcl or none)",
100 |                         required=False, default="mcl")
101 |     
102 |     parser.add_argument("--graphtracemethod", type=str,
103 |                         help="Method for finding a trace from the alignment graph (minclusters, fm, mwtgreedy, or mwtsearch)",
104 |                         required=False, default="minclusters")
105 |     
106 |     parser.add_argument("--graphtraceoptimize", type=str,
107 |                         help="Run an optimization step on the graph trace (true or false)",
108 |                         required=False, default="False")
109 |     
110 |     parser.add_argument("-r", "--mafftruns", type=int,
111 |                         help="Number of MAFFT runs", required=False, default=10)
112 |     
113 |     parser.add_argument("-m", "--mafftsize", type=int,
114 |                         help="Maximum size of MAFFT alignments", required=False, default=200)
115 |     
116 |     parser.add_argument("-f", "--inflationfactor", type=float,
117 |                         help="MCL inflation factor", required=False, default=4)
118 |     
119 |     parser.add_argument("-c", "--constrain", type=str,
120 |                         help="Constrain MAGUS to respect subalignments (true or false)", required=False, default="true")
121 |     
122 |     parser.add_argument("--onlyguidetree", type=str,
123 |                         help="Only output the guide tree (true or false)", required=False, default="false")
124 |     
125 |     parser.add_argument("--recurse", type=str,
126 |                         help="Allow MAGUS to recurse on large subsets (true or false)", required=False, default="true")
127 | 
128 |     parser.add_argument("--recurseguidetree", type=str,
129 |                         help="If recursing, passes this argument as the guide tree option to the lower levels. (Default fasttree)", required=False, default="fasttree")
130 |     
131 |     parser.add_argument("--recursethreshold", type=int,
132 |                         help="MAGUS will recursively align subsets above this threshold size", required=False, default=200)
133 |     
134 |     parser.add_argument("--alignsizelimit", type=float,
135 |                         help="Size threshold for alignment compression (in GB)", required=False, default=100)
136 |        
137 |     '''
138 |         6.1.2021 - added by Chengze Shen
139 |         a new argument defining the weights of the backbone alignments
140 |     '''
141 |     parser.add_argument("-w", "--backboneWeightsPath", type=str,
142 |             required=False, default=None,
143 |             help="Weights of the backbone alignments (a file path)")
144 |     '''
145 |         6.8.2022 - added by Chengze Shen
146 |         new arguments for different versions of mcl/mafft/HMMER (e.g., MacOS)
147 |     '''
148 |     parser.add_argument("--mclpath", type=str, default=None, required=False,
149 |             help="custom MCL path")
150 |     parser.add_argument("--mafftpath", type=str, default=None, required=False,
151 |             help="custom MAFFT path")
152 |     parser.add_argument("--hmmalignpath", type=str, default=None, required=False,
153 |             help="custom hmmalign path")
154 |     parser.add_argument("--hmmbuildpath", type=str, default=None, required=False,
155 |             help="custom hmmbuild path")
156 |     parser.add_argument("--hmmsearchpath", type=str, default=None, required=False,
157 |             help="custom hmmsearch path")
158 |     parser.add_argument("--fasttreepath", type=str, default=None, required=False,
159 |             help="custom FastTree path")
160 |     return parser.parse_args()
161 | 
162 | if __name__ == '__main__':
163 |     main()
164 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/helpers/sequenceutils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Sep 22, 2018
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | 
  8 | class Sequence:
  9 |     def __init__(self, tag, seq):
 10 |         self.tag = tag
 11 |         self.seq = seq
 12 |     
 13 | def readFromFasta(filePath, removeDashes = False):
 14 |     sequences = {}
 15 |     currentSequence = None
 16 | 
 17 |     with open(filePath) as f:
 18 |         for line in f:
 19 |             line = line.strip()
 20 |             if line.startswith('>'):                    
 21 |                 tag = line[1:]
 22 |                 currentSequence = Sequence(tag, "")
 23 |                 sequences[tag] = currentSequence
 24 |             else :
 25 |                 if(removeDashes):
 26 |                     line = line.replace("-", "")
 27 |                 currentSequence.seq = currentSequence.seq + line
 28 | 
 29 |     print("Read " + str(len(sequences)) + " sequences from " + filePath + " ..")
 30 |     return sequences
 31 | 
 32 | def readFromFastaOrdered(filePath, removeDashes = False):
 33 |     sequences = []
 34 |     currentSequence = None
 35 | 
 36 |     with open(filePath) as f:
 37 |         for line in f:
 38 |             line = line.strip()
 39 |             if line.startswith('>'):                    
 40 |                 tag = line[1:]
 41 |                 currentSequence = Sequence(tag, "")
 42 |                 sequences.append(currentSequence)
 43 |             else :
 44 |                 if(removeDashes):
 45 |                     line = line.replace("-", "")
 46 |                 currentSequence.seq = currentSequence.seq + line
 47 | 
 48 |     print("Read " + str(len(sequences)) + " sequences from " + filePath + " ..")
 49 |     return sequences
 50 | 
 51 | def readFromPhylip(filePath, removeDashes = False):
 52 |     sequences = {}    
 53 | 
 54 |     with open(filePath) as f:
 55 |         firstLine = f.readline().strip()
 56 |         
 57 |         for line in f:
 58 |             #print(line)
 59 |             tokens = line.split()
 60 |             if len(tokens) == 2:
 61 |                 tag = tokens[0]
 62 |                 seq = tokens[1]
 63 |                 
 64 |                 if(removeDashes):
 65 |                     seq = seq.replace("-", "")
 66 |                    
 67 |                 if tag in sequences:
 68 |                     sequences[tag].seq = sequences[tag].seq + seq
 69 |                 else:
 70 |                     sequences[tag] = Sequence(tag, seq)
 71 |                 
 72 |     
 73 |     print("Read " + str(len(sequences)) + " sequences from " + filePath + " ..")                                
 74 |     return sequences
 75 | 
 76 | #reads match columns only
 77 | def readFromStockholm(filePath, includeInsertions = False):
 78 |     sequences = {}
 79 |     
 80 |     with open(filePath, 'r') as stockFile:
 81 |         for line in stockFile:
 82 |             line = line.strip()
 83 |             if line == "//":
 84 |                 break
 85 |             elif line == "" or line[0] == "#":
 86 |                 pass
 87 |             else:  
 88 |                 key, seq = line.split()
 89 |                 if key not in sequences:
 90 |                     sequences[key] = Sequence(key, "")
 91 |                     
 92 |                 for c in seq:
 93 |                     #if includeInsertions or not (c == '.' or c in string.ascii_lowercase):
 94 |                     if includeInsertions or (c == c.upper() and c != '.'):
 95 |                         sequences[key].seq = sequences[key].seq + c    
 96 |     return sequences
 97 | 
 98 | def writeFasta(alignment, filePath, taxa = None, append = False):
 99 |         with open(filePath, 'a' if append else 'w') as textFile:
100 |             if taxa is not None:
101 |                 for tag in taxa:
102 |                     if tag in alignment:
103 |                         textFile.write('>' + tag + '\n' + alignment[tag].seq + '\n')
104 |             else:
105 |                 for tag in alignment:
106 |                     textFile.write('>' + tag + '\n' + alignment[tag].seq + '\n')
107 |       
108 |                     
109 | def writePhylip(alignment, filePath, taxa = None):
110 |     maxChars = 0
111 |     lines = []
112 |     for tag in alignment:
113 |         if taxa is None or tag in taxa:
114 |             lines.append("{} {}\n".format(tag, alignment[tag].seq))
115 |             maxChars = max(maxChars, len(alignment[tag].seq))
116 |     
117 |     with open(filePath, 'w') as textFile:
118 |         textFile.write("{} {}\n".format(len(lines), maxChars))
119 |         for line in lines:
120 |             textFile.write(line)
121 | 
122 | def cleanGapColumns(filePath, cleanFile = None):
123 |     align = readFromFasta(filePath, False)
124 |     values = list(align.values())
125 |     keepCols = []
126 |     for i in range(len(values[0].seq)):
127 |         for j in range(len(values)):
128 |             if values[j].seq[i] != '-':
129 |                 keepCols.append(i)
130 |                 break
131 |             
132 |     print("Removing gap columns.. Kept {} out of {}..".format(len(keepCols), len(values[0].seq)))
133 |     for s in values:
134 |         s.seq = ''.join(s.seq[idx] for idx in keepCols)
135 |     
136 |     if cleanFile is None:
137 |         cleanFile = filePath
138 |         
139 |     writeFasta(align, cleanFile)
140 |     
141 | def convertRnaToDna(filePath, destFile = None):
142 |     align = readFromFasta(filePath, False)
143 |     for taxon in align:
144 |         align[taxon].seq = align[taxon].seq.replace('U', 'T')
145 |     if destFile is None:
146 |         destFile = filePath
147 |     writeFasta(align, destFile)
148 | 
149 | def inferDataType(filePath):
150 |     sequences = readFromFasta(filePath, removeDashes=True)
151 |     acg, t, u, total = 0, 0, 0, 0
152 |     for taxon in sequences:
153 |         letters = sequences[taxon].seq.upper()
154 |         for letter in letters:
155 |             total = total + 1
156 |             
157 |             if letter in ('A', 'C', 'G', 'N'):
158 |                 acg = acg + 1
159 |             elif letter == 'T':
160 |                 t = t + 1
161 |             elif letter == 'U':
162 |                 u = u + 1
163 |     
164 |     if u == 0 and (acg + t)/total > 0.9:
165 |         print("Found {}% ACGT-N, assuming DNA..".format(int(100*(acg + t)/total)))
166 |         dataType = "dna"
167 |     elif t == 0 and (acg + u)/total > 0.9:
168 |         print("Found {}% ACGU-N, assuming RNA..".format(int(100*(acg + u)/total)))
169 |         dataType = "rna"
170 |     else:
171 |         print("Assuming protein..")
172 |         dataType = "protein"
173 |           
174 |     return dataType
175 | 
176 | def readSequenceLengthFromFasta(filePath):
177 |     with open(filePath) as f:
178 |         length = 0
179 |         readSequence = False
180 |         for line in f:
181 |             line = line.strip()
182 |             if line.startswith('>'):
183 |                 if readSequence:
184 |                     return length
185 |                 readSequence = True
186 |             else:
187 |                 length = length + len(line)
188 |     if readSequence:
189 |         return length
190 | 
191 | def countGaps(alignFile):
192 |     counts = []
193 |     currentSequence = ""
194 | 
195 |     with open(alignFile) as f:
196 |         for line in f:
197 |             line = line.strip()
198 |             if line.startswith('>'): 
199 |                 
200 |                 if currentSequence is not None:
201 |                     if len(counts) == 0:
202 |                         counts = [0] * len(currentSequence)
203 |                     for i in range(len(counts)):
204 |                         if currentSequence[i] == '-':
205 |                             counts[i] = counts[i] + 1
206 |                                              
207 |                 currentSequence = ""
208 |             else:
209 |                 currentSequence = currentSequence + line
210 |         if currentSequence is not None:
211 |             if len(counts) == 0:
212 |                 counts = [0] * len(currentSequence)
213 |             for i in range(len(counts)):
214 |                 if currentSequence[i] == '-':
215 |                     counts[i] = counts[i] + 1
216 |     
217 |     return counts
218 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/configuration.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 14, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import os
  8 | import time
  9 | 
 10 | from helpers import sequenceutils
 11 | 
 12 | class Configs:
 13 |     
 14 |     workingDir = None
 15 |     sequencesPath = None
 16 |     subsetPaths = None
 17 |     subalignmentPaths = None
 18 |     backbonePaths = None
 19 |     guideTree = "fasttree"
 20 |     outputPath = None
 21 |     dataType = None
 22 |     
 23 |     decompositionMaxNumSubsets = 25
 24 |     decompositionMaxSubsetSize = 50
 25 |     decompositionStrategy = "pastastyle"
 26 |     decompositionSkeletonSize = 300
 27 |     #decompositionKmhIterations = 1
 28 |     
 29 |     graphBuildMethod = "mafft"
 30 |     graphBuildHmmExtend = False
 31 |     graphBuildRestrict = False
 32 |     graphClusterMethod = "mcl" 
 33 |     graphTraceMethod = "minclusters"
 34 |     graphTraceOptimize = False
 35 |     
 36 |     mafftRuns = 10
 37 |     mafftSize = 200
 38 |     mclInflationFactor = 4
 39 |     
 40 |     constrain = True
 41 |     onlyGuideTree = False
 42 |     recurse = True
 43 |     recurseGuideTree = "fasttree"
 44 |     recurseThreshold = 200 
 45 | 
 46 |     clustalPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/clustal/clustalo")
 47 |     mafftPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/mafft/mafft")
 48 |     mclPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/mcl/bin/mcl")
 49 |     mlrmclPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/mlrmcl/mlrmcl")
 50 |     hmmalignPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/hmmer/hmmalign")
 51 |     hmmbuildPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/hmmer/hmmbuild")
 52 |     hmmsearchPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/hmmer/hmmsearch")
 53 |     fasttreePath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/fasttree/FastTreeMP")
 54 |     raxmlPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/raxmlng/raxml-ng")
 55 |     
 56 |     logPath = None
 57 |     errorPath = None
 58 |     debugPath = None
 59 |     
 60 |     numCores = 1
 61 |     searchHeapLimit = 5000
 62 |     alignmentSizeLimit = 100
 63 |     
 64 |     @staticmethod
 65 |     def log(msg, path = None):
 66 |         print(msg)
 67 |         path = Configs.logPath if path is None else path
 68 |         Configs.writeMsg(msg, path)
 69 |     
 70 |     @staticmethod
 71 |     def error(msg, path = None):
 72 |         Configs.log(msg)
 73 |         path = Configs.errorPath if path is None else path
 74 |         Configs.writeMsg(msg, path)
 75 |     
 76 |     @staticmethod
 77 |     def debug(msg, path = None):
 78 |         path = Configs.debugPath if path is None else path
 79 |         Configs.writeMsg(msg, path)
 80 |     
 81 |     @staticmethod
 82 |     def writeMsg(msg, path):
 83 |         if path is not None:
 84 |             with open(path, 'a') as logFile:
 85 |                 logFile.write("{}    {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S"), msg))
 86 |     
 87 |     @staticmethod
 88 |     def inferDataType(sequencesFile):
 89 |         if Configs.dataType is None:
 90 |             Configs.dataType = sequenceutils.inferDataType(sequencesFile)
 91 |             Configs.log("Data type wasn't specified. Inferred data type {} from {}".format(Configs.dataType.upper(), sequencesFile))
 92 |         return Configs.dataType 
 93 | 
 94 | def buildConfigs(args):
 95 |     Configs.outputPath = os.path.abspath(args.output)
 96 |     
 97 |     if args.directory is not None:
 98 |         Configs.workingDir = os.path.abspath(args.directory) 
 99 |     else:
100 |         Configs.workingDir = os.path.join(os.path.dirname(Configs.outputPath), "magus_working_dir")
101 |     if not os.path.exists(Configs.workingDir):
102 |         os.makedirs(Configs.workingDir)
103 |     
104 |     Configs.sequencesPath = os.path.abspath(args.sequences) if args.sequences is not None else Configs.sequencesPath
105 | 
106 |     '''
107 |         11.16.2023 - modified by Chengze Shen
108 |         Making sure that if default guide tree styles are used, then we do not
109 |         attempt to search for the guide tree in path
110 |     '''
111 |     guideTree_styles = ['fasttree', 'fasttree-noml', 'parttree', 'clustal']
112 |     if args.guidetree is not None:
113 |         # using existing styles
114 |         if args.guidetree.lower() in guideTree_styles:
115 |             Configs.guideTree = args.guidetree.lower()
116 |         # supplementing with a working path to a file (presumably a tree file)
117 |         elif os.path.exists(os.path.abspath(args.guidetree)):
118 |             Configs.guideTree = os.path.abspath(args.guidetree)
119 |         # otherwise use the default Configs.guideTree value
120 |     #Configs.guideTree = os.path.abspath(args.guidetree) if args.guidetree is not None else Configs.guideTree
121 |     #if args.guidetree is not None:
122 |     #    Configs.guideTree = os.path.abspath(args.guidetree) if os.path.exists(os.path.abspath(args.guidetree)) else args.guidetree
123 |     
124 |     Configs.subalignmentPaths = []
125 |     for p in args.subalignments:
126 |         path = os.path.abspath(p)
127 |         if os.path.isdir(path):
128 |             for filename in os.listdir(path):
129 |                 Configs.subalignmentPaths.append(os.path.join(path, filename))
130 |         else:
131 |             Configs.subalignmentPaths.append(path)
132 |     
133 |     Configs.backbonePaths = []
134 |     for p in args.backbones:
135 |         path = os.path.abspath(p)
136 |         if os.path.isdir(path):
137 |             for filename in os.listdir(path):
138 |                 Configs.backbonePaths.append(os.path.join(path, filename))
139 |         else:
140 |             Configs.backbonePaths.append(path)
141 | 
142 |     if args.numprocs > 0:
143 |         Configs.numCores = args.numprocs
144 |     else:
145 |         Configs.numCores = os.cpu_count()
146 | 
147 |     Configs.decompositionMaxSubsetSize = args.maxsubsetsize
148 |     Configs.decompositionMaxNumSubsets = args.maxnumsubsets
149 |     Configs.decompositionStrategy = args.decompstrategy
150 |     Configs.decompositionSkeletonSize = args.decompskeletonsize
151 |     Configs.dataType = args.datatype
152 |     
153 |     Configs.graphBuildMethod = args.graphbuildmethod
154 |     Configs.graphBuildHmmExtend = args.graphbuildhmmextend.lower() == "true"
155 |     Configs.graphBuildRestrict = args.graphbuildrestrict.lower() == "true"
156 |     Configs.graphClusterMethod = args.graphclustermethod
157 |     Configs.graphTraceMethod = args.graphtracemethod
158 |     Configs.graphTraceOptimize = args.graphtraceoptimize.lower() == "true"
159 | 
160 |     Configs.mafftRuns = args.mafftruns
161 |     Configs.mafftSize = args.mafftsize
162 |     Configs.mclInflationFactor = args.inflationfactor
163 |     
164 |     Configs.constrain = args.constrain.lower() == "true"
165 |     Configs.onlyGuideTree = args.onlyguidetree.lower() == "true"
166 |     Configs.recurse = args.recurse.lower() == "true"
167 |     Configs.recurseGuideTree = args.recurseguidetree
168 |     Configs.recurseThreshold = args.recursethreshold 
169 | 
170 |     Configs.logPath = os.path.join(Configs.workingDir, "log.txt")    
171 |     Configs.errorPath = os.path.join(Configs.workingDir, "log_errors.txt")
172 |     Configs.debugPath = os.path.join(Configs.workingDir, "log_debug.txt")
173 |     
174 |     Configs.alignmentSizeLimit = args.alignsizelimit
175 | 
176 |     '''
177 |         6.1.2021 - added by Chengze Shen
178 |         new config for taking in the backbone weights
179 |     '''
180 |     Configs.backboneWeightsPath = args.backboneWeightsPath
181 |     '''
182 |         6.8.2022 - added by Chengze Shen
183 |         new config for customized MCL/MAFFT path
184 |     '''
185 |     if args.mclpath:
186 |         Configs.mclPath = os.path.abspath(args.mclpath)
187 |     if args.mafftpath:
188 |         Configs.mafftPath = os.path.abspath(args.mafftpath)
189 |     if args.hmmalignpath:
190 |         Configs.hmmalignPath = os.path.abspath(args.hmmalignpath)
191 |     if args.hmmbuildpath:
192 |         Configs.hmmbuildPath = os.path.abspath(args.hmmbuildpath)
193 |     if args.hmmsearchpath:
194 |         Configs.hmmsearchPath = os.path.abspath(args.hmmsearchpath)
195 |     if args.fasttreepath:
196 |         Configs.fasttreePath = os.path.abspath(args.fasttreepath)
197 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tools/external_tools.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 14, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import subprocess
  8 | import os
  9 | import random
 10 | import shutil
 11 | from configuration import Configs
 12 | from tasks.task import Task
 13 | 
 14 | def runCommand(**kwargs):
 15 |     command = kwargs["command"]
 16 |     Configs.log("Running an external tool, command: {}".format(command))
 17 |     runner = subprocess.run(command, shell = True, cwd = kwargs["workingDir"], universal_newlines = True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 18 |     try:    
 19 |         runner.check_returncode()
 20 |     except:
 21 |         Configs.error("Command encountered error: {}".format(command))
 22 |         Configs.error("Exit code: {}".format(runner.returncode))
 23 |         Configs.error("Output: {}".format(runner.stdout))
 24 |         raise
 25 |     for srcPath, destPath in kwargs.get("fileCopyMap", {}).items():
 26 |         shutil.move(srcPath, destPath)
 27 | 
 28 | def runClustalOmegaGuideTree(fastaPath, workingDir, outputPath, threads = 1):
 29 |     tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath)))
 30 |     args = [Configs.clustalPath]
 31 |     args.extend(["-i", fastaPath, "--max-hmm-iterations=-1", "--guidetree-out={}".format(tempPath)])
 32 |     args.extend(["--threads={}".format(threads)])
 33 |     taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir}
 34 |     return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs)
 35 | 
 36 | def generateMafftFilePathMap(inputPaths, outputDir):
 37 |     mafftMap = {inputPath : os.path.join(outputDir, "mafft_{}".format(os.path.basename(inputPath))) for inputPath in inputPaths}
 38 |     return mafftMap
 39 | 
 40 | def buildMafftAlignments(inputOutputPathMap):
 41 |     tasks = [buildMafftAlignment(inputPath, outputPath) for inputPath, outputPath in inputOutputPathMap.items()]
 42 |     return tasks
 43 |     
 44 | def buildMafftAlignment(inputPath, outputPath, subtablePath = None):
 45 |     return runMafft(inputPath, subtablePath, Configs.workingDir, outputPath, Configs.numCores)                
 46 | 
 47 | def runMafft(fastaPath, subtablePath, workingDir, outputPath, threads = 1):
 48 |     tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath)))
 49 |     args = [Configs.mafftPath, "--localpair", "--maxiterate", "1000", "--ep", "0.123", 
 50 |             "--quiet", "--thread", str(threads), "--anysymbol"]
 51 |     if subtablePath is not None:
 52 |         args.extend(["--merge", subtablePath])
 53 |     args.extend([fastaPath, ">", tempPath])
 54 |     taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir}
 55 |     return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs)
 56 | 
 57 | def runMafftGuideTree(fastaPath, workingDir, outputPath, threads = 1):
 58 |     tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath)))
 59 |     treeFile = os.path.join(os.path.dirname(fastaPath),  "{}.tree".format(os.path.basename(fastaPath)))
 60 |     args = [Configs.mafftPath, "--retree", "0", "--treeout", "--parttree",
 61 |             "--quiet", "--thread", str(threads), "--anysymbol"]
 62 |     args.extend(["--partsize", "1000"])
 63 |     args.extend([fastaPath, ">", tempPath])
 64 |     taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {treeFile : outputPath}, "workingDir" : workingDir}
 65 |     return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs)
 66 | 
 67 | def runMcl(matrixPath, inflation, workingDir, outputPath):
 68 |     tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath)))
 69 |     args = [Configs.mclPath, matrixPath, "--abc", "-o", tempPath]
 70 |     if inflation is not None:
 71 |         args.extend(["-I", str(inflation)])
 72 |     taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir}
 73 |     return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs)
 74 | 
 75 | def runMlrMcl(matrixPath, granularity, balance, inflation, workingDir, outputPath):
 76 |     tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath)))
 77 |     args = [Configs.mlrmclPath, matrixPath, "-o", tempPath]
 78 |     if granularity is not None:
 79 |         args.extend(["-c", str(granularity)])
 80 |     if balance is not None:
 81 |         args.extend(["-b", str(balance)])    
 82 |     if inflation is not None:
 83 |         args.extend(["-i", str(inflation)])
 84 |     taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir}
 85 |     return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs)
 86 | 
 87 | def runFastTree(fastaFilePath, workingDir, outputPath, mode = "normal", intree = None):
 88 |     tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath)))
 89 |     
 90 |     args = [Configs.fasttreePath]
 91 |     if Configs.inferDataType(fastaFilePath) == "protein":
 92 |         args.extend(["-lg"])
 93 |     else:
 94 |         args.extend(["-nt", "-gtr"])
 95 |     
 96 |     if intree is not None:
 97 |         args.extend(["-intree", intree])
 98 |     
 99 |     if mode == "fast":
100 |         args.extend(["-fastest", "-nosupport"]) 
101 |     elif mode == "faster":
102 |         args.extend(["-fastest", "-nosupport", "-mlnni", "4" ]) 
103 |     elif mode == "noml":
104 |         args.extend(["-fastest", "-nosupport", "-noml"])
105 |     
106 |     args.extend([fastaFilePath, ">", tempPath])
107 |     taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir}
108 |     return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs)
109 | 
110 | def runRaxmlNg(fastaFilePath, workingDir, outputPath, threads = 8):
111 |     # raxml-ng --msa prim.phy --model GTR+G --prefix T4 --threads 2 --seed 2 --tree pars{25},rand{25}
112 |     baseName = os.path.basename(outputPath).replace(".","")
113 |     raxmlFile = os.path.join(workingDir, "{}.raxml.bestTree".format(baseName))
114 |     seed = random.randint(1, 1000000)
115 |     args = [Configs.raxmlPath,
116 |             "--msa", fastaFilePath,
117 |             "--prefix", baseName,
118 |             "--threads", str(threads),
119 |             "--seed", str(seed)]
120 |     
121 |     if Configs.inferDataType(fastaFilePath) == "protein":
122 |         args.extend(["--model", "LG+G"])
123 |     else:
124 |         args.extend(["--model", "GTR+G"])
125 |         
126 |     args.extend(["--tree", "pars{{{}}}".format(1)])
127 |     taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {raxmlFile : outputPath}, "workingDir" : workingDir}
128 |     return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs)
129 | 
130 | def runHmmBuild(alignmentPath, workingDir, outputPath):
131 |     tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath)))
132 |     args = [Configs.hmmbuildPath,'--ere', '0.59', "--cpu", "1"]
133 |     args.extend(["--symfrac", "0.0", "--informat", "afa", tempPath, alignmentPath])
134 |     taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir}
135 |     return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs)
136 | 
137 | def runHmmAlign(hmmModelPath, fragPath, workingDir, outputPath):
138 |     tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath)))
139 |     args = [Configs.hmmalignPath, "-o", tempPath]
140 |     args.extend([hmmModelPath, fragPath])
141 |     taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir}
142 |     return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs)
143 | 
144 | def runHmmSearch(hmmModelPath, fragPath, workingDir, outputPath):
145 |     tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath)))
146 |     args = [Configs.hmmsearchPath,"--noali", "--cpu", "1", "-o", tempPath, "-E", "99999999", "--max"]
147 |     args.extend([hmmModelPath, fragPath])
148 |     taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir}
149 |     return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs)


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_trace/rg_fast_search.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Aug 23, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import heapq
  8 | from collections import deque 
  9 | 
 10 | from configuration import Configs
 11 | 
 12 | def rgFastSearch(graph):
 13 |     Configs.log("Finding graph trace with fast region-growing search..")
 14 |     
 15 |     k = len(graph.context.subalignments)
 16 |     lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
 17 |     upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 
 18 |     cuts = rgFastCluster(graph, lowerBound, upperBound, True)
 19 |     graph.clusters = cutsToClusters(graph, cuts)
 20 | 
 21 | def rgFastCluster(graph, lowerBound, upperBound, enforceTrace = True):
 22 |     initialCuts = initialSplit(graph, lowerBound, upperBound, enforceTrace)
 23 |     if len(initialCuts) == 2:
 24 |         return initialCuts
 25 |     #Configs.log("Starting with {} coarse cuts..".format(len(initialCuts)))
 26 |     cuts = []
 27 |     for i in range(len(initialCuts)-1):
 28 |         intervalCuts = rgFastCluster(graph, initialCuts[i], initialCuts[i+1], enforceTrace)
 29 |         cuts.extend(intervalCuts[:-1])
 30 |     
 31 |     cuts.append(list(upperBound))
 32 |     #Configs.log("Returning {} fine cuts..".format(len(cuts)))
 33 |     return cuts
 34 |         
 35 | def initialSplit(graph, lowerBound, upperBound, enforceTrace = True):
 36 |     k = len(graph.context.subalignments)
 37 |     baseIdx = max(range(k), key = lambda x : upperBound[x] - lowerBound[x])
 38 |     #baseIdx = min(range(k), key = lambda x : upperBound[x] - lowerBound[x] if upperBound[x] - lowerBound[x] >= 2 else float('inf'))
 39 |     baseLength = upperBound[baseIdx] - lowerBound[baseIdx]
 40 |     if baseLength < 2:
 41 |         return [list(lowerBound), list(upperBound)]
 42 |     
 43 |     clusters = initialSplitExpansion(graph, lowerBound, upperBound, baseIdx, baseLength)
 44 |     #clusters = initialSplitExpansionSimple(graph, lowerBound, upperBound, baseIdx, baseLength)
 45 |     
 46 |     cuts = clustersToCuts(graph, lowerBound, upperBound, clusters)   
 47 |         
 48 |     return cuts
 49 | 
 50 | def initialSplitExpansion(graph, lowerBound, upperBound, baseIdx, baseLength):
 51 |     k = len(graph.context.subalignments)
 52 |     clusters = [[lowerBound[baseIdx] + i] for i in range(baseLength)]
 53 |     #idxSets = [set([baseIdx]) for i in range(baseLength)]
 54 |     idxSets = {(i, baseIdx) : lowerBound[baseIdx] + i for i in range(baseLength)}
 55 |     usedNodes = set()
 56 |     weightMap = {}
 57 |     
 58 |     boundsMap = {}
 59 |     for i in range(k):
 60 |         boundsMap[0, i] = (lowerBound[i]-1, upperBound[i])
 61 |         boundsMap[baseLength-1, i] = (lowerBound[i]-1, upperBound[i])            
 62 |     
 63 |     heap = []
 64 |     for node in range(lowerBound[baseIdx], upperBound[baseIdx]):
 65 |         for nbr, value in graph.matrix[node].items():
 66 |             i, pos = graph.matSubPosMap[nbr]
 67 |     #for i in range(k):
 68 |     #    for nbr, value in graph.nodeEdges[node][i]:
 69 |             
 70 |             if nbr < lowerBound[i]:
 71 |                 continue
 72 |             if nbr >= upperBound[i]:
 73 |                 continue
 74 |                 #break
 75 |             idx = node - lowerBound[baseIdx]
 76 |             if (idx, i) in idxSets:
 77 |                 continue
 78 |             
 79 |             heapq.heappush(heap, (-1*value, node, nbr, idx)) 
 80 |             weightMap[idx, nbr] = value
 81 |     
 82 |     while len(heap) > 0:
 83 |         value, a, b, idx = heapq.heappop(heap)
 84 |         if b in usedNodes:
 85 |             continue
 86 |         #asub, apos = graph.matSubPosMap[a]
 87 |         bsub, bpos = graph.matSubPosMap[b]
 88 | 
 89 |         if (idx, bsub) in idxSets:
 90 |             continue
 91 |         lower, upper = getBounds(boundsMap, baseLength, idx, bsub)
 92 |         if not (b > lower and b < upper):
 93 |             continue
 94 |         
 95 |         addBounds(graph, boundsMap, baseLength, idx, b)
 96 |         clusters[idx].append(b)
 97 |         idxSets[idx, bsub] = b
 98 |         usedNodes.add(b)
 99 |         
100 |         for nbr, value in graph.matrix[b].items():
101 |             i, pos = graph.matSubPosMap[nbr]
102 |     #for i in range(k):
103 |             if (idx, i) in idxSets:
104 |                 continue
105 |             lower, upper = getBounds(boundsMap, baseLength, idx, i)
106 |         #for nbr, value in graph.nodeEdges[b][i]:
107 |             if nbr in usedNodes:
108 |                 continue               
109 |              
110 |             if nbr <= lower:
111 |                 continue
112 |             if nbr >= upper:
113 |                 #break
114 |                 continue
115 |             
116 |             #print(weightMap.get((idx, nbr), 0))
117 |             weight = value + weightMap.get((idx, nbr), 0)
118 |             weightMap[idx, nbr] = weight
119 |             heapq.heappush(heap, (-1*weight, b, nbr, idx))    
120 | 
121 |     return clusters
122 | 
123 | 
124 | def getBounds(boundsMap, baseLength, idx, asub):
125 |     a, b = 0, baseLength - 1
126 |     if idx == a:
127 |         return boundsMap[a, asub]
128 |     if idx == b:
129 |         return boundsMap[b, asub]
130 |     
131 |     midpoint = int((a+b)*0.5)
132 |     while (midpoint, asub) in boundsMap:
133 |         if idx == midpoint:
134 |             return boundsMap[midpoint, asub]
135 |         elif idx > midpoint:
136 |             a = midpoint
137 |         elif idx < midpoint:
138 |             b = midpoint
139 |         midpoint = int((a+b)*0.5)
140 |     la, ua = boundsMap[a, asub]
141 |     lb, ub = boundsMap[b, asub]
142 |     return (la, ub)
143 | 
144 | def addBounds(graph, boundsMap, baseLength, idx, node):
145 |     asub, apos = graph.matSubPosMap[node]
146 |     a, b = 0, baseLength - 1
147 |     
148 |     while True:
149 |         la, ua = boundsMap[a, asub]
150 |         lb, ub = boundsMap[b, asub]
151 |         if idx == a:
152 |             boundsMap[a, asub] = (node, node)
153 |             return
154 |         elif node < ua:
155 |             boundsMap[a, asub] = (la, node)
156 |             
157 |         if idx == b:
158 |             boundsMap[b, asub] = (node, node)
159 |             return
160 |         elif node > lb:
161 |             boundsMap[b, asub] = (node, ub)    
162 |                 
163 |         midpoint = int((a+b)*0.5)
164 |         if idx == midpoint:
165 |             boundsMap[midpoint, asub] = (node, node)
166 |             return
167 |         elif (midpoint, asub) not in boundsMap:
168 |             boundsMap[midpoint, asub] = (la, ub)
169 |             
170 |         if idx > midpoint:
171 |             a = midpoint
172 |         elif idx < midpoint:
173 |             b = midpoint 
174 |     
175 | 
176 | def initialSplitExpansionSimple(graph, lowerBound, upperBound, baseIdx, baseLength):
177 |     k = len(graph.context.subalignments)
178 |     clusters = [[lowerBound[baseIdx] + i] for i in range(baseLength)]
179 |     #idxSets = [set([baseIdx]) for i in range(baseLength)]
180 |     idxSets = {(i, baseIdx) : lowerBound[baseIdx] + i for i in range(baseLength)}
181 |     usedNodes = set()
182 |     weightMap = {}
183 |     
184 |     heap = []
185 |     for node in range(lowerBound[baseIdx], upperBound[baseIdx]):
186 |         for i in range(k):
187 |             for nbr, value in graph.nodeEdges[node][i]:
188 |                 if nbr < lowerBound[i]:
189 |                     continue
190 |                 if nbr >= upperBound[i]:
191 |                     break
192 |                 idx = node - lowerBound[baseIdx]
193 |                 heapq.heappush(heap, (-1*value, node, nbr, idx)) 
194 |                 weightMap[idx, nbr] = value
195 |     
196 |     while len(heap) > 0:
197 |         value, a, b, idx = heapq.heappop(heap)
198 |         if b in usedNodes:
199 |             continue
200 |         #asub, apos = graph.matSubPosMap[a]
201 |         bsub, bpos = graph.matSubPosMap[b]
202 |         #if bsub in idxSets[idx]:
203 |         #    continue
204 |         if (idx, bsub) in idxSets or idxSets.get((idx-1, bsub), 0) > b or idxSets.get((idx+1, bsub), upperBound[bsub]) < b:
205 |             continue
206 |         
207 |         
208 |         clusters[idx].append(b)
209 |         #idxSets[idx].add(bsub)
210 |         idxSets[idx, bsub] = b
211 |         usedNodes.add(b)
212 |         
213 |         for i in range(k):
214 |             if (idx, i) in idxSets:
215 |                 continue
216 |             for nbr, value in graph.nodeEdges[b][i]:
217 |                 if nbr in usedNodes:
218 |                     continue                
219 |                 if nbr <  idxSets.get((idx-1, i), lowerBound[i]):
220 |                     continue
221 |                 if nbr >= idxSets.get((idx+1, i), upperBound[i]):
222 |                     break
223 |                 
224 |                 #print(weightMap.get((idx, nbr), 0))
225 |                 weight = value + weightMap.get((idx, nbr), 0)
226 |                 #weight = value
227 |                 weightMap[idx, nbr] = weight
228 |                 heapq.heappush(heap, (-1*weight, b, nbr, idx))    
229 | 
230 |     return clusters
231 | 
232 | def clustersToCuts(graph, lowerBound, upperBound, clusters):
233 |     cuts = [list(lowerBound)]
234 |     #cuts = []
235 |     cut = list(lowerBound)
236 |     for i, cluster in enumerate(clusters):
237 |         if i == 0:
238 |             continue
239 |         #cut = list(cuts[-1])
240 |         
241 |         for a in cluster:
242 |             asub, apos = graph.matSubPosMap[a]
243 |             cut[asub] = max(a, cut[asub])
244 |         cuts.append(cut)
245 |         cut = list(cut)    
246 |     cuts.append(list(upperBound))
247 |     return cuts
248 | 
249 | def cutsToClusters(graph, cuts):
250 |     clusters = []
251 |     for i in range(len(cuts)-1):
252 |         cluster = []
253 |         for j in range(len(cuts[i])):
254 |             cluster.extend(list(range(cuts[i][j], cuts[i+1][j])))
255 |         clusters.append(cluster)
256 |         #print(cluster)
257 |         
258 |     return clusters


--------------------------------------------------------------------------------
/witch_msa/tools/magus/align/merge/graph_build/graph_builder.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 14, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import os
  8 | import time
  9 | import random
 10 | 
 11 | from align.merge.alignment_graph import AlignmentGraph
 12 | from helpers import sequenceutils, hmmutils
 13 | from tasks import task
 14 | from configuration import Configs
 15 | from tools import external_tools
 16 | 
 17 | '''
 18 | Building a MAGUS alignment graph from backbone alignments. 
 19 | The graph is a sparse matrix, stored as a weighted adjacency list.
 20 | Backbone alignment tasks are run in parallel, and can begin before the subalignments are finished.
 21 | When subalignments finish, we can initialize the alignment graph. 
 22 | Backbones are added to this graph as they complete.
 23 | Graph is written to file in MCL-compliant format.
 24 | '''
 25 | 
 26 | def buildGraph(context):
 27 |     time1 = time.time() 
 28 |     
 29 |     context.graph = AlignmentGraph(context)
 30 |     context.initializeSequences()
 31 |     
 32 |     if os.path.exists(context.graph.graphPath):
 33 |         Configs.log("Found existing graph file {}".format(context.graph.graphPath))
 34 |     else:
 35 |         requestBackboneTasks(context)
 36 |     
 37 |     context.awaitSubalignments()
 38 |     context.graph.initializeMatrix()
 39 |     
 40 |     if os.path.exists(context.graph.graphPath):
 41 |         context.graph.readGraphFromFile(context.graph.graphPath)
 42 |     else:
 43 |         context.initializeBackboneSequenceMapping()
 44 |         '''
 45 |             6.1.2021 - added by Chengze Shen
 46 |             a new invocation to the newly added function in context
 47 |             which initializes the backbone weights
 48 |         '''
 49 |         context.initializeBackboneWeights()
 50 | 
 51 |         buildMatrix(context)
 52 |         context.graph.writeGraphToFile(context.graph.graphPath)
 53 |        
 54 |     time2 = time.time()
 55 |     Configs.log("Built the alignment graph in {} sec..".format(time2-time1))
 56 | 
 57 | def requestBackboneTasks(context):
 58 |     if len(context.backbonePaths) > 0:
 59 |         Configs.log("Using {} user-defined backbone files..".format(len(context.backbonePaths)))
 60 |         context.backbonePaths = context.backbonePaths
 61 |         for path in context.backbonePaths:
 62 |             context.backboneTaxa.update(sequenceutils.readFromFasta(path))
 63 |     
 64 |     elif Configs.graphBuildMethod == "mafft":
 65 |         Configs.log("Using {} MAFFT backbones..".format(Configs.mafftRuns)) 
 66 |         requestMafftBackbones(context)
 67 |         
 68 |     elif Configs.graphBuildMethod == "subsethmm":
 69 |         Configs.log("Using {} HMM-extended subalignments as backbone files..".format(len(context.subalignmentPaths)))
 70 |         context.backbonePaths = context.subalignmentPaths
 71 |         context.backboneExtend.update(context.backbonePaths)
 72 |         
 73 |     elif Configs.graphBuildMethod == "initial":
 74 |         Configs.log("Using the initial decomposition alignment as the single backbone..")
 75 |         initialAlignPath = os.path.join(context.workingDir, "decomposition", "initial_tree", "initial_insert_align.txt")
 76 |         context.backbonePaths = [initialAlignPath]
 77 |     
 78 |     if not Configs.constrain and Configs.graphBuildMethod != "subsethmm":
 79 |         context.backbonePaths.extend(context.subalignmentPaths)
 80 | 
 81 | def requestMafftBackbones(context):
 82 |     numTaxa = max(1, int(Configs.mafftSize/len(context.subsetPaths)))
 83 | 
 84 |     for n in range(Configs.mafftRuns):
 85 |         unalignedFile = os.path.join(context.graph.workingDir, "backbone_{}_unalign.txt".format(n+1))
 86 |         alignedFile = os.path.join(context.graph.workingDir, "backbone_{}_mafft.txt".format(n+1))
 87 |         if os.path.exists(alignedFile):
 88 |             Configs.log("Existing backbone file found: {}".format(alignedFile))
 89 |             backbone = sequenceutils.readFromFasta(alignedFile)
 90 |             context.backbonePaths.append(alignedFile)
 91 |         else:
 92 |             backbone = assignBackboneTaxa(context, numTaxa, unalignedFile)
 93 |             backboneTask = external_tools.buildMafftAlignment(unalignedFile, alignedFile)
 94 |             context.backboneTasks.append(backboneTask)
 95 |             
 96 |         if Configs.graphBuildHmmExtend:
 97 |             context.backboneExtend.add(alignedFile)
 98 |         else:
 99 |             context.backboneTaxa.update(backbone)
100 |     task.submitTasks(context.backboneTasks)    
101 |                 
102 | def buildMatrix(context):
103 |     addedBackbones = set()
104 |     for backboneTask in task.asCompleted(context.backboneTasks):
105 |         addAlignmentFileToGraph(context, backboneTask.outputFile)
106 |         addedBackbones.add(backboneTask.outputFile)
107 |     
108 |     for backboneFile in context.backbonePaths:
109 |         if backboneFile not in addedBackbones:
110 |             addAlignmentFileToGraph(context, backboneFile)  
111 |     
112 | def assignBackboneTaxa(context, numTaxa, unalignedFile):
113 |     backbone = {}
114 |     for subset in context.subsets:
115 |         random.shuffle(subset)
116 |         for taxon in subset[:numTaxa]:
117 |             backbone[taxon] = context.unalignedSequences[taxon]
118 |     sequenceutils.writeFasta(backbone, unalignedFile)
119 |     return backbone
120 |     
121 | def addAlignmentFileToGraph(context, alignedFile):
122 |     Configs.log("Feeding backbone {} to the graph..".format(alignedFile))
123 |     backboneAlign = sequenceutils.readFromFasta(alignedFile)  
124 |     alignmentLength = len(next(iter(backboneAlign.values())).seq)
125 |     
126 |     if alignedFile in context.backboneExtend:
127 |         extensionTasks = requestHmmExtensionTasks(context, backboneAlign, alignedFile)
128 |         task.submitTasks(extensionTasks)
129 |         for extensionTask in task.asCompleted(extensionTasks):
130 |             backboneAlign.update(sequenceutils.readFromStockholm(extensionTask.outputFile, includeInsertions=True))
131 |     
132 |     alignmap = backboneToAlignMap(context, backboneAlign, alignmentLength)
133 |     Configs.log("Constructed backbone alignment map from {}".format(alignedFile))
134 |     
135 |     '''
136 |         6.1.2021 - added by Chengze Shen
137 |         read in the weight for the alignedFile if the weight exists
138 |     '''
139 |     weight = 1
140 |     if context.backboneWeightsPath:
141 |         # the weight has to be defined for this to work
142 |         weight = context.backboneWeights.get(alignedFile)
143 | 
144 |     graph = context.graph  
145 |     with graph.matrixLock:
146 |         for l in range(alignmentLength):  
147 |             for a, avalue in alignmap[l].items():
148 |                 for b, bvalue in alignmap[l].items():
149 |                     
150 |                     if Configs.graphBuildRestrict:
151 |                         asub, apos = graph.matSubPosMap[a] 
152 |                         bsub, bpos = graph.matSubPosMap[b] 
153 |                         if asub == bsub and apos != bpos:
154 |                             continue
155 | 
156 |                     '''
157 |                         6.1.2021 - modified by Chengze Shen
158 |                         instead of a plain number of counts, we consider
159 |                         a user-defined weighting for the backbone
160 |                         
161 |                         ** the assumption is that the backbone weighting is
162 |                         ** tied to only one query sequence (because if we 
163 |                         ** have multiple queries in the backbone alignment,
164 |                         ** the weightings of the bb of queries may not be
165 |                         ** the same.
166 |                     '''
167 |                     #graph.matrix[a][b] = graph.matrix[a].get(b,0) + avalue * bvalue         
168 |                     graph.matrix[a][b] = graph.matrix[a].get(b,0) \
169 |                             + avalue * bvalue * weight
170 |     '''
171 |         7.8.2021 - added by Chengze Shen
172 |         log the weight information for the backbone too
173 |     '''
174 |     Configs.log("Fed backbone {} to the graph - weight = {}.".format(
175 |         alignedFile, weight))
176 |     #Configs.log("Fed backbone {} to the graph.".format(alignedFile))
177 | 
178 | def backboneToAlignMap(context, backboneAlign, alignmentLength):
179 |     alignmap = [{} for i in range(alignmentLength)]
180 |     t = 0
181 |     
182 |     for taxon in backboneAlign:
183 |         subsetIdx = context.taxonSubalignmentMap[taxon]
184 |         subsetseq = context.backboneSubalignment[taxon].seq
185 |         unalignedseq = context.unalignedSequences[taxon].seq     
186 |         backboneseq = backboneAlign[taxon].seq   
187 |         
188 |         i = 0
189 |         posarray = [0] * len(unalignedseq)
190 |         for n in range(len(subsetseq)):
191 |             if subsetseq[n] == unalignedseq[i]:
192 |                 posarray[i] = n 
193 |                 i = i + 1
194 |                 if i == len(unalignedseq):
195 |                     break
196 |         
197 |         i = 0
198 |         n = 0
199 |         for c in backboneseq:
200 |             if i == len(unalignedseq):
201 |                 break
202 |             if c == unalignedseq[i]:
203 |                 position = int(context.graph.subsetMatrixIdx[subsetIdx] + posarray[i])
204 |                 alignmap[n][position] = alignmap[n].get(position, 0) + 1
205 |             if c.upper() == unalignedseq[i]:
206 |                 i = i + 1
207 |             if c == c.upper() and c != '.':
208 |                 n = n + 1
209 |                 
210 |         t = t + 1
211 |             
212 |     return alignmap
213 |         
214 | def requestHmmExtensionTasks(context, backbone, alignedFile):
215 |     baseName = os.path.basename(alignedFile)
216 |     hmmDir = os.path.join(context.graph.workingDir, "hmm_{}".format(baseName))
217 |     extensionUnalignedFile = os.path.join(hmmDir, "queries.txt")
218 |     hmmPath = os.path.join(hmmDir, "hmm_model.txt")
219 |     if not os.path.exists(hmmDir):
220 |         os.makedirs(hmmDir)
221 |     
222 |     backboneExtension = {}
223 |     for taxon in context.unalignedSequences:
224 |         if not taxon in backbone:
225 |             backboneExtension[taxon] = context.unalignedSequences[taxon]
226 |                 
227 |     sequenceutils.writeFasta(backboneExtension, extensionUnalignedFile) 
228 |     buildTask = hmmutils.buildHmmOverAlignment(alignedFile, hmmPath)
229 |     buildTask.run()
230 |     alignTasks = hmmutils.hmmAlignQueries(hmmPath, extensionUnalignedFile)
231 |     return alignTasks 
232 |     
233 | 


--------------------------------------------------------------------------------
/witch_msa/tools/magus/tasks/manager.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 26, 2020
  3 | 
  4 | @author: Vlad
  5 | '''
  6 | 
  7 | import random
  8 | import threading
  9 | import os
 10 | import time
 11 | import concurrent.futures
 12 | from configuration import Configs
 13 | from tasks import files
 14 | 
 15 | '''
 16 | Launching and awaiting tasks.
 17 | To avoid deadlocks and stack overflows, only the main thread can submit tasks.
 18 | Thus, only the main thread runs alignment tasks, worker threads are used for other task types (like MAFFT).
 19 | '''
 20 | 
 21 | class TaskManager():
 22 |     
 23 |     runningTasksFile = None
 24 |     lockTasksFile = None
 25 |     
 26 |     managerPool = None
 27 |     managerFuture = None
 28 |     managerSignal = threading.Event()
 29 |     managerLock = threading.Lock()
 30 |     managerStopSignal = False
 31 |     
 32 |     observerSignal = threading.Event()
 33 |     observerWaiting = False
 34 |     observerTask = None 
 35 |     
 36 |     waitingTasks = {}
 37 |     submittedTasks = set()
 38 |     runningTasks = set()
 39 |     finishedTasks = set()
 40 |     failedTasks = set()
 41 |     
 42 |     taskPool = None
 43 |     threadsUsed = 0
 44 |     lastFilesCheckTime = 0
 45 |     lastDebugTime = 0
 46 |     serialTaskTypes = {"runAlignmentTask", "buildInducedSubalignment", "compressSubalignment"}
 47 |     contextStack = []
 48 |     
 49 | def startTaskManager():
 50 |     Configs.debug("Starting up the task manager..")
 51 |     
 52 |     tasksDir = os.path.join(Configs.workingDir, "tasks")
 53 |     TaskManager.pendingTasksDir = os.path.join(tasksDir, "tasks_pending")
 54 |     TaskManager.runningTasksFile = os.path.join(tasksDir, "tasks_running.txt")
 55 |     TaskManager.lockTasksFile = os.path.join(tasksDir, "tasks.lock")
 56 |     if not os.path.exists(TaskManager.pendingTasksDir):
 57 |         os.makedirs(TaskManager.pendingTasksDir)  
 58 |     
 59 |     TaskManager.managerPool = concurrent.futures.ThreadPoolExecutor(max_workers = 1)
 60 |     TaskManager.managerFuture = TaskManager.managerPool.submit(runTaskManager)
 61 |     TaskManager.taskPool = concurrent.futures.ThreadPoolExecutor(max_workers = Configs.numCores)
 62 |     Configs.debug("Task manager is up..")
 63 | 
 64 | def stopTaskManager():
 65 |     TaskManager.managerStopSignal = True
 66 |     with TaskManager.managerLock:
 67 |         TaskManager.managerSignal.set()
 68 |     try:
 69 |         Configs.debug("Winding down the task manager..")
 70 |         TaskManager.managerFuture.result()
 71 |     finally:
 72 |         Configs.log("Waiting for {} tasks to finish..".format(len(TaskManager.runningTasks)))
 73 |         TaskManager.taskPool.shutdown()
 74 |         dealWithFinishedTasks()        
 75 |         TaskManager.managerPool.shutdown()
 76 |         Configs.debug("Task manager stopped..")
 77 | 
 78 | def runTaskManager():
 79 |     try:
 80 |         while not TaskManager.managerStopSignal:
 81 |             with TaskManager.managerLock:
 82 |                 dealWithErrors()
 83 |                 dealWithFinishedTasks()
 84 |                 dealWithPendingTasks()
 85 |                 dealWithWaitingTasks()
 86 |                 TaskManager.managerSignal.clear()
 87 |             TaskManager.managerSignal.wait(5)
 88 |     finally:
 89 |         TaskManager.observerSignal.set()  
 90 | 
 91 | def dealWithErrors():
 92 |     for task in TaskManager.failedTasks:
 93 |         Configs.error("Task manager found a failed task: {}".format(task.outputFile))
 94 |         if task.future is not None:
 95 |             task.future.result()
 96 |         
 97 | def dealWithFinishedTasks():
 98 |     stoppedRunning = set(t for t in TaskManager.finishedTasks | TaskManager.failedTasks if t.taskType == "runAlignmentTask")
 99 |     if len(stoppedRunning) > 0:
100 |         with files.FileLock(TaskManager.lockTasksFile):
101 |             runningTasks = files.readTasksFromFile(TaskManager.runningTasksFile)
102 |             stillRunningTasks = [t for t in runningTasks if t not in stoppedRunning]
103 |             if len(stillRunningTasks) < len(runningTasks):
104 |                 files.writeTasksToFile(stillRunningTasks, TaskManager.runningTasksFile, append = False)
105 |         
106 |     if len(TaskManager.failedTasks) > 0:
107 |         processPendingTasks(TaskManager.failedTasks, 0, None)   
108 |         
109 |     TaskManager.finishedTasks = set()
110 |     TaskManager.failedTasks = set()
111 | 
112 | def dealWithPendingTasks():
113 |     numToLaunch = min(1, Configs.numCores - TaskManager.threadsUsed)
114 |     newTasks = []
115 |     for t in TaskManager.submittedTasks:
116 |         if os.path.exists(t.outputFile):
117 |             Configs.log("File already exists: {}".format(t.outputFile))
118 |             t.isFinished = True
119 |             TaskManager.observerSignal.set()
120 |         else:
121 |             newTasks.append(t)
122 |             TaskManager.waitingTasks[t.outputFile] = t
123 |     
124 |     if len(newTasks) > 0:
125 |         launchedTasks, remainingTasks = processPendingTasks(newTasks, numToLaunch, None)
126 |         numToLaunch = numToLaunch - len(launchedTasks)
127 |     if numToLaunch > 0:
128 |         pendingFiles = [os.path.join(TaskManager.pendingTasksDir, file) for file in os.listdir(TaskManager.pendingTasksDir) if file.endswith(".txt")]
129 |         random.shuffle(pendingFiles)
130 |         for taskFile in pendingFiles:
131 |             if numToLaunch <= 0:
132 |                 break
133 |             launchedTasks, remainingTasks = processPendingTasks(None, numToLaunch, taskFile)
134 |             numToLaunch = numToLaunch - len(launchedTasks)
135 | 
136 |     TaskManager.submittedTasks = set()
137 | 
138 | def dealWithWaitingTasks():
139 |     timeSinceFileCheck = time.time() - TaskManager.lastFilesCheckTime
140 |     if timeSinceFileCheck >= 5: 
141 |         for file, task in list(TaskManager.waitingTasks.items()):
142 |             if os.path.exists(file):
143 |                 Configs.debug("Detected task completion: {}".format(file))
144 |                 TaskManager.waitingTasks.pop(file)
145 |                 task.isFinished = True
146 |                 TaskManager.observerSignal.set()
147 |         TaskManager.lastFilesCheckTime = time.time()
148 |     
149 |     timeSinceDebug = time.time() - TaskManager.lastDebugTime
150 |     if timeSinceDebug >= 60:  
151 |         TaskManager.lastDebugTime = time.time()
152 |         for task in TaskManager.runningTasks:
153 |             Configs.debug("Still running task {}, status {}".format(task.outputFile, task.future._state if task.future is not None else "N/A"))
154 |         for file in TaskManager.waitingTasks:
155 |             Configs.debug("Still waiting on task {}".format(file))
156 | 
157 | def processPendingTasks(tasks, numTasksToLaunch, taskFile):
158 |     if taskFile is None:
159 |         taskFile = os.path.join(TaskManager.pendingTasksDir, "task_file_{:03d}.txt".format(random.randint(0,999)))
160 |     
161 |     newTasks = True    
162 |     with files.FileLock(taskFile.replace(".txt", ".lock")):
163 |         if tasks is None:
164 |             tasks = files.readTasksFromFile(taskFile)
165 |             newTasks = False
166 | 
167 |         alignTasks = [t for t in tasks if t.taskType == "runAlignmentTask"]
168 |         if len(alignTasks) > 0:
169 |             with files.FileLock(TaskManager.lockTasksFile):
170 |                 runningTasks = set(files.readTasksFromFile(TaskManager.runningTasksFile))
171 |                 if len(runningTasks) > 0:
172 |                     tasks = [t for t in tasks if t not in runningTasks]
173 |                 launchedTasks, remainingTasks = launchTasks(tasks, numTasksToLaunch)        
174 |                 writeRunningTasks = [t for t in launchedTasks if t.taskType == "runAlignmentTask"]
175 |                 files.writeTasksToFile(writeRunningTasks, TaskManager.runningTasksFile, append = True)
176 |         else:
177 |             launchedTasks, remainingTasks = launchTasks(tasks, numTasksToLaunch)
178 |         
179 |         files.writeTasksToFile(remainingTasks, taskFile, append = newTasks) 
180 |         if len(launchedTasks) > 0:     
181 |             Configs.debug("Launched {} tasks and deferred {} tasks..".format(len(launchedTasks), len(remainingTasks)))   
182 |           
183 |     return launchedTasks, remainingTasks      
184 | 
185 | def launchTasks(tasks, numTasksToLaunch):
186 |     launchedTasks = []
187 |     remainingTasks = []    
188 |     threadsAvailable = Configs.numCores - TaskManager.threadsUsed
189 |     toLaunch = min(numTasksToLaunch, threadsAvailable)    
190 |     for task in tasks:
191 |         if toLaunch > 0 and checkLaunchTask(task):
192 |             Configs.debug("Launched a new task.. {}/{} threads used, type: {}, output file: {}".format(TaskManager.threadsUsed, Configs.numCores, task.taskType, task.outputFile))
193 |             toLaunch = toLaunch - 1
194 |             launchedTasks.append(task)
195 |         else:
196 |             remainingTasks.append(task)
197 |     return launchedTasks, remainingTasks
198 | 
199 | def checkLaunchTask(task):
200 |     if task.taskType not in TaskManager.serialTaskTypes:
201 |         task.future = TaskManager.taskPool.submit(runTask, task)
202 |         return True
203 |     elif TaskManager.observerWaiting and TaskManager.observerTask is None:
204 |         stack = TaskManager.contextStack
205 |         if task.taskType != "runAlignmentTask" or len(stack) == 0 or task in stack[-1].subalignmentTasks:
206 |             TaskManager.observerTask = task
207 |             TaskManager.observerSignal.set()
208 |             return True
209 |     return False
210 | 
211 | def runTask(task):
212 |     with TaskManager.managerLock:
213 |         if task.taskType not in TaskManager.serialTaskTypes:
214 |             TaskManager.threadsUsed = TaskManager.threadsUsed + 1
215 |         TaskManager.runningTasks.add(task)
216 |     
217 |     failed = False
218 |     try:
219 |         task.run()     
220 |     except:
221 |         failed = True
222 |         raise
223 |     finally:
224 |         with TaskManager.managerLock:
225 |             if task.taskType not in TaskManager.serialTaskTypes:
226 |                 TaskManager.threadsUsed = TaskManager.threadsUsed - 1
227 |             TaskManager.runningTasks.remove(task)
228 |             TaskManager.failedTasks.add(task) if failed else TaskManager.finishedTasks.add(task)
229 |             if task.outputFile in TaskManager.waitingTasks:
230 |                 t = TaskManager.waitingTasks.pop(task.outputFile)
231 |                 t.future = task.future
232 |                 t.isFinished = True
233 |             TaskManager.managerSignal.set()
234 |             TaskManager.observerSignal.set()
235 |                 
236 |         


--------------------------------------------------------------------------------