├── tests ├── __init__.py └── test_pipeline.py ├── witch_msa ├── helpers │ ├── __init__.py │ ├── math_utils.py │ ├── general_tools.py │ └── pyhmmer_tools.py ├── tools │ ├── magus │ │ ├── align │ │ │ ├── __init__.py │ │ │ ├── merge │ │ │ │ ├── __init__.py │ │ │ │ ├── graph_build │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── graph_builder.py │ │ │ │ ├── graph_trace │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── naive.py │ │ │ │ │ ├── tracer.py │ │ │ │ │ ├── rg_search.py │ │ │ │ │ └── rg_fast_search.py │ │ │ │ ├── graph_cluster │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── mcl.py │ │ │ │ │ ├── rg.py │ │ │ │ │ ├── clusterer.py │ │ │ │ │ ├── mlr_mcl.py │ │ │ │ │ └── clean_clusters.py │ │ │ │ ├── merger.py │ │ │ │ └── alignment_graph.py │ │ │ ├── decompose │ │ │ │ ├── __init__.py │ │ │ │ ├── kmh.py │ │ │ │ ├── decomposer.py │ │ │ │ └── initial_tree.py │ │ │ ├── aligner.py │ │ │ └── alignment_context.py │ │ ├── helpers │ │ │ ├── __init__.py │ │ │ ├── hmmutils.py │ │ │ └── sequenceutils.py │ │ ├── tasks │ │ │ ├── __init__.py │ │ │ ├── files.py │ │ │ ├── controller.py │ │ │ ├── task.py │ │ │ └── manager.py │ │ ├── tools │ │ │ ├── __init__.py │ │ │ ├── mcl │ │ │ │ └── bin │ │ │ │ │ ├── clm │ │ │ │ │ ├── mcl │ │ │ │ │ ├── mcx │ │ │ │ │ ├── mcxi │ │ │ │ │ ├── mclcm │ │ │ │ │ ├── mcxmap │ │ │ │ │ ├── clmformat │ │ │ │ │ ├── mcxarray │ │ │ │ │ ├── mcxdump │ │ │ │ │ ├── mcxload │ │ │ │ │ ├── mcxrand │ │ │ │ │ ├── mcxsubs │ │ │ │ │ ├── mcxassemble │ │ │ │ │ └── mclblastline │ │ │ ├── mlrmcl │ │ │ │ ├── ncut │ │ │ │ ├── mlrmcl │ │ │ │ └── README │ │ │ ├── hmmer │ │ │ │ ├── hmmalign │ │ │ │ ├── hmmbuild │ │ │ │ └── hmmsearch │ │ │ ├── clustal │ │ │ │ └── clustalo │ │ │ ├── fasttree │ │ │ │ ├── FastTree │ │ │ │ └── FastTreeMP │ │ │ ├── raxmlng │ │ │ │ └── raxml-ng │ │ │ ├── mafft │ │ │ │ ├── mafftdir │ │ │ │ │ └── libexec │ │ │ │ │ │ ├── f2cl │ │ │ │ │ │ ├── dndpre │ │ │ │ │ │ ├── getlag │ │ │ │ │ │ ├── score │ │ │ │ │ │ ├── tbfast │ │ │ │ │ │ ├── addsingle │ │ │ │ │ │ ├── countlen │ │ │ │ │ │ ├── dndblast │ │ │ │ │ │ ├── dndfast7 │ │ │ │ │ │ ├── dvtditr │ │ │ │ │ │ ├── nodepair │ │ │ │ │ │ ├── pairash │ │ │ │ │ │ ├── replaceu │ │ │ │ │ │ ├── restoreu │ │ │ │ │ │ ├── setcore │ │ │ │ │ │ ├── sextet5 │ │ │ │ │ │ ├── version │ │ │ │ │ │ ├── dash_client │ │ │ │ │ │ ├── disttbfast │ │ │ │ │ │ ├── multi2hat3s │ │ │ │ │ │ ├── splittbfast │ │ │ │ │ │ ├── contrafoldwrap │ │ │ │ │ │ ├── hex2maffttext │ │ │ │ │ │ ├── mafft-distance │ │ │ │ │ │ ├── mafft-profile │ │ │ │ │ │ ├── maffttext2hex │ │ │ │ │ │ ├── mccaskillwrap │ │ │ │ │ │ ├── pairlocalalign │ │ │ │ │ │ ├── regtable2seq │ │ │ │ │ │ ├── seq2regtable │ │ │ │ │ │ ├── setdirection │ │ │ │ │ │ ├── makedirectionlist │ │ │ │ │ │ └── mafft-homologs.1 │ │ │ │ └── mafft │ │ │ └── external_tools.py │ │ ├── README.md │ │ ├── magus.py │ │ └── configuration.py │ └── macOS │ │ ├── mcl │ │ ├── hmmalign │ │ ├── hmmbuild │ │ ├── FastTreeMP │ │ └── hmmsearch ├── gcmm │ ├── callback.py │ ├── __init__.py │ ├── merger.py │ ├── task.py │ ├── weighting.py │ └── decompose_tree.py ├── default.config └── init_configs.py ├── examples ├── data │ ├── aligned_all.fasta.gz │ └── backbone.aln.fasta.gz ├── user.config └── run.sh ├── requirements.txt ├── witch.py ├── bin └── witch-msa ├── .gitignore ├── MANIFEST.in ├── .github └── workflows │ ├── coveralls.yml │ ├── python-publish.yml │ └── python-package.yml ├── pyproject.toml └── CHANGELOG.rst /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /witch_msa/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/decompose/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_build/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_trace/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_cluster/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /witch_msa/tools/macOS/mcl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/macOS/mcl -------------------------------------------------------------------------------- /witch_msa/tools/macOS/hmmalign: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/macOS/hmmalign -------------------------------------------------------------------------------- /witch_msa/tools/macOS/hmmbuild: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/macOS/hmmbuild -------------------------------------------------------------------------------- /witch_msa/tools/macOS/FastTreeMP: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/macOS/FastTreeMP -------------------------------------------------------------------------------- /witch_msa/tools/macOS/hmmsearch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/macOS/hmmsearch -------------------------------------------------------------------------------- /examples/data/aligned_all.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/examples/data/aligned_all.fasta.gz -------------------------------------------------------------------------------- /examples/data/backbone.aln.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/examples/data/backbone.aln.fasta.gz -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython>=0.29 2 | configparser>=5.0.0 3 | DendroPy>=4.4.0 4 | numpy>=1.15 5 | psutil>=5.0 6 | tqdm>=4.0.0 7 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/clm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/clm -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/mcl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcl -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/mcx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcx -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/mcxi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxi -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mlrmcl/ncut: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mlrmcl/ncut -------------------------------------------------------------------------------- /witch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from witch_msa import witch_runner 3 | 4 | if __name__ == '__main__': 5 | witch_runner() 6 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/hmmer/hmmalign: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/hmmer/hmmalign -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/hmmer/hmmbuild: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/hmmer/hmmbuild -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/mclcm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mclcm -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/mcxmap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxmap -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mlrmcl/mlrmcl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mlrmcl/mlrmcl -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/clustal/clustalo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/clustal/clustalo -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/fasttree/FastTree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/fasttree/FastTree -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/hmmer/hmmsearch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/hmmer/hmmsearch -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/clmformat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/clmformat -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/mcxarray: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxarray -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/mcxdump: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxdump -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/mcxload: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxload -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/mcxrand: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxrand -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/mcxsubs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxsubs -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/raxmlng/raxml-ng: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/raxmlng/raxml-ng -------------------------------------------------------------------------------- /bin/witch-msa: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from witch_msa import witch_runner 3 | 4 | if __name__ == "__main__": 5 | witch_runner() 6 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/fasttree/FastTreeMP: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/fasttree/FastTreeMP -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/mcxassemble: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mcl/bin/mcxassemble -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/f2cl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/f2cl -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dndpre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dndpre -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/getlag: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/getlag -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/score: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/score -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/tbfast: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/tbfast -------------------------------------------------------------------------------- /tests/test_pipeline.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | class TryPipeline(TestCase): 4 | def test_always_pass(self): 5 | self.assertTrue(True) 6 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/addsingle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/addsingle -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/countlen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/countlen -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dndblast: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dndblast -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dndfast7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dndfast7 -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dvtditr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dvtditr -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/nodepair: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/nodepair -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/pairash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/pairash -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/replaceu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/replaceu -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/restoreu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/restoreu -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/setcore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/setcore -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/sextet5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/sextet5 -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/version: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/version -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dash_client: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/dash_client -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/disttbfast: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/disttbfast -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/multi2hat3s: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/multi2hat3s -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/splittbfast: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/splittbfast -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/contrafoldwrap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/contrafoldwrap -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/hex2maffttext: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/hex2maffttext -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mafft-distance: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mafft-distance -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mafft-profile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mafft-profile -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/maffttext2hex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/maffttext2hex -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mccaskillwrap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mccaskillwrap -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/pairlocalalign: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/pairlocalalign -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/regtable2seq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/regtable2seq -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/seq2regtable: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/seq2regtable -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/setdirection: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/setdirection -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/makedirectionlist: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c5shen/WITCH/HEAD/witch_msa/tools/magus/tools/mafft/mafftdir/libexec/makedirectionlist -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/ 2 | __pycache__/ 3 | *.backup 4 | *witch_output 5 | *_output* 6 | main.config 7 | examples/data2 8 | dist/ 9 | *egg-info 10 | home.path 11 | .witch_msa/ 12 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include CHANGELOG 3 | include witch.py 4 | include witch_msa/default.config 5 | include requirements.txt 6 | graft witch_msa/tools/ 7 | graft witch_msa/gcmm/ 8 | graft witch_msa/helpers/ 9 | graft tests/ 10 | prune */__pycache__ 11 | global-exclude *.py[cod] 12 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafft: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # sh -> bash for debian. By J. R. Peterson. 2015/Jun. 3 | 4 | pushd "`dirname "$0"`" > /dev/null 2>&1; rootdir="$PWD"; popd > /dev/null 2>&1; 5 | MAFFT_BINARIES="$rootdir/mafftdir/libexec"; export MAFFT_BINARIES; 6 | 7 | "$rootdir/mafftdir/bin/mafft" "$@" 8 | # input file name can have space 9 | -------------------------------------------------------------------------------- /witch_msa/helpers/math_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2012 3 | 4 | @author: Siavash Mirarab 5 | """ 6 | 7 | # 1.19.2022 - Copied from SEPP/UPP by Chengze 8 | 9 | 10 | def gcd(a, b): 11 | """Return greatest common divisor using Euclid's Algorithm.""" 12 | while b: 13 | a, b = b, a % b 14 | return a 15 | 16 | 17 | def lcm(a, b): 18 | """Return lowest common multiple.""" 19 | return a * b // gcd(a, b) 20 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_cluster/mcl.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 14, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | from configuration import Configs 8 | from tools import external_tools 9 | 10 | 11 | def runMclClustering(graph): 12 | Configs.log("Running MCL alignment graph clustering..") 13 | external_tools.runMcl(graph.graphPath, Configs.mclInflationFactor, graph.workingDir, graph.clusterPath).run() 14 | graph.readClustersFromFile(graph.clusterPath) 15 | 16 | 17 | -------------------------------------------------------------------------------- /.github/workflows/coveralls.yml: -------------------------------------------------------------------------------- 1 | on: ["push", "pull_request"] 2 | 3 | name: Test Coveralls 4 | 5 | jobs: 6 | 7 | build: 8 | name: Build 9 | runs-on: ubuntu-latest 10 | steps: 11 | 12 | - uses: actions/checkout@v1 13 | 14 | - name: Use Node.js 16.x 15 | uses: actions/setup-node@v3 16 | with: 17 | node-version: 16.x 18 | 19 | - name: npm install, make test-coverage 20 | run: | 21 | npm install 22 | make test-coverage 23 | 24 | - name: Coveralls GitHub Action 25 | uses: coverallsapp/github-action@v2.2.3 26 | -------------------------------------------------------------------------------- /witch_msa/helpers/general_tools.py: -------------------------------------------------------------------------------- 1 | import psutil, os 2 | import argparse 3 | 4 | # return memory usage of python process by MB 5 | def memoryUsage(): 6 | process = psutil.Process(os.getpid()) 7 | mem = process.memory_info().rss / float(2 ** 20) 8 | return mem 9 | 10 | # reformat argparse help text formatting 11 | class SmartHelpFormatter(argparse.HelpFormatter): 12 | def _split_lines(self, text, width): 13 | if '\n' in text: 14 | temp = text.split('\n') 15 | ret = [] 16 | for _splice in [argparse.HelpFormatter._split_lines(self, x, width) 17 | for x in temp]: 18 | ret.extend(_splice) 19 | return ret 20 | return argparse.HelpFormatter._split_lines(self, text, width) 21 | -------------------------------------------------------------------------------- /examples/user.config: -------------------------------------------------------------------------------- 1 | [commandline] 2 | timeout=999999 3 | max-concurrent-jobs=1000000 4 | 5 | [Basic] 6 | alignment_size = 25 7 | #magus_path = 8 | #mafftpath = 9 | #mclpath = 10 | #fasttreepath = 11 | #hmmsearchpath = /anaconda3/bin/hmmsearch 12 | #hmmbuildpath = /anaconda3/bin/hmmbuild 13 | #hmmalignpath = /anaconda3/bin/hmmalign 14 | 15 | [Backbone] 16 | backbone_size = 500 17 | #alignment_method = magus 18 | #alignment_path = 19 | #backbone_size = 20 | #selection_strategy = median_length 21 | #tree_method = FastTree2 22 | #tree_path = 23 | 24 | [MAGUS] 25 | #inflationfactor = 26 | #graphclustermethod = 27 | #graphtracemethod = 28 | #graphtraceoptimize = 29 | #maxnumsubsets = 30 | #mafftpath = 31 | #mclpath = 32 | #fasttreepath = 33 | #hmmsearchpath = /anaconda3/bin/hmmsearch 34 | #hmmbuildpath = /anaconda3/bin/hmmbuild 35 | #hmmalignpath = /anaconda3/bin/hmmalign 36 | -------------------------------------------------------------------------------- /witch_msa/helpers/pyhmmer_tools.py: -------------------------------------------------------------------------------- 1 | from witch_msa.helpers.alignment_tools import Alignment 2 | from pyhmmer import easel 3 | 4 | # helper function to convert an alignment object to TextMSA object 5 | def alignmentToTextMSA(aln, name): 6 | sequences = [] 7 | for taxon, seq in aln.items(): 8 | sequences.append(easel.TextSequence(name=taxon.encode(), sequence=seq)) 9 | msa = easel.TextMSA(name=name.encode(), sequences=sequences) 10 | return msa 11 | 12 | # helper function to obtain alphabet given molecule type 13 | def moleculeToAlphabet(molecule): 14 | alphabet = None 15 | if molecule == 'amino': 16 | alphabet = easel.Alphabet.amino() 17 | elif molecule == 'dna': 18 | alphabet = easel.Alphabet.dna() 19 | elif molecule == 'rna': 20 | alphabet = easel.Alphabet.rna() 21 | else: 22 | raise ValueError(f"alphabet {molecule} is not amino, dna, or rna") 23 | return alphabet 24 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/merger.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 14, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | import time 9 | 10 | 11 | from align.merge.graph_build.graph_builder import buildGraph 12 | from align.merge.graph_cluster.clusterer import clusterGraph 13 | from align.merge.graph_trace.tracer import findTrace 14 | from align.merge.optimizer import optimizeTrace 15 | from align.merge.alignment_writer import writeAlignment 16 | from configuration import Configs 17 | 18 | 19 | def mergeSubalignments(context): 20 | Configs.log("Merging {} subaligments..".format(len(context.subalignmentPaths))) 21 | time1 = time.time() 22 | 23 | buildGraph(context) 24 | clusterGraph(context.graph) 25 | findTrace(context.graph) 26 | optimizeTrace(context.graph) 27 | writeAlignment(context) 28 | 29 | time2 = time.time() 30 | Configs.log("Merged {} subalignments into {} in {} sec..".format(len(context.subalignmentPaths), context.outputFile, time2-time1)) 31 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_cluster/rg.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 23, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | from configuration import Configs 8 | from align.merge.graph_trace.rg_search import rgCluster 9 | from align.merge.graph_trace.rg_fast_search import rgFastCluster 10 | 11 | def rgClustering(graph): 12 | Configs.log("Building a region-growing graph clustering..") 13 | 14 | k = len(graph.context.subalignments) 15 | lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] 16 | upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 17 | graph.clusters = rgCluster(graph, lowerBound, upperBound, False) 18 | graph.writeClustersToFile(graph.clusterPath) 19 | 20 | def rgFastClustering(graph): 21 | Configs.log("Building a fast region-growing graph clustering..") 22 | 23 | k = len(graph.context.subalignments) 24 | lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] 25 | upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 26 | graph.clusters = rgFastCluster(graph, lowerBound, upperBound, False) 27 | graph.writeClustersToFile(graph.clusterPath) 28 | -------------------------------------------------------------------------------- /witch_msa/gcmm/callback.py: -------------------------------------------------------------------------------- 1 | from witch_msa.configs import Configs 2 | from witch_msa.helpers.alignment_tools import ExtendedAlignment 3 | import gzip 4 | 5 | ''' 6 | Callback for a query alignment result 7 | *args should have three fields: _query, _index, checkpoint_path 8 | ''' 9 | def callback_queryAlignment(success, ignored, retry, i_retry, 10 | query, index, taxon_name, checkpoint_path): 11 | if (not query) and i_retry > 0: 12 | retry.append(index) 13 | else: 14 | # failed job indicated in the or pipelines, 15 | # should be ignored in the output 16 | if (not query) or len(query) == 0: 17 | ignored.append(taxon_name) 18 | else: 19 | # write to checkpoint_path 20 | if (not isinstance(query, ExtendedAlignment)) or (len(query) != 1): 21 | return 22 | seq = query[taxon_name] 23 | line = '{}\t{}\n'.format(taxon_name, seq) 24 | encoded = line.encode('utf-8') 25 | with gzip.open(checkpoint_path, 'ab') as f: 26 | f.write(encoded) 27 | 28 | # update success 29 | success.append(query) 30 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_cluster/clusterer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 23, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | import time 9 | 10 | from configuration import Configs 11 | 12 | from align.merge.graph_cluster.mcl import runMclClustering 13 | from align.merge.graph_cluster.mlr_mcl import runMlrMclClustering 14 | from align.merge.graph_cluster.rg import rgClustering 15 | 16 | ''' 17 | The alignment graph is clustered, the clusters are written out as an array of node arrays. 18 | MCL is the main way to do this, but rg could be used if there are scalability issues. 19 | ''' 20 | 21 | def clusterGraph(graph): 22 | time1 = time.time() 23 | 24 | if os.path.exists(graph.clusterPath): 25 | Configs.log("Found existing cluster file {}".format(graph.clusterPath)) 26 | graph.readClustersFromFile(graph.clusterPath) 27 | 28 | elif Configs.graphClusterMethod == "mcl": 29 | runMclClustering(graph) 30 | 31 | elif Configs.graphClusterMethod == "mlrmcl": 32 | runMlrMclClustering(graph) 33 | 34 | elif Configs.graphClusterMethod == "rg": 35 | rgClustering(graph) 36 | 37 | else: 38 | Configs.log("No alignment graph clustering requested..") 39 | 40 | time2 = time.time() 41 | Configs.log("Clustered the graph in {} sec..".format(time2-time1)) 42 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tasks/files.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 26, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | import json 9 | import time 10 | import random 11 | from tasks import task 12 | 13 | 14 | def writeTasksToFile(taskList, tasksFile, append = True): 15 | if not append and len(taskList) == 0 and os.path.exists(tasksFile): 16 | os.remove(tasksFile) 17 | elif len(taskList) > 0: 18 | with open(tasksFile, 'a' if append else 'w') as file: 19 | for t in taskList: 20 | file.write(t.json + "\n") 21 | 22 | def readTasksFromFile(tasksFile): 23 | fileTasks = [] 24 | if os.path.exists(tasksFile): 25 | with open(tasksFile) as file: 26 | for line in file: 27 | mapper = json.loads(line.strip()) 28 | fileTasks.append(task.Task(**mapper)) 29 | return fileTasks 30 | 31 | 32 | class FileLock: 33 | 34 | def __init__(self, filePath): 35 | self.filePath = filePath 36 | 37 | def __enter__(self): 38 | while True: 39 | try: 40 | lock = open(self.filePath, 'x') 41 | lock.close() 42 | return self 43 | except: 44 | time.sleep(random.random()*0.1 + 0.05) 45 | #time.sleep(random.random() + 0.5) 46 | 47 | def __exit__(self, excType, excVal, excTb): 48 | os.remove(self.filePath) 49 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | - name: Lint with flake8 33 | run: | 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | - name: Test with pytest 39 | run: | 40 | pytest 41 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_trace/naive.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 23, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | 8 | from configuration import Configs 9 | 10 | def atomizedClustering(graph): 11 | Configs.log("Building a fully atomized clustering..") 12 | 13 | k = len(graph.context.subalignments) 14 | lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] 15 | upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 16 | graph.clusters = atomizedCluster(lowerBound, upperBound) 17 | 18 | def naiveClustering(graph): 19 | Configs.log("Building a naive left-justified clustering..") 20 | 21 | k = len(graph.context.subalignments) 22 | lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] 23 | upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 24 | graph.clusters = naiveCluster(lowerBound, upperBound) 25 | 26 | def atomizedCluster(lowerBound, upperBound): 27 | clusters = [] 28 | for j in range(len(lowerBound)): 29 | for i in range(lowerBound[j], upperBound[j]): 30 | clusters.append([i]) 31 | return clusters 32 | 33 | def naiveCluster(lowerBound, upperBound): 34 | clusters = [] 35 | i = 0 36 | while True: 37 | cluster = [] 38 | for j in range(len(lowerBound)): 39 | if lowerBound[j] + i < upperBound[j]: 40 | cluster.append(lowerBound[j] + i) 41 | if len(cluster) == 0: 42 | break 43 | clusters.append(cluster) 44 | i = i+1 45 | return clusters -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "witch-msa" 7 | dynamic = ["version", "dependencies"] 8 | description = "WITCH - A Multiple Sequence Alignment Tool" 9 | readme = "README.rst" 10 | authors = [ 11 | {name = "Chengze Shen", email = "chengze5@illinois.edu"} 12 | ] 13 | license = {file = "LICENSE"} 14 | requires-python = ">=3.7" 15 | classifiers = [ 16 | "Development Status :: 4 - Beta", 17 | "Operating System :: OS Independent", 18 | "Intended Audience :: Developers", 19 | "Intended Audience :: Science/Research", 20 | "Topic :: Scientific/Engineering :: Bio-Informatics", 21 | "Topic :: Software Development", 22 | "License :: OSI Approved :: GNU General Public License (GPL)", 23 | "Programming Language :: Python", 24 | "Programming Language :: Python :: 3", 25 | "Programming Language :: Python :: 3.7", 26 | "Programming Language :: Python :: 3.8", 27 | "Programming Language :: Python :: 3.9", 28 | "Programming Language :: Python :: 3.10", 29 | "Programming Language :: Python :: 3.11", 30 | "Programming Language :: Python :: 3.12" 31 | ] 32 | #scripts = ["witch.py"] 33 | #packages = ["gcmm", "helpers"] 34 | 35 | [project.urls] 36 | Homepage = "https://github.com/c5shen/WITCH" 37 | Changelog = "https://github.com/c5shen/WITCH/blob/main/CHANGELOG.rst" 38 | 39 | [tool.setuptools.dynamic] 40 | version = {attr = "witch_msa.__version__"} 41 | dependencies = {file = ["requirements.txt"]} 42 | -------------------------------------------------------------------------------- /witch_msa/default.config: -------------------------------------------------------------------------------- 1 | [Basic] 2 | #### the FastTreeMP executable provided may not be compatible to certain 3 | #### system, such as macOS on M1 chip (arm64 instead of x86). In that case, 4 | #### please provide your own fasttreepath (also do the same for the [MAGUS] 5 | #### configuration below) by compiling the source code from: 6 | #### http://www.microbesonline.org/fasttree/FastTree.c 7 | #### 8 | #### command for compilation (please use gcc-10 or higher): 9 | #### gcc -DOPENMP -fopenmp -O3 -finline-functions -funroll-loops -Wall -o FastTreeMP FastTree.c -lm 10 | #### 11 | #### Other softwares used can also be self-provided if necessary. 12 | magus_path = 13 | mafftpath = 14 | mclpath = 15 | hmmsearchpath = 16 | hmmbuildpath = 17 | hmmalignpath = 18 | fasttreepath = 19 | 20 | [Backbone] 21 | #### alignment_method can be set to [mafft, magus] for now #### 22 | alignment_method = magus 23 | alignment_path = 24 | #alignment_method = mafft 25 | #alignment_path = /anaconda3/bin/mafft 26 | #### default backbone_size is min(1000, len(taxa)), but could be fewer 27 | #### if there aren't many taxa within median length (selection strategy) 28 | backbone_size = 29 | #### selection strategy can be [random, median_length]; default median_length 30 | selection_strategy = median_length 31 | tree_method = FastTree2 32 | tree_path = 33 | 34 | [MAGUS] 35 | #### settings for running MAGUS backbone specifically. Refer to MAGUS 36 | #### github page for more details. 37 | inflationfactor = 38 | graphclustermethod = 39 | graphtracemethod = 40 | graphtraceoptimize = 41 | maxnumsubsets = 42 | #### Custom binary executable paths to run MAGUS/GCM. Specifically added 43 | #### for macOS systems. 44 | #### This will be the same as the ones in [Basic] if generated from setup.py 45 | ####Please use the absolute path to each desired executable. 46 | mafftpath = 47 | mclpath = 48 | hmmsearchpath = 49 | hmmbuildpath = 50 | hmmalignpath = 51 | fasttreepath = 52 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mcl/bin/mclblastline: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright (C) 2003, 2004, 2005, 2006, 2007 Stijn van Dongen 4 | # 5 | # You can redistribute and/or modify this program under the terms of the GNU 6 | # General Public License; either version 3 of the License or (at your option) 7 | # any later version. 8 | 9 | use strict; 10 | 11 | $" = ' '; 12 | 13 | my $do_help = 0; 14 | my $cline = "mclpipeline --parser=mcxdeblast --parser-tag=blast"; 15 | 16 | unless (grep { $_ =~ /^--ass-r[vei]?=/; } @ARGV) { 17 | $cline .= " --ass-r=max"; 18 | } 19 | 20 | $cline .= " @ARGV"; 21 | 22 | if (grep { $_ =~ /--(help|apropos)/; } @ARGV) { 23 | $do_help = 1; 24 | } 25 | elsif (!@ARGV) { 26 | $do_help = 1; 27 | $cline .= " --help"; 28 | } 29 | 30 | if ($do_help) { 31 | print <<_help_; 32 | mcxblastline wraps around the generic mclpipeline script. It fills in the name 33 | of the BLAST parser (mcxdeblast) and the tag ('blast') used to propagate 34 | mcxdeblast options through the pipeline to mcxdeblast itself. You can freely 35 | use all mclpipeline options other than --parser= and 36 | --parser-tag=. 37 | _help_ 38 | } 39 | if (system $cline) { 40 | print "mcxblastline wrapper: pipeline failed\n"; 41 | print "cline: $cline\n"; 42 | exit(1); 43 | } 44 | if ($do_help) { 45 | print <<_help_; 46 | ________________ 47 | The above options are generic pipeline options. You can pass any mcxdeblast 48 | option by inserting the 'blast' tag in front of that particular option. For 49 | example, the mcxdeblast --score=x option (where x is 'b' or 'e') should 50 | be passed to mcxblastline as --blast-score=x. 51 | 52 | The mcxdeblast --xo-dat option is special; it must *not* be prefixed, as it is 53 | shared with mclpipeline, as can be seen from the above listing. The mcxdeblast 54 | --xi-dat option should not be used, as it encapsulated by the mclpipeline --xi 55 | option. 56 | _help_ 57 | } 58 | 59 | -------------------------------------------------------------------------------- /examples/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | all_unaligned_path=data/unaligned_all.fasta 4 | backbone_aln_path=data/backbone.aln.fasta.gz 5 | backbone_tre_path=data/backbone.tre 6 | query_path=data/unaligned_frag.fasta 7 | outname=aligned.fasta 8 | 9 | scenario=4 10 | if [[ $1 != "" ]]; then 11 | scenario=$1 12 | fi 13 | 14 | if [[ $scenario == 1 ]]; then 15 | # Scenario A - unaligned sequences only 16 | python ../witch.py -i ${all_unaligned_path} -d scenarioA_output \ 17 | -o ${outname} 18 | elif [[ $scenario == 2 ]]; then 19 | # Scenario B - unaligned sequences only; using bit scores; 20 | # using 10 HMMs to align a sequence 21 | python ../witch.py -i ${all_unaligned_path} -d scenarioB_output \ 22 | -o ${outname} -w 0 -k 10 23 | elif [[ $scenario == 3 ]]; then 24 | # 3) Scenario C - backbone alignment available; backbone tree missing; 25 | # query sequences available; also saving weights to local 26 | python ../witch.py --num-cpus -1 -b ${backbone_aln_path} \ 27 | -q ${query_path} -d scenarioC_output -o ${outname} \ 28 | --save-weight 1 29 | elif [[ $scenario == 4 ]]; then 30 | # 4) Scenario D - backbone alignment available; backbone tree available; 31 | # query sequences available; saving weights to local; 32 | # also save decomposition results for future use (e.g., 33 | # faster rerun) 34 | python ../witch.py --num-cpus -1 -b ${backbone_aln_path} \ 35 | -e ${backbone_tre_path} \ 36 | -q ${query_path} -d scenarioD_output -o ${outname} \ 37 | --save-weight 1 --keep-decomposition 1 38 | elif [[ $scenario == 5 ]]; then 39 | # 5) Scenario E - same as Scenario D, but with a user-specified config file 40 | python ../witch.py --num-cpus -1 -b ${backbone_aln_path} \ 41 | -e ${backbone_tre_path} \ 42 | -q ${query_path} -d scenarioE_output -o ${outname} \ 43 | --save-weight 1 --keep-decomposition 1 \ 44 | -c user.config 45 | fi 46 | -------------------------------------------------------------------------------- /witch_msa/gcmm/__init__.py: -------------------------------------------------------------------------------- 1 | from witch_msa.configs import Configs 2 | from concurrent.futures import ProcessPoolExecutor 3 | import os, sys, inspect 4 | 5 | ''' 6 | Customized ProcessPoolExecutor class to handle callbacks and monitor current 7 | progress in query alignments 8 | ''' 9 | class WITCHProcessPoolExecutor(ProcessPoolExecutor): 10 | def __init__(self, *args, **kwargs): 11 | super().__init__(*args, **kwargs) 12 | self._running_jobs = 0 13 | self._submitted_jobs = 0 14 | self._finished_jobs = 0 15 | 16 | def submit(self, *args, **kwargs): 17 | future = super().submit(*args, **kwargs) 18 | self._running_jobs += 1 19 | self._submitted_jobs += 1 20 | #future.add_done_callback(self._worker_is_done) 21 | return future 22 | 23 | def _worker_is_done(self, future): 24 | self._running_jobs -= 1 25 | self._finished_jobs += 1 26 | print('Finished jobs: {}/{}'.format( 27 | self._finished_jobs, self._submitted_jobs), end='\r', flush=True) 28 | 29 | def get_pool_usage(self): 30 | return self._running_jobs 31 | 32 | def get_finished_jobs(self): 33 | return self._finished_jobs 34 | 35 | ''' 36 | Simple function for notifying user of errors encountered 37 | ''' 38 | def notifyError(location): 39 | print('Encountered an error at {}\n\tcheck {}'.format( 40 | location, Configs.error_path)) 41 | exit(1) 42 | 43 | ''' 44 | Simple function to obtain current line number of the caller 45 | ''' 46 | def getLineInfo(): 47 | items = inspect.stack()[1][1:4] 48 | return '{}:{} - Line {}'.format(items[0], items[2], items[1]) 49 | 50 | ''' 51 | Simple function for sanity-checking all output files of a given list that: 52 | (1) they exists 53 | (2) they have size > 0 54 | ''' 55 | def sanityCheckFileCreation(files): 56 | ret = [] # list of problematic files 57 | for f in files: 58 | if os.path.exists(f) and os.stat(f).st_size > 0: 59 | pass 60 | else: 61 | ret.append(f) 62 | return ret 63 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tasks/controller.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Nov 1, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | from tasks.manager import TaskManager, runTask 8 | from configuration import Configs 9 | 10 | ''' 11 | This is where the main thread goes to submit and await tasks. 12 | When the main thread is in a blocking wait, it can pick up a new alignment task to run. 13 | ''' 14 | 15 | def submitTasks(tasks): 16 | with TaskManager.managerLock: 17 | checkTaskManager() 18 | TaskManager.submittedTasks.update(tasks) 19 | TaskManager.managerSignal.set() 20 | 21 | def asCompleted(tasks): 22 | unfinished = tasks 23 | while True: 24 | finished, unfinished = checkWhatFinished(unfinished) 25 | yield from finished 26 | if len(unfinished) == 0: 27 | return 28 | if len(finished) == 0: 29 | observeTaskManager() 30 | 31 | def awaitTasks(tasks): 32 | finished, unfinished = checkWhatFinished(tasks) 33 | while len(unfinished) > 0: 34 | observeTaskManager() 35 | finished, unfinished = checkWhatFinished(unfinished) 36 | 37 | def checkWhatFinished(tasks): 38 | finished, unfinished = [], [] 39 | for t in tasks: 40 | finished.append(t) if t.checkFinished() else unfinished.append(t) 41 | return finished, unfinished 42 | 43 | def observeTaskManager(): 44 | TaskManager.observerWaiting = True 45 | TaskManager.managerSignal.set() 46 | TaskManager.observerSignal.wait(10) 47 | with TaskManager.managerLock: 48 | TaskManager.observerWaiting = False 49 | TaskManager.observerSignal.clear() 50 | checkTaskManager() 51 | task, TaskManager.observerTask = TaskManager.observerTask, None 52 | if task is not None: 53 | runTask(task) 54 | 55 | #manager.setTaskFinished(task) 56 | 57 | def checkTaskManager(): 58 | if not TaskManager.managerFuture.running(): 59 | Configs.error("Task manager is dead for some reason..") 60 | TaskManager.managerFuture.result() 61 | raise Exception("Task manager is dead for some reason..") 62 | 63 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_cluster/mlr_mcl.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 13, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | 9 | from configuration import Configs 10 | from tools import external_tools 11 | 12 | 13 | def runMlrMclClustering(graph): 14 | Configs.log("Running MLR-MCL alignment graph clustering..") 15 | graphPath = os.path.join(graph.workingDir, "graph_mlr_mcl.txt") 16 | clusterPath = os.path.join(graph.workingDir, "clusters_mlr_mcl.txt") 17 | 18 | if not os.path.exists(clusterPath): 19 | if not os.path.exists(graphPath): 20 | writeGraphToFile(graph, graphPath) 21 | external_tools.runMlrMcl(graphPath, 30000, 0.5, 4, graph.workingDir, clusterPath).run() 22 | 23 | graph.clusters = readClustersFromFile(clusterPath) 24 | graph.writeClustersToFile(graph.clusterPath) 25 | 26 | def writeGraphToFile(graph, filePath): 27 | Configs.log("Writing MLR-MCL graph file to {}".format(filePath)) 28 | vertices, edges = 0, 0 29 | lines = [] 30 | for i in range(len(graph.matrix)): 31 | pairs = graph.matrix[i].items() 32 | vertices = vertices + 1 33 | edges = edges + len(pairs) 34 | lines.append(" ".join(["{} {}".format(a+1, b) for a, b in pairs])) 35 | 36 | with open(filePath, 'w') as textFile: 37 | textFile.write("{} {} 1\n".format(vertices, int(edges/2))) 38 | for line in lines: 39 | textFile.write(line + "\n") 40 | 41 | Configs.log("Wrote graph with {} vertices and {} edges to {}".format(vertices, int(edges/2), filePath)) 42 | 43 | def readClustersFromFile(filePath): 44 | assignments = {} 45 | with open(filePath) as f: 46 | num = 0 47 | for line in f: 48 | cluster = int(line.strip()) 49 | if cluster not in assignments: 50 | assignments[cluster] = [num] 51 | else: 52 | assignments[cluster].append(num) 53 | num = num + 1 54 | clusters = [assignments[c] for c in range(len(assignments))] 55 | Configs.log("Found {} clusters..".format(len(clusters))) 56 | return clusters -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_trace/tracer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 17, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | import time 9 | 10 | from configuration import Configs 11 | from align.merge.graph_cluster.clean_clusters import purgeClusterViolations, purgeDuplicateClusters 12 | from align.merge.graph_trace.min_clusters import minClustersSearch 13 | from align.merge.graph_trace.fm import fmAlgorithm 14 | from align.merge.graph_trace.mwt_search import mwtGreedySearch, mwtSearch 15 | from align.merge.graph_trace.rg_search import rgSearch 16 | from align.merge.graph_trace.rg_fast_search import rgFastSearch 17 | from align.merge.graph_trace.naive import naiveClustering 18 | 19 | ''' 20 | Graph clusters must be refined into a "trace", a constrained clustering that corresponds to a valid MSA. 21 | There are a variety of ways to do this, "minclusters" is usually the most dependable option. 22 | "mwtgreedy" or "rgfast" might be used if there are scalability issues. 23 | Some of these options don't require an existing clustering, and can work on the raw graph. 24 | ''' 25 | 26 | def findTrace(graph): 27 | time1 = time.time() 28 | 29 | if os.path.exists(graph.tracePath): 30 | Configs.log("Found existing trace file {}".format(graph.tracePath)) 31 | graph.readClustersFromFile(graph.tracePath) 32 | 33 | else: 34 | purgeDuplicateClusters(graph) 35 | purgeClusterViolations(graph) 36 | 37 | if Configs.graphTraceMethod == "minclusters": 38 | minClustersSearch(graph) 39 | elif Configs.graphTraceMethod == "fm": 40 | fmAlgorithm(graph) 41 | elif Configs.graphTraceMethod == "mwtgreedy": 42 | mwtGreedySearch(graph) 43 | elif Configs.graphTraceMethod == "mwtsearch": 44 | mwtSearch(graph) 45 | elif Configs.graphTraceMethod == "rg": 46 | rgSearch(graph) 47 | elif Configs.graphTraceMethod == "rgfast": 48 | rgFastSearch(graph) 49 | elif Configs.graphTraceMethod == "naive": 50 | naiveClustering(graph) 51 | 52 | graph.writeClustersToFile(graph.tracePath) 53 | 54 | 55 | time2 = time.time() 56 | Configs.log("Found alignment graph trace in {} sec..".format(time2-time1)) 57 | Configs.log("Found a trace with {} clusters and a total cost of {}".format(len(graph.clusters), graph.computeClusteringCost(graph.clusters))) 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_cluster/clean_clusters.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 23, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | from configuration import Configs 8 | 9 | def purgeDuplicateClusters(graph): 10 | uniqueClusters = set() 11 | newclusters = [] 12 | for cluster in graph.clusters: 13 | cluster.sort() 14 | clusterTuple = tuple(cluster) 15 | if clusterTuple not in uniqueClusters: 16 | uniqueClusters.add(clusterTuple) 17 | newclusters.append(cluster) 18 | graph.clusters = newclusters 19 | Configs.log("Purged duplicate clusters. Found {} unique clusters..".format(len(graph.clusters))) 20 | 21 | def purgeClusterViolations(graph): 22 | redundantCols = {} 23 | redundantRows = {} 24 | elementScores = {} 25 | for a, cluster in enumerate(graph.clusters): 26 | for b in cluster: 27 | bsub, bpos = graph.matSubPosMap[b] 28 | redundantCols[a, bsub] = redundantCols.get((a, bsub), []) + [(a, b)] 29 | redundantRows[b] = redundantRows.get(b, []) + [(a, b)] 30 | 31 | scoresum = 0 32 | for c in cluster: 33 | csub, cpos = graph.matSubPosMap[c] 34 | if bsub != csub: 35 | scoresum = scoresum + graph.matrix[b].get(c,0) 36 | elementScores[a, b] = scoresum 37 | 38 | problemCols = [(a,b) for a,b in redundantCols if len(redundantCols[a,b]) > 1] 39 | problemRows = [a for a in redundantRows if len(redundantRows[a]) > 1] 40 | Configs.log("Found {} row violations and {} column violations..".format(len(problemRows), len(problemCols))) 41 | 42 | sortedScores = list(elementScores.keys()) 43 | sortedScores.sort(key = lambda x : elementScores[x]) 44 | 45 | for a,b in sortedScores: 46 | bsub, bpos = graph.matSubPosMap[b] 47 | if len(redundantCols[a, bsub]) > 1 or len(redundantRows[b]) > 1: 48 | graph.clusters[a].remove(b) 49 | redundantCols[a, bsub].remove((a,b)) 50 | redundantRows[b].remove((a,b)) 51 | 52 | problemCols = [(a,b) for a,b in redundantCols if len(redundantCols[a,b]) > 1] 53 | problemRows = [a for a in redundantRows if len(redundantRows[a]) > 1] 54 | Configs.log("Finished violations sweep. Now {} row violations and {} column violations..".format(len(problemRows), len(problemCols))) 55 | 56 | graph.clusters = [cluster for cluster in graph.clusters if len(cluster) > 1] 57 | Configs.log("Purged cluster violations. Found {} clean clusters..".format(len(graph.clusters))) 58 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tasks/task.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 29, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | import importlib 9 | import json 10 | import traceback 11 | from configuration import Configs 12 | from tasks import controller 13 | 14 | ''' 15 | Tasks are self-contained, parallelizable units of work. 16 | For example, running MAFFT, compressing a subalignment, etc.. 17 | Primarily used to thread-parallelize MAFFT runs and node-parallelize subalignment operations. 18 | Saved as JSON in task files, which are then read back by computing nodes with available threads. 19 | This also serves the purpose of allowing aborted MAGUS runs to pick up where they left off. 20 | ''' 21 | 22 | class Task: 23 | 24 | functionModuleMap = {"runCommand" : "tools.external_tools", 25 | "runAlignmentTask" : "align.aligner", 26 | "recordGapCounts" : "align.merge.alignment_writer", 27 | "buildInducedSubalignment" : "align.merge.alignment_writer", 28 | "compressSubalignment" : "align.merge.alignment_writer"} 29 | 30 | def __init__(self, taskType, outputFile, taskArgs, **kwargs): 31 | self.taskType = taskType 32 | self.outputFile = outputFile 33 | self.taskArgs = taskArgs 34 | 35 | for attr in kwargs: 36 | vars(self)[attr] = kwargs.get(attr) 37 | self.attributes = list(vars(self).keys()) 38 | 39 | self.isFinished = False 40 | self.future = None 41 | self.json = self.toJson() 42 | 43 | def submitTask(self): 44 | submitTasks([self]) 45 | 46 | def awaitTask(self): 47 | awaitTasks([self]) 48 | 49 | def submitAndAwaitTask(self): 50 | self.submitTask() 51 | self.awaitTask() 52 | 53 | def run(self): 54 | try: 55 | if not os.path.exists(self.outputFile): 56 | Configs.log("Running a task, output file: {}".format(self.outputFile)) 57 | mod = importlib.import_module(Task.functionModuleMap[self.taskType]) 58 | func = getattr(mod, self.taskType) 59 | func(**self.taskArgs) 60 | Configs.log("Completed a task, output file: {}".format(self.outputFile)) 61 | else: 62 | Configs.log("File already exists: {}".format(self.outputFile)) 63 | except Exception as exc: 64 | Configs.error("Task for {} threw an exception:\n{}".format(self.outputFile, exc)) 65 | Configs.error(traceback.format_exc()) 66 | raise 67 | finally: 68 | self.isFinished = True 69 | 70 | def checkFinished(self): 71 | if not self.isFinished: 72 | return False 73 | if self.future is not None: 74 | self.future.result() 75 | return True 76 | 77 | def toJson(self): 78 | mapper = {attr : getattr(self, attr) for attr in self.attributes} 79 | return json.dumps(mapper) 80 | 81 | def __eq__(self, other): 82 | if isinstance(other, Task): 83 | return self.outputFile == other.outputFile 84 | return NotImplemented 85 | 86 | def __hash__(self): 87 | return hash(self.outputFile) 88 | 89 | 90 | def asCompleted(tasks): 91 | yield from controller.asCompleted(tasks) 92 | 93 | def awaitTasks(tasks): 94 | controller.awaitTasks(tasks) 95 | 96 | def submitTasks(tasks): 97 | controller.submitTasks(tasks) 98 | -------------------------------------------------------------------------------- /witch_msa/gcmm/merger.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 10.28.2021 by Chengze Shen 3 | 4 | Merger of all query alignments to form the final alignment. The merging step 5 | is exactly the same one from PASTA and UPP (by transitivity). 6 | ''' 7 | 8 | import os, sys, re 9 | import time 10 | from witch_msa.configs import Configs 11 | from witch_msa.helpers.alignment_tools import Alignment, read_fasta, \ 12 | CompactAlignment, compact, ExtendedAlignment 13 | from functools import partial 14 | #from concurrent.futures.process import ProcessPoolExecutor 15 | from math import ceil 16 | 17 | ''' 18 | function to merge a set of input paths (to alignments) sequentially 19 | ''' 20 | def sequential_merger(queries, inpaths): 21 | init_index = 0 22 | while init_index < len(inpaths) and inpaths[init_index] == 'skipped': 23 | init_index += 1 24 | init_aln = Alignment(); init_aln.read_file_object(queries[init_index]) 25 | new_aln = compact(init_aln) 26 | for i in range(init_index + 1, len(inpaths)): 27 | inpath = inpaths[i] 28 | 29 | # skip these ones 30 | if inpath == 'skipped': 31 | continue 32 | frag_aln = Alignment(); frag_aln.read_file_object(inpath) 33 | new_aln.merge_in(compact(frag_aln)) 34 | return new_aln 35 | 36 | ''' 37 | function to merge all subalignments to one alignment and with all singletons 38 | in queries collapsed (in lower cases). This is the same behavior as UPP. 39 | An additional "masked" version of the final alignment with all lower cases 40 | removed will also be written to disk. 41 | ''' 42 | def mergeAlignmentsCollapsed(backbone_alignment_path, queries, 43 | renamed_taxa, pool): 44 | Configs.log('(UPP-style merging) Merging all GCM subproblems ' \ 45 | 'with transitivity and singletons from queries collapsed...') 46 | start = time.time() 47 | outpath = Configs.output_path 48 | #masked_outpath = Configs.output_path + '.masked' 49 | 50 | # Updated @ 10.26.2024 by Chengze Shen 51 | # masked alignment output name change to .masked.fasta 52 | # if user gives `-o .fa` or `.fasta`, then the output name 53 | # will adapt to the correct suffix 54 | suffix = outpath.split('.')[-1] 55 | if suffix in ['fa', 'fasta']: 56 | masked_outpath = '.'.join(outpath.split('.')[:-1]) + '.masked.' + suffix 57 | else: 58 | masked_outpath = outpath + '.masked.fasta' 59 | 60 | if not (len(queries) > 0): 61 | print('No query alignment provided to merger!') 62 | exit(1) 63 | 64 | # read in all backbone sequences/alignment 65 | full_aln = ExtendedAlignment([]) 66 | full_aln.read_file_object(backbone_alignment_path) 67 | full_aln.from_string_to_bytearray() 68 | 69 | # read in queries so that insertions are marked 70 | #backbone_keys = {x: 1 for x in full_aln.keys()} 71 | #func = partial(getQueryAlignment, backbone_keys) 72 | #queries = list(pool.map(func, inpaths)) 73 | 74 | # merge all queries to the backbone 75 | for query in queries: 76 | if query != 'skipped': 77 | full_aln.merge_in(query, False) 78 | #del query 79 | full_aln.from_bytearray_to_string() 80 | 81 | # rename back taxa 82 | name_map = {v: k for k, v in renamed_taxa.items()} 83 | count = 0 84 | for name in list(name_map.keys()): 85 | ori_name = name_map[name] 86 | if name in full_aln: 87 | full_aln[ori_name] = full_aln[name] 88 | full_aln.pop(name) 89 | count += 1 90 | if count > 0: 91 | Configs.log('Converted {} names back to their originals'.format(count)) 92 | Configs.log('Finished merging all GCM subproblems, output file: {}'.format( 93 | outpath)) 94 | full_aln.write(outpath, 'FASTA') 95 | 96 | # write a masked version of full alignment 97 | full_aln.remove_insertion_columns() 98 | full_aln.write(masked_outpath, 'FASTA') 99 | Configs.log('Masked final alignment written to: {}'.format(masked_outpath)) 100 | 101 | end = time.time() 102 | Configs.runtime('Time to merge all outputs (s): {}'.format(end - start)) 103 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/decompose/kmh.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 29, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | import shutil 9 | import time 10 | import random 11 | 12 | from align.decompose import decomposer 13 | from helpers import sequenceutils, treeutils, hmmutils 14 | from tasks import task 15 | from tools import external_tools 16 | from configuration import Configs 17 | 18 | ''' 19 | Not really used for anything, may be removed in the future. 20 | ''' 21 | 22 | def buildSubsetsKMH(context, subsetsDir): 23 | tempDir = os.path.join(subsetsDir, "initial_tree") 24 | 25 | Configs.log("Building KMH decomposition on {} with skeleton size {}/{}..".format(context.sequencesPath, Configs.decompositionSkeletonSize, 1000)) 26 | time1 = time.time() 27 | 28 | initialTreePath, initialAlignPath, unusedTaxa = buildInitialTreeAlign(tempDir, context.sequencesPath) 29 | 30 | if len(unusedTaxa) == 0: 31 | subsetPaths = treeutils.decomposeGuideTree(tempDir, initialAlignPath, initialTreePath, Configs.decompositionMaxSubsetSize, Configs.decompositionMaxNumSubsets) 32 | else: 33 | subsetSeedDir = os.path.join(subsetsDir, "seed_subsets") 34 | if not os.path.exists(subsetSeedDir): 35 | os.makedirs(subsetSeedDir) 36 | subsetSeedPaths = treeutils.decomposeGuideTree(subsetSeedDir, initialAlignPath, initialTreePath, None, Configs.decompositionMaxNumSubsets) 37 | subsetPaths = reassignTaxons(subsetsDir, subsetSeedPaths, context.unalignedSequences, unusedTaxa) 38 | 39 | time2 = time.time() 40 | Configs.log("Built KMH decomposition on {} in {} sec..".format(context.sequencesPath, time2-time1)) 41 | 42 | return subsetPaths 43 | 44 | def buildInitialTreeAlign(tempDir, sequencesPath): 45 | outputTreePath = os.path.join(tempDir, "initial_tree.tre") 46 | outputAlignPath = os.path.join(tempDir, "initial_align.txt") 47 | 48 | if os.path.exists(outputTreePath) and os.path.exists(outputAlignPath): 49 | return outputTreePath, outputAlignPath 50 | if os.path.exists(tempDir): 51 | shutil.rmtree(tempDir) 52 | os.makedirs(tempDir) 53 | 54 | initialAlign, unusedTaxa = decomposer.initial_tree.buildInitialAlignment(sequencesPath, tempDir, Configs.decompositionSkeletonSize, 1000) 55 | sequenceutils.writeFasta(initialAlign, outputAlignPath) 56 | #external_tools.runRaxmlNg(outputAlignPath, tempDir, outputTreePath, 8).run() 57 | external_tools.runFastTree(outputAlignPath, tempDir, outputTreePath).run() 58 | 59 | return outputTreePath, outputAlignPath, unusedTaxa 60 | 61 | def reassignTaxons(subsetsDir, subsetSeedPaths, sequences, unusedTaxa): 62 | unusedPath = os.path.join(subsetsDir, "unassigned_sequences.txt") 63 | sequenceutils.writeFasta(sequences, unusedPath, unusedTaxa) 64 | 65 | hmmMap = {} 66 | for subsetPath in subsetSeedPaths: 67 | hmmDir = os.path.join(os.path.dirname(subsetPath), "hmm_{}".format(os.path.basename(subsetPath)).replace(".", "_")) 68 | if not os.path.exists(hmmDir): 69 | os.makedirs(hmmDir) 70 | hmmMap[subsetPath] = os.path.join(hmmDir, "hmm_model.txt") 71 | hmmTasks = hmmutils.buildHmms(hmmMap) 72 | task.submitTasks(hmmTasks) 73 | task.awaitTasks(hmmTasks) 74 | hmmPaths = [t.outputFile for t in hmmTasks] 75 | 76 | scoreFileHmmFileMap = {} 77 | scoreTasks = hmmutils.buildHmmScores(hmmPaths, unusedPath, scoreFileHmmFileMap) 78 | task.submitTasks(scoreTasks) 79 | 80 | bestScores = {} 81 | taxonHmmMap = {} 82 | for scoreTask in task.asCompleted(scoreTasks): 83 | subsetScores = hmmutils.readSearchFile(scoreTask.outputFile) 84 | for taxon, scores in subsetScores.items(): 85 | if scores[1] > bestScores.get(taxon, -float("inf")): 86 | bestScores[taxon] = scores[1] 87 | taxonHmmMap[taxon] = scoreFileHmmFileMap[scoreTask.outputFile] 88 | 89 | subsetTaxons = {file : [] for file in hmmPaths} 90 | for taxon, hmmPath in taxonHmmMap.items(): 91 | subsetTaxons[hmmPath].append(taxon) 92 | for subsetPath, hmmPath in hmmMap.items(): 93 | subset = sequenceutils.readFromFasta(subsetPath) 94 | for taxon in subset: 95 | subsetTaxons[hmmPath].append(taxon) 96 | 97 | subsetPaths = [] 98 | i = 1 99 | for hmmPath, subset in subsetTaxons.items(): 100 | subsetPath = os.path.join(subsetsDir, "subset_{}.txt".format(i)) 101 | subsetPaths.append(subsetPath) 102 | sequenceutils.writeFasta(sequences, subsetPath, subset) 103 | i = i + 1 104 | 105 | return subsetPaths 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/decompose/decomposer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 28, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | import random 9 | import time 10 | 11 | from align.decompose import initial_tree, kmh 12 | from helpers import treeutils, sequenceutils 13 | from configuration import Configs 14 | 15 | ''' 16 | Handles the different ways to decompose the dataset into subsets. 17 | The main way is to estimate a guide tree, then use PASTA's centroid edge decomposition 18 | on the guide tree. Can also decompose randomly (for high speed on huge datasets). 19 | ''' 20 | 21 | def decomposeSequences(context): 22 | time1 = time.time() 23 | 24 | if len(context.subsetPaths) > 0: 25 | Configs.log("Subset paths already provided, skipping decomposition..") 26 | 27 | elif len(context.subalignmentPaths) > 0: 28 | context.subsetPaths = context.subalignmentPaths 29 | Configs.log("Subalignment paths already provided, skipping decomposition..") 30 | 31 | else: 32 | subsetsDir = os.path.join(context.workingDir, "decomposition") 33 | context.subsetPaths = [] 34 | n = 1 35 | while True: 36 | filePath = os.path.join(subsetsDir, "subset_{}.txt".format(n)) 37 | if not os.path.exists(filePath): 38 | break 39 | Configs.log("Detected existing subset file {}".format(filePath)) 40 | context.subsetPaths.append(filePath) 41 | n = n + 1 42 | 43 | if len(context.subsetPaths) == 0: 44 | buildDecomposition(context, subsetsDir) 45 | 46 | time2 = time.time() 47 | Configs.log("Decomposed {} into {} subsets in {} sec..".format(context.sequencesPath, len(context.subsetPaths), time2-time1)) 48 | 49 | def buildDecomposition(context, subsetsDir): 50 | if not os.path.exists(subsetsDir): 51 | os.makedirs(subsetsDir) 52 | if context.unalignedSequences is None: 53 | context.unalignedSequences = sequenceutils.readFromFasta(context.sequencesPath, removeDashes=True) 54 | 55 | if (Configs.decompositionStrategy == "random" or context.guideTree == "random") and Configs.outputPath == context.outputFile: 56 | context.subsetPaths = randomDecomposition(subsetsDir, context.unalignedSequences, Configs.decompositionMaxNumSubsets) 57 | 58 | elif Configs.decompositionStrategy == "kmh": 59 | Configs.log("Decomposing {} with KMH..".format(context.sequencesPath)) 60 | Configs.log("Targetting {} subsets..".format(Configs.decompositionMaxNumSubsets)) 61 | context.subsetPaths = kmh.buildSubsetsKMH(context, subsetsDir) 62 | 63 | else: 64 | guideTreePath = initial_tree.buildInitialTree(context, subsetsDir, context.guideTree) 65 | Configs.log("Using target subset size of {}, and maximum number of subsets {}..".format(Configs.decompositionMaxSubsetSize, Configs.decompositionMaxNumSubsets)) 66 | context.subsetPaths = treeutils.decomposeGuideTree(subsetsDir, context.sequencesPath, guideTreePath, 67 | Configs.decompositionMaxSubsetSize, Configs.decompositionMaxNumSubsets) 68 | 69 | def chooseSkeletonTaxa(sequences, skeletonSize, mode = "fulllength"): 70 | allTaxa = list(sequences.keys()) 71 | 72 | if mode == "fulllength": 73 | seqLengths = [len(sequences[t].seq) for t in sequences] 74 | 75 | #topQuartile = numpy.quantile(seqLengths, 0.75) 76 | seqLengths.sort() 77 | topQuartile = seqLengths[int(0.75*(len(seqLengths)-1))] 78 | 79 | fullLength = [] 80 | notFullLength = [] 81 | for t in allTaxa: 82 | if abs(len(sequences[t].seq) - topQuartile) < 0.25 * topQuartile: 83 | fullLength.append(t) 84 | else: 85 | notFullLength.append(t) 86 | 87 | random.shuffle(fullLength) 88 | random.shuffle(notFullLength) 89 | allTaxa = fullLength + notFullLength 90 | else: 91 | random.shuffle(allTaxa) 92 | 93 | skeletonTaxa = allTaxa[:skeletonSize] 94 | remainingTaxa = allTaxa[skeletonSize:] 95 | return skeletonTaxa, remainingTaxa 96 | 97 | def randomDecomposition(subsetsDir, sequences, numSubsets): 98 | allTaxa = list(sequences.keys()) 99 | random.shuffle(allTaxa) 100 | 101 | taxonSubsets = [allTaxa[i :: numSubsets] for i in range(numSubsets)] 102 | subsetPaths = [] 103 | for n, subset in enumerate(taxonSubsets): 104 | subsetPath = os.path.join(subsetsDir, "subset_{}.txt".format(n+1)) 105 | subsetPaths.append(subsetPath) 106 | sequenceutils.writeFasta(sequences, subsetPath, subset) 107 | return subsetPaths 108 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/aligner.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 29, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | import shutil 9 | 10 | from align.alignment_context import AlignmentContext 11 | from align.decompose.decomposer import decomposeSequences 12 | from align.merge.merger import mergeSubalignments 13 | from tools import external_tools 14 | from configuration import Configs 15 | from helpers import sequenceutils 16 | from tasks import task 17 | 18 | ''' 19 | Alignments are treated as "tasks", units of work that are written out to task files and 20 | processed as threads and/or compute nodes become available. 21 | MAGUS tasks will recursively generate MAGUS tasks over large subsets and MAFFT tasks over smaller subsets. 22 | ''' 23 | 24 | def mainAlignmentTask(): 25 | ''' 26 | 6.1.2021 - modified by Chengze Shen 27 | added a new argument for backboneWeightsPath 28 | ''' 29 | args = {"workingDir" : Configs.workingDir, "outputFile" : Configs.outputPath, 30 | "subalignmentPaths" : Configs.subalignmentPaths, "sequencesPath" : Configs.sequencesPath, 31 | "backbonePaths" : Configs.backbonePaths, "guideTree" : Configs.guideTree, 32 | "backboneWeightsPath": Configs.backboneWeightsPath} 33 | task = createAlignmentTask(args) 34 | task.submitTask() 35 | task.awaitTask() 36 | 37 | def createAlignmentTask(args): 38 | return task.Task(taskType = "runAlignmentTask", outputFile = args["outputFile"], taskArgs = args) 39 | 40 | def runAlignmentTask(**kwargs): 41 | ''' 42 | The standard MAGUS task: 43 | decompose the data into subsets, align each subset, and merge the subalignments. 44 | ''' 45 | 46 | with AlignmentContext(**kwargs) as context: 47 | if context.sequencesPath is not None: 48 | Configs.log("Aligning sequences {}".format(context.sequencesPath)) 49 | 50 | decomposeSequences(context) 51 | if Configs.onlyGuideTree: 52 | Configs.log("Outputting only the guide tree, as requested..") 53 | shutil.copyfile(os.path.join(context.workingDir, "decomposition", "initial_tree", "initial_tree.tre"), context.outputFile) 54 | return 55 | 56 | ''' 57 | 6.1.2021 - added by Chengze Shen 58 | a quick log for sanity check that the backbone weights path is correct 59 | ''' 60 | if context.backboneWeightsPath: 61 | Configs.log("Backbone weights path is {}..".format( 62 | context.backboneWeightsPath)) 63 | else: 64 | Configs.log("Heads up! All backbone alignments are treated equally..") 65 | 66 | alignSubsets(context) 67 | mergeSubalignments(context) 68 | 69 | def alignSubsets(context): 70 | if len(context.subalignmentPaths) > 0: 71 | Configs.log("Subalignment paths already provided, skipping subalignments..") 72 | return 73 | 74 | Configs.log("Building {} subalignments..".format(len(context.subsetPaths))) 75 | subalignDir = os.path.join(context.workingDir, "subalignments") 76 | if not os.path.exists(subalignDir): 77 | os.makedirs(subalignDir) 78 | 79 | mafftThreshold = max(Configs.mafftSize, Configs.decompositionMaxSubsetSize, Configs.recurseThreshold) 80 | 81 | for file in context.subsetPaths: 82 | subset = sequenceutils.readFromFasta(file) 83 | subalignmentPath = os.path.join(subalignDir, "subalignment_{}".format(os.path.basename(file))) 84 | context.subalignmentPaths.append(subalignmentPath) 85 | 86 | if os.path.exists(subalignmentPath): 87 | Configs.log("Existing subalignment file detected: {}".format(subalignmentPath)) 88 | 89 | elif len(subset) <= mafftThreshold or not Configs.recurse: 90 | Configs.log("Subset has {}/{} sequences, aligning with MAFFT..".format(len(subset), mafftThreshold)) 91 | subalignmentTask = external_tools.buildMafftAlignment(file, subalignmentPath) 92 | context.subalignmentTasks.append(subalignmentTask) 93 | 94 | else: 95 | Configs.log("Subset has {}/{} sequences, recursively subaligning with MAGUS..".format(len(subset), mafftThreshold)) 96 | subalignmentDir = os.path.join(subalignDir, os.path.splitext(os.path.basename(subalignmentPath))[0]) 97 | subalignmentTask = createAlignmentTask({"outputFile" : subalignmentPath, "workingDir" : subalignmentDir, 98 | "sequencesPath" : file, "guideTree" : Configs.recurseGuideTree}) 99 | context.subalignmentTasks.append(subalignmentTask) 100 | 101 | task.submitTasks(context.subalignmentTasks) 102 | Configs.log("Prepared {} subset alignment tasks..".format(len(context.subalignmentTasks))) 103 | 104 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/alignment_context.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Dec 4, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | from helpers import sequenceutils 9 | from tasks import task, manager 10 | from configuration import Configs 11 | 12 | ''' 13 | The AlignmentContext data structure maintains all information pertaining to a single MAGUS alignment. 14 | The main thread keeps one context active at a time (to manage resources). Subalignments will spawn their 15 | own contexts, which will become active when the parent context is in a blocking wait. The previously active 16 | context resumes when the current alignment is completed. 17 | ''' 18 | 19 | class AlignmentContext: 20 | 21 | def __init__(self, **kwargs): 22 | self.outputFile = None 23 | self.workingDir = None 24 | self.sequencesPath = None 25 | self.subsetPaths = [] 26 | self.subalignmentPaths = [] 27 | self.backbonePaths = [] 28 | self.guideTree = None 29 | 30 | self.unalignedSequences = None 31 | #self.taxa = [] 32 | self.subsets = [] 33 | self.subalignments = [] 34 | self.taxonSubsetMap = {} 35 | self.taxonSubalignmentMap = {} 36 | 37 | self.backboneTaxa = {} 38 | self.backboneExtend = set() 39 | self.backboneSubalignment = {} 40 | 41 | self.subalignmentTasks = [] 42 | self.backboneTasks = [] 43 | self.graph = None 44 | 45 | ''' 46 | 6.1.2021 - added by Chengze Shen 47 | a dict: = backbone aln name, = weights 48 | a path: the file path to define the dict 49 | ''' 50 | self.backboneWeights = {} 51 | self.backboneWeightsPath = None 52 | 53 | for attr in kwargs: 54 | vars(self)[attr] = kwargs.get(attr) 55 | 56 | if not os.path.exists(self.workingDir): 57 | os.makedirs(self.workingDir) 58 | 59 | def awaitSubalignments(self): 60 | task.awaitTasks(self.subalignmentTasks) 61 | 62 | def initializeSequences(self): 63 | self.unalignedSequences = {} 64 | for i, subsetPath in enumerate(self.subsetPaths): 65 | self.subsets.append([]) 66 | subset = sequenceutils.readFromFastaOrdered(subsetPath, removeDashes=True) 67 | for sequence in subset: 68 | self.unalignedSequences[sequence.tag] = sequence 69 | self.taxonSubsetMap[sequence.tag] = i 70 | self.subsets[i].append(sequence.tag) 71 | 72 | if Configs.constrain: 73 | self.subalignments = self.subsets 74 | self.taxonSubalignmentMap = self.taxonSubsetMap 75 | else: 76 | for s in self.subsets: 77 | for taxon in s: 78 | self.taxonSubalignmentMap[taxon] = len(self.subalignments) 79 | self.subalignments.append([taxon]) 80 | 81 | ''' 82 | 6.1.2021 - added by Chengze Shen 83 | a new function to initialize readings of weights from the 84 | backbone weights path (if such path exists) 85 | ''' 86 | def initializeBackboneWeights(self): 87 | if self.backboneWeightsPath: 88 | # the weights should be put in the following manner: 89 | # > each line denotes a weight (for a backbone) 90 | # > for each line, it should have the format: backbone path,weight 91 | with open(self.backboneWeightsPath, 'r') as f: 92 | lines = f.read().split('\n')[:-1] 93 | for line in lines: 94 | if line == '': 95 | continue 96 | w = [x.strip() for x in line.split(',')] 97 | assert len(w) == 2 98 | self.backboneWeights[w[0]] = float(w[1]) 99 | else: 100 | return 101 | 102 | 103 | def initializeBackboneSequenceMapping(self): 104 | if len(self.backboneTaxa) == 0: 105 | backboneSubsetTaxonMap = {i : subset for i, subset in enumerate(self.subsets)} 106 | else: 107 | backboneSubsetTaxonMap = {} 108 | for taxon in self.backboneTaxa: 109 | i = self.taxonSubsetMap[taxon] 110 | backboneSubsetTaxonMap[i] = backboneSubsetTaxonMap.get(i, []) 111 | backboneSubsetTaxonMap[i].append(taxon) 112 | 113 | if Configs.constrain: 114 | for i, subalignPath in enumerate(self.subalignmentPaths): 115 | subalignment = sequenceutils.readFromFasta(subalignPath, removeDashes=False) 116 | for taxon in backboneSubsetTaxonMap.get(i, []): 117 | self.backboneSubalignment[taxon] = subalignment[taxon] 118 | else: 119 | self.backboneSubalignment = self.unalignedSequences 120 | 121 | def __enter__(self): 122 | manager.TaskManager.contextStack.append(self) 123 | return self 124 | 125 | def __exit__(self, excType, excVal, excTb): 126 | manager.TaskManager.contextStack.pop() 127 | 128 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mafft/mafftdir/libexec/mafft-homologs.1: -------------------------------------------------------------------------------- 1 | .\" Title: MAFFT-HOMOLOGS 2 | .\" Author: Kazutaka Katoh 3 | .\" Generator: DocBook XSL Stylesheets v1.72.0 4 | .\" Date: 2007-08-14 5 | .\" Manual: Mafft Manual 6 | .\" Source: mafft-homologs 2.1 7 | .\" 8 | .TH "MAFFT\-HOMOLOGS" "1" "2007\-06\-09" "mafft\-homologs 2.1" "Mafft Manual" 9 | .\" disable hyphenation 10 | .nh 11 | .\" disable justification (adjust text to left margin only) 12 | .ad l 13 | .SH "NAME" 14 | .RS 0 15 | mafft\-homologs \- aligns sequences together with homologues automatically collected from SwissProt via NCBI BLAST 16 | .RE 17 | .SH "SYNOPSIS" 18 | .RS 0 19 | \fBmafft\-homologs\fR [\fBoptions\fR] \fIinput\fR [>\ \fIoutput\fR] 20 | .RE 21 | .SH "DESCRIPTION" 22 | .RS 0 23 | The accuracy of an alignment of a few distantly related sequences is considerably improved when being aligned together with their close homologs. The reason for the improvement is probably the same as that for PSI\-BLAST. That is, the positions of highly conserved residues, those with many gaps and other additional information is brought by close homologs. According to Katoh et al. (2005), the improvement by adding close homologs is 10% or so, which is comparable to the improvement by incorporating structural information of a pair of sequences. Mafft\-homologs in a mafft server works like this: 24 | .sp 25 | .RS 4 26 | \h'-04' 1.\h'+02'Collect a number (50 by default) of close homologs (E=1e\-10 by default) of the input sequences. 27 | .RE 28 | .sp 29 | .RS 4 30 | \h'-04' 2.\h'+02'Align the input sequences and homologs all together using the L\-INS\-i strategy. 31 | .RE 32 | .sp 33 | .RS 4 34 | \h'-04' 3.\h'+02'Remove the homologs. 35 | .RE 36 | .RE 37 | .SH "OPTIONS" 38 | .RS 0 39 | .PP 40 | \fB\-a\fR \fI\fIn\fR\fR 41 | .RS 4 42 | The number of collected sequences (default: 50). 43 | .RE 44 | .PP 45 | \fB\-e\fR \fI\fIn\fR\fR 46 | .RS 4 47 | Threshold value (default: 1e\-10). 48 | .RE 49 | .PP 50 | \fB\-o\fR \fI\fIxxx\fR\fR 51 | .RS 4 52 | Options for mafft (default: " \-\-op 1.53 \-\-ep 0.123 \-\-maxiterate 1000 --localpair --reorder"). 53 | .RE 54 | .PP 55 | \fB\-l\fR 56 | .RS 4 57 | Locally carries out BLAST searches instead of NCBI BLAST (requires locally installed BLAST and a database). 58 | .RE 59 | .PP 60 | \fB\-f\fR 61 | .RS 4 62 | Outputs collected homologues also (default: off). 63 | .RE 64 | .PP 65 | \fB\-w\fR 66 | .RS 4 67 | entire sequences are subjected to BLAST search (default: well\-aligned region only) 68 | .RE 69 | .RE 70 | .SH "REQUIREMENTS" 71 | .RS 0 72 | .PP 73 | MAFFT version > 5.58. 74 | .PP 75 | Either of 76 | .RS 4 77 | .PP 78 | lynx (when remote BLAST server is used) 79 | .PP 80 | BLAST and a protein sequence database (when local BLAST is used) 81 | .RE 82 | .RE 83 | .SH "REFERENCES" 84 | .RS 0 85 | .PP 86 | Katoh, Kuma, Toh and Miyata (Nucleic Acids Res. 33:511\-518, 2005) MAFFT version 5: improvement in accuracy of multiple sequence alignment. 87 | .RE 88 | .SH "SEE ALSO" 89 | .RS 0 90 | .PP 91 | \fBmafft\fR(1) 92 | .RE 93 | .SH "AUTHORS" 94 | .RS 0 95 | .PP 96 | \fBKazutaka Katoh\fR <\&katoh_at_bioreg.kyushu\-u.ac.jp.\&> 97 | .sp -1n 98 | .IP "" 4 99 | Wrote Mafft. 100 | .PP 101 | \fBCharles Plessy\fR <\&charles\-debian\-nospam@plessy.org\&> 102 | .sp -1n 103 | .IP "" 4 104 | Wrote this manpage in DocBook XML for the Debian distribution, using Mafft's homepage as a template. 105 | .RE 106 | .SH "COPYRIGHT" 107 | .RS 0 108 | Copyright \(co 2002\-2007 Kazutaka Katoh (mafft) 109 | .br 110 | Copyright \(co 2007 Charles Plessy (this manpage) 111 | .br 112 | .PP 113 | Mafft and its manpage are offered under the following conditions: 114 | .PP 115 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 116 | .sp 117 | .RS 4 118 | \h'-04' 1.\h'+02'Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 119 | .RE 120 | .sp 121 | .RS 4 122 | \h'-04' 2.\h'+02'Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 123 | .RE 124 | .sp 125 | .RS 4 126 | \h'-04' 3.\h'+02'The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. 127 | .RE 128 | .PP 129 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 130 | .br 131 | .RE 132 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/README.md: -------------------------------------------------------------------------------- 1 | # MAGUS 2 | Multiple Sequence Alignment using Graph Clustering 3 | 4 | - - - - 5 | 6 | ## Purpose and Functionality 7 | MAGUS is a tool for piecewise large-scale multiple sequence alignment. 8 | The dataset is divided into subsets, which are independently aligned with a base method (currently MAFFT -linsi). These subalignments are merged together with the Graph Clustering Merger (GCM). GCM builds the final alignment by clustering an alignment graph, which is constructed from a set of backbone alignments. This process allows MAGUS to effectively boost MAFFT -linsi to over a million sequences. 9 | 10 | The basic procedure is outlined below. Steps 4-7 are GCM. 11 | 1. The input is a set of unaligned sequences. Alternatively, the user can provide a set of multiple sequence alignments and skip the next two steps. 12 | 2. The dataset is decomposed into subsets. 13 | 3. The subsets are aligned with MAFFT -linsi. 14 | 4. A set of backbone alignments are generated with MAFFT -linsi (or provided by the user). 15 | 5. The backbones are compiled into an alignment graph. 16 | 6. The graph is clustered with MCL. 17 | 7. The clusters are resolved into a final alignment. 18 | 19 | - - - - 20 | 21 | ## Dependencies 22 | MAGUS requires 23 | * Python 3 24 | * MAFFT (linux version is included) 25 | * MCL (linux version is included) 26 | * FastTree and Clustal Omega are needed if using these guide trees (linux versions included) 27 | 28 | If you would like to use some other version of MAFFT and/or MCL (for instance, if you're using Mac), 29 | you will need to edit the MAFFT/MCL paths in configuration.py 30 | (I'll pull these out into a separate config file to make it simpler). 31 | 32 | - - - - 33 | 34 | ## Getting Started 35 | Please navigate your terminal to the "example" directory to get started with some sample data. 36 | A few basic ways of running MAGUS are shown below. 37 | Run "magus.py -h" to view the full list of arguments. 38 | 39 | **Align a set of unaligned sequences from scratch** 40 | *python3 ../magus.py -d outputs -i unaligned_sequences.txt -o magus_result.txt* 41 | 42 | *-o* specifies the output alignment path 43 | *-d* (optional) specifies the working directory for GCM's intermediate files, like the graph, clusters, log, etc. 44 | 45 | **Merge a prepared set of alignments** 46 | *python3 ../magus.py -d outputs -s subalignments -o magus_result.txt* 47 | 48 | *-s* specifies the directory with subalignment files. Alternatively, you can pass a list of file paths. 49 | 50 | - - - - 51 | 52 | ## Controlling the pipeline 53 | 54 | **Specify subset decomposition behavior** 55 | *python3 ../magus.py -d outputs -i unaligned_sequences.txt -t fasttree --maxnumsubsets 100 --maxsubsetsize 50 -o magus_result.txt* 56 | 57 | *-t* specifies the guide tree method to use, and is the main way to set the decomposition strategy. 58 | Available options are fasttree (default), parttree, clustal (recommended for very large datasets), and random. 59 | *--maxnumsubsets* sets the desired number of subsets to decompose into (default 25). 60 | *--maxsubsetsize* sets the threshold to stop decomposing subsets below this number (default 50). 61 | Decomposition proceeds until maxnumsubsets is reached OR all subsets are below maxsubsetsize. 62 | 63 | **Specify beckbones for alignment graph** 64 | *python3 ../magus.py -d outputs -i unaligned_sequences.txt -r 10 -m 200 -o magus_result.txt* 65 | *python3 ../magus.py -d outputs -s subalignments -b backbones -o magus_result.txt* 66 | 67 | *-r* and *-m* specify the number of MAFFT backbones and their maximum size, respectively. Default to 10 and 200. 68 | Alternatively, the user can provide his own backbones; *-b* can be used to provide a directory or a list of files. 69 | 70 | **Specify graph trace method** 71 | *python3 ../magus.py -d outputs -i unaligned_sequences.txt --graphtracemethod mwtgreedy -o magus_result.txt* 72 | 73 | *--graphtracemethod* is the flag that governs the graph trace method. Options are minclusters (default and recommended), fm, mwtgreedy (recommended for very large graphs), rg, or mwtsearch. 74 | 75 | **Unconstrained alignment** 76 | *python3 ../magus.py -d outputs -i unaligned_sequences.txt -c false -o magus_result.txt* 77 | 78 | By default, MAGUS constrains the merged alignment to induce all subalignments. This constraint can be disabled with *-c false*. 79 | This drastically slows MAGUS and is strongly not recommended above 200 sequences. 80 | 81 | - - - - 82 | 83 | ## Things to Keep in Mind 84 | 85 | * MAGUS will not overwrite existing backbone, graph and cluster files. 86 | Please delete them/specify a different working directory to perform a clean run. 87 | * Related issue: if MAGUS is stopped while running MAFFT, MAFFT's output backbone files will be empty. 88 | This will cause errors if MAGUS reruns and finds these empty files. 89 | * A large number of subalignments (>100) will start to significantly slow down the ordering phase, especially for very heterogenous data. 90 | I would generally disadvise using more than 100 subalignments, unless the data is expected to be well-behaved. 91 | 92 | - - - - 93 | 94 | ## Related Publications 95 | 96 | * Original MAGUS paper: ___Smirnov, V. and Warnow, T., 2020. MAGUS: Multiple Sequence Alignment using Graph Clustering. Bioinformatics. https://doi.org/10.1093/bioinformatics/btaa992___ 97 | * GCM-MWT paper: 98 | * MAGUS on ultra-large datasets: 99 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | WITCH v1.0.10 2 | ------------- 3 | 1. Added an adaptive schematic for inclusion of top HMMs for aligning each 4 | query sequence, when using adjusted bitscores (``--use-weights 1``). 5 | Previously, all top `k` HMMs will be used. Now, WITCH includes up to `k` 6 | HMMs, or until the sum of weights exceeds 0.999. This should keep the 7 | core design of WITCH, but this has not been tested with data yet. 8 | 2. Added type check and cast for various user-defined configs in the 9 | configuration file before passing them to the main process. 10 | 11 | WITCH v1.0.9 12 | ------------ 13 | 1. Fixed the issue with feeding FastTree2 with gzipped alignment file for 14 | tree estimation. Now use piping to pipe the alignment file (gzipped or 15 | not) as stdin to the FastTree2 executable. 16 | 2. Enforced the start method for multiprocessing on macOS to be ``fork``. 17 | This ensures WITCH usability on a macOS environment. 18 | 3. Changed the default invocation of example codes from ``python3`` to 19 | just ``python``. 20 | 21 | WITCH v1.0.8 22 | ------------ 23 | 1. Supported reading gzipped alignment file. That is, when the user supplies 24 | with their own backbone alignment and adds some other query sequences). 25 | E.g., ``backbone.fasta.gz`` or ``backbone.fa.gzip``, etc. 26 | 27 | WITCH v1.0.7 28 | ------------ 29 | 1. Added example usages to ``witch.py --help``. Also changed the default 30 | formatter from a custom one to ``argparse.RawDescriptionHelpFormatter``. 31 | 2. Now by default will bypass the initial WITCH setup step. Previously, this 32 | was achieved by giving WITCH the parameter ``-y`` (``--bypass-setup``). 33 | 3. Changed the default behavior of ``examples/run.sh`` to running scenario D. 34 | 4. Fixed the runtime error when home.path exists but the pointed main.config is 35 | missing. Now will regenerate ``main.config`` at the pointed location. 36 | 5. Fixed ``--bypass-setup`` not working as True by default. Now will always 37 | create the config path at ``~/.witch/main.config``. 38 | 6. Changed the default filename for the masked alignment output. Previously, 39 | it will be named as ``.fasta.masked``, as ```` supplied by the user 40 | or default to ``aligned``. Now will be written as ``.masked.fasta``. 41 | If the user gives ``-o .fa`` or ``-o .fasta``, the masked alignment 42 | will use the corresponding suffix (e.g., ``.masked.fa``). 43 | 44 | WITCH v1.0.6 45 | ------------ 46 | 1. Added Software Output Explanation to the README to avoid confusion on what 47 | alignment file to use for downstream analyses. 48 | 49 | WITCH v1.0.5 50 | ------------ 51 | 1. Added compatibility to Dendropy with version >4.5.2 and removed its 52 | requirement from requirements.txt for pip. 53 | 54 | WITCH v1.0.5b 55 | ------------- 56 | 1. Added a new parameter option allowing users to specify a customized config 57 | file to override the default ``main.config`` (usually can be found at 58 | ``~/.witch_msa/main.config``). Use ``-c `` to do so. 59 | The priority for arguments: ``commandline > user.config > main.config``. 60 | 61 | WITCH v1.0.5a 62 | ------------- 63 | 1. Added two sanity checks to HMMBuild and HMMSearch jobs: making sure all 64 | files are created correctly before proceeding. 65 | 2. Added a file number check utility function using the ``inspect`` package. 66 | 67 | WITCH v1.0.4 68 | ------------ 69 | 1. Added an additional parameter option to set an upper bound to the HMM 70 | subsets created (``-Z``, complementary to ``-A`` which is for lower bound), 71 | based on the number of sequences in a subset. 72 | 2. Changed the behavior for creating HMM subsets. Instead of reading in the 73 | backboen alignment at once, WITCH now reads line by line to avoid large 74 | memory consumption if the backbone is very large. 75 | 3. Changed the behavior for running HMMSearch. Now also uses my task manager 76 | to manage the maximum amount of concurrently running jobs (instead of 77 | submitting all at once). 78 | 79 | WITCH v1.0.3 80 | ------------ 81 | 1. Fixed an oversight in memory issue if the backbone alignment is large and 82 | we are creating a lot of subsets (``gcmm/algorithm.py``). 83 | 2. Added default behavior to function ``gcmm/task.py/handleFuture(...)`` so that 84 | return values are attached to ``success`` if no callback function is provided. 85 | 3. Moving towards using my generic task manager for all parallelization. 86 | 87 | WITCH v1.0.2 88 | ------------ 89 | 1. Added an option to bypass the initial config setup step. Use ``-y``, or 90 | ``--bypass-setup`` in your commandline running WITCH to avoid being asked where 91 | to generate the config directory (will default to ``~/.witch_msa``). Example: 92 | running ``witch.py -y -i [sequence files]`` for the first time will directly 93 | set up WITCH configuration file and start aligning the input sequences. 94 | 95 | WITCH v1.0.2a1 96 | -------------- 97 | 1. Fixed not using absolute path when setting up the directory for 98 | ``main.config``. 99 | 2. Improved metavar naming for argument parameters. 100 | 3. Fixed a bug in the included MAGUS installation such that if the user has 101 | a file or directory named ``fasttree/`` in the folder where WITCH is run, 102 | MAGUS will try to read it as the guide tree instead of creating a FastTree 103 | guide tree. 104 | 4. Changed the executable search from ``usr/bin/env python`` to 105 | ``usr/bin/env python3``. 106 | 5. Further added systemrecursionlimit from 10000 to 20000 to combat issues 107 | with large tree. 108 | 109 | WITCH v1.0.1 110 | ------------ 111 | 1. First working release across different platform and different python 112 | versions. 113 | -------------------------------------------------------------------------------- /witch_msa/gcmm/task.py: -------------------------------------------------------------------------------- 1 | import time 2 | from witch_msa.configs import Configs, tqdm_styles 3 | from tqdm import tqdm 4 | import itertools 5 | 6 | import concurrent.futures 7 | 8 | ''' 9 | A class defining a generic task object that will be used for job submission 10 | ''' 11 | class MyTask(object): 12 | # (required) the list of parameters used in the task 13 | # (optional) parent(s) of the task 14 | # (i.e., the other tasks that depends on this task) 15 | # (optional) children of this task 16 | # (i.e., the other tasks that this task depends on) 17 | def __init__(self, *args, **kwargs): 18 | self.args = tuple(*args) 19 | 20 | _valid = ['parent', 'children'] 21 | for k, v in kwargs.items(): 22 | if k in _valid: 23 | if isinstance(v, MyTask): 24 | setattr(self, k, [v]) 25 | elif isinstance(v, list): 26 | setattr(self, k, v) 27 | else: 28 | raise TypeError(type(v)) 29 | 30 | def get_args(self): 31 | return self.args 32 | 33 | # currently unused 34 | def get_parent(self): 35 | if 'parent' in self.__dict__: 36 | return self.parent 37 | else: 38 | return None 39 | 40 | # currently unused 41 | def get_children(self): 42 | if 'children' in self.__dict__: 43 | return self.children 44 | else: 45 | return None 46 | 47 | ''' 48 | Helper function to convert a list of lists of arguments to a list of MyTask objects 49 | Assumption: all elements of args are of the same length 50 | Return: a generator of MyTask 51 | ''' 52 | def getTasks(*args): 53 | total_length = len(args[0]) 54 | for i in range(total_length): 55 | _args = [x[i] for x in args] 56 | yield MyTask(_args) 57 | 58 | ''' 59 | Variant of getTasks to use given index positions to select 60 | Also append the index to each yielded element at back 61 | ''' 62 | def getTasksWithIndexes(indexes, *args): 63 | for i in indexes: 64 | _args = [x[i] for x in args] + [i] 65 | yield MyTask(_args) 66 | 67 | ''' 68 | Helper function to handle a single future object with any return values. 69 | Run additional callbacks with the return values and the additional callback 70 | arguments supplemented. 71 | Return: the runtime to run the additional callback 72 | ''' 73 | def handleFuture(future, success, ignored, retry, i_retry, 74 | callback_func, callback_args): 75 | s1 = time.time() 76 | ret = future.result() 77 | 78 | # first four fields of any callbacks will be: success=, 79 | # ignored=, retry=, i_retry= 80 | if callback_func: 81 | callback_func(success, ignored, retry, i_retry, 82 | *ret, *callback_args) 83 | return time.time() - s1 84 | else: 85 | # default behavior: attach ret to success 86 | success.append(ret) 87 | return 0. 88 | 89 | ''' 90 | Helper function to run tasks defined by a list of MyTask objects 91 | Required: the function that defines the task 92 | the process pool to submit to 93 | a generator of MyTask objects 94 | the number of MyTask objects 95 | Optional: max_concurrent_jobs= # default will submit all tasks at once 96 | i_retry= # number of retry for failed tasks 97 | callback_func= # callback to run after a future is handled 98 | should take future return values as the 99 | first set of arguments 100 | callback_args=<*args> # additional arguments for the callback 101 | Return: success, ignored, retry 102 | total runtime (seconds) for handling/running the tasks 103 | ''' 104 | def runTasks(func, pool, mytasks, num_tasks, **kwargs): 105 | handle_runtime = 0. 106 | max_concurrent_jobs = kwargs.get('max_concurrent_jobs', num_tasks) 107 | i_retry = kwargs.get('i_retry', 0) 108 | callback_func = kwargs.get('callback_func', None) 109 | callback_args = kwargs.get('callback_args', []) 110 | 111 | success, ignored, retry = [], [], [] 112 | with tqdm(total=num_tasks, **tqdm_styles) as pbar: 113 | futures = { 114 | #pool.submit(func, *task.get_args()): task.get_id() 115 | pool.submit(func, *task.get_args()): task.get_parent() 116 | for task in itertools.islice(mytasks, max_concurrent_jobs) 117 | } 118 | while futures: 119 | # wait for the next future to complete 120 | done, _ = concurrent.futures.wait( 121 | futures, return_when=concurrent.futures.FIRST_COMPLETED) 122 | 123 | for future in done: 124 | # depending on kwargs, allow re-adding some failed tasks back 125 | # to queue 126 | handle_runtime += handleFuture(future, success, ignored, retry, 127 | i_retry, callback_func, callback_args) 128 | _ = futures.pop(future) 129 | pbar.update(len(done)) 130 | 131 | # schedule the next batch of tasks, no more than the number of tasks 132 | # that just finished 133 | for task in itertools.islice(mytasks, len(done)): 134 | future = pool.submit(func, *task.get_args()) 135 | futures[future] = task.get_parent() 136 | return success, ignored, retry, handle_runtime 137 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/decompose/initial_tree.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 29, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | import shutil 9 | import time 10 | import random 11 | 12 | from align.decompose import decomposer 13 | from helpers import sequenceutils, hmmutils, treeutils 14 | from tasks import task 15 | from tools import external_tools 16 | from configuration import Configs 17 | 18 | ''' 19 | Different options for estimating a guide tree. 20 | The main ones are FastTree (for accuracy) and Clustal Omega's mbed (for speed). 21 | ''' 22 | 23 | def buildInitialTree(context, workingDir, treeType): 24 | ''' 25 | 11.16.2023 - modified by Chengze Shen 26 | Since the guide tree type and guide tree path are using the same arg, 27 | there is a chance that even user did not specify the type (default to 28 | "fasttree"), the code below will try to find if a file named "fasttree" 29 | exist in the directory where the script is run. 30 | Hence, if a user accidentally has a file/directory named "fasttree", 31 | it will be read and used as the guide tree instead of creating an initial 32 | tree with FastTree2. 33 | 34 | Modification: will not check for path existencce if treeType is among 35 | [fasttree, fasttree-noml, parttree, clustal] 36 | ''' 37 | default_styles = ['fasttree', 'fasttree-noml', 'parttree', 'clustal'] 38 | #if treeType is not None and os.path.exists(treeType): 39 | if treeType is not None: 40 | if treeType.lower() in default_styles: 41 | pass 42 | elif os.path.exists(treeType): 43 | Configs.log("Found user guide tree {}".format(treeType)) 44 | return treeType 45 | else: 46 | # by default use fasttree 47 | treeType = "fasttree" 48 | 49 | tempDir = os.path.join(workingDir, "initial_tree") 50 | outputTreePath = os.path.join(tempDir, "initial_tree.tre") 51 | if os.path.exists(outputTreePath): 52 | Configs.log("Found existing initial tree {}".format(outputTreePath)) 53 | return outputTreePath 54 | if os.path.exists(tempDir): 55 | shutil.rmtree(tempDir) 56 | os.makedirs(tempDir) 57 | 58 | time1 = time.time() 59 | 60 | if treeType.lower() == "fasttree": 61 | Configs.log("Building PASTA-style FastTree initial tree on {} with skeleton size {}..".format(context.sequencesPath, Configs.decompositionSkeletonSize)) 62 | alignPath = os.path.join(tempDir, "initial_align.txt") 63 | buildInitialAlignment(context.unalignedSequences, tempDir, Configs.decompositionSkeletonSize, None, alignPath) 64 | external_tools.runFastTree(alignPath, tempDir, outputTreePath, "fast").run() 65 | elif treeType.lower() == "fasttree-noml": 66 | Configs.log("Building PASTA-style FastTree (NO ML) initial tree on {} with skeleton size {}..".format(context.sequencesPath, Configs.decompositionSkeletonSize)) 67 | alignPath = os.path.join(tempDir, "initial_align.txt") 68 | buildInitialAlignment(context.unalignedSequences, tempDir, Configs.decompositionSkeletonSize, None, alignPath) 69 | external_tools.runFastTree(alignPath, tempDir, outputTreePath, "noml").run() 70 | elif treeType.lower() == "parttree": 71 | Configs.log("Building MAFFT PartTree initial tree on {}..".format(context.sequencesPath)) 72 | taxa = list(context.unalignedSequences.keys()) 73 | external_tools.runMafftGuideTree(context.sequencesPath, tempDir, outputTreePath, Configs.numCores).run() 74 | treeutils.convertMafftGuideTree(outputTreePath, taxa) 75 | elif treeType.lower() == "clustal": 76 | Configs.log("Building Clustal Omega initial tree on {}..".format(context.sequencesPath)) 77 | external_tools.runClustalOmegaGuideTree(context.sequencesPath, tempDir, outputTreePath, Configs.numCores).run() 78 | else: 79 | raise Exception("Guide tree {} not a file and not recognized..".format(treeType)) 80 | 81 | time2 = time.time() 82 | Configs.log("Built initial tree on {} in {} sec..".format(context.sequencesPath, time2-time1)) 83 | 84 | return outputTreePath 85 | 86 | def buildInitialAlignment(sequences, tempDir, skeletonSize, initialAlignSize, outputAlignPath): 87 | skeletonPath = os.path.join(tempDir, "skeleton_sequences.txt") 88 | queriesPath = os.path.join(tempDir, "queries.txt") 89 | hmmDir = os.path.join(tempDir, "skeleton_hmm") 90 | hmmPath = os.path.join(hmmDir, "hmm_model.txt") 91 | initialInsertPath = os.path.join(tempDir, "initial_insert_align.txt") 92 | if not os.path.exists(hmmDir): 93 | os.makedirs(hmmDir) 94 | 95 | if initialAlignSize is None or initialAlignSize > len(sequences): 96 | initialAlignSize = len(sequences) 97 | 98 | skeletonTaxa, remainingTaxa = decomposer.chooseSkeletonTaxa(sequences, skeletonSize) 99 | additional = initialAlignSize-skeletonSize 100 | random.shuffle(remainingTaxa) 101 | remainingTaxa, unusedTaxa = remainingTaxa[:additional], remainingTaxa[additional:] 102 | 103 | sequenceutils.writeFasta(sequences, skeletonPath, skeletonTaxa) 104 | external_tools.runMafft(skeletonPath, None, tempDir, outputAlignPath, Configs.numCores).run() 105 | 106 | if len(remainingTaxa) > 0: 107 | sequenceutils.writeFasta(sequences, queriesPath, remainingTaxa) 108 | hmmutils.buildHmmOverAlignment(outputAlignPath, hmmPath).run() 109 | hmmTasks = hmmutils.hmmAlignQueries(hmmPath, queriesPath) 110 | task.submitTasks(hmmTasks) 111 | for hmmTask in task.asCompleted(hmmTasks): 112 | hmmutils.mergeHmmAlignments([hmmTask.outputFile], outputAlignPath, includeInsertions=False) 113 | if Configs.graphBuildMethod == "initial": 114 | hmmutils.mergeHmmAlignments([hmmTask.outputFile], initialInsertPath, includeInsertions=True) 115 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/mlrmcl/README: -------------------------------------------------------------------------------- 1 | Compilation: 2 | ----------- 3 | 4 | - A simple 'make' should do in Unix-like environments. It creates 5 | the executables 'mlrmcl' and 'ncut' in the top-level 6 | directory. 7 | 8 | - 'make realclean' removes all libraries, executables and object 9 | files. 10 | 11 | General Usage and Options: 12 | ------------------------- 13 | 14 | - Executing a program without arguments will print its usage. 15 | 16 | - The options may be specified in any order (similar to how it 17 | works for general unix utilities). 18 | 19 | Usage and Options for mlrmcl: 20 | ---------------------------- 21 | 22 | - The only required argument for mlrmcl is the graph file. The 23 | format for this file is described below in the Format section. 24 | 25 | - The output file can be specified using the -o option. This is 26 | optional. The file that the output is written to is printed in 27 | the stdout output of the program. 28 | 29 | - The granularity of the output clustering can be controlled 30 | using the '-c' option. This option specifies how small the 31 | graph can get before the coarsening in MLR-MCL stops. For 32 | example, if mlrmcl is run with '-c 1000', the graph is 33 | coarsened until it has no more than 1000 vertices. 34 | The default value for this option is 1000. If c is the same as 35 | the number of vertices in the graph, then no coarsening will 36 | take place at all and this is the same as R-MCL. The smaller 37 | the value of this option, the fewer clusters are output by the 38 | program. Note that if the cluster structure is especially 39 | clear (such as for synthetic graphs), 40 | the program will simply output the same clustering 41 | regardless of the parameter value. 42 | 43 | - The balance (i.e. the variance in output cluster sizes) can be 44 | controlled using the '-b' option. The default value of 0.5 45 | should be good enough in most cases. If you find that the 46 | output clustering is too balanced, you can try lower values for 47 | b (until 0), or if it is too imbalanced, you can try higher 48 | values for b such as 0.75 or 1. 49 | 50 | - The inflation parameter is specified using the '-i option' and 51 | can also be used to control the granularity of the clustering. 52 | (In the case when no coarsening is performed, i.e. for R-MCL, 53 | it is the only way to control the number of clusters.) Higher 54 | values of 'i' lead to more clusters, and the clustering also 55 | converges faster. The default is 2.0. 56 | 57 | Usage and Options for mergeClusters: 58 | ----------------------------------- 59 | 60 | - The mergeClusters program performs hierarchical agglomerative 61 | clustering. It may be used in situations where the user needs 62 | to exactly control the number of output clusters. First, one 63 | can run MLR-MCL, setting the options such that more clusters 64 | than required are output by the program. Subsequently one may 65 | run the mergeClusters program, specifying the number of merges 66 | to be the same as the number of additional clusters that were 67 | output by MLR-MCL. 68 | 69 | - The exact usage for mergeClusters can be seen by executing the 70 | program without any arguments. 71 | 72 | Input format: 73 | ------------ 74 | 75 | - The input format is the same as that for Metis and Graclus. A 76 | pdf document explaining this format is available at 77 | http://glaros.dtc.umn.edu/gkhome/fetch/sw/metis/manual.pdf 78 | This pdf is also present inside the Metis distribution. For 79 | convenience, we have also included a copy of this manual under 80 | the name metis.4.0.manual.pdf. 81 | 82 | Output format: 83 | ------------- 84 | 85 | - The output format is also the same as that for Metis and 86 | Graclus. Each line contains the cluster index to which the 87 | node of the corresponding line number has been assigned. (For 88 | example, if line 20 is '4', that means that the node 20 has 89 | been assigned to the cluster 4.) 90 | 91 | Examples: 92 | --------- 93 | - The 'examples' folder has two graphs: synthetic.graph, which 94 | is a synthetic graph of 1000 nodes, generated with 25 clusters 95 | and astro.graph, which is a collaboration network of astro physics 96 | researchers (see http://snap.stanford.edu). 97 | The following are some example usages 98 | (assuming the software is compiled, and we are in the example 99 | directory): 100 | 101 | - ../mlrmcl -o synthetic.graph.out synthetic.graph 102 | - ../mlrmcl -o astro.graph.out astro.graph 103 | - ../mlrmcl -c 500 -o astro.graph.c500.out astro.graph 104 | - ../mlrmcl -c 2000 -o astro.graph.c2000.out astro.graph 105 | - ../mlrmcl -c 2000 -b 0.25 -o astro.graph.c2000.b0.25.out astro.graph 106 | - ../mlrmcl -c 20000 -i 1.8 -b 0.25 -o astro.graph.c20000.i1.8.b0.25.out astro.graph 107 | (The last example above does not perform any coarsening, since 108 | the c value 20000 is more than the number of vertices in the 109 | graph, which is 17903.) 110 | 111 | - ../mergeClusters -e astro.graph.c500.out -n 10 -o astro.graph.c500.10merges.out astro.graph 112 | (If astro.graph.c500.out represents the clustering of 113 | astro.graph into x clusters, then 114 | astro.graph.c500.10.merges.out is a clustering of x-10 115 | clusters.) 116 | 117 | 118 | References: 119 | ---------- 120 | - Venu Satuluri and Srinivasan Parthasarathy. "Scalable Graph 121 | Clustering using Stochastic Flows: Applications to Community 122 | Discovery." Proceedings of ACM SIGKDD 2009, Paris. 123 | 124 | - Venu Satuluri, Srinivasan Parthasarathy and Dugyu Ucar. "Markov 125 | Clustering of Protein Interaction Networks with Improved 126 | Balance and Scalablity". Proceedings of ACM BCB 2010, Niagara 127 | Falls. 128 | 129 | Acknowledgments: 130 | --------------- 131 | I am very grateful to the authors of Metis and Graclus for releasing 132 | the source of their softwares, as this has enabled me to 133 | implement my own software much faster than would have been 134 | possible otherwise. 135 | 136 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/helpers/hmmutils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 28, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import re 8 | import os 9 | import math 10 | from tools import external_tools 11 | from configuration import Configs 12 | from helpers import sequenceutils 13 | 14 | ''' 15 | def buildHmmScores(hmmPath, queriesPath, scoreMap): 16 | #tasks = [getHmmScores(hmmPath, queriesPath) for hmmPath in hmmPaths] 17 | queries = sequenceutils.readFromFasta(queriesPath, removeDashes = True) 18 | baseName = os.path.basename(queriesPath).split('.')[0] 19 | dirName = os.path.dirname(hmmPath) 20 | 21 | taxonScores = {} 22 | for taxon in queries: 23 | inputName = os.path.join(dirName, "{}.txt".format(baseName)) 24 | outputName = os.path.join(dirName, "{}_score.txt".format(baseName)) 25 | if os.path.exists(outputName): 26 | os.remove(outputName) 27 | sequenceutils.writeFasta({taxon : queries[taxon]}, inputName) 28 | getHmmScores(hmmPath, inputName, outputName).run() 29 | subsetScores = readSearchFile(outputName) 30 | taxonScores[taxon] = subsetScores[taxon][1] 31 | scoreMap[hmmPath] = taxonScores 32 | ''' 33 | 34 | def buildHmmScores(hmmPaths, queriesPath, scoreFileHmmFileMap): 35 | #tasks = [getHmmScores(hmmPath, queriesPath) for hmmPath in hmmPaths] 36 | queries = sequenceutils.readFromFasta(queriesPath, removeDashes = True) 37 | baseName = os.path.basename(queriesPath).split('.')[0] 38 | dirName = os.path.join(os.path.dirname(queriesPath), "chunks_{}".format(baseName)) 39 | if not os.path.exists(dirName): 40 | os.makedirs(dirName) 41 | 42 | chunkSize = 1000 43 | 44 | taxa = list(queries.keys()) 45 | inputOutputs = [] 46 | for i in range(math.ceil(len(taxa) / chunkSize)): 47 | chunk = taxa[i*chunkSize : min(len(taxa), (i+1)*chunkSize)] 48 | inputName = os.path.join(dirName, "{}_chunk_{}.txt".format(baseName, i+1)) 49 | sequenceutils.writeFasta(queries, inputName, chunk) 50 | for hmmPath in hmmPaths: 51 | outputName = os.path.join(os.path.dirname(hmmPath), "{}_chunk_{}_score.txt".format(baseName, i+1)) 52 | inputOutputs.append((hmmPath, inputName, outputName)) 53 | scoreFileHmmFileMap[outputName] = hmmPath 54 | 55 | tasks = [getHmmScores(hmmPath, inputPath, outputPath) for hmmPath, inputPath, outputPath in inputOutputs] 56 | return tasks 57 | 58 | def getHmmScores(hmmPath, queriesPath, scorePath): 59 | workingDir = os.path.dirname(hmmPath) 60 | #searchPath = os.path.join(workingDir, "hmm_search.txt") 61 | task = external_tools.runHmmSearch(hmmPath, queriesPath, workingDir, scorePath) 62 | return task 63 | 64 | def readHmmScores(searchFiles): 65 | sequenceScores = {} 66 | for file in searchFiles: 67 | subsetScores = readSearchFile(file) 68 | taxonScores = {taxon : scores[1] for taxon, scores in subsetScores.items()} 69 | sequenceScores[file] = taxonScores 70 | return sequenceScores 71 | 72 | def buildHmms(sequencesHmmsPathsMap): 73 | tasks = [buildHmmOverAlignment(sequencePath, hmmPath) for sequencePath, hmmPath in sequencesHmmsPathsMap.items()] 74 | return tasks 75 | 76 | def buildHmmOverAlignment(sequencePath, hmmPath): 77 | workingDir = os.path.dirname(hmmPath) 78 | task = external_tools.runHmmBuild(sequencePath, workingDir, hmmPath) 79 | return task 80 | 81 | def combineHmmAlignments(alignFiles, outputAlignmentPath, includeInsertions): 82 | alignment = {} 83 | for file in alignFiles: 84 | alignment.update(sequenceutils.readFromStockholm(file, includeInsertions)) 85 | sequenceutils.writeFasta(alignment, outputAlignmentPath, None) 86 | 87 | def mergeHmmAlignments(alignFiles, outputAlignmentPath, includeInsertions): 88 | for file in alignFiles: 89 | alignment = sequenceutils.readFromStockholm(file, includeInsertions) 90 | sequenceutils.writeFasta(alignment, outputAlignmentPath, None, True) 91 | 92 | def hmmAlignQueries(hmmPath, queriesPath): 93 | queries = sequenceutils.readFromFasta(queriesPath, removeDashes = True) 94 | baseName = os.path.basename(queriesPath).split('.')[0] 95 | dirName = os.path.join(os.path.dirname(queriesPath), "chunks_{}".format(baseName)) 96 | if not os.path.exists(dirName): 97 | os.makedirs(dirName) 98 | chunkSize = 1000 99 | 100 | taxa = list(queries.keys()) 101 | alignFiles = {} 102 | for i in range(math.ceil(len(taxa) / chunkSize)): 103 | chunk = taxa[i*chunkSize : min(len(taxa), (i+1)*chunkSize)] 104 | inputName = os.path.join(dirName, "{}_chunk_{}.txt".format(baseName, i+1)) 105 | outputName = os.path.join(dirName, "{}_chunk_{}_aligned.txt".format(baseName, i+1)) 106 | sequenceutils.writeFasta(queries, inputName, chunk) 107 | alignFiles[inputName] = outputName 108 | 109 | tasks = [] 110 | for inputPath, outputPath in alignFiles.items(): 111 | task = buildHmmAlignment(hmmPath, inputPath, outputPath) 112 | tasks.append(task) 113 | return tasks 114 | 115 | def buildHmmAlignment(hmmPath, queriesPath, outputAlignmentPath): 116 | workingDir = os.path.dirname(hmmPath) 117 | task = external_tools.runHmmAlign(hmmPath, queriesPath, workingDir, outputAlignmentPath) 118 | return task 119 | 120 | #from PASTA repo 121 | def readSearchFile(searchFilePath): 122 | with open(searchFilePath, 'r') as searchFile: 123 | results = {} 124 | 125 | pattern = re.compile( 126 | r"([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+" 127 | r"([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)") 128 | start_reading = False 129 | for line in searchFile: 130 | line = line.strip() 131 | if (not start_reading and line.startswith("E-value") is True): 132 | start_reading = True 133 | elif (start_reading and line == ""): 134 | start_reading = False 135 | break 136 | elif (start_reading): 137 | matches = pattern.search(line) 138 | if (matches is not None and matches.group(0).find("--") == -1): 139 | results[matches.group(9).strip()] = ( 140 | float(matches.group(1).strip()), 141 | float(matches.group(2).strip())) 142 | # _LOG.debug("Fragment scores;" 143 | # "fragment:%s E-Value:%s BitScore:%s" %(matches 144 | # .group(9).strip(),matches.group(1).strip(), matches. 145 | # group(2).strip())) 146 | return results -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_trace/rg_search.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 23, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import heapq 8 | from collections import deque 9 | 10 | from configuration import Configs 11 | 12 | ''' 13 | Region-growing strategy, similar to Kruskal's algorithm. Start with an empty graph, greedily 14 | adding the heaviest edges first. Can be used for clustering or tracing. 15 | ''' 16 | 17 | def rgSearch(graph): 18 | Configs.log("Finding graph trace with region-growing search..") 19 | 20 | k = len(graph.context.subalignments) 21 | lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] 22 | upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 23 | graph.clusters = rgCluster(graph, lowerBound, upperBound, True) 24 | 25 | def rgCluster(graph, lowerBound, upperBound, enforceTrace = True): 26 | clusters = [] 27 | clusterPointers = {} 28 | clusterPos = {} 29 | nodeClusters = {} 30 | weightMap = [] 31 | absorbed = set() 32 | cantConnects = set() 33 | 34 | for s in range(len(lowerBound)): 35 | for a in range(lowerBound[s], upperBound[s]): 36 | clusters.append([a]) 37 | idx = len(clusters)-1 38 | nodeClusters[a] = idx 39 | weightMap.append({}) 40 | clusterPos[idx] = {s : a} 41 | clusterPointers[idx] = {s : (idx-1 if idx > lowerBound[s] else None, idx+1 if idx < upperBound[s]-1 else None)} 42 | 43 | 44 | heap = buildHeap(graph, nodeClusters, weightMap, lowerBound, upperBound) 45 | Configs.log("Built a heap of size {}..".format(len(heap))) 46 | crunchHeap(graph, heap, clusters, nodeClusters, clusterPos, clusterPointers, weightMap, cantConnects, absorbed, enforceTrace) 47 | 48 | #c2 = [sorted(c) for c in clusters if len(c) > 0] 49 | #c2.sort(key= lambda l : graph.matSubPosMap[l[0]]) 50 | #for c in c2: 51 | # print(c) 52 | 53 | if enforceTrace: 54 | clusters = orderClusters(graph, clusters, nodeClusters, lowerBound, upperBound) 55 | #for c in clusters: 56 | # print(sorted(c)) 57 | return clusters 58 | 59 | def buildHeap(graph, nodeClusters, weightMap, lowerBound, upperBound): 60 | heap = [] 61 | for s in range(len(lowerBound)): 62 | for a in range(lowerBound[s], upperBound[s]): 63 | asub, apos = graph.matSubPosMap[a] 64 | i = nodeClusters[a] 65 | for b, value in graph.matrix[a].items(): 66 | bsub, bpos = graph.matSubPosMap[b] 67 | if b <= a or asub == bsub or b < lowerBound[bsub] or b >= upperBound[bsub]: 68 | continue 69 | j = nodeClusters[b] 70 | weightMap[j][i] = value 71 | weightMap[i][j] = value 72 | heapq.heappush(heap, (-1 * value, a, b)) 73 | 74 | return heap 75 | #baseIdx = max(range(k), key = lambda x : upperBound[x] - lowerBound[x]) 76 | #baseLength = upperBound[baseIdx] - lowerBound[baseIdx] 77 | 78 | def crunchHeap(graph, heap, clusters, nodeClusters, clusterPos, clusterPointers, weightMap, cantConnects, absorbed, enforceTrace): 79 | while len(heap) > 0: 80 | value, a, b = heapq.heappop(heap) 81 | i, j = nodeClusters[a], nodeClusters[b] 82 | if i == j or orderPair(i,j) in cantConnects: 83 | continue 84 | 85 | if not checkConnect(graph, i, j, clusters, clusterPos, enforceTrace): 86 | cantConnects.add(orderPair(i,j)) 87 | continue 88 | 89 | absorbed.add(j) 90 | for e in clusters[j]: 91 | nodeClusters[e] = i 92 | clusters[i].append(e) 93 | asub, apos = graph.matSubPosMap[e] 94 | clusterPos[i][asub] = e 95 | clusters[j] = [] 96 | 97 | if enforceTrace: 98 | for s in clusterPointers[j]: 99 | prev, nxt = clusterPointers[j][s] 100 | if prev is not None: 101 | clusterPointers[prev][s] = (clusterPointers[prev][s][0], i) 102 | if nxt is not None: 103 | clusterPointers[nxt][s] = (i, clusterPointers[nxt][s][1]) 104 | clusterPointers[i][s] = (prev, nxt) 105 | 106 | updateMergePointers(graph, i, clusterPointers, clusters, clusterPos) 107 | 108 | #print("Clusters left: {}".format(len(clusters) - len(absorbed))) 109 | for n in weightMap[j]: 110 | if n in absorbed: 111 | continue 112 | weightMap[i][n] = weightMap[i].get(n, 0) + weightMap[j][n] 113 | weightMap[n][i] = weightMap[i][n] 114 | heapq.heappush(heap, (-1 * weightMap[i][n], clusters[i][0], clusters[n][0])) 115 | 116 | def updateMergePointers(graph, i, clusterPointers, clusters, clusterPos): 117 | subsets = [graph.matSubPosMap[a][0] for a in clusters[i]] 118 | 119 | for s in subsets: 120 | queue = deque([i]) 121 | visited = set([i]) 122 | 123 | while len(queue) > 0: 124 | curNode = queue.popleft() 125 | 126 | if clusterPos[curNode].get(s, float('inf')) > clusterPos[i][s] or curNode == i: 127 | clusterPos[curNode][s] = clusterPos[i][s] 128 | 129 | for p in clusterPointers[curNode]: 130 | prv, nxt = clusterPointers[curNode][p] 131 | if prv not in visited and prv is not None: 132 | queue.append(prv) 133 | visited.add(prv) 134 | 135 | 136 | def checkConnect(graph, i, j, clusters, clusterPos, enforceTrace): 137 | ci , cj = set([graph.matSubPosMap[a][0] for a in clusters[i]]), set([graph.matSubPosMap[a][0] for a in clusters[j]]) 138 | for s in ci: 139 | if s in cj: 140 | return False 141 | 142 | if not enforceTrace: 143 | return True 144 | 145 | for s in ci: 146 | if clusterPos[j].get(s, float('inf')) <= clusterPos[i][s]: 147 | return False 148 | for s in cj: 149 | if clusterPos[i].get(s, float('inf')) <= clusterPos[j][s]: 150 | return False 151 | return True 152 | 153 | def orderClusters(graph, clusters, nodeClusters, lowerBound, upperBound): 154 | orderedClusters = [] 155 | frontier = list(lowerBound) 156 | while True: 157 | foundGood = False 158 | for j in range(len(lowerBound)): 159 | good = True 160 | idx = frontier[j] 161 | if idx >= upperBound[j]: 162 | continue 163 | i = nodeClusters[idx] 164 | for b in clusters[i]: 165 | bsub, bpos = graph.matSubPosMap[b] 166 | if b > frontier[bsub]: 167 | #print(bsub, b, frontier[bsub]) 168 | good = False 169 | break 170 | 171 | if good: 172 | orderedClusters.append(clusters[i]) 173 | for b in clusters[i]: 174 | bsub, bpos = graph.matSubPosMap[b] 175 | frontier[bsub] = b + 1 176 | foundGood = True 177 | break 178 | if not foundGood: 179 | break 180 | return orderedClusters 181 | 182 | def orderPair(a, b): 183 | return (min(a, b), max(a, b)) -------------------------------------------------------------------------------- /witch_msa/gcmm/weighting.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 10.28.2021 by Chengze Shen 3 | 4 | Bitscore to weight calculation. 5 | ''' 6 | 7 | import os 8 | import time 9 | import numpy as np 10 | import concurrent.futures 11 | from witch_msa.configs import Configs, tqdm_styles 12 | from tqdm import tqdm 13 | 14 | class Weights(object): 15 | weights = dict() 16 | weights_map = dict() 17 | ranked_bitscores = dict() 18 | def __init__(self): 19 | pass 20 | 21 | ''' 22 | Function to read in weights from local given the taxon name 23 | ''' 24 | def readWeights(taxon): 25 | infile = Configs.outdir + '/weights/w_{}.txt'.format(taxon) 26 | if not os.path.isfile(infile): 27 | return None, None 28 | else: 29 | weights, weights_map = [], [] 30 | with open(infile, 'r') as f: 31 | line = f.read() 32 | taxon, raw = line.split(':') 33 | weights = [eval(x) for x in raw.split(';')] 34 | weights_map = {ind: w for (ind, w) in weights} 35 | return weights, weights_map 36 | 37 | ''' 38 | Function to read in bitscores from local given the taxon name 39 | ''' 40 | def readBitscores(taxon): 41 | infile = Configs.outdir + '/bitscores/b_{}.txt'.format(taxon) 42 | if not os.path.isfile(infile): 43 | return None, None 44 | else: 45 | bitscores = [] 46 | with open(infile, 'r') as f: 47 | line = f.read() 48 | taxon, raw = line.split(':') 49 | bitscores = [eval(x) for x in raw.split(';')] 50 | return bitscores 51 | 52 | ''' 53 | Function to calculate the HMM weighting, given the bitscores and sizes 54 | of the HMMs (for a given query taxon) 55 | inputs: ensemble of HMMs H (with their bitscores and sizes) 56 | outputs: weights for HMMs H 57 | ''' 58 | def calculateWeights(packed_data): 59 | taxon, indexes, bitscores, sizes = packed_data 60 | #logging.debug('working with: {}'.format(taxon)) 61 | weights = {} 62 | 63 | assert len(indexes) == len(bitscores) == len(sizes) 64 | for i in range(len(bitscores)): 65 | score_i, size_i = bitscores[i], sizes[i] 66 | exponents = np.array(bitscores) - score_i \ 67 | + np.log2(np.array(sizes) / size_i) 68 | denominator = np.sum(np.power(2, exponents)) 69 | weights[indexes[i]] = 1. / denominator 70 | 71 | num_to_retain = min(Configs.num_hmms, len(weights)) 72 | sorted_weights = sorted([(ind, w) for ind, w in weights.items()], 73 | key = lambda x: x[1], reverse=True)[:num_to_retain] 74 | return {taxon: tuple(sorted_weights)} 75 | 76 | ## write weights to local (only top k ones) 77 | #sorted_weights = [str(x) for x in sorted_weights] 78 | #with open(Configs.outdir + '/weights/w_{}.txt'.format(taxon), 'w') as f: 79 | # f.write(taxon + ':' + ';'.join(sorted_weights) + '\n') 80 | #return None 81 | 82 | ''' 83 | Function to write a single taxon with its ranked bitscore to local 84 | ''' 85 | def writeQueryBitscores(packed_data): 86 | taxon, sorted_scores = packed_data 87 | str_sorted_scores = [str(x) for x in sorted_scores] 88 | 89 | with open(Configs.outdir + '/bitscores/b_{}.txt'.format(taxon), 'w') as f: 90 | f.write(taxon + ':' + ';'.join(str_sorted_scores) + '\n') 91 | return None 92 | 93 | ''' 94 | Write bitscores to local (the same way as we write weights) 95 | ''' 96 | def writeBitscores(ranked_bitscores, pool): 97 | s2 = time.time() 98 | Configs.warning('Starting to load bitscores...') 99 | #if not os.path.isdir(Configs.outdir + '/bitscores'): 100 | # os.makedirs(Configs.outdir + '/bitscores') 101 | 102 | taxon_to_bitscores = {} 103 | for taxon, sorted_scores in ranked_bitscores.items(): 104 | num_to_retain = min(Configs.num_hmms, len(sorted_scores)) 105 | taxon_to_bitscores[taxon] = tuple(sorted_scores[:num_to_retain]) 106 | 107 | #args = [] 108 | #for taxon, sorted_scores in ranked_bitscores.items(): 109 | # args.append((taxon, sorted_scores)) 110 | #all_score_temps = list(pool.map(writeQueryBitscores, args)) 111 | 112 | time_write_scores = time.time() - s2 113 | Configs.warning('Finished loading bitscores in memory.') 114 | Configs.runtime(' '.join(['(writeBitscores) Time to write ranked bitscores', 115 | 'to local (s):', str(time_write_scores)])) 116 | return taxon_to_bitscores 117 | 118 | ''' 119 | Obtain and write weights to local based on bitscores 120 | ''' 121 | def writeWeights(index_to_hmm, ranked_bitscores, pool): 122 | s2 = time.time() 123 | Configs.warning('Starting to calculate weights...') 124 | #pool = Pool(Configs.num_cpus) 125 | 126 | # - get sizes of each HMM 127 | all_sizes = {} 128 | for index, subset in index_to_hmm.items(): 129 | all_sizes[index] = subset.num_taxa 130 | #all_sizes[index] = subset.alignment.get_num_taxa() 131 | 132 | # iterate through each query taxon 133 | # write to local for each taxon and its weights 134 | #if not os.path.isdir(Configs.outdir + '/weights'): 135 | # os.makedirs(Configs.outdir + '/weights') 136 | weights, weights_map = {}, {} 137 | args = [] 138 | for taxon, sorted_scores in ranked_bitscores.items(): 139 | indexes = [x[0] for x in sorted_scores] 140 | bitscores = [x[1] for x in sorted_scores] 141 | sizes = [all_sizes[x] for x in indexes] 142 | args.append((taxon, indexes, bitscores, sizes)) 143 | 144 | ## sequential version to calculate weights 145 | #this_weights_map,_ = calculateWeights(taxon, indexes, bitscores, sizes) 146 | #weights[taxon] = sorted([(ind, w) for ind, w in this_weights_map.items()], 147 | # key = lambda x: x[1], reverse=True) 148 | #weights_map[taxon] = this_weights_map 149 | #all_taxon_to_weights = list(pool.map(calculateWeights, args, 150 | # chunksize=Configs.chunksize)) 151 | all_taxon_to_weights, futures = [], [] 152 | for arg in args: 153 | futures.append(pool.submit(calculateWeights, arg)) 154 | for future in tqdm( 155 | concurrent.futures.as_completed(futures), 156 | total=len(args), **tqdm_styles): 157 | res = future.result() 158 | if res: 159 | all_taxon_to_weights.append(res) 160 | 161 | taxon_to_weights = {} 162 | for item in all_taxon_to_weights: 163 | taxon_to_weights.update(item) 164 | 165 | time_obtain_weights = time.time() - s2 166 | Configs.warning('Finished calculating weights!') 167 | Configs.runtime(' '.join(['(writeWeights) Time to obtain weights', 168 | 'given bitscores (s):', str(time_obtain_weights)])) 169 | return taxon_to_weights 170 | 171 | ''' 172 | Write weights to local as [outdir]/weights.txt 173 | ''' 174 | def writeWeightsToLocal(taxon_to_weights, path): 175 | Configs.log('Writing weights to {}'.format(path)) 176 | with open(path, 'w') as f: 177 | for taxon, weights in taxon_to_weights.items(): 178 | f.write('{}:{}\n'.format(taxon, weights)) 179 | 180 | ''' 181 | Function to read weights from a given weights path (e.g., ./weights.txt) 182 | Return a dictionary of taxon to weights 183 | ''' 184 | def readWeightsFromLocal(path): 185 | Configs.log('Reading weights from {}'.format(path)) 186 | taxon_to_weights = {} 187 | with open(path, 'r') as f: 188 | line = f.readline() 189 | while line: 190 | # split by ':' 191 | taxon, taxon_weight = line.split(':') 192 | taxon_to_weights[taxon] = eval(taxon_weight) 193 | line = f.readline() 194 | return taxon_to_weights 195 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/alignment_graph.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 14, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | 9 | from helpers import sequenceutils 10 | from configuration import Configs 11 | import threading 12 | 13 | 14 | ''' 15 | Data structure for dealing with alignment graphs. 16 | Subalignment columns are mapped to graph nodes, represented by integers. 17 | Integer nodes can be converted back to corresponding subalignment columns. 18 | Reads/writes graph and cluster files. 19 | ''' 20 | 21 | class AlignmentGraph: 22 | 23 | def __init__(self, context): 24 | self.context = context 25 | self.workingDir = os.path.join(self.context.workingDir, "graph") 26 | self.graphPath = os.path.join(self.workingDir, "graph.txt") 27 | self.clusterPath = os.path.join(self.workingDir, "clusters.txt") 28 | self.tracePath = os.path.join(self.workingDir, "trace.txt") 29 | if not os.path.exists(self.workingDir): 30 | os.makedirs(self.workingDir) 31 | 32 | self.subalignmentLengths = [] 33 | self.subsetMatrixIdx = [] 34 | self.matSubPosMap = [] 35 | 36 | self.matrixSize = 0 37 | self.matrix = None 38 | self.matrixLock = threading.Lock() 39 | self.nodeEdges = None 40 | 41 | self.clusters = [] 42 | self.insertions = set() 43 | 44 | def initializeMatrix(self): 45 | if Configs.constrain: 46 | self.subalignmentLengths = [sequenceutils.readSequenceLengthFromFasta(file) for file in self.context.subalignmentPaths] 47 | else: 48 | self.subalignmentLengths = [len(self.context.unalignedSequences[s[0]].seq) for s in self.context.subalignments] 49 | 50 | self.matrixSize = sum(self.subalignmentLengths) 51 | self.subsetMatrixIdx = [0] * len(self.subalignmentLengths) 52 | for k in range(1, len(self.subalignmentLengths)): 53 | self.subsetMatrixIdx[k] = self.subsetMatrixIdx[k-1] + self.subalignmentLengths[k-1] 54 | 55 | self.matSubPosMap = [0] * self.matrixSize 56 | i = 0 57 | for k in range(len(self.subalignmentLengths)): 58 | for j in range(self.subalignmentLengths[k]): 59 | self.matSubPosMap[i] = (k, j) 60 | i = i + 1 61 | 62 | self.matrix = [{} for i in range(self.matrixSize)] 63 | 64 | def writeGraphToFile(self, filePath): 65 | with open(filePath, 'w') as textFile: 66 | for i in range(len(self.matrix)): 67 | for k in self.matrix[i]: 68 | textFile.write("{} {} {}\n".format(i, k, self.matrix[i][k])) 69 | Configs.log("Wrote matrix to {}".format(filePath)) 70 | 71 | def readGraphFromFile(self, filePath): 72 | self.matrix = [{} for i in range(self.matrixSize)] 73 | with open(filePath) as f: 74 | for line in f: 75 | tokens = [int(token) for token in line.strip().split()] 76 | self.matrix[tokens[0]][tokens[1]] = tokens[2] 77 | Configs.log("Read matrix from {}".format(filePath)) 78 | 79 | def writeClustersToFile(self, filePath): 80 | with open(filePath, 'w') as textFile: 81 | for cluster in self.clusters: 82 | textFile.write("{}\n".format(" ".join([str(c) for c in cluster]))) 83 | 84 | def readClustersFromFile(self, filePath): 85 | self.clusters = [] 86 | with open(filePath) as f: 87 | for line in f: 88 | tokens = [int(token) for token in line.strip().split()] 89 | if len(tokens) > 1: 90 | self.clusters.append(tokens) 91 | print("Found {} clusters..".format(len(self.clusters))) 92 | 93 | def buildNodeEdgeDataStructure(self): 94 | Configs.log("Preparing node edge data structure..") 95 | k = len(self.subalignmentLengths) 96 | self.nodeEdges = {} 97 | 98 | for a in range(self.matrixSize): 99 | asub, apos = self.matSubPosMap[a] 100 | self.nodeEdges[a] = [[] for i in range(k)] 101 | for b, value in self.matrix[a].items(): 102 | bsub, bpos = self.matSubPosMap[b] 103 | if asub == bsub: 104 | continue 105 | self.nodeEdges[a][bsub].append((b, value)) 106 | for i in range(k): 107 | self.nodeEdges[a][i].sort(key = lambda pair: pair[0]) 108 | Configs.log("Prepared node edge data structure..") 109 | 110 | def buildNodeEdgeDataStructureFromClusters(self): 111 | Configs.log("Preparing node edge data structure..") 112 | k = len(self.subalignmentLengths) 113 | self.nodeEdges = {} 114 | 115 | Configs.log("Using {} pre-existing clusters to simplify alignment graph..".format(len(self.clusters))) 116 | for a in range(self.matrixSize): 117 | self.nodeEdges[a] = [[] for i in range(k)] 118 | 119 | for cluster in self.clusters: 120 | for a in cluster: 121 | asub, apos = self.matSubPosMap[a] 122 | for b in cluster: 123 | bsub, bpos = self.matSubPosMap[b] 124 | if asub == bsub or b not in self.matrix[a]: 125 | continue 126 | value = self.matrix[a][b] 127 | self.nodeEdges[a][bsub].append((b, value)) 128 | for i in range(k): 129 | self.nodeEdges[a][i].sort(key = lambda pair: pair[0]) 130 | Configs.log("Prepared node edge data structure..") 131 | 132 | def cutString(self, cut): 133 | stringCut = list(cut) 134 | for i, value in enumerate(stringCut): 135 | stringCut[i] = value - self.subsetMatrixIdx[i] 136 | return stringCut 137 | 138 | def computeClusteringCost(self, clusters): 139 | cutCost = 0 140 | nodeClusters = {} 141 | 142 | for n, cluster in enumerate(clusters): 143 | for a in cluster: 144 | nodeClusters[a] = n 145 | 146 | clusterCounter = len(clusters) 147 | for a in range(self.matrixSize): 148 | if a not in nodeClusters: 149 | nodeClusters[a] = clusterCounter 150 | clusterCounter = clusterCounter + 1 151 | 152 | for a in range(self.matrixSize): 153 | asub, apos = self.matSubPosMap[a] 154 | for b, value in self.matrix[a].items(): 155 | bsub, bpos = self.matSubPosMap[b] 156 | if asub != bsub and nodeClusters[a] != nodeClusters[b]: 157 | cutCost = cutCost + value 158 | 159 | return int(cutCost/2) 160 | 161 | def addSingletonClusters(self): 162 | newClusters = [] 163 | 164 | lastIdx = list(self.subsetMatrixIdx) 165 | for cluster in self.clusters: 166 | for a in cluster: 167 | #print(a) 168 | asub, apos = self.matSubPosMap[a] 169 | for node in range(lastIdx[asub], a): 170 | newClusters.append([node]) 171 | lastIdx[asub] = a+1 172 | newClusters.append(cluster) 173 | for i in range(len(lastIdx)): 174 | for node in range(lastIdx[i], self.subsetMatrixIdx[i] + self.subalignmentLengths[i]): 175 | newClusters.append([node]) 176 | self.clusters = newClusters 177 | return newClusters 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /witch_msa/gcmm/decompose_tree.py: -------------------------------------------------------------------------------- 1 | # uym2 added 2 | # June 2017 3 | # utils for tree decomposition 4 | 5 | # 1.22.2022 - Copied over and modified to accomodate GCMM from SEPP 6 | # by Chengze Shen 7 | 8 | 9 | from dendropy import Tree 10 | try: 11 | from queue import Queue # python 3 12 | except ImportError: 13 | from Queue import Queue # python 2 14 | # from tree import PhylogeneticTree 15 | from witch_msa.configs import Configs 16 | 17 | 18 | def decompose_by_diameter(a_tree, strategy, max_size=None, min_size=None, 19 | max_diam=None): 20 | def __ini_record__(): 21 | for node in a_tree.postorder_node_iter(): 22 | __update_node__(node) 23 | 24 | def __find_midpoint_edge__(tre): 25 | u = tre.seed_node.bestLCA.anchor 26 | uel = u.edge_length if u.edge_length else 0 27 | d = 0 28 | while d + uel < tre.seed_node.diameter / 2: 29 | d += uel 30 | u = u.parent_node 31 | uel = u.edge_length if u.edge_length else 0 32 | return u.edge 33 | 34 | def __find_centroid_edge__(tre): 35 | u = tre.seed_node 36 | product = 0 37 | acc_nleaf = 0 38 | 39 | while not u.is_leaf(): 40 | max_child = None 41 | max_child_nleaf = 0 42 | for ch in u.child_node_iter(): 43 | if ch.nleaf > max_child_nleaf: 44 | max_child_nleaf = ch.nleaf 45 | max_child = ch 46 | acc_nleaf += (u.nleaf-max_child.nleaf) 47 | new_product = max_child.nleaf * acc_nleaf 48 | if new_product <= product: 49 | break 50 | product = new_product 51 | u = max_child 52 | 53 | return u.edge 54 | 55 | def __bisect__(tre, edg): 56 | # e = __find_centroid_edge__(t) 57 | 58 | u = edg.tail_node 59 | v = edg.head_node 60 | 61 | u.remove_child(v) 62 | tr1 = Tree(seed_node=v) 63 | 64 | if u.num_child_nodes() == 1: 65 | p = u.parent_node 66 | v = u.child_nodes()[0] 67 | l_v = v.edge_length if v.edge_length else 0 68 | u.remove_child(v) 69 | # u is the seed_node; this means the tree runs out of all but one 70 | # side 71 | if p is None: 72 | tre.seed_node = v 73 | return tre, tr1 74 | l_u = u.edge_length if u.edge_length else 0 75 | p.remove_child(u) 76 | p.add_child(v) 77 | v.edge_length = l_u + l_v 78 | u = p 79 | 80 | while u is not None: 81 | __update_node__(u) 82 | u = u.parent_node 83 | 84 | return tre, tr1 85 | 86 | def __clean_up__(tre): 87 | for node in tre.postorder_node_iter(): 88 | delattr(node, "nleaf") 89 | delattr(node, "anchor") 90 | # delattr(node,"maxheight") 91 | delattr(node, "maxdepth") 92 | delattr(node, "diameter") 93 | # delattr(node,"topo_diam") 94 | delattr(node, "bestLCA") 95 | 96 | def __update_node__(node): 97 | if node.is_leaf(): 98 | node.anchor = node 99 | # node.maxheight = 0 100 | node.maxdepth = 0 101 | node.diameter = 0 102 | # node.topo_diam = 0 103 | node.bestLCA = node 104 | node.nleaf = 1 105 | return 106 | 107 | # n1 = -1 108 | # n2 = -1 109 | d1 = -1 110 | d2 = -1 111 | anchor1 = None 112 | node.diameter = 0 113 | # node.topo_diam = 0 114 | node.bestLCA = None 115 | node.nleaf = 0 116 | 117 | for ch in node.child_node_iter(): 118 | node.nleaf += ch.nleaf 119 | # n = ch.maxheight + 1 120 | d = ch.maxdepth + ch.edge_length if ch.edge_length else 0 121 | # if n > n1: 122 | # n2 = n1 123 | # n1 = n 124 | # anchor2 = anchor1 125 | # anchor1 = ch.anchor 126 | # elif n > n2: 127 | # n2 = n 128 | # anchor2 = ch.anchor 129 | if d > d1: 130 | d2 = d1 131 | d1 = d 132 | anchor1 = ch.anchor 133 | elif d > d2: 134 | d2 = d 135 | if ch.diameter > node.diameter: 136 | node.diameter = ch.diameter 137 | node.bestLCA = ch.bestLCA 138 | # node.diameter = max(ch.diameter,node.diameter) 139 | 140 | # node.diameter = max(d1+d2, node.diameter) 141 | node.maxdepth = d1 142 | # node.maxheight = n1 143 | node.anchor = anchor1 144 | if d1+d2 > node.diameter: 145 | node.diameter = d1+d2 146 | node.bestLCA = node 147 | 148 | def __get_breaking_edge__(tre, edge_type): 149 | if tre.seed_node.nleaf <= max_size and \ 150 | tre.seed_node.diameter <= max_diam: 151 | return None 152 | if edge_type == 'midpoint': 153 | ed = __find_midpoint_edge__(tre) 154 | elif edge_type == 'centroid': 155 | ed = __find_centroid_edge__(tre) 156 | else: 157 | Configs.warning(("Invalid decomposition type! Please use either " 158 | "'midpoint' or 'centroid'")) 159 | return None 160 | 161 | n = ed.head_node.nleaf 162 | if (n < min_size) or (tre.seed_node.nleaf - n) < min_size: 163 | return None 164 | return ed 165 | 166 | def __check_stop__(tre): 167 | return ((tre.seed_node.nleaf <= max_size and 168 | tre.seed_node.diameter <= max_diam) or 169 | (tre.seed_node.nleaf // 2 < min_size)) 170 | 171 | def __break_by_MP_centroid__(tre): 172 | ed = __get_breaking_edge__(tre, 'midpoint') 173 | if ed is None: 174 | # print("Midpoint failed. Trying centroid decomposition...") 175 | ed = __get_breaking_edge__(tre, 'centroid') 176 | # else: 177 | # print("Successfully splitted by midpoint") 178 | return ed 179 | 180 | def __break(tre): 181 | if strategy == "centroid": 182 | return __get_breaking_edge__(tre, 'centroid') 183 | elif strategy == "midpoint": 184 | return __break_by_MP_centroid__(tre) 185 | else: 186 | raise Exception("strategy not valid: %s" % strategy) 187 | 188 | tqueue = Queue() 189 | 190 | Configs.debug("Starting brlen decomposition ...") 191 | __ini_record__() 192 | min_size = min_size if min_size else 0 193 | max_size = max_size if max_size else a_tree.seed_node.nleaf 194 | max_diam = max_diam if max_diam else a_tree.seed_node.diameter 195 | 196 | Configs.debug( 197 | "Now breaking by %s with min %d and max %d sizes and diameter %f ..." % 198 | (strategy, min_size, max_size, max_diam)) 199 | # try using midpoint 200 | e = __break(a_tree) 201 | 202 | if e is None: 203 | __clean_up__(a_tree) 204 | return [a_tree] 205 | 206 | tree_map = [] 207 | tqueue.put((a_tree, e)) 208 | while not tqueue.empty(): 209 | t, e = tqueue.get() 210 | t1, t2 = __bisect__(t, e) 211 | e1 = __break(t1) 212 | if e1 is None: 213 | __clean_up__(t1) 214 | tree_map.append(t1) 215 | else: 216 | tqueue.put((t1, e1)) 217 | e2 = __break(t2) 218 | if e2 is None: 219 | __clean_up__(t2) 220 | tree_map.append(t2) 221 | else: 222 | tqueue.put((t2, e2)) 223 | 224 | return tree_map 225 | -------------------------------------------------------------------------------- /witch_msa/init_configs.py: -------------------------------------------------------------------------------- 1 | import os, sys, shutil 2 | try: 3 | import configparser 4 | except ImportError: 5 | import ConfigParser as configparser 6 | from argparse import ArgumentParser, Namespace 7 | from platform import platform 8 | 9 | def find_main_config(homepath): 10 | with open(homepath, 'r') as f: 11 | _root_dir = f.read().strip() 12 | main_config_path = os.path.join(_root_dir, 'main.config') 13 | if os.path.exists(main_config_path): 14 | return _root_dir, main_config_path 15 | else: 16 | return None, None 17 | 18 | ''' 19 | first time run, need user to initialize the main.config 20 | if it is not installed through github (i.e., python setup.py config) 21 | will be needed if installed through pip/pypi 22 | ''' 23 | def init_config_file(homepath, prioritize_user_software=True): 24 | # read from sys.argv to find if "-y" or "--bypass-setup" exists 25 | args = sys.argv[1:] 26 | bypass_setup = True 27 | #if '-y' in args or '--bypass-setup' in args: 28 | # bypass_setup = True 29 | 30 | # initialize a home.path that points to local user main.config 31 | # if it exists then pass on 32 | if os.path.exists(homepath): 33 | # detecting old home.path file based on creation time 34 | if os.stat(homepath).st_mtime >= os.stat(__file__).st_mtime: 35 | _root_dir, main_config_path = find_main_config(homepath) 36 | if _root_dir is None: 37 | print('home.path exists but main.config missing, regenerating...') 38 | else: 39 | return _root_dir, main_config_path 40 | else: 41 | print('Found old home.path and regenerating...') 42 | os.remove(homepath) 43 | else: 44 | print('Cannot find home.path: {}'.format(homepath)) 45 | 46 | # install to user local directory 47 | # bypassing the setup step to directly use the default path 48 | _root_dir = '' 49 | if not bypass_setup: 50 | _root_dir = input('Create main.config file at [default: ~/.witch_msa/]: ') 51 | 52 | if _root_dir == '': 53 | _root_dir = os.path.expanduser('~/.witch_msa') 54 | else: 55 | _root_dir = os.path.abspath(_root_dir) 56 | main_config_path = os.path.join(_root_dir, 'main.config') 57 | print('Initializing main configuration file: {}...'.format(main_config_path)) 58 | 59 | # write to local for installation to system 60 | # will read in during runs to find the main.config file 61 | if not os.path.isdir(_root_dir): 62 | os.mkdir(_root_dir) 63 | with open(homepath, 'w') as f: 64 | f.write(_root_dir) 65 | 66 | # create main.config file at configfile using default.config 67 | _config_path = os.path.join(os.path.dirname(__file__), 'default.config') 68 | config_defaults = [] 69 | cparser = configparser.ConfigParser() 70 | cparser.optionxform = str 71 | assert os.path.exists('{}'.format(_config_path)), \ 72 | "default config file {} missing! Please redownload from Github\n".format( 73 | _config_path) 74 | 75 | if os.path.exists(main_config_path): 76 | print('Main configuration file {} exists...'.format(main_config_path)) 77 | print('Overwriting existing configuration file...') 78 | 79 | print('\n') 80 | # initialize main config file using default config file 81 | default_config = configparser.ConfigParser() 82 | with open(_config_path, 'r') as f: 83 | default_config.read_file(f) 84 | for section in default_config.sections(): 85 | cparser.add_section(section) 86 | for k, v in default_config[section].items(): 87 | cparser.set(section, k, v) 88 | 89 | # if platform is linux then we just copy the default config file 90 | # as main.config 91 | platform_name = platform() 92 | tools_dir = os.path.join(os.path.dirname(__file__), 'tools') 93 | set_sections = ['Basic', 'MAGUS'] 94 | 95 | # copy magus directory to tools/ 96 | magus_dir = os.path.join(tools_dir, 'magus') 97 | cparser.set('Basic', 'magus_path', magus_dir + '/magus.py') 98 | 99 | if 'macos' not in platform_name.lower(): 100 | print('System is {}, using default config as main.config...'.format( 101 | platform_name)) 102 | # use existing binaries from MAGUS subfolder (reduce redundancy of 103 | # duplicated binaries) 104 | for _section in set_sections: 105 | # mafftpath 106 | cparser.set(_section, 'mafftpath', 107 | os.path.join(magus_dir, 'tools', 'mafft', 'mafft')) 108 | # mclpath 109 | cparser.set(_section, 'mclpath', 110 | os.path.join(magus_dir, 'tools', 'mcl', 'bin', 'mcl')) 111 | # fasttreepath 112 | cparser.set(_section, 'fasttreepath', 113 | os.path.join(magus_dir, 'tools', 'fasttree', 'FastTreeMP')) 114 | # hmmer packages 115 | for hmmer_pkg in ['hmmsearch', 'hmmalign', 'hmmbuild']: 116 | cparser.set(_section, '{}path'.format(hmmer_pkg), 117 | os.path.join(magus_dir, 'tools', 'hmmer', hmmer_pkg)) 118 | else: 119 | if 'x86' not in platform_name: 120 | print('Warning: system is not using x86 architecture.', 121 | 'Some softwares such as FastTreeMP need to be', 122 | 'self-provided. See {} [Basic] '.format(_config_path), 123 | 'section for more information.') 124 | print("System is {}, reconfiguring main.config...".format(platform_name)) 125 | 126 | # configure MAGUS to use macOS compatible executables 127 | binaries = os.listdir(tools_dir + '/macOS') 128 | for binary in binaries: 129 | path = os.path.join(tools_dir, 'macOS', binary) 130 | #path = os.path.join(_macOS_dir, binary) 131 | for _section in set_sections: 132 | if 'FastTreeMP' in path: 133 | cparser.set(_section, 'fasttreepath', path) 134 | #self.copy_tool_to_lib('FastTreeMP', path)) 135 | else: 136 | cparser.set(_section, '{}path'.format(binary), path) 137 | #self.copy_tool_to_lib(binary, path)) 138 | 139 | # binaries from the user's environment will be used in priority 140 | # if they exist 141 | if prioritize_user_software: 142 | print('Detecting existing software from the user\'s environment...') 143 | software = ['mafft', 'mcl', 144 | 'hmmsearch', 'hmmalign', 'hmmbuild', 'FastTreeMP'] 145 | print('\tDetected:\n') 146 | for soft in software: 147 | if shutil.which(soft): 148 | print('\t{}: {}'.format(soft, shutil.which(soft))) 149 | for _section in set_sections: 150 | if soft == 'FastTreeMP': 151 | cparser.set(_section, 'fasttreepath', 152 | shutil.which(soft)) 153 | elif soft == 'magus': 154 | cparser.set('Basic', 'magus_path', 155 | shutil.which(soft)) 156 | else: 157 | cparser.set(_section, '{}path'.format(soft), 158 | shutil.which(soft)) 159 | 160 | with open(main_config_path, 'w') as f: 161 | cparser.write(f) 162 | print('\n(Done) main.config written to {}'.format(main_config_path)) 163 | print('If you would like to make manual changes, please directly edit {}'.format( 164 | main_config_path)) 165 | # DO NOT EXIT; can start running WITCH with any given commands now 166 | #exit(0) 167 | return _root_dir, main_config_path 168 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/magus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Apr 14, 2020 4 | 5 | @author: Vlad 6 | ''' 7 | 8 | import time 9 | import argparse 10 | import sys 11 | import traceback 12 | 13 | from align.aligner import mainAlignmentTask 14 | from configuration import buildConfigs, Configs 15 | from tasks import manager 16 | 17 | def main(): 18 | ''' 19 | Resolve the args/configs, spin up the task manager (which deals with worker threads and handles parallelism), 20 | and get started on the main alignment task. 21 | ''' 22 | 23 | startTime = time.time() 24 | args = parseArgs() 25 | buildConfigs(args) 26 | Configs.log("MAGUS was run with: {}".format(" ".join(sys.argv))) 27 | 28 | try: 29 | manager.startTaskManager() 30 | mainAlignmentTask() 31 | except: 32 | Configs.error("MAGUS aborted with an exception..") 33 | Configs.error(traceback.format_exc()) 34 | finally: 35 | manager.stopTaskManager() 36 | 37 | endTime = time.time() 38 | Configs.log("MAGUS finished in {} seconds..".format(endTime-startTime)) 39 | 40 | def parseArgs(): 41 | parser = argparse.ArgumentParser() 42 | 43 | parser.add_argument("-d", "--directory", type=str, 44 | help="Path to working directory", required=False, default=None) 45 | 46 | parser.add_argument("-i", "--sequences", type=str, 47 | help="Path to input unaligned sequences", required=False, default=None) 48 | 49 | parser.add_argument("-s", "--subalignments", type=str, nargs="+", 50 | help="Paths to input subalignment files", required=False, default=[]) 51 | 52 | parser.add_argument("-b", "--backbones", type=str, nargs="+", 53 | help="Paths to input backbone alignment files", required=False, default=[]) 54 | 55 | parser.add_argument("-o", "--output", type=str, 56 | help="Output alignment path", required=True) 57 | 58 | parser.add_argument("-t", "--guidetree", type=str, 59 | help="Guide tree for subset decomposition. fasttree (default), fasttree-noml, clustal, parttree, or path to user guide tree", 60 | required=False, default="fasttree") 61 | 62 | parser.add_argument("-np", "--numprocs", type=int, 63 | help="Number of processors to use (default: # cpus available)", 64 | required=False, default=-1) 65 | 66 | parser.add_argument("--maxsubsetsize", type=int, 67 | help="Maximum subset size for divide-and-conquer", 68 | required=False, default=50) 69 | 70 | parser.add_argument("--maxnumsubsets", type=int, 71 | help="Maximum number of subsets for divide-and-conquer", 72 | required=False, default=25) 73 | 74 | parser.add_argument("--decompstrategy", type=str, 75 | help="Initial decomposition strategy (pastastyle or kmh)", 76 | required=False, default="pastastyle") 77 | 78 | parser.add_argument("--decompskeletonsize", type=int, 79 | help="Number of skeleton sequences for the initial decomposition strategy", 80 | required=False, default=300) 81 | 82 | parser.add_argument("--datatype", type=str, 83 | help="Data type (dna, rna, or protein). Will be inferred if not provided", 84 | required=False, default=None) 85 | 86 | parser.add_argument("--graphbuildmethod", type=str, 87 | help="Method for building the alignment graph (mafft, mafftmerge, or initial)", 88 | required=False, default="mafft") 89 | 90 | parser.add_argument("--graphbuildrestrict", type=str, 91 | help="Prevent the alignment graph from adding edges that violate subalignments (true or false)", 92 | required=False, default="False") 93 | 94 | parser.add_argument("--graphbuildhmmextend", type=str, 95 | help="Extend the alignment graph MAFFT backbones with hmmer (true or false)", 96 | required=False, default="False") 97 | 98 | parser.add_argument("--graphclustermethod", type=str, 99 | help="Method for initial clustering of the alignment graph (mcl or none)", 100 | required=False, default="mcl") 101 | 102 | parser.add_argument("--graphtracemethod", type=str, 103 | help="Method for finding a trace from the alignment graph (minclusters, fm, mwtgreedy, or mwtsearch)", 104 | required=False, default="minclusters") 105 | 106 | parser.add_argument("--graphtraceoptimize", type=str, 107 | help="Run an optimization step on the graph trace (true or false)", 108 | required=False, default="False") 109 | 110 | parser.add_argument("-r", "--mafftruns", type=int, 111 | help="Number of MAFFT runs", required=False, default=10) 112 | 113 | parser.add_argument("-m", "--mafftsize", type=int, 114 | help="Maximum size of MAFFT alignments", required=False, default=200) 115 | 116 | parser.add_argument("-f", "--inflationfactor", type=float, 117 | help="MCL inflation factor", required=False, default=4) 118 | 119 | parser.add_argument("-c", "--constrain", type=str, 120 | help="Constrain MAGUS to respect subalignments (true or false)", required=False, default="true") 121 | 122 | parser.add_argument("--onlyguidetree", type=str, 123 | help="Only output the guide tree (true or false)", required=False, default="false") 124 | 125 | parser.add_argument("--recurse", type=str, 126 | help="Allow MAGUS to recurse on large subsets (true or false)", required=False, default="true") 127 | 128 | parser.add_argument("--recurseguidetree", type=str, 129 | help="If recursing, passes this argument as the guide tree option to the lower levels. (Default fasttree)", required=False, default="fasttree") 130 | 131 | parser.add_argument("--recursethreshold", type=int, 132 | help="MAGUS will recursively align subsets above this threshold size", required=False, default=200) 133 | 134 | parser.add_argument("--alignsizelimit", type=float, 135 | help="Size threshold for alignment compression (in GB)", required=False, default=100) 136 | 137 | ''' 138 | 6.1.2021 - added by Chengze Shen 139 | a new argument defining the weights of the backbone alignments 140 | ''' 141 | parser.add_argument("-w", "--backboneWeightsPath", type=str, 142 | required=False, default=None, 143 | help="Weights of the backbone alignments (a file path)") 144 | ''' 145 | 6.8.2022 - added by Chengze Shen 146 | new arguments for different versions of mcl/mafft/HMMER (e.g., MacOS) 147 | ''' 148 | parser.add_argument("--mclpath", type=str, default=None, required=False, 149 | help="custom MCL path") 150 | parser.add_argument("--mafftpath", type=str, default=None, required=False, 151 | help="custom MAFFT path") 152 | parser.add_argument("--hmmalignpath", type=str, default=None, required=False, 153 | help="custom hmmalign path") 154 | parser.add_argument("--hmmbuildpath", type=str, default=None, required=False, 155 | help="custom hmmbuild path") 156 | parser.add_argument("--hmmsearchpath", type=str, default=None, required=False, 157 | help="custom hmmsearch path") 158 | parser.add_argument("--fasttreepath", type=str, default=None, required=False, 159 | help="custom FastTree path") 160 | return parser.parse_args() 161 | 162 | if __name__ == '__main__': 163 | main() 164 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/helpers/sequenceutils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 22, 2018 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | 8 | class Sequence: 9 | def __init__(self, tag, seq): 10 | self.tag = tag 11 | self.seq = seq 12 | 13 | def readFromFasta(filePath, removeDashes = False): 14 | sequences = {} 15 | currentSequence = None 16 | 17 | with open(filePath) as f: 18 | for line in f: 19 | line = line.strip() 20 | if line.startswith('>'): 21 | tag = line[1:] 22 | currentSequence = Sequence(tag, "") 23 | sequences[tag] = currentSequence 24 | else : 25 | if(removeDashes): 26 | line = line.replace("-", "") 27 | currentSequence.seq = currentSequence.seq + line 28 | 29 | print("Read " + str(len(sequences)) + " sequences from " + filePath + " ..") 30 | return sequences 31 | 32 | def readFromFastaOrdered(filePath, removeDashes = False): 33 | sequences = [] 34 | currentSequence = None 35 | 36 | with open(filePath) as f: 37 | for line in f: 38 | line = line.strip() 39 | if line.startswith('>'): 40 | tag = line[1:] 41 | currentSequence = Sequence(tag, "") 42 | sequences.append(currentSequence) 43 | else : 44 | if(removeDashes): 45 | line = line.replace("-", "") 46 | currentSequence.seq = currentSequence.seq + line 47 | 48 | print("Read " + str(len(sequences)) + " sequences from " + filePath + " ..") 49 | return sequences 50 | 51 | def readFromPhylip(filePath, removeDashes = False): 52 | sequences = {} 53 | 54 | with open(filePath) as f: 55 | firstLine = f.readline().strip() 56 | 57 | for line in f: 58 | #print(line) 59 | tokens = line.split() 60 | if len(tokens) == 2: 61 | tag = tokens[0] 62 | seq = tokens[1] 63 | 64 | if(removeDashes): 65 | seq = seq.replace("-", "") 66 | 67 | if tag in sequences: 68 | sequences[tag].seq = sequences[tag].seq + seq 69 | else: 70 | sequences[tag] = Sequence(tag, seq) 71 | 72 | 73 | print("Read " + str(len(sequences)) + " sequences from " + filePath + " ..") 74 | return sequences 75 | 76 | #reads match columns only 77 | def readFromStockholm(filePath, includeInsertions = False): 78 | sequences = {} 79 | 80 | with open(filePath, 'r') as stockFile: 81 | for line in stockFile: 82 | line = line.strip() 83 | if line == "//": 84 | break 85 | elif line == "" or line[0] == "#": 86 | pass 87 | else: 88 | key, seq = line.split() 89 | if key not in sequences: 90 | sequences[key] = Sequence(key, "") 91 | 92 | for c in seq: 93 | #if includeInsertions or not (c == '.' or c in string.ascii_lowercase): 94 | if includeInsertions or (c == c.upper() and c != '.'): 95 | sequences[key].seq = sequences[key].seq + c 96 | return sequences 97 | 98 | def writeFasta(alignment, filePath, taxa = None, append = False): 99 | with open(filePath, 'a' if append else 'w') as textFile: 100 | if taxa is not None: 101 | for tag in taxa: 102 | if tag in alignment: 103 | textFile.write('>' + tag + '\n' + alignment[tag].seq + '\n') 104 | else: 105 | for tag in alignment: 106 | textFile.write('>' + tag + '\n' + alignment[tag].seq + '\n') 107 | 108 | 109 | def writePhylip(alignment, filePath, taxa = None): 110 | maxChars = 0 111 | lines = [] 112 | for tag in alignment: 113 | if taxa is None or tag in taxa: 114 | lines.append("{} {}\n".format(tag, alignment[tag].seq)) 115 | maxChars = max(maxChars, len(alignment[tag].seq)) 116 | 117 | with open(filePath, 'w') as textFile: 118 | textFile.write("{} {}\n".format(len(lines), maxChars)) 119 | for line in lines: 120 | textFile.write(line) 121 | 122 | def cleanGapColumns(filePath, cleanFile = None): 123 | align = readFromFasta(filePath, False) 124 | values = list(align.values()) 125 | keepCols = [] 126 | for i in range(len(values[0].seq)): 127 | for j in range(len(values)): 128 | if values[j].seq[i] != '-': 129 | keepCols.append(i) 130 | break 131 | 132 | print("Removing gap columns.. Kept {} out of {}..".format(len(keepCols), len(values[0].seq))) 133 | for s in values: 134 | s.seq = ''.join(s.seq[idx] for idx in keepCols) 135 | 136 | if cleanFile is None: 137 | cleanFile = filePath 138 | 139 | writeFasta(align, cleanFile) 140 | 141 | def convertRnaToDna(filePath, destFile = None): 142 | align = readFromFasta(filePath, False) 143 | for taxon in align: 144 | align[taxon].seq = align[taxon].seq.replace('U', 'T') 145 | if destFile is None: 146 | destFile = filePath 147 | writeFasta(align, destFile) 148 | 149 | def inferDataType(filePath): 150 | sequences = readFromFasta(filePath, removeDashes=True) 151 | acg, t, u, total = 0, 0, 0, 0 152 | for taxon in sequences: 153 | letters = sequences[taxon].seq.upper() 154 | for letter in letters: 155 | total = total + 1 156 | 157 | if letter in ('A', 'C', 'G', 'N'): 158 | acg = acg + 1 159 | elif letter == 'T': 160 | t = t + 1 161 | elif letter == 'U': 162 | u = u + 1 163 | 164 | if u == 0 and (acg + t)/total > 0.9: 165 | print("Found {}% ACGT-N, assuming DNA..".format(int(100*(acg + t)/total))) 166 | dataType = "dna" 167 | elif t == 0 and (acg + u)/total > 0.9: 168 | print("Found {}% ACGU-N, assuming RNA..".format(int(100*(acg + u)/total))) 169 | dataType = "rna" 170 | else: 171 | print("Assuming protein..") 172 | dataType = "protein" 173 | 174 | return dataType 175 | 176 | def readSequenceLengthFromFasta(filePath): 177 | with open(filePath) as f: 178 | length = 0 179 | readSequence = False 180 | for line in f: 181 | line = line.strip() 182 | if line.startswith('>'): 183 | if readSequence: 184 | return length 185 | readSequence = True 186 | else: 187 | length = length + len(line) 188 | if readSequence: 189 | return length 190 | 191 | def countGaps(alignFile): 192 | counts = [] 193 | currentSequence = "" 194 | 195 | with open(alignFile) as f: 196 | for line in f: 197 | line = line.strip() 198 | if line.startswith('>'): 199 | 200 | if currentSequence is not None: 201 | if len(counts) == 0: 202 | counts = [0] * len(currentSequence) 203 | for i in range(len(counts)): 204 | if currentSequence[i] == '-': 205 | counts[i] = counts[i] + 1 206 | 207 | currentSequence = "" 208 | else: 209 | currentSequence = currentSequence + line 210 | if currentSequence is not None: 211 | if len(counts) == 0: 212 | counts = [0] * len(currentSequence) 213 | for i in range(len(counts)): 214 | if currentSequence[i] == '-': 215 | counts[i] = counts[i] + 1 216 | 217 | return counts 218 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/configuration.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 14, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | import time 9 | 10 | from helpers import sequenceutils 11 | 12 | class Configs: 13 | 14 | workingDir = None 15 | sequencesPath = None 16 | subsetPaths = None 17 | subalignmentPaths = None 18 | backbonePaths = None 19 | guideTree = "fasttree" 20 | outputPath = None 21 | dataType = None 22 | 23 | decompositionMaxNumSubsets = 25 24 | decompositionMaxSubsetSize = 50 25 | decompositionStrategy = "pastastyle" 26 | decompositionSkeletonSize = 300 27 | #decompositionKmhIterations = 1 28 | 29 | graphBuildMethod = "mafft" 30 | graphBuildHmmExtend = False 31 | graphBuildRestrict = False 32 | graphClusterMethod = "mcl" 33 | graphTraceMethod = "minclusters" 34 | graphTraceOptimize = False 35 | 36 | mafftRuns = 10 37 | mafftSize = 200 38 | mclInflationFactor = 4 39 | 40 | constrain = True 41 | onlyGuideTree = False 42 | recurse = True 43 | recurseGuideTree = "fasttree" 44 | recurseThreshold = 200 45 | 46 | clustalPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/clustal/clustalo") 47 | mafftPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/mafft/mafft") 48 | mclPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/mcl/bin/mcl") 49 | mlrmclPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/mlrmcl/mlrmcl") 50 | hmmalignPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/hmmer/hmmalign") 51 | hmmbuildPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/hmmer/hmmbuild") 52 | hmmsearchPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/hmmer/hmmsearch") 53 | fasttreePath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/fasttree/FastTreeMP") 54 | raxmlPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/raxmlng/raxml-ng") 55 | 56 | logPath = None 57 | errorPath = None 58 | debugPath = None 59 | 60 | numCores = 1 61 | searchHeapLimit = 5000 62 | alignmentSizeLimit = 100 63 | 64 | @staticmethod 65 | def log(msg, path = None): 66 | print(msg) 67 | path = Configs.logPath if path is None else path 68 | Configs.writeMsg(msg, path) 69 | 70 | @staticmethod 71 | def error(msg, path = None): 72 | Configs.log(msg) 73 | path = Configs.errorPath if path is None else path 74 | Configs.writeMsg(msg, path) 75 | 76 | @staticmethod 77 | def debug(msg, path = None): 78 | path = Configs.debugPath if path is None else path 79 | Configs.writeMsg(msg, path) 80 | 81 | @staticmethod 82 | def writeMsg(msg, path): 83 | if path is not None: 84 | with open(path, 'a') as logFile: 85 | logFile.write("{} {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S"), msg)) 86 | 87 | @staticmethod 88 | def inferDataType(sequencesFile): 89 | if Configs.dataType is None: 90 | Configs.dataType = sequenceutils.inferDataType(sequencesFile) 91 | Configs.log("Data type wasn't specified. Inferred data type {} from {}".format(Configs.dataType.upper(), sequencesFile)) 92 | return Configs.dataType 93 | 94 | def buildConfigs(args): 95 | Configs.outputPath = os.path.abspath(args.output) 96 | 97 | if args.directory is not None: 98 | Configs.workingDir = os.path.abspath(args.directory) 99 | else: 100 | Configs.workingDir = os.path.join(os.path.dirname(Configs.outputPath), "magus_working_dir") 101 | if not os.path.exists(Configs.workingDir): 102 | os.makedirs(Configs.workingDir) 103 | 104 | Configs.sequencesPath = os.path.abspath(args.sequences) if args.sequences is not None else Configs.sequencesPath 105 | 106 | ''' 107 | 11.16.2023 - modified by Chengze Shen 108 | Making sure that if default guide tree styles are used, then we do not 109 | attempt to search for the guide tree in path 110 | ''' 111 | guideTree_styles = ['fasttree', 'fasttree-noml', 'parttree', 'clustal'] 112 | if args.guidetree is not None: 113 | # using existing styles 114 | if args.guidetree.lower() in guideTree_styles: 115 | Configs.guideTree = args.guidetree.lower() 116 | # supplementing with a working path to a file (presumably a tree file) 117 | elif os.path.exists(os.path.abspath(args.guidetree)): 118 | Configs.guideTree = os.path.abspath(args.guidetree) 119 | # otherwise use the default Configs.guideTree value 120 | #Configs.guideTree = os.path.abspath(args.guidetree) if args.guidetree is not None else Configs.guideTree 121 | #if args.guidetree is not None: 122 | # Configs.guideTree = os.path.abspath(args.guidetree) if os.path.exists(os.path.abspath(args.guidetree)) else args.guidetree 123 | 124 | Configs.subalignmentPaths = [] 125 | for p in args.subalignments: 126 | path = os.path.abspath(p) 127 | if os.path.isdir(path): 128 | for filename in os.listdir(path): 129 | Configs.subalignmentPaths.append(os.path.join(path, filename)) 130 | else: 131 | Configs.subalignmentPaths.append(path) 132 | 133 | Configs.backbonePaths = [] 134 | for p in args.backbones: 135 | path = os.path.abspath(p) 136 | if os.path.isdir(path): 137 | for filename in os.listdir(path): 138 | Configs.backbonePaths.append(os.path.join(path, filename)) 139 | else: 140 | Configs.backbonePaths.append(path) 141 | 142 | if args.numprocs > 0: 143 | Configs.numCores = args.numprocs 144 | else: 145 | Configs.numCores = os.cpu_count() 146 | 147 | Configs.decompositionMaxSubsetSize = args.maxsubsetsize 148 | Configs.decompositionMaxNumSubsets = args.maxnumsubsets 149 | Configs.decompositionStrategy = args.decompstrategy 150 | Configs.decompositionSkeletonSize = args.decompskeletonsize 151 | Configs.dataType = args.datatype 152 | 153 | Configs.graphBuildMethod = args.graphbuildmethod 154 | Configs.graphBuildHmmExtend = args.graphbuildhmmextend.lower() == "true" 155 | Configs.graphBuildRestrict = args.graphbuildrestrict.lower() == "true" 156 | Configs.graphClusterMethod = args.graphclustermethod 157 | Configs.graphTraceMethod = args.graphtracemethod 158 | Configs.graphTraceOptimize = args.graphtraceoptimize.lower() == "true" 159 | 160 | Configs.mafftRuns = args.mafftruns 161 | Configs.mafftSize = args.mafftsize 162 | Configs.mclInflationFactor = args.inflationfactor 163 | 164 | Configs.constrain = args.constrain.lower() == "true" 165 | Configs.onlyGuideTree = args.onlyguidetree.lower() == "true" 166 | Configs.recurse = args.recurse.lower() == "true" 167 | Configs.recurseGuideTree = args.recurseguidetree 168 | Configs.recurseThreshold = args.recursethreshold 169 | 170 | Configs.logPath = os.path.join(Configs.workingDir, "log.txt") 171 | Configs.errorPath = os.path.join(Configs.workingDir, "log_errors.txt") 172 | Configs.debugPath = os.path.join(Configs.workingDir, "log_debug.txt") 173 | 174 | Configs.alignmentSizeLimit = args.alignsizelimit 175 | 176 | ''' 177 | 6.1.2021 - added by Chengze Shen 178 | new config for taking in the backbone weights 179 | ''' 180 | Configs.backboneWeightsPath = args.backboneWeightsPath 181 | ''' 182 | 6.8.2022 - added by Chengze Shen 183 | new config for customized MCL/MAFFT path 184 | ''' 185 | if args.mclpath: 186 | Configs.mclPath = os.path.abspath(args.mclpath) 187 | if args.mafftpath: 188 | Configs.mafftPath = os.path.abspath(args.mafftpath) 189 | if args.hmmalignpath: 190 | Configs.hmmalignPath = os.path.abspath(args.hmmalignpath) 191 | if args.hmmbuildpath: 192 | Configs.hmmbuildPath = os.path.abspath(args.hmmbuildpath) 193 | if args.hmmsearchpath: 194 | Configs.hmmsearchPath = os.path.abspath(args.hmmsearchpath) 195 | if args.fasttreepath: 196 | Configs.fasttreePath = os.path.abspath(args.fasttreepath) 197 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tools/external_tools.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 14, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import subprocess 8 | import os 9 | import random 10 | import shutil 11 | from configuration import Configs 12 | from tasks.task import Task 13 | 14 | def runCommand(**kwargs): 15 | command = kwargs["command"] 16 | Configs.log("Running an external tool, command: {}".format(command)) 17 | runner = subprocess.run(command, shell = True, cwd = kwargs["workingDir"], universal_newlines = True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 18 | try: 19 | runner.check_returncode() 20 | except: 21 | Configs.error("Command encountered error: {}".format(command)) 22 | Configs.error("Exit code: {}".format(runner.returncode)) 23 | Configs.error("Output: {}".format(runner.stdout)) 24 | raise 25 | for srcPath, destPath in kwargs.get("fileCopyMap", {}).items(): 26 | shutil.move(srcPath, destPath) 27 | 28 | def runClustalOmegaGuideTree(fastaPath, workingDir, outputPath, threads = 1): 29 | tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath))) 30 | args = [Configs.clustalPath] 31 | args.extend(["-i", fastaPath, "--max-hmm-iterations=-1", "--guidetree-out={}".format(tempPath)]) 32 | args.extend(["--threads={}".format(threads)]) 33 | taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir} 34 | return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs) 35 | 36 | def generateMafftFilePathMap(inputPaths, outputDir): 37 | mafftMap = {inputPath : os.path.join(outputDir, "mafft_{}".format(os.path.basename(inputPath))) for inputPath in inputPaths} 38 | return mafftMap 39 | 40 | def buildMafftAlignments(inputOutputPathMap): 41 | tasks = [buildMafftAlignment(inputPath, outputPath) for inputPath, outputPath in inputOutputPathMap.items()] 42 | return tasks 43 | 44 | def buildMafftAlignment(inputPath, outputPath, subtablePath = None): 45 | return runMafft(inputPath, subtablePath, Configs.workingDir, outputPath, Configs.numCores) 46 | 47 | def runMafft(fastaPath, subtablePath, workingDir, outputPath, threads = 1): 48 | tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath))) 49 | args = [Configs.mafftPath, "--localpair", "--maxiterate", "1000", "--ep", "0.123", 50 | "--quiet", "--thread", str(threads), "--anysymbol"] 51 | if subtablePath is not None: 52 | args.extend(["--merge", subtablePath]) 53 | args.extend([fastaPath, ">", tempPath]) 54 | taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir} 55 | return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs) 56 | 57 | def runMafftGuideTree(fastaPath, workingDir, outputPath, threads = 1): 58 | tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath))) 59 | treeFile = os.path.join(os.path.dirname(fastaPath), "{}.tree".format(os.path.basename(fastaPath))) 60 | args = [Configs.mafftPath, "--retree", "0", "--treeout", "--parttree", 61 | "--quiet", "--thread", str(threads), "--anysymbol"] 62 | args.extend(["--partsize", "1000"]) 63 | args.extend([fastaPath, ">", tempPath]) 64 | taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {treeFile : outputPath}, "workingDir" : workingDir} 65 | return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs) 66 | 67 | def runMcl(matrixPath, inflation, workingDir, outputPath): 68 | tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath))) 69 | args = [Configs.mclPath, matrixPath, "--abc", "-o", tempPath] 70 | if inflation is not None: 71 | args.extend(["-I", str(inflation)]) 72 | taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir} 73 | return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs) 74 | 75 | def runMlrMcl(matrixPath, granularity, balance, inflation, workingDir, outputPath): 76 | tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath))) 77 | args = [Configs.mlrmclPath, matrixPath, "-o", tempPath] 78 | if granularity is not None: 79 | args.extend(["-c", str(granularity)]) 80 | if balance is not None: 81 | args.extend(["-b", str(balance)]) 82 | if inflation is not None: 83 | args.extend(["-i", str(inflation)]) 84 | taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir} 85 | return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs) 86 | 87 | def runFastTree(fastaFilePath, workingDir, outputPath, mode = "normal", intree = None): 88 | tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath))) 89 | 90 | args = [Configs.fasttreePath] 91 | if Configs.inferDataType(fastaFilePath) == "protein": 92 | args.extend(["-lg"]) 93 | else: 94 | args.extend(["-nt", "-gtr"]) 95 | 96 | if intree is not None: 97 | args.extend(["-intree", intree]) 98 | 99 | if mode == "fast": 100 | args.extend(["-fastest", "-nosupport"]) 101 | elif mode == "faster": 102 | args.extend(["-fastest", "-nosupport", "-mlnni", "4" ]) 103 | elif mode == "noml": 104 | args.extend(["-fastest", "-nosupport", "-noml"]) 105 | 106 | args.extend([fastaFilePath, ">", tempPath]) 107 | taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir} 108 | return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs) 109 | 110 | def runRaxmlNg(fastaFilePath, workingDir, outputPath, threads = 8): 111 | # raxml-ng --msa prim.phy --model GTR+G --prefix T4 --threads 2 --seed 2 --tree pars{25},rand{25} 112 | baseName = os.path.basename(outputPath).replace(".","") 113 | raxmlFile = os.path.join(workingDir, "{}.raxml.bestTree".format(baseName)) 114 | seed = random.randint(1, 1000000) 115 | args = [Configs.raxmlPath, 116 | "--msa", fastaFilePath, 117 | "--prefix", baseName, 118 | "--threads", str(threads), 119 | "--seed", str(seed)] 120 | 121 | if Configs.inferDataType(fastaFilePath) == "protein": 122 | args.extend(["--model", "LG+G"]) 123 | else: 124 | args.extend(["--model", "GTR+G"]) 125 | 126 | args.extend(["--tree", "pars{{{}}}".format(1)]) 127 | taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {raxmlFile : outputPath}, "workingDir" : workingDir} 128 | return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs) 129 | 130 | def runHmmBuild(alignmentPath, workingDir, outputPath): 131 | tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath))) 132 | args = [Configs.hmmbuildPath,'--ere', '0.59', "--cpu", "1"] 133 | args.extend(["--symfrac", "0.0", "--informat", "afa", tempPath, alignmentPath]) 134 | taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir} 135 | return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs) 136 | 137 | def runHmmAlign(hmmModelPath, fragPath, workingDir, outputPath): 138 | tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath))) 139 | args = [Configs.hmmalignPath, "-o", tempPath] 140 | args.extend([hmmModelPath, fragPath]) 141 | taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir} 142 | return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs) 143 | 144 | def runHmmSearch(hmmModelPath, fragPath, workingDir, outputPath): 145 | tempPath = os.path.join(os.path.dirname(outputPath), "temp_{}".format(os.path.basename(outputPath))) 146 | args = [Configs.hmmsearchPath,"--noali", "--cpu", "1", "-o", tempPath, "-E", "99999999", "--max"] 147 | args.extend([hmmModelPath, fragPath]) 148 | taskArgs = {"command" : subprocess.list2cmdline(args), "fileCopyMap" : {tempPath : outputPath}, "workingDir" : workingDir} 149 | return Task(taskType = "runCommand", outputFile = outputPath, taskArgs = taskArgs) -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_trace/rg_fast_search.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 23, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import heapq 8 | from collections import deque 9 | 10 | from configuration import Configs 11 | 12 | def rgFastSearch(graph): 13 | Configs.log("Finding graph trace with fast region-growing search..") 14 | 15 | k = len(graph.context.subalignments) 16 | lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] 17 | upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 18 | cuts = rgFastCluster(graph, lowerBound, upperBound, True) 19 | graph.clusters = cutsToClusters(graph, cuts) 20 | 21 | def rgFastCluster(graph, lowerBound, upperBound, enforceTrace = True): 22 | initialCuts = initialSplit(graph, lowerBound, upperBound, enforceTrace) 23 | if len(initialCuts) == 2: 24 | return initialCuts 25 | #Configs.log("Starting with {} coarse cuts..".format(len(initialCuts))) 26 | cuts = [] 27 | for i in range(len(initialCuts)-1): 28 | intervalCuts = rgFastCluster(graph, initialCuts[i], initialCuts[i+1], enforceTrace) 29 | cuts.extend(intervalCuts[:-1]) 30 | 31 | cuts.append(list(upperBound)) 32 | #Configs.log("Returning {} fine cuts..".format(len(cuts))) 33 | return cuts 34 | 35 | def initialSplit(graph, lowerBound, upperBound, enforceTrace = True): 36 | k = len(graph.context.subalignments) 37 | baseIdx = max(range(k), key = lambda x : upperBound[x] - lowerBound[x]) 38 | #baseIdx = min(range(k), key = lambda x : upperBound[x] - lowerBound[x] if upperBound[x] - lowerBound[x] >= 2 else float('inf')) 39 | baseLength = upperBound[baseIdx] - lowerBound[baseIdx] 40 | if baseLength < 2: 41 | return [list(lowerBound), list(upperBound)] 42 | 43 | clusters = initialSplitExpansion(graph, lowerBound, upperBound, baseIdx, baseLength) 44 | #clusters = initialSplitExpansionSimple(graph, lowerBound, upperBound, baseIdx, baseLength) 45 | 46 | cuts = clustersToCuts(graph, lowerBound, upperBound, clusters) 47 | 48 | return cuts 49 | 50 | def initialSplitExpansion(graph, lowerBound, upperBound, baseIdx, baseLength): 51 | k = len(graph.context.subalignments) 52 | clusters = [[lowerBound[baseIdx] + i] for i in range(baseLength)] 53 | #idxSets = [set([baseIdx]) for i in range(baseLength)] 54 | idxSets = {(i, baseIdx) : lowerBound[baseIdx] + i for i in range(baseLength)} 55 | usedNodes = set() 56 | weightMap = {} 57 | 58 | boundsMap = {} 59 | for i in range(k): 60 | boundsMap[0, i] = (lowerBound[i]-1, upperBound[i]) 61 | boundsMap[baseLength-1, i] = (lowerBound[i]-1, upperBound[i]) 62 | 63 | heap = [] 64 | for node in range(lowerBound[baseIdx], upperBound[baseIdx]): 65 | for nbr, value in graph.matrix[node].items(): 66 | i, pos = graph.matSubPosMap[nbr] 67 | #for i in range(k): 68 | # for nbr, value in graph.nodeEdges[node][i]: 69 | 70 | if nbr < lowerBound[i]: 71 | continue 72 | if nbr >= upperBound[i]: 73 | continue 74 | #break 75 | idx = node - lowerBound[baseIdx] 76 | if (idx, i) in idxSets: 77 | continue 78 | 79 | heapq.heappush(heap, (-1*value, node, nbr, idx)) 80 | weightMap[idx, nbr] = value 81 | 82 | while len(heap) > 0: 83 | value, a, b, idx = heapq.heappop(heap) 84 | if b in usedNodes: 85 | continue 86 | #asub, apos = graph.matSubPosMap[a] 87 | bsub, bpos = graph.matSubPosMap[b] 88 | 89 | if (idx, bsub) in idxSets: 90 | continue 91 | lower, upper = getBounds(boundsMap, baseLength, idx, bsub) 92 | if not (b > lower and b < upper): 93 | continue 94 | 95 | addBounds(graph, boundsMap, baseLength, idx, b) 96 | clusters[idx].append(b) 97 | idxSets[idx, bsub] = b 98 | usedNodes.add(b) 99 | 100 | for nbr, value in graph.matrix[b].items(): 101 | i, pos = graph.matSubPosMap[nbr] 102 | #for i in range(k): 103 | if (idx, i) in idxSets: 104 | continue 105 | lower, upper = getBounds(boundsMap, baseLength, idx, i) 106 | #for nbr, value in graph.nodeEdges[b][i]: 107 | if nbr in usedNodes: 108 | continue 109 | 110 | if nbr <= lower: 111 | continue 112 | if nbr >= upper: 113 | #break 114 | continue 115 | 116 | #print(weightMap.get((idx, nbr), 0)) 117 | weight = value + weightMap.get((idx, nbr), 0) 118 | weightMap[idx, nbr] = weight 119 | heapq.heappush(heap, (-1*weight, b, nbr, idx)) 120 | 121 | return clusters 122 | 123 | 124 | def getBounds(boundsMap, baseLength, idx, asub): 125 | a, b = 0, baseLength - 1 126 | if idx == a: 127 | return boundsMap[a, asub] 128 | if idx == b: 129 | return boundsMap[b, asub] 130 | 131 | midpoint = int((a+b)*0.5) 132 | while (midpoint, asub) in boundsMap: 133 | if idx == midpoint: 134 | return boundsMap[midpoint, asub] 135 | elif idx > midpoint: 136 | a = midpoint 137 | elif idx < midpoint: 138 | b = midpoint 139 | midpoint = int((a+b)*0.5) 140 | la, ua = boundsMap[a, asub] 141 | lb, ub = boundsMap[b, asub] 142 | return (la, ub) 143 | 144 | def addBounds(graph, boundsMap, baseLength, idx, node): 145 | asub, apos = graph.matSubPosMap[node] 146 | a, b = 0, baseLength - 1 147 | 148 | while True: 149 | la, ua = boundsMap[a, asub] 150 | lb, ub = boundsMap[b, asub] 151 | if idx == a: 152 | boundsMap[a, asub] = (node, node) 153 | return 154 | elif node < ua: 155 | boundsMap[a, asub] = (la, node) 156 | 157 | if idx == b: 158 | boundsMap[b, asub] = (node, node) 159 | return 160 | elif node > lb: 161 | boundsMap[b, asub] = (node, ub) 162 | 163 | midpoint = int((a+b)*0.5) 164 | if idx == midpoint: 165 | boundsMap[midpoint, asub] = (node, node) 166 | return 167 | elif (midpoint, asub) not in boundsMap: 168 | boundsMap[midpoint, asub] = (la, ub) 169 | 170 | if idx > midpoint: 171 | a = midpoint 172 | elif idx < midpoint: 173 | b = midpoint 174 | 175 | 176 | def initialSplitExpansionSimple(graph, lowerBound, upperBound, baseIdx, baseLength): 177 | k = len(graph.context.subalignments) 178 | clusters = [[lowerBound[baseIdx] + i] for i in range(baseLength)] 179 | #idxSets = [set([baseIdx]) for i in range(baseLength)] 180 | idxSets = {(i, baseIdx) : lowerBound[baseIdx] + i for i in range(baseLength)} 181 | usedNodes = set() 182 | weightMap = {} 183 | 184 | heap = [] 185 | for node in range(lowerBound[baseIdx], upperBound[baseIdx]): 186 | for i in range(k): 187 | for nbr, value in graph.nodeEdges[node][i]: 188 | if nbr < lowerBound[i]: 189 | continue 190 | if nbr >= upperBound[i]: 191 | break 192 | idx = node - lowerBound[baseIdx] 193 | heapq.heappush(heap, (-1*value, node, nbr, idx)) 194 | weightMap[idx, nbr] = value 195 | 196 | while len(heap) > 0: 197 | value, a, b, idx = heapq.heappop(heap) 198 | if b in usedNodes: 199 | continue 200 | #asub, apos = graph.matSubPosMap[a] 201 | bsub, bpos = graph.matSubPosMap[b] 202 | #if bsub in idxSets[idx]: 203 | # continue 204 | if (idx, bsub) in idxSets or idxSets.get((idx-1, bsub), 0) > b or idxSets.get((idx+1, bsub), upperBound[bsub]) < b: 205 | continue 206 | 207 | 208 | clusters[idx].append(b) 209 | #idxSets[idx].add(bsub) 210 | idxSets[idx, bsub] = b 211 | usedNodes.add(b) 212 | 213 | for i in range(k): 214 | if (idx, i) in idxSets: 215 | continue 216 | for nbr, value in graph.nodeEdges[b][i]: 217 | if nbr in usedNodes: 218 | continue 219 | if nbr < idxSets.get((idx-1, i), lowerBound[i]): 220 | continue 221 | if nbr >= idxSets.get((idx+1, i), upperBound[i]): 222 | break 223 | 224 | #print(weightMap.get((idx, nbr), 0)) 225 | weight = value + weightMap.get((idx, nbr), 0) 226 | #weight = value 227 | weightMap[idx, nbr] = weight 228 | heapq.heappush(heap, (-1*weight, b, nbr, idx)) 229 | 230 | return clusters 231 | 232 | def clustersToCuts(graph, lowerBound, upperBound, clusters): 233 | cuts = [list(lowerBound)] 234 | #cuts = [] 235 | cut = list(lowerBound) 236 | for i, cluster in enumerate(clusters): 237 | if i == 0: 238 | continue 239 | #cut = list(cuts[-1]) 240 | 241 | for a in cluster: 242 | asub, apos = graph.matSubPosMap[a] 243 | cut[asub] = max(a, cut[asub]) 244 | cuts.append(cut) 245 | cut = list(cut) 246 | cuts.append(list(upperBound)) 247 | return cuts 248 | 249 | def cutsToClusters(graph, cuts): 250 | clusters = [] 251 | for i in range(len(cuts)-1): 252 | cluster = [] 253 | for j in range(len(cuts[i])): 254 | cluster.extend(list(range(cuts[i][j], cuts[i+1][j]))) 255 | clusters.append(cluster) 256 | #print(cluster) 257 | 258 | return clusters -------------------------------------------------------------------------------- /witch_msa/tools/magus/align/merge/graph_build/graph_builder.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 14, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import os 8 | import time 9 | import random 10 | 11 | from align.merge.alignment_graph import AlignmentGraph 12 | from helpers import sequenceutils, hmmutils 13 | from tasks import task 14 | from configuration import Configs 15 | from tools import external_tools 16 | 17 | ''' 18 | Building a MAGUS alignment graph from backbone alignments. 19 | The graph is a sparse matrix, stored as a weighted adjacency list. 20 | Backbone alignment tasks are run in parallel, and can begin before the subalignments are finished. 21 | When subalignments finish, we can initialize the alignment graph. 22 | Backbones are added to this graph as they complete. 23 | Graph is written to file in MCL-compliant format. 24 | ''' 25 | 26 | def buildGraph(context): 27 | time1 = time.time() 28 | 29 | context.graph = AlignmentGraph(context) 30 | context.initializeSequences() 31 | 32 | if os.path.exists(context.graph.graphPath): 33 | Configs.log("Found existing graph file {}".format(context.graph.graphPath)) 34 | else: 35 | requestBackboneTasks(context) 36 | 37 | context.awaitSubalignments() 38 | context.graph.initializeMatrix() 39 | 40 | if os.path.exists(context.graph.graphPath): 41 | context.graph.readGraphFromFile(context.graph.graphPath) 42 | else: 43 | context.initializeBackboneSequenceMapping() 44 | ''' 45 | 6.1.2021 - added by Chengze Shen 46 | a new invocation to the newly added function in context 47 | which initializes the backbone weights 48 | ''' 49 | context.initializeBackboneWeights() 50 | 51 | buildMatrix(context) 52 | context.graph.writeGraphToFile(context.graph.graphPath) 53 | 54 | time2 = time.time() 55 | Configs.log("Built the alignment graph in {} sec..".format(time2-time1)) 56 | 57 | def requestBackboneTasks(context): 58 | if len(context.backbonePaths) > 0: 59 | Configs.log("Using {} user-defined backbone files..".format(len(context.backbonePaths))) 60 | context.backbonePaths = context.backbonePaths 61 | for path in context.backbonePaths: 62 | context.backboneTaxa.update(sequenceutils.readFromFasta(path)) 63 | 64 | elif Configs.graphBuildMethod == "mafft": 65 | Configs.log("Using {} MAFFT backbones..".format(Configs.mafftRuns)) 66 | requestMafftBackbones(context) 67 | 68 | elif Configs.graphBuildMethod == "subsethmm": 69 | Configs.log("Using {} HMM-extended subalignments as backbone files..".format(len(context.subalignmentPaths))) 70 | context.backbonePaths = context.subalignmentPaths 71 | context.backboneExtend.update(context.backbonePaths) 72 | 73 | elif Configs.graphBuildMethod == "initial": 74 | Configs.log("Using the initial decomposition alignment as the single backbone..") 75 | initialAlignPath = os.path.join(context.workingDir, "decomposition", "initial_tree", "initial_insert_align.txt") 76 | context.backbonePaths = [initialAlignPath] 77 | 78 | if not Configs.constrain and Configs.graphBuildMethod != "subsethmm": 79 | context.backbonePaths.extend(context.subalignmentPaths) 80 | 81 | def requestMafftBackbones(context): 82 | numTaxa = max(1, int(Configs.mafftSize/len(context.subsetPaths))) 83 | 84 | for n in range(Configs.mafftRuns): 85 | unalignedFile = os.path.join(context.graph.workingDir, "backbone_{}_unalign.txt".format(n+1)) 86 | alignedFile = os.path.join(context.graph.workingDir, "backbone_{}_mafft.txt".format(n+1)) 87 | if os.path.exists(alignedFile): 88 | Configs.log("Existing backbone file found: {}".format(alignedFile)) 89 | backbone = sequenceutils.readFromFasta(alignedFile) 90 | context.backbonePaths.append(alignedFile) 91 | else: 92 | backbone = assignBackboneTaxa(context, numTaxa, unalignedFile) 93 | backboneTask = external_tools.buildMafftAlignment(unalignedFile, alignedFile) 94 | context.backboneTasks.append(backboneTask) 95 | 96 | if Configs.graphBuildHmmExtend: 97 | context.backboneExtend.add(alignedFile) 98 | else: 99 | context.backboneTaxa.update(backbone) 100 | task.submitTasks(context.backboneTasks) 101 | 102 | def buildMatrix(context): 103 | addedBackbones = set() 104 | for backboneTask in task.asCompleted(context.backboneTasks): 105 | addAlignmentFileToGraph(context, backboneTask.outputFile) 106 | addedBackbones.add(backboneTask.outputFile) 107 | 108 | for backboneFile in context.backbonePaths: 109 | if backboneFile not in addedBackbones: 110 | addAlignmentFileToGraph(context, backboneFile) 111 | 112 | def assignBackboneTaxa(context, numTaxa, unalignedFile): 113 | backbone = {} 114 | for subset in context.subsets: 115 | random.shuffle(subset) 116 | for taxon in subset[:numTaxa]: 117 | backbone[taxon] = context.unalignedSequences[taxon] 118 | sequenceutils.writeFasta(backbone, unalignedFile) 119 | return backbone 120 | 121 | def addAlignmentFileToGraph(context, alignedFile): 122 | Configs.log("Feeding backbone {} to the graph..".format(alignedFile)) 123 | backboneAlign = sequenceutils.readFromFasta(alignedFile) 124 | alignmentLength = len(next(iter(backboneAlign.values())).seq) 125 | 126 | if alignedFile in context.backboneExtend: 127 | extensionTasks = requestHmmExtensionTasks(context, backboneAlign, alignedFile) 128 | task.submitTasks(extensionTasks) 129 | for extensionTask in task.asCompleted(extensionTasks): 130 | backboneAlign.update(sequenceutils.readFromStockholm(extensionTask.outputFile, includeInsertions=True)) 131 | 132 | alignmap = backboneToAlignMap(context, backboneAlign, alignmentLength) 133 | Configs.log("Constructed backbone alignment map from {}".format(alignedFile)) 134 | 135 | ''' 136 | 6.1.2021 - added by Chengze Shen 137 | read in the weight for the alignedFile if the weight exists 138 | ''' 139 | weight = 1 140 | if context.backboneWeightsPath: 141 | # the weight has to be defined for this to work 142 | weight = context.backboneWeights.get(alignedFile) 143 | 144 | graph = context.graph 145 | with graph.matrixLock: 146 | for l in range(alignmentLength): 147 | for a, avalue in alignmap[l].items(): 148 | for b, bvalue in alignmap[l].items(): 149 | 150 | if Configs.graphBuildRestrict: 151 | asub, apos = graph.matSubPosMap[a] 152 | bsub, bpos = graph.matSubPosMap[b] 153 | if asub == bsub and apos != bpos: 154 | continue 155 | 156 | ''' 157 | 6.1.2021 - modified by Chengze Shen 158 | instead of a plain number of counts, we consider 159 | a user-defined weighting for the backbone 160 | 161 | ** the assumption is that the backbone weighting is 162 | ** tied to only one query sequence (because if we 163 | ** have multiple queries in the backbone alignment, 164 | ** the weightings of the bb of queries may not be 165 | ** the same. 166 | ''' 167 | #graph.matrix[a][b] = graph.matrix[a].get(b,0) + avalue * bvalue 168 | graph.matrix[a][b] = graph.matrix[a].get(b,0) \ 169 | + avalue * bvalue * weight 170 | ''' 171 | 7.8.2021 - added by Chengze Shen 172 | log the weight information for the backbone too 173 | ''' 174 | Configs.log("Fed backbone {} to the graph - weight = {}.".format( 175 | alignedFile, weight)) 176 | #Configs.log("Fed backbone {} to the graph.".format(alignedFile)) 177 | 178 | def backboneToAlignMap(context, backboneAlign, alignmentLength): 179 | alignmap = [{} for i in range(alignmentLength)] 180 | t = 0 181 | 182 | for taxon in backboneAlign: 183 | subsetIdx = context.taxonSubalignmentMap[taxon] 184 | subsetseq = context.backboneSubalignment[taxon].seq 185 | unalignedseq = context.unalignedSequences[taxon].seq 186 | backboneseq = backboneAlign[taxon].seq 187 | 188 | i = 0 189 | posarray = [0] * len(unalignedseq) 190 | for n in range(len(subsetseq)): 191 | if subsetseq[n] == unalignedseq[i]: 192 | posarray[i] = n 193 | i = i + 1 194 | if i == len(unalignedseq): 195 | break 196 | 197 | i = 0 198 | n = 0 199 | for c in backboneseq: 200 | if i == len(unalignedseq): 201 | break 202 | if c == unalignedseq[i]: 203 | position = int(context.graph.subsetMatrixIdx[subsetIdx] + posarray[i]) 204 | alignmap[n][position] = alignmap[n].get(position, 0) + 1 205 | if c.upper() == unalignedseq[i]: 206 | i = i + 1 207 | if c == c.upper() and c != '.': 208 | n = n + 1 209 | 210 | t = t + 1 211 | 212 | return alignmap 213 | 214 | def requestHmmExtensionTasks(context, backbone, alignedFile): 215 | baseName = os.path.basename(alignedFile) 216 | hmmDir = os.path.join(context.graph.workingDir, "hmm_{}".format(baseName)) 217 | extensionUnalignedFile = os.path.join(hmmDir, "queries.txt") 218 | hmmPath = os.path.join(hmmDir, "hmm_model.txt") 219 | if not os.path.exists(hmmDir): 220 | os.makedirs(hmmDir) 221 | 222 | backboneExtension = {} 223 | for taxon in context.unalignedSequences: 224 | if not taxon in backbone: 225 | backboneExtension[taxon] = context.unalignedSequences[taxon] 226 | 227 | sequenceutils.writeFasta(backboneExtension, extensionUnalignedFile) 228 | buildTask = hmmutils.buildHmmOverAlignment(alignedFile, hmmPath) 229 | buildTask.run() 230 | alignTasks = hmmutils.hmmAlignQueries(hmmPath, extensionUnalignedFile) 231 | return alignTasks 232 | 233 | -------------------------------------------------------------------------------- /witch_msa/tools/magus/tasks/manager.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 26, 2020 3 | 4 | @author: Vlad 5 | ''' 6 | 7 | import random 8 | import threading 9 | import os 10 | import time 11 | import concurrent.futures 12 | from configuration import Configs 13 | from tasks import files 14 | 15 | ''' 16 | Launching and awaiting tasks. 17 | To avoid deadlocks and stack overflows, only the main thread can submit tasks. 18 | Thus, only the main thread runs alignment tasks, worker threads are used for other task types (like MAFFT). 19 | ''' 20 | 21 | class TaskManager(): 22 | 23 | runningTasksFile = None 24 | lockTasksFile = None 25 | 26 | managerPool = None 27 | managerFuture = None 28 | managerSignal = threading.Event() 29 | managerLock = threading.Lock() 30 | managerStopSignal = False 31 | 32 | observerSignal = threading.Event() 33 | observerWaiting = False 34 | observerTask = None 35 | 36 | waitingTasks = {} 37 | submittedTasks = set() 38 | runningTasks = set() 39 | finishedTasks = set() 40 | failedTasks = set() 41 | 42 | taskPool = None 43 | threadsUsed = 0 44 | lastFilesCheckTime = 0 45 | lastDebugTime = 0 46 | serialTaskTypes = {"runAlignmentTask", "buildInducedSubalignment", "compressSubalignment"} 47 | contextStack = [] 48 | 49 | def startTaskManager(): 50 | Configs.debug("Starting up the task manager..") 51 | 52 | tasksDir = os.path.join(Configs.workingDir, "tasks") 53 | TaskManager.pendingTasksDir = os.path.join(tasksDir, "tasks_pending") 54 | TaskManager.runningTasksFile = os.path.join(tasksDir, "tasks_running.txt") 55 | TaskManager.lockTasksFile = os.path.join(tasksDir, "tasks.lock") 56 | if not os.path.exists(TaskManager.pendingTasksDir): 57 | os.makedirs(TaskManager.pendingTasksDir) 58 | 59 | TaskManager.managerPool = concurrent.futures.ThreadPoolExecutor(max_workers = 1) 60 | TaskManager.managerFuture = TaskManager.managerPool.submit(runTaskManager) 61 | TaskManager.taskPool = concurrent.futures.ThreadPoolExecutor(max_workers = Configs.numCores) 62 | Configs.debug("Task manager is up..") 63 | 64 | def stopTaskManager(): 65 | TaskManager.managerStopSignal = True 66 | with TaskManager.managerLock: 67 | TaskManager.managerSignal.set() 68 | try: 69 | Configs.debug("Winding down the task manager..") 70 | TaskManager.managerFuture.result() 71 | finally: 72 | Configs.log("Waiting for {} tasks to finish..".format(len(TaskManager.runningTasks))) 73 | TaskManager.taskPool.shutdown() 74 | dealWithFinishedTasks() 75 | TaskManager.managerPool.shutdown() 76 | Configs.debug("Task manager stopped..") 77 | 78 | def runTaskManager(): 79 | try: 80 | while not TaskManager.managerStopSignal: 81 | with TaskManager.managerLock: 82 | dealWithErrors() 83 | dealWithFinishedTasks() 84 | dealWithPendingTasks() 85 | dealWithWaitingTasks() 86 | TaskManager.managerSignal.clear() 87 | TaskManager.managerSignal.wait(5) 88 | finally: 89 | TaskManager.observerSignal.set() 90 | 91 | def dealWithErrors(): 92 | for task in TaskManager.failedTasks: 93 | Configs.error("Task manager found a failed task: {}".format(task.outputFile)) 94 | if task.future is not None: 95 | task.future.result() 96 | 97 | def dealWithFinishedTasks(): 98 | stoppedRunning = set(t for t in TaskManager.finishedTasks | TaskManager.failedTasks if t.taskType == "runAlignmentTask") 99 | if len(stoppedRunning) > 0: 100 | with files.FileLock(TaskManager.lockTasksFile): 101 | runningTasks = files.readTasksFromFile(TaskManager.runningTasksFile) 102 | stillRunningTasks = [t for t in runningTasks if t not in stoppedRunning] 103 | if len(stillRunningTasks) < len(runningTasks): 104 | files.writeTasksToFile(stillRunningTasks, TaskManager.runningTasksFile, append = False) 105 | 106 | if len(TaskManager.failedTasks) > 0: 107 | processPendingTasks(TaskManager.failedTasks, 0, None) 108 | 109 | TaskManager.finishedTasks = set() 110 | TaskManager.failedTasks = set() 111 | 112 | def dealWithPendingTasks(): 113 | numToLaunch = min(1, Configs.numCores - TaskManager.threadsUsed) 114 | newTasks = [] 115 | for t in TaskManager.submittedTasks: 116 | if os.path.exists(t.outputFile): 117 | Configs.log("File already exists: {}".format(t.outputFile)) 118 | t.isFinished = True 119 | TaskManager.observerSignal.set() 120 | else: 121 | newTasks.append(t) 122 | TaskManager.waitingTasks[t.outputFile] = t 123 | 124 | if len(newTasks) > 0: 125 | launchedTasks, remainingTasks = processPendingTasks(newTasks, numToLaunch, None) 126 | numToLaunch = numToLaunch - len(launchedTasks) 127 | if numToLaunch > 0: 128 | pendingFiles = [os.path.join(TaskManager.pendingTasksDir, file) for file in os.listdir(TaskManager.pendingTasksDir) if file.endswith(".txt")] 129 | random.shuffle(pendingFiles) 130 | for taskFile in pendingFiles: 131 | if numToLaunch <= 0: 132 | break 133 | launchedTasks, remainingTasks = processPendingTasks(None, numToLaunch, taskFile) 134 | numToLaunch = numToLaunch - len(launchedTasks) 135 | 136 | TaskManager.submittedTasks = set() 137 | 138 | def dealWithWaitingTasks(): 139 | timeSinceFileCheck = time.time() - TaskManager.lastFilesCheckTime 140 | if timeSinceFileCheck >= 5: 141 | for file, task in list(TaskManager.waitingTasks.items()): 142 | if os.path.exists(file): 143 | Configs.debug("Detected task completion: {}".format(file)) 144 | TaskManager.waitingTasks.pop(file) 145 | task.isFinished = True 146 | TaskManager.observerSignal.set() 147 | TaskManager.lastFilesCheckTime = time.time() 148 | 149 | timeSinceDebug = time.time() - TaskManager.lastDebugTime 150 | if timeSinceDebug >= 60: 151 | TaskManager.lastDebugTime = time.time() 152 | for task in TaskManager.runningTasks: 153 | Configs.debug("Still running task {}, status {}".format(task.outputFile, task.future._state if task.future is not None else "N/A")) 154 | for file in TaskManager.waitingTasks: 155 | Configs.debug("Still waiting on task {}".format(file)) 156 | 157 | def processPendingTasks(tasks, numTasksToLaunch, taskFile): 158 | if taskFile is None: 159 | taskFile = os.path.join(TaskManager.pendingTasksDir, "task_file_{:03d}.txt".format(random.randint(0,999))) 160 | 161 | newTasks = True 162 | with files.FileLock(taskFile.replace(".txt", ".lock")): 163 | if tasks is None: 164 | tasks = files.readTasksFromFile(taskFile) 165 | newTasks = False 166 | 167 | alignTasks = [t for t in tasks if t.taskType == "runAlignmentTask"] 168 | if len(alignTasks) > 0: 169 | with files.FileLock(TaskManager.lockTasksFile): 170 | runningTasks = set(files.readTasksFromFile(TaskManager.runningTasksFile)) 171 | if len(runningTasks) > 0: 172 | tasks = [t for t in tasks if t not in runningTasks] 173 | launchedTasks, remainingTasks = launchTasks(tasks, numTasksToLaunch) 174 | writeRunningTasks = [t for t in launchedTasks if t.taskType == "runAlignmentTask"] 175 | files.writeTasksToFile(writeRunningTasks, TaskManager.runningTasksFile, append = True) 176 | else: 177 | launchedTasks, remainingTasks = launchTasks(tasks, numTasksToLaunch) 178 | 179 | files.writeTasksToFile(remainingTasks, taskFile, append = newTasks) 180 | if len(launchedTasks) > 0: 181 | Configs.debug("Launched {} tasks and deferred {} tasks..".format(len(launchedTasks), len(remainingTasks))) 182 | 183 | return launchedTasks, remainingTasks 184 | 185 | def launchTasks(tasks, numTasksToLaunch): 186 | launchedTasks = [] 187 | remainingTasks = [] 188 | threadsAvailable = Configs.numCores - TaskManager.threadsUsed 189 | toLaunch = min(numTasksToLaunch, threadsAvailable) 190 | for task in tasks: 191 | if toLaunch > 0 and checkLaunchTask(task): 192 | Configs.debug("Launched a new task.. {}/{} threads used, type: {}, output file: {}".format(TaskManager.threadsUsed, Configs.numCores, task.taskType, task.outputFile)) 193 | toLaunch = toLaunch - 1 194 | launchedTasks.append(task) 195 | else: 196 | remainingTasks.append(task) 197 | return launchedTasks, remainingTasks 198 | 199 | def checkLaunchTask(task): 200 | if task.taskType not in TaskManager.serialTaskTypes: 201 | task.future = TaskManager.taskPool.submit(runTask, task) 202 | return True 203 | elif TaskManager.observerWaiting and TaskManager.observerTask is None: 204 | stack = TaskManager.contextStack 205 | if task.taskType != "runAlignmentTask" or len(stack) == 0 or task in stack[-1].subalignmentTasks: 206 | TaskManager.observerTask = task 207 | TaskManager.observerSignal.set() 208 | return True 209 | return False 210 | 211 | def runTask(task): 212 | with TaskManager.managerLock: 213 | if task.taskType not in TaskManager.serialTaskTypes: 214 | TaskManager.threadsUsed = TaskManager.threadsUsed + 1 215 | TaskManager.runningTasks.add(task) 216 | 217 | failed = False 218 | try: 219 | task.run() 220 | except: 221 | failed = True 222 | raise 223 | finally: 224 | with TaskManager.managerLock: 225 | if task.taskType not in TaskManager.serialTaskTypes: 226 | TaskManager.threadsUsed = TaskManager.threadsUsed - 1 227 | TaskManager.runningTasks.remove(task) 228 | TaskManager.failedTasks.add(task) if failed else TaskManager.finishedTasks.add(task) 229 | if task.outputFile in TaskManager.waitingTasks: 230 | t = TaskManager.waitingTasks.pop(task.outputFile) 231 | t.future = task.future 232 | t.isFinished = True 233 | TaskManager.managerSignal.set() 234 | TaskManager.observerSignal.set() 235 | 236 | --------------------------------------------------------------------------------