├── .dockerignore
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── docker-image.yml
    │   └── publish-pypi-release.yml
├── .gitignore
├── .idea
    └── remote-mappings.xml
├── Dockerfile
├── FastOMA.nf
├── FastOMA
    ├── __init__.py
    ├── _hog_class.py
    ├── _infer_subhog.py
    ├── _utils_frag_SO_detection.py
    ├── _utils_roothog.py
    ├── _utils_subhog.py
    ├── _wrappers.py
    ├── batch_roothogs.py
    ├── check_input.py
    ├── collect_subhogs.py
    ├── fastoma_notebook_stat.ipynb
    ├── helper_scripts.py
    ├── infer_roothogs.py
    ├── infer_subhogs.py
    ├── transformer.py
    └── zoo
    │   ├── README.md
    │   ├── __init__.py
    │   ├── familyanalyzer
    │       ├── __init__.py
    │       ├── genetree.py
    │       ├── newick.py
    │       ├── orthoxmlquery.py
    │       ├── taxonomy.py
    │       └── tools.py
    │   ├── file_utils
    │       ├── __init__.py
    │       ├── context_managers.py
    │       └── extractors.py
    │   ├── hog
    │       ├── __init__.py
    │       ├── convert.py
    │       ├── extract_groups.py
    │       ├── extract_hog_info.py
    │       ├── filter_orthoxml.py
    │       ├── orthoxml_merge.py
    │       └── transform.py
    │   ├── seq_utils
    │       ├── __init__.py
    │       └── utils.py
    │   ├── unionfind.py
    │   ├── utils.py
    │   └── wrappers
    │       ├── __init__.py
    │       ├── abstract_cli.py
    │       ├── aligners
    │           ├── __init__.py
    │           ├── base_aligner.py
    │           ├── mafft.py
    │           ├── muscle.py
    │           ├── probcons.py
    │           └── prographmsa.py
    │       ├── modeltesters
    │           ├── __init__.py
    │           ├── base_modeltester.py
    │           ├── parsers.py
    │           └── prottest.py
    │       ├── options.py
    │       ├── treebuilders
    │           ├── __init__.py
    │           ├── base_treebuilder.py
    │           ├── fasttree.py
    │           ├── guenomu.py
    │           ├── iqtree.py
    │           ├── parsers.py
    │           ├── phyml.py
    │           └── raxml.py
    │       └── trimmers
    │           ├── __init__.py
    │           ├── base_trimmer.py
    │           └── trimal.py
├── README.md
├── archive
    ├── analysis
    │   ├── edit_orthxml_file.py
    │   ├── find_unfinished_rhog.py
    │   ├── preprocess_qfo_files.py
    │   ├── write_gene_id_pickle_old_code.py
    │   └── xml_.py
    ├── fastOMA_logo.png
    └── test_curn.py
├── conf
    └── base.config
├── environment-conda.yml
├── license
├── nextflow.config
├── nextflow_slurm.config
├── pyproject.toml
├── testdata
    ├── README.md
    ├── expected_output
    │   ├── .DS_Store
    │   ├── FastOMA_HOGs.orthoxml
    │   ├── OrthologousGroups.tsv
    │   ├── OrthologousGroupsFasta
    │   │   ├── OG_0000001.fa
    │   │   ├── OG_0000001.fa.gz
    │   │   ├── OG_0000002.fa.gz
    │   │   ├── OG_0000003.fa.gz
    │   │   ├── OG_0000004.fa.gz
    │   │   ├── OG_0000005.fa.gz
    │   │   ├── OG_0000006.fa.gz
    │   │   ├── OG_0000007.fa.gz
    │   │   ├── OG_0000008.fa.gz
    │   │   ├── OG_0000009.fa.gz
    │   │   ├── OG_0000010.fa.gz
    │   │   ├── OG_0000011.fa.gz
    │   │   └── OG_0000012.fa.gz
    │   ├── RootHOGs.tsv
    │   ├── RootHOGsFasta
    │   │   ├── HOG0000001.fa
    │   │   ├── HOG0000001.fa.gz
    │   │   ├── HOG0000002.fa.gz
    │   │   ├── HOG0000003.fa.gz
    │   │   ├── HOG0000004.fa.gz
    │   │   ├── HOG0000005.fa.gz
    │   │   ├── HOG0000006.fa.gz
    │   │   ├── HOG0000007.fa.gz
    │   │   ├── HOG0000008.fa.gz
    │   │   ├── HOG0000009.fa.gz
    │   │   ├── HOG0000010.fa.gz
    │   │   ├── HOG0000011.fa.gz
    │   │   └── HOG0000012.fa.gz
    │   ├── hogmap
    │   │   ├── AQUAE.fa.hogmap
    │   │   ├── CHLTR.fa.hogmap
    │   │   └── MYCGE.fa.hogmap
    │   ├── orthologs.tsv
    │   ├── phylostratigraphy.html
    │   ├── report.html
    │   ├── report.ipynb
    │   ├── species_tree_checked.nwk
    │   └── stats
    │   │   └── report_2024-10-18_02-43-20.html
    └── in_folder
    │   ├── proteome
    │       ├── AQUAE.fa
    │       ├── CHLTR.fa
    │       └── MYCGE.fa
    │   └── species_tree.nwk
├── tests
    ├── data
    │   ├── HOG_0890520.fa
    │   ├── correct-msa.fa
    │   └── failing-msa.fa
    ├── test_fasttree_wrapper.py
    ├── test_infer_subhog.py
    └── test_roothog_example.py
└── utils
    ├── OrthoXMLSplitter.py
    ├── filter_orthoxml_completeness.py
    ├── find_unfinished_rhogs.py
    ├── orthoxml2OG.py
    ├── orthoxml2family.py
    ├── orthoxml2newick.py
    ├── orthoxml2pairs.py
    ├── orthoxml2perrhog.py
    ├── orthoxml2phylostratigraphy.py
    ├── pickle2orthoxml.py
    └── write_orthoxml_per_rHOG.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | work
2 | .nextflow*
3 | .idea
4 | .git
5 | output
6 | testdata
7 | dist
8 | archive/
9 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   # Maintain dependencies for GitHub Actions
4 |   - package-ecosystem: "github-actions"
5 |     directory: "/"
6 |     schedule:
7 |       interval: "daily"
8 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |   release:
 7 |     type: [published]
 8 | 
 9 | env:
10 |   TEST_TAG: dessimozlab/fastoma:test
11 | 
12 | jobs:
13 | 
14 |   build:
15 | 
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |       - name: Checkout
20 |         uses: actions/checkout@v4
21 |         with:
22 |           submodules: recursive
23 | 
24 |       - name: Docker meta
25 |         id: meta
26 |         uses: docker/metadata-action@v5
27 |         with:
28 |           # list of Docker images to use as base name for tags
29 |           images: |
30 |             dessimozlab/fastoma
31 |           # generate Docker tags based on the following events/attributes
32 |           tags: |
33 |             type=schedule
34 |             type=ref,event=branch
35 |             type=ref,event=pr
36 |             type=semver,pattern={{version}}
37 |             type=semver,pattern={{major}}.{{minor}}
38 |             type=semver,pattern={{major}}
39 |             type=sha
40 | 
41 |       - name: Set up QEMU
42 |         uses: docker/setup-qemu-action@v3
43 | 
44 |       - name: Set up Docker Buildx
45 |         uses: docker/setup-buildx-action@v3
46 | 
47 |       - name: Build and export to docker for testing
48 |         uses: docker/build-push-action@v6
49 |         with:
50 |           context: .
51 |           load: true
52 |           tags: ${{ env.TEST_TAG }}
53 | 
54 |       #- name: Test
55 |       #  run: |
56 |       #    docker run --rm -i -v $PWD/tests:/input -v $PWD/tests/:/reads -v $PWD/output:/out -v $PWD/run:/run ${{ env.TEST_TAG }} --tree --standalone_path /input/marker_genes --dna_reference /input/cds-marker_genes.fasta.gz --reads /reads/sample_1.fastq --output_path /out
57 |       #    if [ ! -f output/tree_sample_1.nwk ] ; then exit 1; fi
58 | 
59 |       - name: Login to DockerHub
60 |         uses: docker/login-action@v3
61 |         with:
62 |           username: ${{ secrets.DOCKER_HUB_USERNAME }}
63 |           password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
64 | 
65 |       - name: Build and push
66 |         uses: docker/build-push-action@v6
67 |         with:
68 |           context: .
69 |           platforms: linux/amd64,linux/arm64
70 |           push: true
71 |           #${{ github.event_name != 'push' && github.event_name != 'pull_request' }}
72 |           tags: ${{ steps.meta.outputs.tags }}
73 |           labels: ${{ steps.meta.outputs.labels }}
74 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-pypi-release.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: Upload FastOMA to pypi
 3 | 
 4 | on:
 5 |   push:
 6 |     tags:
 7 |       - v*
 8 | 
 9 | jobs:
10 |   deploy:
11 | 
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v4
16 |     - name: Set up Python
17 |       uses: actions/setup-python@v5
18 |       with:
19 |         python-version: '3.x'
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install hatch
24 |     - name: Build package
25 |       run: hatch build
26 |     - name: Publish package
27 |       uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc
28 |       with:
29 |         user: __token__
30 |         password: ${{ secrets.PYPI_API_TOKEN }}
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .nextflow*
 2 | work/
 3 | .idea/
 4 | dist/
 5 | archive
 6 | .git
 7 | .gitignore
 8 | __pycache__
 9 | *.orig
10 | 


--------------------------------------------------------------------------------
/.idea/remote-mappings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="RemoteMappingsManager">
 4 |     <list>
 5 |       <list>
 6 |         <remote-mappings server-id="python@5ffbd9de-c0f1-433f-b7a1-e5e8993716c9" />
 7 |       </list>
 8 |     </list>
 9 |   </component>
10 | </project>


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim as basis
 2 | 
 3 | # set environment varibles
 4 | ENV PYTHONDONTWRITEBYTECODE 1
 5 | ENV PYTHONUNBUFFERED 1
 6 | 
 7 | 
 8 | FROM basis as builder
 9 | RUN apt-get update \
10 |     && apt-get install -y --no-install-recommends \
11 |        build-essential \
12 |        fasttree \
13 |        libxml2 \
14 |        mafft \
15 |     && rm -rf /var/lib/apt/lists/*
16 | 
17 | WORKDIR /src
18 | RUN pip install --upgrade hatch pip
19 | COPY pyproject.toml .
20 | RUN python -m venv /app \
21 |     && hatch dep show requirements --all > requirements.txt \
22 |     && /app/bin/pip install wheel setuptools \
23 |     && /app/bin/pip install -r requirements.txt
24 | 
25 | COPY . .
26 | RUN ls -la \
27 |     && hatch build \
28 |     && ls -la dist/ \
29 |     && /app/bin/pip install dist/*.whl
30 | 
31 | 
32 | FROM basis as runtime
33 | RUN apt-get update \
34 |     && apt-get install -y --no-install-recommends \
35 |        fasttree \
36 |        libxml2 \
37 |        mafft \
38 |        mmseqs2 \
39 |        procps \
40 |     && apt-get -y autoremove \
41 |     && apt-get -y autoclean \
42 |     && rm -rf /var/lib/apt/lists/*
43 | 
44 | COPY --from=builder /app /app
45 | ENV PATH="/app/bin:$PATH"
46 | 


--------------------------------------------------------------------------------
/FastOMA/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | __packagename__ = "FastOMA"
3 | __version__ = "0.3.5"
4 | 


--------------------------------------------------------------------------------
/FastOMA/batch_roothogs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import shutil
 3 | from pathlib import Path
 4 | from ._wrappers import logger
 5 | from . import __version__ as fastoma_version
 6 | 
 7 | big_rhog_filesize_thresh = 400 * 1000
 8 | sum_list_rhogs_filesize_thresh = 2 * 1e6
 9 | 
10 | 
11 | """
12 | 
13 | fastoma-batch-roothogs --input-roothogs omamer_rhogs --out-big rhogs_big  --out-rest rhogs_rest -vv
14 | 
15 | """
16 | 
17 | class BatchBuilder:
18 |     def __init__(self, outdir: Path, max_size: int):
19 |         self.outdir = outdir
20 |         self.max_size = max_size
21 | 
22 |     def __enter__(self):
23 |         self.cur_batch = []
24 |         self.cur_size = 0
25 |         self.counter = 0
26 |         self.outdir.mkdir(parents=True, exist_ok=True)
27 |         return self
28 | 
29 |     def __exit__(self, exc_type, exc_val, exc_tb):
30 |         if len(self.cur_batch) > 0:
31 |             self._flush()
32 | 
33 |     def add_hog(self, hog_file: Path):
34 |         self.cur_batch.append(hog_file)
35 |         self.cur_size += hog_file.stat().st_size
36 |         logger.debug("adding %s with size %d to batch %d", hog_file, hog_file.stat().st_size, self.counter)
37 |         if self.cur_size > self.max_size:
38 |             self._flush()
39 |             self.counter += 1
40 | 
41 |     def _flush(self):
42 |         batch_dir = self.outdir / str(self.counter)
43 |         batch_dir.mkdir()
44 |         for fn in self.cur_batch:
45 |             shutil.copy(fn, batch_dir)
46 |         logger.debug("creating batch %s with %d families; total size of files is %d",
47 |                      batch_dir, len(self.cur_batch), self.cur_size)
48 |         self.cur_size = 0
49 |         self.cur_batch = []
50 | 
51 | 
52 | def folder_1h_rhog(roothog_path: Path, output_folder_big: Path, output_folder_rest: Path):
53 |     # create a list of hogs in descending filesize order
54 |     hog_size_tuples = sorted([(f, f.stat().st_size) for f in roothog_path.rglob("*.fa")], key=lambda x: -x[1])
55 |     with BatchBuilder(output_folder_big, 1) as big_hogs, \
56 |             BatchBuilder(output_folder_rest, sum_list_rhogs_filesize_thresh) as rest_hogs:
57 |         for hog, fsize in hog_size_tuples:
58 |             if fsize > big_rhog_filesize_thresh:
59 |                 big_hogs.add_hog(hog)
60 |             else:
61 |                 rest_hogs.add_hog(hog)
62 | 
63 | 
64 | def fastoma_batch_roothogs():
65 |     import argparse
66 |     parser = argparse.ArgumentParser(description="Analyse roothog families and create batches for analysis")
67 |     parser.add_argument("--version", action="version", version="FastOMA v"+fastoma_version)
68 |     parser.add_argument('--input-roothogs', required=True, help="folder where input roothogs are stored")
69 |     parser.add_argument('--out-big', required=True, help="folder where the big single family hogs should be stored")
70 |     parser.add_argument('--out-rest', required=True, help="folder where the remaining families should be stored in"
71 |                                                           "batch subfolder structure.")
72 |     parser.add_argument('-v', default=0, action="count", help="incrase verbosity")
73 |     conf_batch_roothogs = parser.parse_args()
74 |     logger.setLevel(level=30 - 10 * min(conf_batch_roothogs.v, 2))
75 |     logger.debug("Arguments: %s", conf_batch_roothogs)
76 | 
77 |     folder_1h_rhog(Path(conf_batch_roothogs.input_roothogs), Path(conf_batch_roothogs.out_big), Path(conf_batch_roothogs.out_rest))
78 | 
79 | 


--------------------------------------------------------------------------------
/FastOMA/helper_scripts.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from ._wrappers import logger
 3 | from .zoo.utils import auto_open
 4 | 
 5 | 
 6 | def extract_pw_rels(args):
 7 |     from lxml import etree
 8 |     from .zoo.hog import transform
 9 |     xml = etree.parse(args.orthoxml)
10 |     with auto_open(args.out, 'wt') as fout:
11 |         for p1, p2 in transform.iter_pairwise_relations(xml, rel_type=args.type, id_attribute="protId"):
12 |             fout.write(f"{p1}\t{p2}\n")
13 | 
14 | 
15 | def main():
16 |     parser = argparse.ArgumentParser(description="FastOMA helper scripts")
17 |     parser.add_argument('-v', default=0, action="count", help="increase verbosity")
18 |     subparsers = parser.add_subparsers(required=True)
19 | 
20 |     parser_pw = subparsers.add_parser('pw-rel')
21 |     parser_pw.add_argument("--type", choices=("ortholog", "paralog"), default="ortholog",
22 |                            help="Type of relations to extract. either 'ortholog' or 'paralog'")
23 |     parser_pw.add_argument("--out", required=True, help="Path to output file")
24 |     parser_pw.add_argument("--orthoxml", required=True, help="Path to input orthoxml file")
25 |     parser_pw.set_defaults(func=extract_pw_rels)
26 | 
27 |     conf = parser.parse_args()
28 |     logger.setLevel(level=30 - 10 * min(conf.v, 2))
29 |     logger.debug(conf)
30 |     conf.func(conf)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     main()


--------------------------------------------------------------------------------
/FastOMA/infer_roothogs.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | from shutil import which
 3 | 
 4 | from . import _utils_roothog
 5 | from ._wrappers import logger
 6 | from . import __version__ as fastoma_version
 7 | 
 8 | 
 9 | 
10 | """
11 | 
12 | fastoma-infer-roothogs --proteomes proteome --hogmap hogmap --out-rhog-folder omamer_rhogs -vv
13 | 
14 | """
15 | 
16 | 
17 | def fastoma_infer_roothogs():
18 |     import argparse
19 |     parser = argparse.ArgumentParser(description="checking parameters for FastOMA")
20 |     parser.add_argument("--version", action="version", version="FastOMA v"+fastoma_version)
21 |     parser.add_argument("--proteomes", required=True, help="Path to the folder containing the input proteomes")
22 |     parser.add_argument("--splice", help="Path to the folder containing the splice information files")
23 |     parser.add_argument("--hogmap", help="Path to the folder containing the hogmap files")
24 |     parser.add_argument("--out-rhog-folder", required=True, help="Folder where the roothog fasta files are written") #out_rhog_folder
25 |     parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug")
26 |     parser.add_argument('--min-sequence-length', required=False, default=50, type=int,
27 |                         help="minimum sequence length. Shorter sequences will be ignored. (Default=50)")
28 | 
29 |     parser.add_argument("--mergHOG-ratioMax-thresh", required=False, type=float, default=0.8, help="For merging rootHOGs, threshold of ratioMax ") # mergHOG_ratioMax_thresh
30 |     parser.add_argument("--mergHOG-ratioMin-thresh", required=False, type=float, default=0.9, help="For merging rootHOGs, threshold of ratioMin ") # mergHOG_ratioMin_thresh
31 |     parser.add_argument("--mergHOG-shared-thresh", required=False, type=float, default=10, help="For merging rootHOGs, threshold of number shared proteins ") # mergHOG_shared_thresh
32 |     parser.add_argument("--mergHOG-fscore-thresh", required=False, type=float, default=70, help="For merging rootHOGs, threshold of famlut score shared proteins ") # mergHOG_fscore_thresh
33 |     parser.add_argument("--big-rhog-size", required=False, type=int, default=50*1000, help= "For big rootHOGs, we have different heuristics") # big_rhog_size
34 |     parser.add_argument("--big-fscore-thresh", required=False, type=int, default=95, help="For huge rootHOGs, we have different heuristics, like filtering low family score protiens") # big_fscore_thresh
35 | 
36 |     conf = parser.parse_args()
37 |     logger.setLevel(level=30 - 10 * min(conf.v, 2))
38 |     logger.debug("Arguments: %s", conf)
39 | 
40 |     species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes, conf.min_sequence_length)  # optional input folder
41 |     prot_recs_all = _utils_roothog.add_species_name_prot_id(prot_recs_lists)
42 | 
43 |     hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(prot_recs_lists, fasta_format_keep, folder=conf.hogmap)  # optional input folder
44 | 
45 |     splice_files = conf.splice is not None and os.path.exists(conf.splice)
46 |     if splice_files:
47 |         isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names, folder=conf.splice)
48 |         isoform_selected,  isoform_not_selected = _utils_roothog.find_nonbest_isoform(
49 |             species_names, isoform_by_gene_all, hogmaps
50 |         )
51 |         _utils_roothog.write_isoform_selected(isoform_by_gene_all, isoform_selected, prot_recs_lists)
52 |         # for each isoform file, there will be a file ending with _selected_isoforms.tsv
53 |         hogmaps = _utils_roothog.handle_splice(hogmaps, isoform_not_selected)
54 | 
55 |     rhogs_prots = _utils_roothog.group_prots_roothogs(hogmaps)
56 |     rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots, hogmaps, conf)
57 |     rhogs_prots = _utils_roothog.merge_rhogs2(hogmaps, rhogs_prots, conf)
58 |     rhogs_prots = _utils_roothog.filter_big_roothogs(hogmaps, rhogs_prots, conf)
59 | 
60 |     min_rhog_size = 2
61 |     rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, conf.out_rhog_folder, min_rhog_size)
62 |     linclust_available=which("mmseqs")  # True #
63 |     # if memseqs is not installed the output will be empty / None
64 |     if linclust_available:
65 |         num_unmapped_singleton = _utils_roothog.collect_unmapped_singleton(rhogs_prots, unmapped, prot_recs_all,  "singleton_unmapped.fa")
66 |         if num_unmapped_singleton:
67 |             result_linclust = _utils_roothog.run_linclust(fasta_to_cluster="singleton_unmapped.fa")
68 |             logger.debug(" linclust is done %s", result_linclust)
69 |             num_clusters = _utils_roothog.write_clusters(conf.out_rhog_folder, min_rhog_size)
70 |             logger.debug("we wrote %d new clusters with linclust ", num_clusters)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     fastoma_infer_roothogs()


--------------------------------------------------------------------------------
/FastOMA/infer_subhogs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from . import _utils_subhog
 4 | from . import _infer_subhog
 5 | from ._wrappers import logger
 6 | from . import __version__ as fastoma_version
 7 | 
 8 | """
 9 | 
10 | fastoma-infer-subhogs  --input-rhog-folder rhogs_rest/0  --output-pickles "pickle_hogs"  \
11 |     --species-tree  species_tree_checked.nwk -vv --parallel # --msa-write --gene-trees-write
12 |     
13 | """
14 | 
15 | def fastoma_infer_subhogs():
16 | 
17 |     import argparse
18 |     parser = argparse.ArgumentParser(description="checking parameters for FastOMA",
19 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
20 |     parser.add_argument("--version", action="version", version="FastOMA v"+fastoma_version)
21 |     parser.add_argument("--input-rhog-folder", required=True, help="Path to the input rootHOG folder.")
22 |     parser.add_argument("--parallel", action='store_true', help="use concurrent parallel per rootHOG")
23 |     parser.add_argument("--species-tree", required=True,
24 |                         help="Path to the input species tree file in newick format")
25 |     parser.add_argument("--output-pickles", required=False, default="pickle_hogs",
26 |                         help="Path to the output folder")
27 | 
28 |     parser.add_argument("--threshold-dubious-sd", required=False, type=float, default=1/10,
29 |                         help="Threshold to remove proteins in a gene tree due to low species overlap score, not enough evidence for duplication event.") # threshold_dubious_sd
30 |     parser.add_argument("--number-of-samples-per-hog", type=int, default=5,
31 |                         help="Number of representatives (sequences) per HOG. Defaults to ")
32 |     parser.add_argument("--overlap-fragments", required=False, type=float, default=0.15,
33 |                         help="Threshold overlap between two sequences (rows) in MSA to decide whether they are fragments of a gene.")  # overlap_fragments
34 |     parser.add_argument("--gene-rooting-method", required=False, default="midpoint", # gene_rooting_method
35 |                         help="The method used for rooting of gene tree :    midpoint    mad     Nevers_rooting .")
36 |     parser.add_argument("--gene-trees-write", action='store_true',
37 |                         help="writing the all gene trees .")  # the order seems to be nwk_SD_labeled.nwk, dubious_sd0.nwk_SD_labeled.nwk, dubious_sd1.nwk_SD_labeled.nwk
38 |     parser.add_argument("--msa-write", action='store_true',
39 |                         help="writing the raw MSAs (might have more genes that the final gene tree).")
40 |     parser.add_argument("--msa-filter-method",
41 |                         choices=("col-row-threshold", "col-elbow-row-threshold", "trimal"),
42 |                         default="col-row-threshold",
43 |                         help="The method used for filtering MSAs.")
44 |     parser.add_argument("--gap-ratio-row", required=False, type=float, default=0.3,
45 |                         help="For trimming the MSA, the threshold of ratio of gaps for each row.")
46 |     parser.add_argument("--gap-ratio-col", required=False, type=float, default=0.5,
47 |                         help="For trimming the MSA, the threshold of ratio of gaps for each column.")
48 |     parser.add_argument("--min-col-trim", required=False, type=int, default=50,  # todo min rows trim
49 |                         help="min no. columns in msa to consider for filtering")
50 |     parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug")
51 |     conf_infer_subhhogs = parser.parse_args()
52 |     logger.setLevel(level=30 - 10 * min(conf_infer_subhhogs.v, 2))
53 |     logger.debug("Arguments: %s", conf_infer_subhhogs)
54 | 
55 |     address_rhogs_folder = conf_infer_subhhogs.input_rhog_folder
56 |     # address_rhogs_folder = "./"  # _config.input_rhog_folder
57 |     inferhog_concurrent_on = conf_infer_subhhogs.parallel
58 |     if inferhog_concurrent_on:
59 |         print("parallelization for subhog inference is on.")
60 | 
61 |     if not os.path.exists(conf_infer_subhhogs.output_pickles):
62 |         os.makedirs(conf_infer_subhhogs.output_pickles)
63 | 
64 |     pickles_subhog_folder_all = "./" # pickle per taxonomic level
65 | 
66 |     list_rhog_fastas_files = _utils_subhog.list_rhog_fastas(address_rhogs_folder)
67 |     print("there are ", len(list_rhog_fastas_files), "rhogs in the input folder")
68 | 
69 |     rhogs_fa_folder = address_rhogs_folder
70 | 
71 |     list_rhog_fastas_files_rem = _utils_subhog.list_rhog_fastas(address_rhogs_folder)
72 |     print("there are ", len(list_rhog_fastas_files_rem), "rhogs remained in the input folder", list_rhog_fastas_files_rem[:5] )
73 | 
74 |     hogs_rhog_xml_batch = _infer_subhog.read_infer_xml_rhogs_batch(list_rhog_fastas_files_rem, inferhog_concurrent_on, conf_infer_subhhogs.output_pickles, pickles_subhog_folder_all, rhogs_fa_folder, conf_infer_subhhogs)
75 | 
76 |     print("finsihed ", address_rhogs_folder)
77 | 
78 |     threshold_dubious_sd= 0.1
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     fastoma_infer_subhogs()


--------------------------------------------------------------------------------
/FastOMA/transformer.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import re
 3 | from ._wrappers import logger
 4 | 
 5 | 
 6 | class FastaHeaderTransformer(metaclass=abc.ABCMeta):
 7 |     @abc.abstractmethod
 8 |     def transform(self, header):
 9 |         return header
10 | 
11 | 
12 | class NoOpFastaHeaderTransformer(FastaHeaderTransformer):
13 |     def transform(self, header):
14 |         return header
15 | 
16 | 
17 | class ExtractUniProtAccessionFastaHeaderTransformer(FastaHeaderTransformer):
18 |     def __init__(self):
19 |         self._up_re = re.compile(r"[sptr]{2}\|(?P<acc>[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})\|.*")
20 | 
21 |     def transform(self, header):
22 |         m = self._up_re.match(header)
23 |         if m:
24 |             return m.group('acc')
25 |         logger.warning("cannot extract uniprot accession from header: %s", header)
26 |         return header
27 | 
28 | 
29 | def header_transformer(name):
30 |     if name.lower() == "noop":
31 |         return NoOpFastaHeaderTransformer()
32 |     elif name.lower() == 'uniprot':
33 |         return ExtractUniProtAccessionFastaHeaderTransformer()
34 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/README.md:
--------------------------------------------------------------------------------
1 | zoo 
2 | ===
3 | 
4 | 
5 | 
6 | This is part of the [zoo](https://zoo.cs.ucl.ac.uk/doc/zoo/wrappers.html) 


--------------------------------------------------------------------------------
/FastOMA/zoo/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | __version__ = "0.0.5"


--------------------------------------------------------------------------------
/FastOMA/zoo/familyanalyzer/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 | from __future__ import print_function
3 | from __future__ import division
4 | from __future__ import absolute_import
5 | from future import standard_library
6 | standard_library.install_hooks()
7 | from .genetree import *
8 | from .taxonomy import *
9 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/familyanalyzer/orthoxmlquery.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | from __future__ import print_function
  3 | from __future__ import division
  4 | from __future__ import absolute_import
  5 | from future.builtins import str
  6 | from future import standard_library
  7 | standard_library.install_hooks()
  8 | 
  9 | 
 10 | class ElementError(Exception):
 11 |     def __init__(self, msg):
 12 |         self.msg = msg
 13 | 
 14 |     def __str__(self):
 15 |         return str(self.msg)
 16 | 
 17 | 
 18 | class OrthoXMLQuery(object):
 19 |     """Helper class with predefined queries on an orthoxml tree."""
 20 | 
 21 |     ns = {"ns0": "http://orthoXML.org/2011/"}   # xml namespace
 22 | 
 23 |     @classmethod
 24 |     def getToplevelOrthologGroups(cls, root):
 25 |         """returns a list with the toplevel orthologGroup elements
 26 |         of the given root element."""
 27 |         xquery = ".//{{{ns0}}}groups/{{{ns0}}}orthologGroup".format(**cls.ns)
 28 |         return root.findall(xquery)
 29 | 
 30 |     @classmethod
 31 |     def getTaxRangeNodes(cls, root, recursively=True):
 32 |         xPrefix = ".//" if recursively else "./"
 33 |         xquery = '{}{{{}}}property[@name="TaxRange"]'.format(xPrefix,
 34 |                                                              cls.ns['ns0'])
 35 |         return root.findall(xquery)
 36 | 
 37 |     @classmethod
 38 |     def getTaxidNodes(cls, root, recursively=True):
 39 |         xPrefix = ".//" if recursively else "./"
 40 |         xquery = '{}{{{}}}property[@name="taxid"]'.format(xPrefix, cls.ns['ns0'])
 41 |         return root.findall(xquery)
 42 | 
 43 |     @classmethod
 44 |     def getGeneRefNodes(cls, root, recursively=True):
 45 |         iterfn = root.iter if recursively else root.iterchildren
 46 |         iterator = iterfn('{{{}}}geneRef'.format(cls.ns['ns0']))
 47 |         return list(iterator)
 48 | 
 49 |     @classmethod
 50 |     def getGeneFromId(cls, id_, root):
 51 |         xquery = ".*//{{{}}}gene[@id='{}']".format(cls.ns['ns0'], id_)
 52 |         genes = root.findall(xquery)
 53 |         if len(genes) > 1:
 54 |             raise ElementError('several gene nodes with id {} '
 55 |                                'exist'.format(id_))
 56 |         gene = genes[0] if len(genes)>0 else None
 57 |         return gene
 58 | 
 59 |     @classmethod
 60 |     def getGroupsAtLevel(cls, level, root):
 61 |         """returns a list with the orthologGroup elements which have a
 62 |         TaxRange property equals to the requested level."""
 63 |         xquery = (".//{{{0}}}property[@name='TaxRange'][@value='{1}']/..".
 64 |                   format(cls.ns['ns0'], level))
 65 |         return root.findall(xquery)
 66 | 
 67 |     @classmethod
 68 |     def getSubNodes(cls, targetNode, root, recursively=True):
 69 |         """method which returns a list of all (if recursively
 70 |         is set to true) or only the direct children nodes
 71 |         having 'targetNode' as their tagname.
 72 |         The namespace is automatically added to the tagname."""
 73 |         xPrefix = ".//" if recursively else "./"
 74 |         xquery = "{}{{{}}}{}".format(xPrefix, cls.ns['ns0'], targetNode)
 75 |         return root.findall(xquery)
 76 | 
 77 |     @classmethod
 78 |     def is_geneRef_node(cls, element):
 79 |         """check whether a given element is an instance of a geneRef
 80 |         element."""
 81 |         return element.tag == '{{{ns0}}}geneRef'.format(**cls.ns)
 82 | 
 83 |     @classmethod
 84 |     def getLevels(cls, element):
 85 |         """returns a list of the TaxRange levels associated to the
 86 |         passed orthologGroup element. If the element does not have
 87 |         any TaxRange property tags associated, an empty list is
 88 |         returned."""
 89 |         propTags = cls.getSubNodes("property", element, recursively=False)
 90 |         res = [t.get('value') for t in propTags if t.get('name') == 'TaxRange']
 91 |         return res
 92 | 
 93 |     @classmethod
 94 |     def getInputGenes(cls, root, species=None):
 95 |         """returns a list of all gene elements in the orthoxml inside
 96 |         <species><database> tags, i.e. the list of genes prior to running
 97 |         OMA-HOGS. Optionally filtered by species."""
 98 |         filter_ = ('[@name="{}"]'.format(species)
 99 |                    if species is not None else '')
100 |         if filter_ > '':
101 |             xquery = ('/ns:orthoXML/ns:species{}/ns:database/'
102 |                       'ns:genes//ns:gene'.format(filter_))
103 |         else:
104 |             xquery = '//ns:gene'
105 |         return root.xpath(xquery, namespaces={'ns': cls.ns['ns0']})
106 | 
107 |     @classmethod
108 |     def getGroupedGenes(cls, root, species=None):
109 |         """ returns a list of all geneRef elements inside <group> tags, i.e.
110 |         the list of genes clustered into families after running OMA-HOGS.
111 |         Optionally filtered by species."""
112 |         filter_ = ('[@name="TaxRange"and@value="{}"]'.format(species)
113 |                    if species is not None else '')
114 |         if filter_ > '':
115 |             xquery = ('/ns:orthoXML/ns:groups/ns:orthologGroup//ns:property{}/'
116 |                       'following-sibling::ns:geneRef'.format(filter_))
117 |         else:
118 |             xquery = '//ns:geneRef'
119 |         return root.xpath(xquery, namespaces={'ns': cls.ns['ns0']})
120 | 
121 |     @classmethod
122 |     def getScoreNodes(cls, root, score_id=None):
123 |         """returns the associated score nodes for a certain (orthologGroup) node.
124 |         If score_id is not specified, all scores will be returned"""
125 |         xquery = './ns:score'
126 |         if score_id is not None:
127 |             xquery += "[@id='{}']".format(score_id)
128 |         return root.xpath(xquery, namespaces={'ns': cls.ns['ns0']})
129 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/familyanalyzer/tools.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import division
  3 | from __future__ import absolute_import
  4 | from future.builtins import dict
  5 | from future.builtins import zip
  6 | from future.builtins import range
  7 | from future import standard_library
  8 | standard_library.install_hooks()
  9 | 
 10 | try:
 11 |     from progressbar import ProgressBar, Percentage, Timer, ETA, Bar
 12 |     PROGRESSBAR = True
 13 | except ImportError:
 14 |     PROGRESSBAR = False
 15 | 
 16 | from collections import deque
 17 | 
 18 | def setup_progressbar(msg, size):
 19 |     if not msg.endswith(': '):
 20 |         msg += ': '
 21 | 
 22 |     widgets = [msg,
 23 |                Percentage(), ' ',
 24 |                Bar(), ' ',
 25 |                Timer(), ' ',
 26 |                ETA()]
 27 | 
 28 |     pbar = ProgressBar(widgets=widgets, maxval=size)
 29 |     return pbar
 30 | 
 31 | def enum(*sequential, **named):
 32 |     """creates an Enum type with given values"""
 33 |     enums = dict(zip(sequential, range(len(sequential))), **named)
 34 |     enums['reverse'] = dict((value, key) for key, value in enums.items())
 35 |     return type('Enum', (object, ), enums)
 36 | 
 37 | 
 38 | class IterableClassException(Exception):
 39 |     pass
 40 | 
 41 | def py2_iterable(Class):
 42 |     """
 43 |     Use as a class decorator to make a class that has a python 3 next method --
 44 |     __next__() -- also iterable with python 2, which uses next(). Also checks
 45 |     for an __iter__ method -- if this is missing the class won't be iterable anyway.
 46 | 
 47 | 
 48 |     e.g.
 49 |     @py2_iterable
 50 |     class Py2and3Iterator(object):
 51 |         def __init__(self):
 52 |             self.data = list('somestuff')
 53 |             self._pos = 0
 54 | 
 55 |         def __iter__(self):
 56 |             return self
 57 | 
 58 |         def __next__(self):
 59 |             if self._pos == len(self.data):
 60 |                 self._pos = 0
 61 |                 raise StopIteration
 62 |             char = self.data[self._pos]
 63 |             self._pos += 1
 64 |             return char
 65 | 
 66 | 
 67 |     :param Class: the class being decorated
 68 |     :return: Class: the decorated class, which is iterable in py2 and py3
 69 |     """
 70 |     if not hasattr(Class, '__iter__'):
 71 |         raise IterableClassException('Class "{}" has no __iter__ method and will not be iterable'
 72 |                                      .format(Class.__class__.__name__))
 73 | 
 74 |     if hasattr(Class, '__next__'):
 75 |         next_method = getattr(Class, '__next__')
 76 |         setattr(Class, 'next', next_method)
 77 | 
 78 |     return Class
 79 | 
 80 | 
 81 | @py2_iterable
 82 | class Queue(object):
 83 | 
 84 |     def __init__(self):
 85 |         self.__queue = deque()
 86 | 
 87 |     def __iter__(self):
 88 |         return self
 89 | 
 90 |     def __len__(self):
 91 |         return len(self.__queue)
 92 | 
 93 |     def __next__(self):
 94 |         if self.isempty():
 95 |             raise StopIteration
 96 |         return self.dequeue()
 97 | 
 98 |     def enqueue(self, item):
 99 |         self.__queue.append(item)
100 | 
101 |     def dequeue(self):
102 |         if self.isempty():
103 |             raise Exception('empty queue')
104 |         return self.__queue.popleft()
105 | 
106 |     def isempty(self):
107 |         return len(self.__queue) == 0
108 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/file_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .context_managers import *
2 | from .extractors import *
3 | 
4 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/file_utils/context_managers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import tempfile
  4 | 
  5 | 
  6 | 
  7 | __all__ = ['TempFile', 'TempDir', 'ChDir', 'MkDir', 'NonDeletingTempDir']
  8 | 
  9 | class TempFile(object):
 10 |     """ 
 11 |     Context manager for working with a temporary file
 12 |     that automatically cleans up.
 13 | 
 14 |     Usage:
 15 | 
 16 |     with TempFile() as tmp:
 17 |         # In scope, tmp exists on the disk
 18 |         # Do some work with tmp, e.g. tmp.write('something')
 19 | 
 20 |     # Out of scope, tmp is deleted
 21 | 
 22 |     with TempFile('local_temp_space') as tmp:
 23 |         # tmp is created in the directory 'local_temp_space'
 24 |         # The specified directory must exist, or an error is thrown
 25 | 
 26 |     """
 27 | 
 28 |     def __init__(self, dir_=None):
 29 |         if dir_ is not None and not os.path.exists(dir_):
 30 |             raise IOError('Directory "{}"" does not exist'.format(dir_))
 31 |         self.dir = dir_
 32 | 
 33 |     def __enter__(self):
 34 |         self._fd, self._wrapped_tmp = tempfile.mkstemp(dir=self.dir)
 35 |         return os.path.abspath(self._wrapped_tmp)
 36 | 
 37 |     def __exit__(self, type, value, tb):
 38 |         os.close(self._fd)
 39 |         os.remove(self._wrapped_tmp)
 40 | 
 41 | 
 42 | class TempDir(object):
 43 |     """
 44 |     Context manager for working with a temporary file
 45 |     that automatically cleans up.
 46 | 
 47 |     Usage:
 48 | 
 49 |     with TempDir() as tmpd:
 50 |         # In scope, tmpd exists on the disk
 51 |         # Do some work with tmpd ...
 52 | 
 53 |     # Out of scope, tmpd is deleted along with all its content
 54 | 
 55 |     Can be nested with TempFile, e.g.
 56 | 
 57 |     with TempDir() as tmpd, TempFile(tmpd) as tmpf:
 58 |         # tempfile tmpf is created inside temporary directory tmpd
 59 |     # On exit, everything is deleted
 60 | 
 61 |     """
 62 | 
 63 |     def __enter__(self):
 64 |         self._wrapped_tmpdir = tempfile.mkdtemp()
 65 |         return os.path.abspath(self._wrapped_tmpdir)
 66 | 
 67 |     def __exit__(self, type, value, tb):
 68 |         shutil.rmtree(self._wrapped_tmpdir)
 69 | 
 70 | 
 71 | class NonDeletingTempDir(TempDir):
 72 |     def __exit__(self, tpye, value, tb):
 73 |         pass
 74 | 
 75 | 
 76 | class ChDir(object):
 77 |     """
 78 |     Context manager to switch to a working directory,
 79 |     and return to the current directory (like 'Dir.chdir do' block in Ruby)
 80 | 
 81 |     Usage:
 82 | 
 83 |     with TempDir() as dir, ChDir(dir):
 84 |         # Do some work in the working temp directory 'dir'
 85 | 
 86 |     # Exit 'dir'
 87 |     """
 88 | 
 89 |     def __init__(self, working_dir):
 90 |         if not os.path.exists(working_dir):
 91 |             raise IOError('Directory "{}"" does not exist'.format(working_dir))
 92 |         self._cdir = os.getcwd()
 93 |         self._wdir = working_dir
 94 | 
 95 |     def __enter__(self):
 96 |         os.chdir(self._wdir)
 97 | 
 98 |     def __exit__(self, type, value, tb):
 99 |         os.chdir(self._cdir)
100 | 
101 | 
102 | class MkDir(ChDir):
103 |     """
104 |     Context manager to create and switch to a working directory,
105 |     then return to the current directory.
106 | 
107 |     Usage:
108 | 
109 |     with TempDir() as dir, MkDir(dir):
110 |         # Do some work in the working temp directory 'dir'
111 | 
112 |     # Exit 'dir'
113 |     """
114 | 
115 |     def __init__(self, working_dir):
116 |         if not os.path.exists(working_dir):
117 |             try:
118 |                 os.makedirs(working_dir)
119 |             except OSError as e:
120 |                 if e.errno != 17:
121 |                     raise
122 |                 pass  # path was created by another thread / process
123 |                 # this is a race condition, but probably benign
124 | 
125 |     def __enter__(self):
126 |         pass
127 | 
128 |     def __exit__(self, type, value, tb):
129 |         pass
130 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/file_utils/extractors.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import collections
  3 | import re
  4 | 
  5 | 
  6 | 
  7 | __all__ = ['tail', 'fall_back_tail', 'grep']
  8 | 
  9 | 
 10 | def tail(fh, lines=20, block_size=1024):
 11 |     """Returns the last n lines from a file
 12 | 
 13 |     This function returns the last n lines from an file-like
 14 |     object. It does this efficiently without reading the whole
 15 |     file, but rather by loading blocks from the end of the file.
 16 | 
 17 |     .. note::
 18 | 
 19 |         If the file is opened in text mode, i.e. open('/path', 'rt'),
 20 |         python3 cannot efficiently move in the file. In this case,
 21 |         the function fall back to a slow method that goes through
 22 |         the whole file.
 23 | 
 24 |     Example:
 25 | 
 26 |     >>> with open("/etc/passwd", 'rb') as f:
 27 |     ...     last_lines = tail(f, 2)
 28 |     ...
 29 |     >>> print(last_lines)
 30 | 
 31 |     :param fh: file-like object to read from
 32 |     :param int lines: number of lines to be returned
 33 |     :param int block_size: size of block to be read at once.
 34 |         intended for optimisation.
 35 |     :returns: The last lines as a list of bytes/str object"""
 36 | 
 37 |     if lines <= 0:
 38 |         raise ValueError('invalid lines value %r' % lines)
 39 | 
 40 |     encoded = getattr(fh, 'encoding', False)
 41 |     if encoded:
 42 |         return fall_back_tail(fh, lines)
 43 |     CR = '\n' if encoded else b'\n'
 44 |     data = '' if encoded else b''
 45 |     fh.seek(0, os.SEEK_END)
 46 |     fsize = fh.tell()
 47 |     block = -1
 48 |     loaded_enough_data = False
 49 |     while not loaded_enough_data:
 50 |         step = (block * block_size)
 51 |         if abs(step) >= fsize:
 52 |             fh.seek(0)
 53 |             newdata = fh.read(block_size - (abs(step) - fsize))
 54 |             loaded_enough_data = True
 55 |         else:
 56 |             fh.seek(step, os.SEEK_END)
 57 |             newdata = fh.read(block_size)
 58 |         data = newdata + data
 59 |         if data.count(CR) > lines:
 60 |             break
 61 |         else:
 62 |             block -= 1
 63 |     return data.splitlines()[-lines:]
 64 | 
 65 | 
 66 | def fall_back_tail(fh, lines):
 67 |     fh.seek(0)
 68 |     data = collections.deque(fh, maxlen=lines)
 69 |     return [e.rstrip('\n') for e in data]
 70 | 
 71 | 
 72 | def grep(fh, pat):
 73 |     """Yields lines matching a pattern
 74 | 
 75 |     This function yields all the lines that match a given pattern.
 76 |     The pattern can be either a simple str/bytes, or a compiled
 77 |     regex expression. The newline character is not removed.
 78 | 
 79 |     Example:
 80 |         >>> with open('/etc/hosts', 'rb') as fh:
 81 |         ...    for line in grep(fh, b'127.0.0.1'):
 82 |         ...        print(line)
 83 |         127.0.0.1       localhost
 84 | 
 85 |     :param fh: file-like object
 86 |     :param pat: search pattern, either str, bytes or compiled regex
 87 |     :returns: generator yielding lines matching pattern.
 88 | 
 89 |     """
 90 |     if isinstance(pat, (str, bytes)):
 91 |         encoded = getattr(fh, 'encoding', False)
 92 |         if encoded and isinstance(pat, bytes):
 93 |             pat = re.compile(pat.decode())
 94 |         elif not encoded and isinstance(pat, str):
 95 |             pat = re.compile(pat.encode('utf-8'))
 96 |         else:
 97 |             pat = re.compile(pat)
 98 |     fh.seek(0)
 99 |     for line in fh:
100 |         if pat.search(line):
101 |             yield line
102 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/hog/__init__.py:
--------------------------------------------------------------------------------
1 | from .filter_orthoxml import *
2 | from .convert import orthoxml_to_newick
3 | from .orthoxml_merge import merge_orthoxml_files
4 | from .extract_groups import TaxLevel, extract_flat_groups_at_level, extract_marker_groups_at_level
5 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/hog/convert.py:
--------------------------------------------------------------------------------
  1 | from xml.etree.ElementTree import XMLParser
  2 | __all__ = ["orthoxml_to_newick"]
  3 | 
  4 | 
  5 | class TaxonNHXMixin:
  6 |     def get_tax_nhx(self):
  7 |         tags = []
  8 |         if self.level:
  9 |             tags.append(":S={}".format(self.level))
 10 |         if self.taxid:
 11 |             tags.append(":T={}".format(self.taxid))
 12 |         return tags
 13 | 
 14 | 
 15 | class Speciation:
 16 |     type = None
 17 | 
 18 |     def __init__(self, parent=None):
 19 |         self.level = ""
 20 |         self.taxid = None
 21 |         self.children = []
 22 |         self.parent = parent
 23 |         if parent is not None:
 24 |             parent.add_child(self)
 25 | 
 26 |     def add_child(self, e):
 27 |         self.children.append(e)
 28 | 
 29 |     def set_level(self, level):
 30 |         self.level = level
 31 | 
 32 |     def set_taxid(self, taxid):
 33 |         self.taxid = taxid
 34 | 
 35 |     def get_newick_node_name(self):
 36 |         if not hasattr(self, 'get_tax_nhx'):
 37 |             return self.level.replace(' ', '_')
 38 |         return ""
 39 | 
 40 |     def as_nhx(self):
 41 |         nhx = "[&&NHX"
 42 |         t = ",".join([c.as_nhx() for c in self.children])
 43 |         if t != "":
 44 |             t = "({})".format(t)
 45 |         tags = self.get_newick_node_name()
 46 | 
 47 |         if self.type:
 48 |             nhx += ":Ev={}".format(self.type)
 49 |         if hasattr(self, "get_tax_nhx"):
 50 |             nhx += "".join(self.get_tax_nhx())
 51 |         nhx += "]"
 52 |         if len(nhx) > 7:
 53 |             tags += nhx
 54 |         return "{}{}".format(t, tags)
 55 | 
 56 | 
 57 | class Duplication(Speciation):
 58 |     type = "duplication"
 59 | 
 60 | 
 61 | class Leaf(Speciation):
 62 |     def __init__(self, xref, species, parent=None):
 63 |         super().__init__(parent=parent)
 64 |         self.name = xref
 65 |         self.level = species
 66 | 
 67 |     def get_newick_node_name(self):
 68 |         return self.name
 69 | 
 70 | 
 71 | class NHXSpeciation(Speciation, TaxonNHXMixin):
 72 |     pass
 73 | 
 74 | class NHXDuplication(Duplication, TaxonNHXMixin):
 75 |     pass
 76 | 
 77 | class NHXLeaf(Leaf, TaxonNHXMixin):
 78 |     pass
 79 | 
 80 | 
 81 | class OrthoxmlToNewick:
 82 | 
 83 |     def __init__(self, xref_tag="protId", encode_levels_as_nhx=True, return_gene_to_species=False):
 84 |         self.xref_tag = xref_tag
 85 |         self.gene2xref = {}
 86 |         self.trees = {}
 87 |         self.depth = 0
 88 |         self.famid = None
 89 |         self.cur_event = None
 90 |         self.cur_species = None
 91 |         self._use_nhx = encode_levels_as_nhx
 92 |         self._return_gene_to_species= return_gene_to_species
 93 | 
 94 |     def start(self, tag, attrib):
 95 |         if tag == "{http://orthoXML.org/2011/}species":
 96 |             self.cur_species = attrib['name']
 97 |         if tag == "{http://orthoXML.org/2011/}gene":
 98 |             self.gene2xref[attrib['id']] = (attrib[self.xref_tag], self.cur_species)
 99 |         elif tag == "{http://orthoXML.org/2011/}geneRef":
100 |             leaf_cls = NHXLeaf if self._use_nhx else Leaf
101 |             self.cur_event.add_child(leaf_cls(*self.gene2xref[attrib['id']]))
102 |         elif tag == "{http://orthoXML.org/2011/}orthologGroup":
103 |             if self.depth == 0:
104 |                 self.famid = attrib['id']
105 |             speciation_cls = NHXSpeciation if self._use_nhx else Speciation
106 |             self.cur_event = speciation_cls(self.cur_event)
107 |             self.depth += 1
108 |         elif tag == "{http://orthoXML.org/2011/}paralogGroup":
109 |             dupl_cls = NHXDuplication if self._use_nhx else Duplication
110 |             self.cur_event = dupl_cls(self.cur_event)
111 |         elif tag == "{http://orthoXML.org/2011/}property":
112 |             if attrib['name'] == "TaxRange":
113 |                 self.cur_event.set_level(attrib['value'])
114 |             elif attrib['name'].lower() in ("taxid", "taxonid", "taxon_id", "ncbi_taxon_id"):
115 |                 self.cur_event.set_taxid(attrib['value'])
116 | 
117 |     def end(self, tag):
118 |         if tag == "{http://orthoXML.org/2011/}paralogGroup":
119 |             self.cur_event = self.cur_event.parent
120 |         elif tag == "{http://orthoXML.org/2011/}orthologGroup":
121 |             self.depth -= 1
122 |             if self.depth == 0:
123 |                 assert(self.cur_event.parent is None)
124 |                 self.trees[self.famid] = self.cur_event.as_nhx() + ";"
125 |             self.cur_event = self.cur_event.parent
126 | 
127 |     def close(self):
128 |         if self._return_gene_to_species:
129 |             gene2species = {k[0]: k[1] for k in self.gene2xref.values()}
130 |             return self.trees, gene2species
131 |         return self.trees
132 | 
133 | 
134 | def orthoxml_to_newick(filename, xref_tag="protId", encode_levels_as_nhx=False, return_gene_to_species=False):
135 |     """function to convert all HOGs from an orthoxml file into newick trees
136 | 
137 |     This function converts all toplevel orthologGroups into a dictionary of newick trees.
138 |     Duplication nodes are labeled as such using the nhx tag, e.g. a paralogGroup node
139 |     will be translated into an internal node having the nhx label [&&NHX:Ev=duplication]
140 | 
141 |     :param filename: the filename of the input orthoxml file
142 | 
143 |     :param xref_tag: the attribute of the <gene> element that should be used to get as label
144 |                      for the leaves labels.
145 | 
146 |     :param encode_levels_as_nhx: boolean flag indicating whether or not the species information
147 |                                  of the internal and extend nodes should be returned in NHX format
148 |                                  with the :S=<...> and :T=<...> format. otherwise, the TaxRange
149 |                                  value will be used as newick node label for the internal nodes.
150 | 
151 |     :param return_gene_to_species: boolean flag indicating if a mapping with the gene to species
152 |                                    should be returned.
153 | 
154 |     :returns either a dict of {roothogid: tree} where tree is in nhx format or a tuple with the
155 |              first element being the tree dictionary and the second being a mapping from
156 |              {gene: species}.
157 |     """
158 | 
159 |     target = OrthoxmlToNewick(
160 |         xref_tag=xref_tag,
161 |         encode_levels_as_nhx=encode_levels_as_nhx,
162 |         return_gene_to_species=return_gene_to_species)
163 |     parser = XMLParser(target=target)
164 |     with open(filename, 'rb') as xml:
165 |         for chunk in xml:
166 |             parser.feed(chunk)
167 |     return parser.close()
168 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/hog/extract_hog_info.py:
--------------------------------------------------------------------------------
 1 | from ..utils import auto_open
 2 | import collections
 3 | from time import time
 4 | import xml.etree.ElementTree as etree
 5 | from pathlib import Path
 6 | import logging
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | Gene = collections.namedtuple("Gene", "xref species internal_id")
10 | 
11 | 
12 | class SpeciesAnalyser:
13 |     def __init__(self, gene_attr="protId"):
14 |         self.gene_attr = gene_attr
15 |         self.genes = {}
16 |         self.nr_genes_per_species = collections.defaultdict(int)
17 | 
18 |     def add_genome_genes(self, genome_node):
19 |         genome_name = genome_node.get('name', None)
20 |         if genome_name is None:
21 |             genome_name = genome_node.get("NCBITaxId")
22 | 
23 |         generef_2_xref = {}
24 |         for gene in genome_node.findall('.//{http://orthoXML.org/2011/}gene'):
25 |             gene_id = gene.get('id')
26 |             gene_prot_id = gene.get(self.gene_attr)
27 |             generef_2_xref[gene_id] = Gene(gene_prot_id, genome_name, gene_id)
28 |             self.nr_genes_per_species[genome_name] += 1
29 |         self.genes.update(generef_2_xref)
30 | 
31 |     def gene_in_group(self, gene_id):
32 |         self.genes.pop(gene_id)
33 | 
34 |     def get_singletons(self):
35 |         return self.genes
36 | 
37 |     def summary(self):
38 |         single = collections.defaultdict(int)
39 |         for g in self.genes.values():
40 |             single[g.species] += 1
41 |         return [{'species': g, 'genes': self.nr_genes_per_species[g], 'not_in_group': single[g]}
42 |                 for g in self.nr_genes_per_species]
43 | 
44 | 
45 | def parse_orthoxml(fh, genome_watcher: SpeciesAnalyser):
46 |     taxonomy = {}
47 |     og_level = 0
48 | 
49 |     def collect_genes(elem):
50 |         genes = 0
51 |         for child in elem.iter():
52 |             if child == elem:
53 |                 continue
54 |             if child.tag == "{http://orthoXML.org/2011/}geneRef":
55 |                 genes += 1
56 |                 if genome_watcher is not None:
57 |                     genome_watcher.gene_in_group(child.get('id'))
58 |             elif child.tag == "{http://orthoXML.org/2011/}orthologGroup":
59 |                 genes += child.text
60 |         elem.clear()
61 |         elem.text = genes
62 |         return genes
63 | 
64 |     logger.info("start mapping of orthoxml formatted input file")
65 |     for event, elem in etree.iterparse(fh, events=('start', 'end')):
66 |         if event == "start":
67 |             if elem.tag == "{http://orthoXML.org/2011/}orthoXML":
68 |                 if elem.get('version') != "0.5":
69 |                     raise RuntimeError(f"Expecting orthoXML version 0.5, but is {elem.get('version')}")
70 |             elif elem.tag == '{http://orthoXML.org/2011/}orthologGroup':
71 |                 og_level += 1
72 |         elif event == 'end':
73 |             if elem.tag == "{http://orthoXML.org/2011/}orthologGroup":
74 |                 og_level -= 1
75 |                 data = {'id': elem.get('id'), 'level': taxonomy[elem.get('taxonId')]}
76 |                 for child in elem.findall('./{http://orthoXML.org/2011/}score'):
77 |                     data[child.get('id')] = float(child.get('value'))
78 |                 data['nr_members'] = collect_genes(elem)
79 |                 data['is_roothog'] = og_level == 0
80 |                 yield data
81 |                 if og_level == 0:
82 |                     elem.clear()
83 |             elif elem.tag == "{http://orthoXML.org/2011/}species":
84 |                 if genome_watcher is not None:
85 |                     genome_watcher.add_genome_genes(elem)
86 |                 elem.clear()
87 |             elif elem.tag == "{http://orthoXML.org/2011/}taxon":
88 |                 taxonomy[elem.get('id')] = elem.get('name')
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     import argparse
93 |     parser = argparse.ArgumentParser()
94 |     parser.add_argument("--orthoxml", required=True)
95 |     conf = parser.parse_args()
96 |     genome_coverage_stats = SpeciesAnalyser()
97 |     with open(conf.orthoxml, 'rt') as xml:
98 |         for group in parse_orthoxml(xml, genome_coverage_stats):
99 |             print(group)


--------------------------------------------------------------------------------
/FastOMA/zoo/hog/filter_orthoxml.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from ..utils import auto_open
 3 | # import collections
 4 | # from time import time
 5 | from lxml import etree as ET
 6 | # import Bio.Phylo
 7 | from typing import Iterable
 8 | from pathlib import Path
 9 | import logging
10 | logger = logging.getLogger(__name__)
11 | 
12 | class HOGFilter:
13 |     def __init__(self, score:str, value:float):
14 |         self.score = score
15 |         self.value = value
16 | 
17 |     def remove(self, score_id, value):
18 |         return score_id == self.score and self.value > float(value)
19 | 
20 | 
21 | class OrthoXMLFilterProcesser:
22 | 
23 |     def __init__(self, filters:Iterable[HOGFilter]=None):
24 |         self.filters = list(filters)
25 | 
26 |     def add_filter(self, filter:HOGFilter):
27 |         self.filters.append(filter)
28 | 
29 |     def process(self, fh):
30 |         NS = "http://orthoXML.org/2011/"
31 |         self.doc = ET.parse(fh)
32 |         root = self.doc.getroot()
33 |         to_rem = []
34 |         for hog in root.iterfind('.//{{{0}}}orthologGroup'.format(NS)):
35 |             score = hog.find('./{{{0}}}score'.format(NS))
36 |             if score is None:
37 |                 continue
38 |             for filt in self.filters:
39 |                 if filt.remove(score.get('id'), score.get('value')):
40 |                     to_rem.append(hog)
41 |                     break
42 |         logger.info(f"will remove {len(to_rem)} hogs")
43 |         for h in to_rem:
44 |             parent = h.getparent()
45 |             if 'id' in h.attrib:
46 |                 logger.info("removing hog " + str(h) + " line " + str(h.sourceline) + " " +str(h.attrib['id']))
47 |             else:
48 |                 logger.info("removing hog " + str(h) + " line " + str(h.sourceline))
49 |             if parent:
50 |                 parent.remove(h)
51 |                 if sum(c.tag == "{{{0}}}orthologGroup".format(NS) for c in parent) == 0:
52 |                     if 'id' in parent.attrib:
53 |                         logger.info("consider deleting the empty parent hog "+str(parent)+" line "+str(parent.sourceline)+" "+str(parent.attrib['id']))
54 |                     else:
55 |                         logger.info("consider deleting the empty parent hog " + str(parent) + " line "+str(parent.sourceline))
56 |                     to_rem.append(parent)
57 | 
58 |     def write(self, fh):
59 |         self.doc.write(fh, xml_declaration=True, encoding="UTF-8")
60 | 
61 | 
62 | 
63 | def filter_orthoxml_file(source_orthoxml, out, filter: HOGFilter):
64 |     processor = OrthoXMLFilterProcesser([filter])
65 |     if isinstance(source_orthoxml, (str, bytes, Path)):
66 |         with auto_open(source_orthoxml, 'rt') as fh:
67 |             processor.process(fh)
68 |     else:
69 |         processor.process(source_orthoxml)
70 |     processor.write(out)
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/hog/orthoxml_merge.py:
--------------------------------------------------------------------------------
  1 | from xml.etree import ElementTree as ET
  2 | from typing import List, Iterable
  3 | from random import randint
  4 | 
  5 | 
  6 | 
  7 | class GeneRefManager:
  8 |     def __init__(self):
  9 |         self.xrefs = {}
 10 |         self.ids = set([])
 11 | 
 12 |     def _random_unused_id(self):
 13 |         while True:
 14 |             cand = randint(100000, 1000000000)
 15 |             if str(cand) not in self.ids:
 16 |                 return str(cand)
 17 | 
 18 |     def register_and_reassign(self, gene_nodes:Iterable[ET.Element]):
 19 |         update_ids = {}
 20 |         to_rem = []
 21 |         for gene in gene_nodes:
 22 |             if gene.attrib['id'] in self.ids:
 23 |                 if gene.attrib['protId'] in self.xrefs:
 24 |                     # protId already in set. is it unique? if yes, no action, otherwise error
 25 |                     if self.xrefs[gene.attrib['protId']] != gene.attrib['id']:
 26 |                         raise ValueError("protId '{}' is used several times with different gene id :'{},'{}'"
 27 |                                          .format(gene.attrib['protId'], self.xrefs[gene.attrib['protId']], gene.attrib['id']))
 28 |                     else:
 29 |                         to_rem.append(gene.attrib['id'])
 30 |                         continue
 31 |                 else:
 32 |                     # reassign internal gene id.
 33 |                     new_id = self._random_unused_id()
 34 |                     update_ids[gene.attrib['id']] = new_id
 35 |                     gene.attrib['id'] = new_id
 36 | 
 37 |             self.xrefs[gene.attrib['protId']] = gene.attrib['id']
 38 |             self.ids.add(gene.attrib['id'])
 39 |         return update_ids, to_rem
 40 | 
 41 | 
 42 | class Merger:
 43 |     def __init__(self, first):
 44 |         self.NS = "http://orthoXML.org/2011/"
 45 |         ET.register_namespace("", self.NS)
 46 |         self.doc = ET.parse(first)
 47 |         self.root = self.doc.getroot()
 48 | 
 49 |         self.all_species = set(z.attrib['name'] for z in self.doc.findall('./{{{}}}species'.format(self.NS)))
 50 |         self.all_genes = GeneRefManager()
 51 |         self.all_genes.register_and_reassign(
 52 |             self.doc.findall("./{{{0}}}species/{{{0}}}database/{{{0}}}genes/{{{0}}}gene".format(self.NS))
 53 |         )
 54 | 
 55 |     def merge_file(self, other):
 56 |         gene_id_updates, to_rem = self.all_genes.register_and_reassign(
 57 |             other.findall("./{{{0}}}species/{{{0}}}database/{{{0}}}genes/{{{0}}}gene".format(self.NS)))
 58 |         self._remove_unnecessary_genes(other, to_rem)
 59 |         self._update_geneRef_ids(other.find('./{{{}}}groups'.format(self.NS)), gene_id_updates)
 60 | 
 61 |         for sp in other.findall("./{{{}}}species".format(self.NS)):
 62 |             if sp.attrib['name'] not in self.all_species:
 63 |                 species_seen = False
 64 |                 for i, el in enumerate(self.root):
 65 |                     if el.tag == "{{{}}}species".format(self.NS):
 66 |                         species_seen = True
 67 |                     elif species_seen:
 68 |                         break
 69 |                 self.root.insert(i, sp)
 70 |                 self.all_species.add(sp.attrib['name'])
 71 |             else:
 72 |                 db = self.root.find("./{{{0}}}species[@name='{1}']/{{{0}}}database/{{{0}}}genes".format(self.NS, sp.attrib['name']))
 73 |                 for g in sp.iterfind(".//{{{}}}gene".format(self.NS)):
 74 |                     db.append(g)
 75 |         grps = self.root.find("./{{{}}}groups".format(self.NS))
 76 |         for g in other.find("./{{{}}}groups".format(self.NS)):
 77 |             grps.append(g)
 78 | 
 79 |     def _update_geneRef_ids(self, root, gene_id_updates):
 80 |         for old_id, new_id in gene_id_updates.items():
 81 |             for g in root.iterfind(".//{{{0}}}geneRef[@id='{1}']".format(self.NS, old_id)):
 82 |                 g.attrib['id'] = new_id
 83 | 
 84 |     def _remove_unnecessary_genes(self, root, to_rem):
 85 |         for e in to_rem:
 86 |             parent = root.find("./{{{0}}}species/{{{0}}}database/{{{0}}}genes/{{{0}}}gene[@id='{1}']/.."
 87 |                                .format(self.NS, e))
 88 |             child = parent.find("./{{{0}}}gene[@id='{1}']".format(self.NS, e))
 89 |             parent.remove(child)
 90 | 
 91 | 
 92 | 
 93 | 
 94 |     def write(self, fh):
 95 |         self.doc.write(fh, xml_declaration=True, encoding="UTF-8", default_namespace=None)
 96 | 
 97 | 
 98 | def merge_orthoxml_files(out, files):
 99 |     """function to merge several orthoxml files into a single orthoxml file that contains all groups.
100 | 
101 |     This function combines several orthoxml files into a single orthoxml file that
102 |     contains all the groups and maintains a valid definition block of the species
103 |     and their genes. The protId attributes among all the orthoxml files need to be
104 |     either unique or being at least assigned to the same internal gene id; in that
105 |     case it is assumed that it is the same gene across the different files and it
106 |     can be merged.
107 |     if the gene id attribute is the same two or more orthoxml files, but their
108 |     protId value is different, a new gene id value is generated and the geneRef
109 |     values are updated accordingly.
110 | 
111 |     :param out: a path or a filehandle object where the combined orthoxml data should
112 |                 be written to.
113 | 
114 |     :param files: a list of paths or filehandle objects (of valid orthoxml format) that
115 |                 should be merged.
116 | 
117 |     """
118 | 
119 |     first = files.pop()
120 |     merger = Merger(first)
121 |     for f in files:
122 |         merger.merge_file(ET.parse(f).getroot())
123 | 
124 |     return merger.write(out)
125 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/seq_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/unionfind.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | 
  3 | """UnionFind.py
  4 | 
  5 | Union-find data structure. Based on Josiah Carlson's code,
  6 | http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/215912
  7 | with significant additional changes by D. Eppstein and
  8 | Adrian Altenhoff.
  9 | """
 10 | 
 11 | 
 12 | class UnionFind(object):
 13 |     """Union-find data structure.
 14 | 
 15 |     Each unionFind instance X maintains a family of disjoint sets of
 16 |     hashable objects, supporting the following two methods:
 17 | 
 18 |     - X[item] returns a name for the set containing the given item.
 19 |       Each set is named by an arbitrarily-chosen one of its members; as
 20 |       long as the set remains unchanged it will keep the same name. If
 21 |       the item is not yet part of a set in X, a new singleton set is
 22 |       created for it.
 23 | 
 24 |     - X.union(item1, item2, ...) merges the sets containing each item
 25 |       into a single larger set.  If any item is not yet part of a set
 26 |       in X, it is added to X as one of the members of the merged set.
 27 |     """
 28 | 
 29 |     def __init__(self, elements=None):
 30 |         """Create a new union-find structure.
 31 | 
 32 |         If elements is not None, the structure gets initialized
 33 |         with each element as a singleton component.
 34 | 
 35 |         :param elements: an iterable to initialize the structure.
 36 |         """
 37 | 
 38 |         self.weights = {}
 39 |         self.parents = {}
 40 |         if elements is not None:
 41 |             for elem in iter(elements):
 42 |                 self.parents[elem] = elem
 43 |                 self.weights[elem] = 1
 44 | 
 45 |     def __getitem__(self, obj):
 46 |         """return the name of set which contains obj.
 47 | 
 48 |         :param obj: the query object
 49 | 
 50 |         :SeeAlso: :meth:`find`"""
 51 |         return self.find(obj)
 52 | 
 53 |     def find(self, obj):
 54 |         """Find and return the name of the set containing the obj.
 55 | 
 56 |         If the object is not found in any set, a new singleton set
 57 |         is created that holds only this object until it is further merged."""
 58 | 
 59 |         # check for previously unknown obj. If unknown, add it
 60 |         # as a new cluster
 61 |         if obj not in self.parents:
 62 |             self.parents[obj] = obj
 63 |             self.weights[obj] = 1
 64 |             return obj
 65 | 
 66 |         # find path of objects leading to the root
 67 |         path = [obj]
 68 |         root = self.parents[obj]
 69 |         while root != path[-1]:
 70 |             path.append(root)
 71 |             root = self.parents[root]
 72 | 
 73 |         # compress the path and return
 74 |         for ancestor in path:
 75 |             self.parents[ancestor] = root
 76 |         return root
 77 | 
 78 |     def remove(self, obj):
 79 |         """Remove an object from the sets.
 80 | 
 81 |         Removes an object entirly from the datastructure. The
 82 |         containing set will shrink by this one element.
 83 | 
 84 |         :Note: If one tries to accessed it afterwards using
 85 |             :meth:`find`, it will be created newly and put as a
 86 |             singleton.
 87 |         """
 88 |         if obj not in self.parents:
 89 |             return
 90 |         comp = self.find(obj)
 91 |         self.weights[comp] -= 1
 92 |         self.parents.pop(obj)
 93 | 
 94 |     def __iter__(self):
 95 |         """Iterate through all items ever found or unioned by this structure."""
 96 |         return iter(self.parents)
 97 | 
 98 |     def union(self, *objects):
 99 |         """Find the sets containing the objects and merge them.
100 | 
101 |         any number of objects can be passed to this method and
102 |         all of them will be merged into one set containing at
103 |         least these objects.
104 | 
105 |         :param objects: the objects to be merged. they have to be all
106 |             hashable. If they haven't been initialy added to the UnionFind
107 |             datastructre at instantiation time, they are added at this point
108 |             in time.
109 |         """
110 |         roots = [self[x] for x in objects]
111 |         heaviest = max([(self.weights[r], r) for r in roots], key=lambda x: x[0])[1]
112 |         for r in roots:
113 |             if r != heaviest:
114 |                 self.weights[heaviest] += self.weights[r]
115 |                 self.parents[r] = heaviest
116 | 
117 |     def get_components(self):
118 |         """return a list of sets corresponding to the connected
119 |         components of the structure."""
120 |         comp_dict = collections.defaultdict(set)
121 |         for elem in iter(self):
122 |             comp_dict[self[elem]].add(elem)
123 |         comp = list(comp_dict.values())
124 |         return comp
125 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 |     Utilities for zoo files.
  4 | '''
  5 | from io import BytesIO
  6 | import bz2
  7 | import gzip
  8 | import os
  9 | import logging
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | # File opening. This is based on the example on SO here:
 14 | # http://stackoverflow.com/a/26986344
 15 | fmagic = {b'\x1f\x8b\x08': gzip.open,
 16 |           b'\x42\x5a\x68': bz2.BZ2File}
 17 | 
 18 | 
 19 | def auto_open(fn, *args, **kwargs):
 20 |     """function to open regular or compressed files for read / write.
 21 | 
 22 |     This function opens files based on their "magic bytes". Supports bz2
 23 |     and gzip. If it finds neither of these, presumption is it is a
 24 |     standard, uncompressed file.
 25 | 
 26 |     Example::
 27 | 
 28 |         with auto_open("/path/to/file/maybe/compressed", mode="rb") as fh:
 29 |             fh.read()
 30 | 
 31 |         with auto_open("/tmp/test.txt.gz", mode="wb") as fh:
 32 |             fh.write("my big testfile")
 33 | 
 34 |     :param fn: either a string of an existing or new file path, or
 35 |         a BytesIO handle
 36 |     :param \*\*kwargs: additional arguments that are understood by the
 37 |         underlying open handler
 38 |     :returns: a file handler
 39 |     """
 40 |     if isinstance(fn, BytesIO):
 41 |         return fn
 42 | 
 43 |     if os.path.isfile(fn) and os.stat(fn).st_size > 0:
 44 |         with open(fn, 'rb') as fp:
 45 |             fs = fp.read(max([len(x) for x in fmagic]))
 46 |         for (magic, _open) in fmagic.items():
 47 |             if fs.startswith(magic):
 48 |                 return _open(fn, *args, **kwargs)
 49 |     else:
 50 |         if fn.endswith('gz'):
 51 |             return gzip.open(fn, *args, **kwargs)
 52 |         elif fn.endswith('bz2'):
 53 |             return bz2.BZ2File(fn, *args, **kwargs)
 54 | 
 55 |     return open(fn, *args, **kwargs)
 56 | 
 57 | 
 58 | class LazyProperty(object):
 59 |     """Decorator to evaluate a property only on access.
 60 | 
 61 |     Compute the attribute value and caches it in the instance.
 62 |     Python Cookbook (Denis Otkidach) http://stackoverflow.com/users/168352/denis-otkidach
 63 |     This decorator allows you to create a property which can be computed once and
 64 |     accessed many times.
 65 | 
 66 |     Example::
 67 | 
 68 |         class Circle:
 69 |             def __init__(self, radius):
 70 |                 self.radius = radius
 71 | 
 72 |             @LazyProperty
 73 |             def area(self):
 74 |                 print("computing area")
 75 |                 return 3.14 * self.radius ** 2
 76 | 
 77 |         >>> c = Circle(4)
 78 |         >>> c.area
 79 |         computing area
 80 |         50.24
 81 |         >>> c.area
 82 |         50.24
 83 | 
 84 |     You can see that the property method is only executed once.
 85 |     """
 86 | 
 87 |     def __init__(self, method, name=None):
 88 |         # record the unbound-method and the name
 89 |         self.method = method
 90 |         self.name = name or method.__name__
 91 |         self.__doc__ = method.__doc__
 92 | 
 93 |     def __get__(self, inst, cls):
 94 |         if inst is None:
 95 |             return self
 96 |         # compute, cache and return the instance's attribute value
 97 |         result = self.method(inst)
 98 |         # setattr redefines the instance's attribute so this doesn't get called again
 99 |         setattr(inst, self.name, result)
100 |         return result
101 | 
102 | 
103 | def unique(seq):
104 |     """Return the elements of a list uniquely while preserving the order
105 | 
106 |     :param list seq: a list of hashable elements
107 |     :returns: new list with first occurence of elements of seq"""
108 |     seen = set()
109 |     return [x for x in seq if x not in seen and not seen.add(x)]
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | class WrapperError(Exception):
2 |     pass
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/aligners/__init__.py:
--------------------------------------------------------------------------------
1 | from .mafft import Mafft
2 | from .muscle import Muscle
3 | from .prographmsa import ProGraphMSA
4 | from .probcons import ProbCons
5 | from .base_aligner import AlignmentInput, DataType, WrapperError
6 | 
7 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/aligners/base_aligner.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from abc import ABCMeta, abstractmethod
  3 | from enum import Enum
  4 | from Bio import AlignIO, SeqIO
  5 | 
  6 | 
  7 | from ...seq_utils import is_dna, identify_input, AlignmentInput
  8 | from .. import WrapperError
  9 | 
 10 | 
 11 | 
 12 | 
 13 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN')
 14 | 
 15 | 
 16 | class Aligner(object):
 17 |     """
 18 |     Base class for wrappers of Multiple Sequence Aligner software
 19 | 
 20 |     The wrapper is written as a callable class.
 21 |     This can hold data (state) to do with the operation it performs, so it can keep results,
 22 |     execution times and other metadata, as well as perform the task.
 23 | 
 24 |     This is a base implementation to be extended. The important parts are
 25 |     __init__ (does the setup) and __call__ (does the work). All
 26 |     else are helper methods.
 27 | 
 28 |     :Example:
 29 | 
 30 |     ::
 31 | 
 32 |         callable_wrapper = ConcreteAligner(aln)
 33 |         result = callable_wrapper()
 34 |         time_taken = callable_wrapper.elapsed_time
 35 |         result_again = callable_wrapper.result
 36 | 
 37 |     """
 38 |     __metaclass__ = ABCMeta
 39 | 
 40 |     def __init__(self, input_, datatype=DataType.UNKNOWN, binary=None):
 41 |         """
 42 |         Should work the same whether you're working with a Biopython object or a file
 43 |         but the implementation differs, e.g. a Biopython object will need
 44 |         to be written temporarily to disk for the Aligner to work on it.
 45 | 
 46 |         :param input_: can be either a filename or a biopython multiple
 47 |             sequence alignment (a collection of :class:`Bio.SeqRecord.SeqRecord`)
 48 | 
 49 |         :param binary: is the alignment's executable file, or None. If set to
 50 |             None, it is assumed to be found in the PATH.
 51 | 
 52 |         :param datatype: means is it DNA or protein?
 53 |         """
 54 |         self.input_type = identify_input(input_)  # Figure out what it is - file or object
 55 | 
 56 |         if isinstance(datatype, str):
 57 |             try:
 58 |                 datatype = getattr(DataType, datatype.upper())
 59 |             except AttributeError:
 60 |                 raise ValueError("\"{}\" is an invalid datatype for an Aligner".format(datatype))
 61 |         if datatype == DataType.UNKNOWN:
 62 |             self.datatype = guess_datatype(input_, from_filename=self.input_type == AlignmentInput.FILENAME)
 63 |             if self.input_type == AlignmentInput.OBJECT:
 64 |                 dup, input_ = itertools.tee(input_)
 65 |                 self.datatype = guess_datatype(dup, False)
 66 |             else:
 67 |                 self.datatype = guess_datatype(input_, True)
 68 |         else:
 69 |             self.datatype = datatype
 70 | 
 71 |         self.input = input_  # store it
 72 |         self.elapsed_time = None
 73 |         self.stdout = None
 74 |         self.stderr = None
 75 |         try:
 76 |             self.cli = self._init_cli(binary)
 77 |         except IOError as err:
 78 |             raise WrapperError('Error searching for binary: {}'.format(err))
 79 |         # End setup
 80 | 
 81 |     @abstractmethod
 82 |     def __call__(self, *args, **kwargs):
 83 |         """
 84 |         How to call the underlying aligner
 85 |         """
 86 |         pass
 87 | 
 88 |     @abstractmethod
 89 |     def _init_cli(self, binary):
 90 |         pass
 91 | 
 92 | import logging
 93 | logger = logging.getLogger()
 94 | 
 95 | 
 96 | def guess_datatype(alignment, from_filename=False):
 97 |     logger.warning("Guessing is not recommended - specify the sequence type with option datatype={DNA, PROTEIN}, be more confident")
 98 |     if from_filename:
 99 |         try:
100 |             alignment = SeqIO.parse(alignment, 'fasta')
101 |         except:
102 |             alignment = SeqIO.parse(alignment, 'phylip-relaxed')
103 |     return DataType.DNA if is_dna(alignment) else DataType.PROTEIN
104 | 
105 | 
106 | # TODO: Break the identify_input function into two parts - one to work out the datatype, one to work out whether
107 | # this is a file or an object
108 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/aligners/muscle.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | import time
  3 | from Bio import AlignIO, SeqIO
  4 | from six import StringIO
  5 | from ..abstract_cli import AbstractCLI
  6 | from .base_aligner import Aligner, AlignmentInput, DataType
  7 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, TreeInputOption, OptionSet
  8 | 
  9 | 
 10 | 
 11 | 
 12 | class MuscleCLI(AbstractCLI):
 13 |     """
 14 |     Muscle low-level command line interface
 15 | 
 16 |     example:
 17 |     muscle_cli = MuscleCLI()
 18 |     process = muscle_cli(cmd='muscle args...')
 19 |     stdout = muscle_cli.get_stdout()
 20 |     """
 21 |     @property
 22 |     def _default_exe(self):
 23 |         return 'muscle'
 24 | 
 25 |     # def _set_help(self):
 26 |     #     self(help=True, wait=True)
 27 |     #     self._help = self.get_stdout()
 28 | 
 29 | def set_default_dna_options(aligner):
 30 |     """
 31 |     Dummy function as sensible default already provided by mafft --auto
 32 |     """
 33 |     aligner.options = get_default_options()
 34 | 
 35 | 
 36 | def set_default_protein_options(aligner):
 37 |     """
 38 |     Dummy function as sensible default already provided by mafft --auto
 39 |     """
 40 |     aligner.options = get_default_options()
 41 | 
 42 | class Muscle(Aligner):
 43 |     """
 44 |     Convenient wrapper for Muscle multiple sequence aligner
 45 | 
 46 |     The wrapper is written as a callable class.
 47 |     This can hold data (state) to do with the operation it performs, so it can keep results,
 48 |     execution times and other metadata, as well as perform the task.
 49 | 
 50 |     This is a basic implementation that can be extended. The important parts are
 51 |     __init__ (does the setup) and __call__ (does the work). All
 52 |     else are helper methods.
 53 | 
 54 |     :Example:
 55 | 
 56 |     ::
 57 | 
 58 |         callable_wrapper = Muscle(aln)
 59 |         result = callable_wrapper()
 60 |         time_taken = callable_wrapper.elapsed_time
 61 |         result_again = callable_wrapper.result
 62 |     """
 63 | 
 64 |     def __init__(self, input_, *args, **kwargs):
 65 |         super(Muscle, self).__init__(input_, *args, **kwargs)
 66 |         self.options = get_default_options()
 67 | 
 68 |         if self.datatype == DataType.DNA:
 69 |             set_default_dna_options(self)
 70 |         else:
 71 |             set_default_protein_options(self)
 72 | 
 73 |     def __call__(self, *args, **kwargs):
 74 |         """
 75 |         Anything to do with calling Muscle should go here.
 76 |         If any extra arguments need to be passed they can
 77 |         be specified (listed as *args and **kwargs for now).
 78 |         """
 79 |         start = time.time() # time the execution
 80 | 
 81 |         if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
 82 |             with tempfile.NamedTemporaryFile(mode="wt") as filehandle:
 83 |                 SeqIO.write(self.input, filehandle, 'fasta')
 84 |                 filehandle.seek(0)
 85 |                 output, error = self._call(filehandle.name, *args, **kwargs)
 86 |         else:
 87 |             output, error = self._call(self.input, *args, **kwargs)
 88 | 
 89 |         self.result = self._read_result(output) # store result
 90 |         self.stdout = output
 91 |         self.stderr = error
 92 | 
 93 |         end = time.time()
 94 |         self.elapsed_time = end - start
 95 |         return self.result
 96 |         # End call
 97 | 
 98 |     # Any other accessory methods 
 99 |     def _call(self, filename, *args, **kwargs):
100 |         """
101 |         Call underlying low level _MuscleCLI wrapper. 
102 |         Options are passed via *args and **kwargs
103 |         [This only covers the simplest automatic
104 |          case]
105 |         """
106 |         self.cli('{} -in {}'.format(self.command(), filename),
107 |                 wait=True)
108 |         return self.cli.get_stdout(), self.cli.get_stderr()
109 | 
110 |     def command(self):
111 |         return str(self.options)
112 | 
113 |     def _read_result(self, output):
114 |         """
115 |         Read back the result.
116 |         """
117 |         fileobj = StringIO(output)
118 |         return AlignIO.read(fileobj, 'fasta')
119 | 
120 |     def _init_cli(self, binary):
121 |         return MuscleCLI(executable=binary)
122 | 
123 | 
124 | def get_default_options():
125 |     return OptionSet([
126 |         # Algorithm
127 | 
128 |         # Find diagonals (faster for similar sequences)
129 |         FlagOption('-diags', False, active=False),
130 | 
131 |         # Maximum number of iterations(integer, default 16)
132 |         IntegerOption('-maxiters', 16, active=False),
133 | 
134 |         # Maximum time to iterate in hours (default no limit)
135 |         FloatOption('-maxhours', 0.0, active=False)
136 | 
137 |         #reeInputOption('-usetree', '', active=False)
138 |     ])
139 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/aligners/probcons.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from Bio import AlignIO, SeqIO
  3 | from six import StringIO
  4 | from ..abstract_cli import AbstractCLI
  5 | from .base_aligner import Aligner, AlignmentInput, DataType
  6 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
  7 | import tempfile
  8 | 
  9 | 
 10 | 
 11 | 
 12 | class ProbConsCLI(AbstractCLI):
 13 |     """
 14 |     ProbCons low-level command line interface
 15 | 
 16 |     :Example:
 17 | 
 18 |     ::
 19 | 
 20 |         probcons_cli = _ProbConsCLI()
 21 |         process = mafft_cli(cmd='mafft args...')
 22 |         stdout = mafft_cli.get_stdout()
 23 |     """
 24 |     @property
 25 |     def _default_exe(self):
 26 |         return 'probcons'
 27 | 
 28 |     # def _set_help(self):
 29 |     #     self(help=True, wait=True)
 30 |     #     self._help = self.get_stdout()
 31 | 
 32 | 
 33 | def set_default_dna_options(aligner):
 34 |     """
 35 |     Dummy function as sensible default already provided by mafft --auto
 36 |     """
 37 |     aligner.options = get_default_options()
 38 | 
 39 | 
 40 | def set_default_protein_options(aligner):
 41 |     """
 42 |     Dummy function as sensible default already provided by mafft --auto
 43 |     """
 44 |     aligner.options = get_default_options()
 45 | 
 46 | 
 47 | class ProbCons(Aligner):
 48 |     """
 49 |     Convenient wrapper for ProbCons multiple sequence aligner
 50 | 
 51 |     The wrapper is written as a callable class.
 52 |     This can hold data (state) to do with the operation it performs, so it can keep results,
 53 |     execution times and other metadata, as well as perform the task.
 54 | 
 55 |     This is a basic implementation that can be extended. The important parts are
 56 |     __init__ (does the setup) and __call__ (does the work). All
 57 |     else are helper methods.
 58 | 
 59 |     :Example:
 60 | 
 61 |     ::
 62 | 
 63 |         callable_wrapper = ProbCons(aln)
 64 |         result = callable_wrapper()
 65 |         time_taken = callable_wrapper.elapsed_time
 66 |         result_again = callable_wrapper.result
 67 | 
 68 | 
 69 |     .. note:: There exists an ipython notebook on how to work with wrappers,
 70 |          including dealing with non-default parameters.
 71 |     """
 72 | 
 73 |     def __init__(self, input_, *args, **kwargs):
 74 |         super(ProbCons, self).__init__(input_, *args, **kwargs)
 75 |         self.options = get_default_options()
 76 |         if self.datatype == DataType.DNA:
 77 |             set_default_dna_options(self)
 78 |         else:
 79 |             set_default_protein_options(self)
 80 | 
 81 |     def __call__(self, *args, **kwargs):
 82 |         """
 83 |         Anything to do with calling Mafft should go here.
 84 |         If any extra arguments need to be passed they can
 85 |         be specified (listed as *args and **kwargs for now).
 86 |         """
 87 |         start = time.time()  # time the execution
 88 |         
 89 |         if self.input_type == AlignmentInput.OBJECT:  # different operation depending on what it is
 90 |             with tempfile.NamedTemporaryFile(mode='wt') as filehandle:
 91 |                 SeqIO.write(self.input, filehandle, 'fasta')
 92 |                 filehandle.seek(0)
 93 |                 output, error = self._call(filehandle.name, *args, **kwargs)
 94 |                 
 95 |         else:
 96 |             output, error = self._call(self.input, *args, **kwargs)
 97 | 
 98 |         self.result = self._read_result(output) # store result
 99 |         self.stdout = output
100 |         self.stderr = error
101 | 
102 |         end = time.time()
103 |         self.elapsed_time = end - start
104 |         return self.result
105 |         # End call
106 | 
107 |     # Any other accessory methods 
108 |     def _call(self, filename, *args, **kwargs):
109 |         """
110 |         Call underlying low level _Mafft wrapper. 
111 |         Options are passed via *args and **kwargs
112 |         [This only covers the simplest automatic
113 |          case]
114 |         """
115 |         self.cli('{} {}'.format(self.command(), filename),
116 |                  wait=True)
117 |         return self.cli.get_stdout(), self.cli.get_stderr()
118 | 
119 |     def command(self):
120 |         return str(self.options)
121 | 
122 |     def _read_result(self, output):
123 |         """
124 |         Read back the result.
125 |         """
126 |         fileobj = StringIO(output)
127 |         return AlignIO.read(fileobj, 'fasta')
128 | 
129 |     def _init_cli(self, binary):
130 |         return ProbConsCLI(executable=binary)
131 | 
132 | 
133 | def get_default_options():
134 |     return OptionSet([
135 |         # Algorithm
136 | 
137 |         # use CLUSTALW output format instead of MFA
138 |         FlagOption('-clustalw', False, active=False),
139 | 
140 |         # use 0 <= REPS <= 5 (default: 2) passes of consistency transformation
141 |         IntegerOption('-c', 0, active=False),
142 | 
143 |         # use 0 <= REPS <= 1000 (default: 100) passes of iterative-refinement
144 |         IntegerOption('-ir', 100, active=False),
145 | 
146 |         # use 0 <= REPS <= 20 (default: 0) rounds of pretraining
147 |         IntegerOption('-pre', 0, active=False),
148 | 
149 |         # generate all-pairs pairwise alignments
150 |         FlagOption('-pairs', False, active=False),
151 | 
152 |         #use Viterbi algorithm to generate all pairs(automatically enables - pairs)
153 |         FlagOption('-viterbi', False, active=False),
154 | 
155 |         # write annotation for multiple alignment to FILENAME
156 |         StringOption('-annot', '', active=False),
157 | 
158 |         # print sequences in alignment order rather than input order (default: off)
159 |         FlagOption('-a', False, active=False)
160 | 
161 |     ])
162 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/aligners/prographmsa.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from Bio import AlignIO, SeqIO
  3 | import tempfile
  4 | from six import StringIO
  5 | from ..abstract_cli import AbstractCLI
  6 | from .base_aligner import Aligner, AlignmentInput, DataType
  7 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
  8 | 
  9 | 
 10 | 
 11 | 
 12 | class ProGraphMSACLI(AbstractCLI):
 13 |     """
 14 |     PrographMSA low-level command line interface
 15 | 
 16 |     :Example:
 17 | 
 18 |     ::
 19 | 
 20 |         prograph_cli = ProGraphMSACLI()
 21 |         process = prograph_cli(cmd='mafft args...')
 22 |         stdout = prograph_cli.get_stdout()
 23 |     """
 24 | 
 25 |     @property
 26 |     def _default_exe(self):
 27 |         return 'ProGraphMSA'
 28 | 
 29 | 
 30 | def set_default_dna_options(aligner):
 31 |     """
 32 |     Dummy function as sensible default already provided by mafft --auto
 33 |     """
 34 |     aligner.options = get_default_options()
 35 | 
 36 | 
 37 | def set_default_protein_options(aligner):
 38 |     """
 39 |     Dummy function as sensible default already provided by mafft --auto
 40 |     """
 41 |     aligner.options = get_default_options()
 42 | 
 43 | 
 44 | class ProGraphMSA(Aligner):
 45 |     """
 46 |     Convenient wrapper for ProGraphMSA multiple sequence aligner
 47 | 
 48 |     The wrapper is written as a callable class.
 49 |     This can hold data (state) to do with the operation it performs, so it can keep results,
 50 |     execution times and other metadata, as well as perform the task.
 51 | 
 52 |     This is a basic implementation that can be extended. The important parts are
 53 |     __init__ (does the setup) and __call__ (does the work). All
 54 |     else are helper methods.
 55 | 
 56 |     :Example:
 57 | 
 58 |     ::
 59 | 
 60 |         callable_wrapper = Mafft(aln)
 61 |         result = callable_wrapper()
 62 |         time_taken = callable_wrapper.elapsed_time
 63 |         result_again = callable_wrapper.result
 64 |     """
 65 | 
 66 |     def __init__(self, input_, *args, **kwargs):
 67 |         super(ProGraphMSA, self).__init__(input_, *args, **kwargs)
 68 |         self.options = get_default_options()
 69 |         if self.datatype == DataType.DNA:
 70 |             set_default_dna_options(self)
 71 |         else:
 72 |             set_default_protein_options(self)
 73 | 
 74 |     def __call__(self, *args, **kwargs):
 75 |         """
 76 |         Anything to do with calling ProGraphMSA should go here.
 77 |         If any extra arguments need to be passed they can
 78 |         be specified (listed as *args and **kwargs for now).
 79 |         """
 80 |         start = time.time()  # time the execution
 81 | 
 82 |         if self.input_type == AlignmentInput.OBJECT:  # different operation depending on what it is
 83 |             with tempfile.NamedTemporaryFile(mode="wt") as fh:
 84 |                 SeqIO.write(self.input, fh, 'fasta')
 85 |                 fh.seek(0)
 86 |                 output, error = self._call(fh.name, *args, **kwargs)
 87 | 
 88 |         else:
 89 |             output, error = self._call(self.input, *args, **kwargs)
 90 | 
 91 |         self.result = self._read_result(output)  # store result
 92 |         self.stdout = output
 93 |         self.stderr = error
 94 | 
 95 |         end = time.time()
 96 |         self.elapsed_time = end - start
 97 |         return self.result
 98 |         # End call
 99 | 
100 |     # Any other accessory methods
101 |     def _call(self, filename, *args, **kwargs):
102 |         """
103 |         Call underlying low level ProGraphMSA wrapper.
104 |         Options are passed via *args and **kwargs
105 |         [This only covers the simplest automatic
106 |          case]
107 |         """
108 |         self.cli('{} {}'.format(self.command(), filename),
109 |                  wait=True)
110 |         return self.cli.get_stdout(), self.cli.get_stderr()
111 | 
112 |     def command(self):
113 |         return str(self.options)
114 | 
115 |     def _read_result(self, output):
116 |         """
117 |         Read back the result.
118 |         """
119 |         fileobj = StringIO(output)
120 |         return AlignIO.read(fileobj, 'fasta')
121 | 
122 |     def _init_cli(self, binary):
123 |         return ProGraphMSACLI(executable=binary)
124 | 
125 | 
126 | def get_default_options():
127 |     return OptionSet([
128 |         # Algorithm
129 | 
130 |         # output fasta format (instead of stockholm), better because no tree output is produced
131 |         FlagOption('--fasta', True, active=True),
132 | 
133 |         # output all ancestral sequences
134 |         FlagOption('--ancestral_seqs', False, active=False),
135 | 
136 |         # output sequences in input order (default: tree order)
137 |         FlagOption('--input_order', False, active=False),
138 | 
139 |         # output all intermediate guide trees
140 |         FlagOption('--all_trees', False, active=False),
141 | 
142 |         # use ML distances with gap
143 |         FlagOption('--mldist_gap', False, active=False),
144 | 
145 |         # use ML distances
146 |         FlagOption('--mldist', False, active=False),
147 | 
148 |         # use of guide tree
149 |         StringOption('--tree', '', active=False)
150 | 
151 |     ])
152 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/modeltesters/__init__.py:
--------------------------------------------------------------------------------
1 | from .prottest import ProtTest
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/modeltesters/base_modeltester.py:
--------------------------------------------------------------------------------
  1 | import os, types, itertools
  2 | from abc import ABCMeta, abstractmethod
  3 | from enum import Enum
  4 | from Bio import AlignIO, SeqIO
  5 | from Bio.Align import MultipleSeqAlignment
  6 | from ...seq_utils import is_dna
  7 | 
  8 | 
  9 | 
 10 | from zoo.wrappers import WrapperError
 11 | from zoo.wrappers.aligners.base_aligner import identify_input
 12 | 
 13 | import logging
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | AlignmentInput = Enum('AlignmentInput', 'OBJECT FILENAME')
 17 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN')
 18 | 
 19 | 
 20 | class ModelTester(object):
 21 |     """
 22 |     Base class for wrappers of model testers for phylogeny inference
 23 | 
 24 |     The wrapper is written as a callable class.
 25 |     This can hold data (state) to do with the operation it performs, so it can keep results,
 26 |     execution times and other metadata, as well as perform the task.
 27 | 
 28 |     This is a base implementation to be extended. The important parts are
 29 |     __init__ (does the setup) and __call__ (does the work). All
 30 |     else are helper methods.
 31 | 
 32 |     :Example:
 33 | 
 34 |     ::
 35 | 
 36 |         callable_wrapper = ConcreteModelTester(aln)
 37 |         result = callable_wrapper()
 38 |         time_taken = callable_wrapper.elapsed_time
 39 |         result_again = callable_wrapper.result
 40 |     """
 41 |     __metaclass__ = ABCMeta
 42 | 
 43 |     def __init__(self, alignment=None, datatype=DataType.UNKNOWN, binary=None):
 44 |         """
 45 |         ..note::  TODO: this documentation is not correct. it needs to be updateted.
 46 | 
 47 |         Should work the same whether you're working with a Biopython object or a file
 48 |             but the implementation differs, e.g. a Biopython object will need
 49 |             to be written temporarily to disk for the Aligner to work on it.
 50 | 
 51 |         alignment is one of 4 things:
 52 |             a filename
 53 |             a Biopython MSA
 54 |             a list of Seq objects
 55 |             anything else (throw an exception)
 56 | 
 57 |         binary is the alignment's executable file, or None
 58 |         """
 59 | 
 60 |         if alignment is not None:
 61 |             self.input_type = identify_input(alignment)  # Figure out what it is - file or object
 62 |             if datatype == DataType.UNKNOWN:
 63 |                 # dup, input_ = itertools.tee(input_)
 64 |                 self.datatype = guess_datatype(alignment, from_filename=self.input_type == AlignmentInput.FILENAME)
 65 |             else:
 66 |                 self.datatype = datatype
 67 | 
 68 |             self.input = alignment  # store it
 69 |         else:
 70 |             self.input_type = None
 71 |             self.input = None
 72 | 
 73 | 
 74 |         self.elapsed_time = None
 75 |         self.stdout = None
 76 |         self.stderr = None
 77 |         self.cli = self._init_cli(binary)
 78 |         #TODO: the wrapper error is not compatible with calling a function with java!
 79 |         #try:
 80 |         #    self.cli = self._init_cli(binary)
 81 |         #except IOError as err:
 82 |         #     raise WrapperError('Error searching for binary: {}'.format(err))
 83 |             # End setup
 84 | 
 85 |     @abstractmethod
 86 |     def __call__(self, *args, **kwargs):
 87 |         """
 88 |         How to call the underlying aligner
 89 |         """
 90 |         pass
 91 | 
 92 |     @abstractmethod
 93 |     def _init_cli(self, binary):
 94 |         """
 95 |         Set up the command-line interface to the wrapped software
 96 |         :param binary: filename of executable binary file
 97 |         :return: concrete CLI type inheriting from AbstractCLI
 98 |         """
 99 |         pass
100 | 
101 | 
102 | def guess_datatype(alignment, from_filename=False):
103 |     logger.warning("Guessing is not recommended - specify the sequence type with option datatype={DNA, PROTEIN}, be more confident")
104 |     if from_filename:
105 |         try:
106 |             alignment = list(SeqIO.parse(alignment, 'fasta'))
107 |         except:
108 |             alignment = list(SeqIO.parse(alignment, 'phylip-relaxed'))
109 |     return DataType.DNA if is_dna(alignment) else DataType.PROTEIN
110 | 
111 | 
112 | def identify_input(alignment):
113 |     """
114 |     Work out if we're dealing with an alignment (return True), a file
115 |     (return False), or invalid input (raise error)
116 | 
117 |     :param alignment: either an Biopython MultipleSequenceAlignment or
118 |         a filename pointing to an existing msa file.
119 |     """
120 |     try:
121 |         if isinstance(alignment, (MultipleSeqAlignment, types.GeneratorType, list)):
122 |             # `alignment` is a Biopython MultipleSequenceAlignment
123 |             return AlignmentInput.OBJECT
124 | 
125 |         elif isinstance(alignment, str) and os.path.exists(alignment):
126 |             # `alignment` is a filepath
127 |             return AlignmentInput.FILENAME
128 | 
129 |     except:
130 |         # `alignment` is some other thing we can't handle
131 |         raise ValueError('{} is not an alignment object or a valid filename'.format(alignment))
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/modeltesters/parsers.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import dendropy as dpy
 3 | from pyparsing import Suppress, SkipTo, Word, Regex, Literal, OneOrMore, Group, LineEnd, CharsNotIn, nums, alphanums, \
 4 |     ParseException
 5 | 
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | logger.addHandler(logging.StreamHandler())
 9 | 
10 | 
11 | FLOAT = Word(nums + '.-').setParseAction(lambda x: float(x[0]))
12 | INT = Word(nums).setParseAction(lambda x: int(x[0]))
13 | WORD = Word(alphanums + '_')
14 | SPACEDWORD = Word(alphanums+' _')
15 | 
16 | 
17 | class ProtTestParser(object):
18 |     """
19 |     Simple prottest result parser.
20 |     """
21 | 
22 |     def __init__(self):
23 |         self.MODEL = Regex(r'Best model according to\s+')
24 |         # These are all the models that are possible to be tested using phyml
25 |         self.model = OneOrMore(Group(Suppress(SkipTo(self.MODEL)) + Suppress(self.MODEL) + WORD + Suppress(":") + WORD))
26 | 
27 |     def parse(self, s):
28 |         model = None
29 |         try:
30 |             model = self.model.parseString(s).asList()
31 |         except ParseException as err:
32 |             logger.error(err)
33 | 
34 |         return model
35 | 
36 |     def to_dict(self, stats_filename):
37 |         result = {}
38 |         model = self.parse(stats_filename)
39 |         try:
40 |             for mg in model:
41 |                 result[mg[0]] = mg[1]
42 |         except IOError as err:
43 |             logger.error(err)
44 |             return
45 | 
46 |         return result
47 | 
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/modeltesters/prottest.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import tempfile
  4 | import logging
  5 | 
  6 | 
  7 | from pyparsing import ParseException
  8 | from Bio import AlignIO, SeqIO
  9 | 
 10 | from .parsers import ProtTestParser
 11 | from .base_modeltester import ModelTester, AlignmentInput, DataType
 12 | 
 13 | from ..abstract_cli import AbstractCLI
 14 | from ..options import StringOption, FlagOption, OptionSet
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | logger.addHandler(logging.StreamHandler())
 18 | logger.setLevel(logging.INFO)
 19 | 
 20 | 
 21 | class ProtTestCLI(AbstractCLI):
 22 |     """
 23 |     Especially in this case it is important that the $PROTTEST_HOME environmental variable is set to the installation directory of the prottest tool
 24 |     """
 25 |     @property
 26 |     def _default_exe(self):
 27 |         return 'java -jar ' + os.environ['PROTTEST_HOME'] + '/prottest-3.4.2.jar'
 28 | 
 29 | 
 30 | def set_default_dna_options(modeltester):
 31 |     """
 32 |     Dummy function as sensible default
 33 |     """
 34 |     modeltester.options = get_default_options()
 35 | 
 36 | 
 37 | def set_default_protein_options(modeltester):
 38 |     """
 39 |     Dummy function as sensible default
 40 |     """
 41 |     modeltester.options = get_default_options()
 42 | 
 43 | 
 44 | class ProtTest(ModelTester):
 45 |     """ ProtTest to determine the best model for a specific alignment
 46 |     This wrapper can be called to test various models for phylogeny inference.
 47 |     """
 48 | 
 49 |     def __init__(self, alignment, *args, **kwargs):
 50 |         """
 51 |         :param alignment: input multiple sequence alignment. This can be either
 52 |             a filename or an biopython SeqRecord collection.
 53 |         """
 54 |         self.options = get_default_options()
 55 |         super(ProtTest, self).__init__(alignment=alignment, *args, **kwargs)
 56 |         if self.datatype == DataType.DNA:
 57 |             set_default_dna_options(self)
 58 |         else:
 59 |             set_default_protein_options(self)
 60 | 
 61 |     def __call__(self, *args, **kwargs):
 62 |         """
 63 |         Anything to do with calling ProtTest should go here.
 64 |         If any extra arguments need to be passed they can
 65 |         be specified (listed as *args and **kwargs for now).
 66 |         """
 67 |         start = time.time()  # time the execution
 68 |         if self.input_type == AlignmentInput.OBJECT:  # different operation depending on what it is
 69 |             with tempfile.NamedTemporaryFile(mode='wt') as filehandle:
 70 |                 SeqIO.write(self.input, filehandle, 'fasta')
 71 |                 filehandle.seek(0)
 72 |                 output, error = self._call(filehandle.name, *args, **kwargs)
 73 |         else:
 74 |             output, error = self._call(self.input, *args, **kwargs)
 75 | 
 76 |         self.result = self._read_result(output)  # store result
 77 |         self.stdout = output
 78 |         self.stderr = error
 79 | 
 80 |         end = time.time()
 81 |         self.elapsed_time = end - start
 82 |         return self.result
 83 |         # End call
 84 | 
 85 |     # Any other accessory methods
 86 |     def _call(self, filename, *args, **kwargs):
 87 |         """
 88 |         Call underlying low level _ProtTest wrapper.
 89 |         Options are passed via *args and **kwargs
 90 |         [This only covers the simplest automatic
 91 |          case]
 92 |         """
 93 |         self.cli('{} -i {}'.format(self.command(), filename),
 94 |                  wait=True)
 95 |         return self.cli.get_stdout(), self.cli.get_stderr()
 96 | 
 97 |     def command(self):
 98 |         return str(self.options)
 99 | 
100 |     def _read_result(self, output):
101 | 
102 |         parser = ProtTestParser()
103 | 
104 |         try:
105 |             result = parser.to_dict(output)
106 | 
107 |         except IOError as ioerr:
108 |             logger.error('Error reading results')
109 |             result = None
110 |         except ParseException as parseerr:
111 |             logger.error('Other parse error', parseerr)
112 |             result = None
113 | 
114 |         return result
115 | 
116 | 
117 |     def _init_cli(self, binary):
118 |         return ProtTestCLI(executable=binary)
119 | 
120 | 
121 | def get_default_options():
122 |     return OptionSet([
123 |         # Algorithm
124 | 
125 |         # Display models sorted by Akaike Information Criterion (AIC)
126 |         FlagOption('-AIC', False, active=False),
127 | 
128 |         # Display models sorted by Decision Theory Criterion
129 |         FlagOption('-DT', False, active=False),
130 | 
131 |         # Tree file (optional) [default: NJ tree]
132 |         StringOption('-t', '', active=False),
133 | 
134 |         # Display models sorted by Corrected Akaike Information Criterion (AICc)
135 |         FlagOption('-AICC', False, active=False),
136 | 
137 |         #Enables / Disables PhyML logging into log directory(see prottest.properties)
138 |         FlagOption('-log', False, active=False)
139 |     ])
140 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/options.py:
--------------------------------------------------------------------------------
  1 | from numbers import Integral, Real
  2 | from six import string_types
  3 | from abc import ABCMeta, abstractproperty
  4 | from dendropy import Tree
  5 | 
  6 | 
  7 | class Option(object):
  8 |     """Abstract base class for an option.
  9 | 
 10 |     Options provide an interface between the wrapper and the
 11 |     concrete command line option of the wrapped program."""
 12 |     __metaclass__ = ABCMeta
 13 | 
 14 |     def __init__(self, name, default=None, active=False):
 15 |         self._name = name
 16 |         self.set_value(default)
 17 |         self.active = active
 18 | 
 19 |     def __repr__(self):
 20 |         return '{}({}={}) <{}>'.format(self.__class__.__name__, self.name, self.get_value(), 'on' if self.active else 'off')
 21 | 
 22 |     def __str__(self):
 23 |         return (' '.join([self._name, str(self.get_value())]) if self.active else '')
 24 | 
 25 |     @property
 26 |     def active(self):
 27 |         return self._active
 28 | 
 29 |     @active.setter
 30 |     def active(self, val):
 31 |         self._active = True if val else False
 32 | 
 33 |     @property
 34 |     def name(self):
 35 |         return self._name
 36 | 
 37 |     def set_value(self, value):
 38 |         self._value = value
 39 |         if value is not None:
 40 |             self.active = True
 41 | 
 42 |     def get_value(self):
 43 |         return self._value
 44 | 
 45 |     def set_and_activate(self, value):
 46 |         self.set_value(value)
 47 |         self.active = True
 48 | 
 49 |     def status(self):
 50 |         return 'Name: {}\nValue: {}\nActive: {}\nStr: {}'.format(self.name,
 51 |                                                                  self.get_value(),
 52 |                                                                  self.active,
 53 |                                                                  str(self) or "''")
 54 | 
 55 | 
 56 | class ValueOption(Option):
 57 |     __metaclass__ = ABCMeta
 58 | 
 59 | 
 60 | class TypedValueOption(ValueOption):
 61 |     """A TypedValueOption is an option that only accepts options of a given type.
 62 | 
 63 |     This abstract class provides the functionality to check the type
 64 |     of a passed value and raises an ValueError if it doesn't match
 65 |     the expected type.
 66 | 
 67 |     A TypedValueOption must overwrite the abstract property _type.
 68 |     """
 69 | 
 70 |     __metaclass__ = ABCMeta
 71 | 
 72 |     @abstractproperty
 73 |     def _type(self):
 74 |         pass
 75 | 
 76 |     def set_value(self, value):
 77 |         if isinstance(value, self._type):
 78 |             self._value = value
 79 |             self.active = True
 80 | 
 81 |         else:
 82 |             raise ValueError('Value should be of type {}'.format(self.type))
 83 | 
 84 | 
 85 | ### Concrete classes from here on
 86 | 
 87 | class IntegerOption(TypedValueOption):
 88 |     """option to hold an integer value"""
 89 |     @property
 90 |     def _type(self):
 91 |         return Integral
 92 | 
 93 | 
 94 | class FloatOption(TypedValueOption):
 95 |     """Option to hold a real number value"""
 96 | 
 97 |     @property
 98 |     def _type(self):
 99 |         return Real
100 | 
101 |     def get_value(self):
102 |         return float(self._value)
103 | 
104 | 
105 | class StringOption(TypedValueOption):
106 |     """Opion to hold a string value"""
107 | 
108 |     def __init__(self, name, value=None, active=False):
109 |         if value is None:
110 |             value = str()
111 |         super(StringOption, self).__init__(name, value, active)
112 | 
113 |     @property
114 |     def _type(self):
115 |         return string_types
116 | 
117 | 
118 | class FlagOption(TypedValueOption):
119 |     """Option to hold a boolean flag value, i.e. True or False"""
120 |     @property
121 |     def _type(self):
122 |         return bool
123 | 
124 |     def __str__(self):
125 |         return (self._name if self.active and self.get_value() else '')
126 | 
127 | 
128 | class TreeInputOption(TypedValueOption):
129 |     """Option to hold a phylogenetic tree argument.
130 | 
131 |     As of now, Trees are represented as :class:`dendropy.Tree` objects."""
132 | 
133 |     @property
134 |     def _type(self):
135 |         return Tree
136 | 
137 | 
138 | class MultiOption(Option):
139 |     """Option to hold a list"""
140 | 
141 |     @property
142 |     def _type(self):
143 |         return list
144 | 
145 |     def __str__(self):
146 |         listopts = self.get_value()
147 |         if listopts is None: return ''
148 |         strings = []
149 |         for item in listopts:
150 |             item_string = ' '.join([self._name, str(item)]) if self.active else ''
151 |             if item_string > '':
152 |                 strings.append(item_string)
153 | 
154 |         return ' '.join(strings)
155 | 
156 | 
157 | class OptionSet(object):
158 |     """Option to hold a set of key-value pairs."""
159 |     def __init__(self, options):
160 |         if isinstance(options, (list, tuple)):
161 |             self.options = {opt.name: opt for opt in options}
162 |         elif isinstance(options, dict):
163 |             self.options = options
164 |         else:
165 |             raise ValueError('Expected a list, tuple or dict of options, not {}'.format(type(options)))
166 | 
167 |     def __str__(self):
168 |         strings = []
169 |         for name, option in self.options.items():
170 |             option_string = str(option)
171 |             if option_string > '':
172 |                 strings.append(option_string)
173 | 
174 |         return ' '.join(strings)
175 | 
176 |     def __getitem__(self, item):
177 |         return self.options[item]
178 | 
179 |     def list(self):
180 |         return [(name, option) for (name, option) in self.options.items()]
181 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/treebuilders/__init__.py:
--------------------------------------------------------------------------------
1 | from .phyml import Phyml
2 | from .raxml import Raxml
3 | from .iqtree import Iqtree
4 | from .fasttree import Fasttree
5 | from .guenomu import Guenomu
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/treebuilders/base_treebuilder.py:
--------------------------------------------------------------------------------
  1 | import os, types, itertools
  2 | from abc import ABCMeta, abstractmethod
  3 | from enum import Enum
  4 | from Bio import AlignIO, SeqIO
  5 | from Bio.Align import MultipleSeqAlignment
  6 | from ...seq_utils import is_dna
  7 | 
  8 | from FastOMA.zoo.wrappers import WrapperError
  9 | from FastOMA.zoo.wrappers.aligners.base_aligner import identify_input
 10 | 
 11 | import logging
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | AlignmentInput = Enum('AlignmentInput', 'OBJECT FILENAME')
 15 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN')
 16 | 
 17 | 
 18 | class TreeBuilder(object):
 19 |     """
 20 |     Base class for wrappers of tree building software
 21 | 
 22 |     The wrapper is written as a callable class.
 23 |     This can hold data (state) to do with the operation it performs, so it can keep results,
 24 |     execution times and other metadata, as well as perform the task.
 25 | 
 26 |     This is a base implementation to be extended. The important parts are
 27 |     __init__ (does the setup) and __call__ (does the work). All
 28 |     else are helper methods.
 29 | 
 30 |     :Example:
 31 | 
 32 |     ::
 33 | 
 34 |         callable_wrapper = ConcreteAligner(aln)
 35 |         result = callable_wrapper()
 36 |         time_taken = callable_wrapper.elapsed_time
 37 |         result_again = callable_wrapper.result
 38 |     """
 39 |     __metaclass__ = ABCMeta
 40 | 
 41 |     def __init__(self, alignment=None, datatype=DataType.UNKNOWN, binary=None):
 42 |         """
 43 |         ..note::  TODO: this documentation is not correct. it needs to be updateted.
 44 | 
 45 |         Should work the same whether you're working with a Biopython object or a file
 46 |             but the implementation differs, e.g. a Biopython object will need
 47 |             to be written temporarily to disk for the Aligner to work on it.
 48 | 
 49 |         alignment is one of 4 things:
 50 |             a filename
 51 |             a Biopython MSA
 52 |             a list of Seq objects
 53 |             anything else (throw an exception)
 54 | 
 55 |         binary is the alignment's executable file, or None
 56 |         """
 57 | 
 58 |         if alignment is not None:
 59 |             self.input_type = identify_input(alignment)  # Figure out what it is - file or object
 60 |             if datatype == DataType.UNKNOWN:
 61 |                 # dup, input_ = itertools.tee(input_)
 62 |                 self.datatype = guess_datatype(alignment, from_filename=self.input_type == AlignmentInput.FILENAME)
 63 |             else:
 64 |                 self.datatype = datatype
 65 | 
 66 |             self.input = alignment  # store it
 67 |         else:
 68 |             self.input_type = None
 69 |             self.input = None
 70 | 
 71 |         self.elapsed_time = None
 72 |         self.stdout = None
 73 |         self.stderr = None
 74 |         try:
 75 |             self.cli = self._init_cli(binary)
 76 |         except IOError as err:
 77 |             raise WrapperError('Error searching for binary: {}'.format(err))
 78 |             # End setup
 79 | 
 80 |     @abstractmethod
 81 |     def __call__(self, *args, **kwargs):
 82 |         """
 83 |         How to call the underlying aligner
 84 |         """
 85 |         pass
 86 | 
 87 |     @abstractmethod
 88 |     def _init_cli(self, binary):
 89 |         """
 90 |         Set up the command-line interface to the wrapped software
 91 |         :param binary: filename of executable binary file
 92 |         :return: concrete CLI type inheriting from AbstractCLI
 93 |         """
 94 |         pass
 95 | 
 96 | 
 97 | def guess_datatype(alignment, from_filename=False):
 98 |     if from_filename:
 99 |         try:
100 |             alignment = list(SeqIO.parse(alignment, 'fasta'))
101 |         except:
102 |             alignment = list(SeqIO.parse(alignment, 'phylip-relaxed'))
103 |     if is_dna(alignment):
104 |         logger.warning("Guessed datatype=DNA. But better explicitly specify the sequence type with option datatype={DNA, PROTEIN}.")
105 |         return DataType.DNA 
106 |     else:
107 |         logger.warning("Guessed datatype=PROTEIN. But better explicitly specify the sequence type with option datatype={DNA, PROTEIN}.")
108 |         return DataType.PROTEIN
109 | 
110 | 
111 | def identify_input(alignment):
112 |     """
113 |     Work out if we're dealing with an alignment (return True), a file
114 |     (return False), or invalid input (raise error)
115 | 
116 |     :param alignment: either an Biopython MultipleSequenceAlignment or
117 |         a filename pointing to an existing msa file.
118 |     """
119 |     try:
120 |         if isinstance(alignment, (MultipleSeqAlignment, types.GeneratorType, list)):
121 |             # `alignment` is a Biopython MultipleSequenceAlignment
122 |             return AlignmentInput.OBJECT
123 | 
124 |         elif isinstance(alignment, str) and os.path.exists(alignment):
125 |             # `alignment` is a filepath
126 |             return AlignmentInput.FILENAME
127 | 
128 |     except:
129 |         # `alignment` is some other thing we can't handle
130 |         raise ValueError('{} is not an alignment object or a valid filename'.format(alignment))
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/treebuilders/fasttree.py:
--------------------------------------------------------------------------------
  1 | # Author: Ivana Pilizota
  2 | # Date: 1 November 2016
  3 | 
  4 | import logging
  5 | import os
  6 | import time
  7 | 
  8 | from Bio import SeqIO
  9 | from pyparsing import ParseException
 10 | import tempfile
 11 | 
 12 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType, WrapperError
 13 | from .parsers import FasttreeParser
 14 | 
 15 | from ..abstract_cli import AbstractCLI
 16 | from ..options import OptionSet, StringOption, IntegerOption
 17 | from ...file_utils import TempFile, TempDir
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | logger.addHandler(logging.StreamHandler())
 21 | logger.setLevel(logging.INFO)
 22 | 
 23 | 
 24 | 
 25 | 
 26 | class FasttreeCLI(AbstractCLI):
 27 |     @property
 28 |     def _default_exe(self):
 29 |         return ['fasttree', 'FastTree']
 30 | 
 31 | 
 32 | def set_default_dna_options(treebuilder):
 33 |     """
 34 |     Dummy function as sensible default
 35 |     """
 36 |     treebuilder.options = get_default_options()
 37 | 
 38 | 
 39 | def set_default_protein_options(treebuilder):
 40 |     """
 41 |     Dummy function as sensible default
 42 |     """
 43 |     treebuilder.options = get_default_options()
 44 | 
 45 | 
 46 | class Fasttree(TreeBuilder):
 47 | 
 48 |     def __init__(self, alignment, *args, **kwargs):
 49 |         self.options = get_default_options()
 50 |         super(Fasttree, self).__init__(alignment=alignment, *args, **kwargs)
 51 |         if self.input is not None:
 52 |             if self.datatype == DataType.DNA:
 53 |                 set_default_dna_options(self)
 54 |             else:
 55 |                 set_default_protein_options(self)
 56 | 
 57 |     def __call__(self, *args, **kwargs):
 58 |         """
 59 |         Sets up temporary output file location and calls FastTree using _call() function.
 60 |         Writes temporary input file if we're working with SeqIO object
 61 |         Saves the stdout and stderr and returns
 62 |         """
 63 |         start = time.time()  # time the execution
 64 |         if self.input_type == AlignmentInput.OBJECT:  # different operation depending on what it is
 65 |             with tempfile.NamedTemporaryFile(mode='wt') as fh:
 66 |                 SeqIO.write(self.input, fh, 'fasta') # default interleaved # 'phylip-relaxed'
 67 |                 fh.seek(0)
 68 |                 output, error = self._call(fh.name, *args, **kwargs)
 69 |                 self.result = self._read_result(output, error)  # store result
 70 |         else:
 71 |             filename = os.path.abspath(self.input)
 72 |             output, error = self._call(filename, *args, **kwargs)
 73 |             self.result = self._read_result(output, error)  # store result
 74 | 
 75 |         end = time.time()
 76 |         self.elapsed_time = end - start
 77 |         return self.result["tree"]
 78 |         # End call
 79 | 
 80 |     # Any other accessory methods
 81 |     def _call(self, filename, *args, **kwargs):
 82 |         """
 83 |         Call underlying low level FastTree wrapper.
 84 |         Options are passed via *args and **kwargs
 85 |         [This only covers the simplest automatic
 86 |          case]
 87 |         """
 88 |         #hard code tmp_output as the output name since we don't save it anyway
 89 |         #self.cli('{} -log {log_output} {seqfile} > {tmp_path}'.format(self.command(), tmp_path=os.path.join(tmpd,'tmp_output'), log_output=logfile, seqfile=filename), wait=True)
 90 |         self.cli('{} {seq_file}'.format(self.command(), seq_file=filename), wait=True)
 91 |         self.returncode = self.cli.process.returncode
 92 | 
 93 |         if self.returncode != 0:
 94 |             self.stderr = self.cli.get_stderr()
 95 |             last_error_line = self.stderr.split('\n')[-1].strip()
 96 |             msg = f"Fasttree failed on {filename}: {last_error_line}"
 97 |             logger.error(msg)
 98 |             raise WrapperError(msg, self.stderr)
 99 | 
100 |         return (self.cli.get_stdout(), self.cli.get_stderr())
101 | 
102 |     def command(self):
103 |         return str(self.options)
104 | 
105 |     def _read_result(self, stdout, stderr):
106 |         """
107 |         Read back the result.
108 |         """
109 |         parser = FasttreeParser()
110 | 
111 |         try:
112 |             parser.parse(tree=stdout, other=stderr)
113 |             result = parser.to_dict()
114 |         except IOError as ioerr:
115 |             logger.error('Error reading results')
116 |             result = None
117 |         except ParseException as parseerr:
118 |             logger.error('Other parse error', parseerr)
119 |             result = None
120 | 
121 |         return result
122 | 
123 |     def _init_cli(self, binary):
124 |         return FasttreeCLI(executable=binary)
125 | 
126 | 
127 | def get_default_options():
128 | 
129 |     return OptionSet([
130 |         # Algorithm
131 | 
132 |         # Set datatype to DNA (nt) or AA alignment: AA by default. If set to True will assume DNA format.
133 |         StringOption('-nt', active=False),
134 | 
135 |         # Set the WAG model for AA alignment. Default Jones-Taylor-Thorton
136 |         StringOption('-wag', active=False),
137 | 
138 |         # Set the GTR model for nt alignment. Default Jones-Taylor-Thorton
139 |         StringOption('-gtr', active=False),
140 | 
141 |         # Set the gamma model. Default Jones-Taylor-Thorton
142 |         StringOption('-gamma', active=False),
143 | 
144 |         # Specify the number of rate categories of sites. Default 20.
145 |         IntegerOption('-cat', 20, active=False),
146 | 
147 |         IntegerOption('-seed',1234, active=False),
148 | 
149 |         # Specify starting tree
150 |         StringOption('-intree', '', active=False),
151 | 
152 |         # Speed up the neighbor joining phase & reduce memory usage (recommended for >50,000 sequences)
153 |         StringOption('-fastest', active=False),
154 |         # allow spaces and other restricted characters (but not ' ) in sequence names and quote names in the output tree (fasta input only; FastTree will not be able to read these trees back in)
155 |         StringOption('-quote', active=True),
156 | 
157 |         #-quote -- quote sequence names in the output and allow spaces, commas,  parentheses, and colons in them but not ' characters (fasta files only)\n"
158 |         StringOption('-quote', active=False),
159 | 
160 |         # Set the number of rounds of maximum-likelihood NNIs. Deafault 4*log2(N), N = the number of unique sequences
161 |         IntegerOption('-mlnni', 0, active=False),
162 | 
163 |     ])
164 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/treebuilders/iqtree.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import logging
  4 | import random
  5 | from pyparsing import ParseException
  6 | import shutil
  7 | from Bio import SeqIO
  8 | 
  9 | 
 10 | from .parsers import IqtreeParser
 11 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType
 12 | 
 13 | 
 14 | from ..abstract_cli import AbstractCLI
 15 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
 16 | 
 17 | from ...file_utils import TempFile, TempDir
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | logger.addHandler(logging.StreamHandler())
 21 | logger.setLevel(logging.INFO)
 22 | 
 23 | 
 24 | class IqtreeCLI(AbstractCLI):
 25 |     @property
 26 |     def _default_exe(self):
 27 |         return ['iqtree-omp', 'iqtree']
 28 | 
 29 | 
 30 | def set_default_dna_options(treebuilder):
 31 |     """
 32 |     Dummy function as sensible default
 33 |     """
 34 |     treebuilder.options = get_default_options()
 35 |     treebuilder.options['-st'].set_value('DNA')
 36 | 
 37 | 
 38 | def set_default_protein_options(treebuilder):
 39 |     """
 40 |     Dummy function as sensible default
 41 |     """
 42 |     treebuilder.options = get_default_options()
 43 |     treebuilder.options['-st'].set_value('AA')
 44 | 
 45 | 
 46 | class Iqtree(TreeBuilder):
 47 | 
 48 |     def __init__(self, input_, *args, **kwargs):
 49 |         super(Iqtree, self).__init__(alignment=input_, *args, **kwargs)
 50 |         self.options = get_default_options()
 51 |         if self.datatype == DataType.DNA:
 52 |             set_default_dna_options(self)
 53 |         elif self.datatype == DataType.PROTEIN:
 54 |             set_default_protein_options(self)
 55 | 
 56 |     def __call__(self, *args, **kwargs):
 57 |         """
 58 |         Sets up temporary output file location and calls iqtree using _call() function.
 59 |         Writes temporary input file if we're working with SeqIO object
 60 |         Saves the stdout and stderr and returns
 61 |         """
 62 |         start = time.time()  # time the execution
 63 | 
 64 |         #Need to create temp directory to put raxml output here
 65 |         with TempDir() as tmpd:
 66 |             if self.input_type is AlignmentInput.OBJECT:  # different operation depending on what it is
 67 |                 with TempFile() as filename:
 68 |                     SeqIO.write(self.input, filename, 'phylip-relaxed') # default interleaved
 69 |                     output, error = self._call(filename,tmpd, *args, **kwargs)
 70 |             elif self.input_type is AlignmentInput.FILENAME:
 71 |                 filename = self.input
 72 |                 output, error = self._call(filename, tmpd, *args, **kwargs)
 73 |             else:
 74 |                 output, error = self._call(None,tmpd, *args, **kwargs)
 75 |             self.result = self._read_result(tmpd)  # store result
 76 |         self.stdout = output
 77 |         self.stderr = error
 78 | 
 79 |         end = time.time()
 80 |         self.elapsed_time = end - start
 81 |         return self.result
 82 |         # End call
 83 | 
 84 |     # Any other accessory methods
 85 |     def _call(self, filename, tmpd, *args, **kwargs):
 86 |         """
 87 |         Call underlying low level _iqtree wrapper.
 88 |         Options are passed via *args and **kwargs
 89 |         [This only covers the simplest automatic
 90 |          case]
 91 |         """
 92 |         self.cli('{} -pre {tmp_path} -s {seqfile}'.format(self.command(),
 93 |                                                           tmp_path=os.path.join(tmpd, 'tmp_output'),
 94 |                                                           seqfile=filename),
 95 |                  wait=True)
 96 |         return self.cli.get_stdout(), self.cli.get_stderr()
 97 | 
 98 |     def command(self):
 99 |         return str(self.options)
100 | 
101 |     def _read_result(self, tmpd):
102 |         """
103 |         Read back the result.
104 |         """
105 | 
106 |         expected_outfiles = [os.path.join(tmpd, 'tmp_output.iqtree'),
107 |                              os.path.join(tmpd, 'tmp_output.treefile')]
108 |         parser = IqtreeParser()
109 |         try:
110 |             result = parser.to_dict(*expected_outfiles)
111 |         except IOError as ioerr:
112 |             logger.error('Error reading results')
113 |             result = None
114 |         except ParseException as parseerr:
115 |             logger.error('Other parse error', parseerr)
116 |             result = None
117 |         return result
118 | 
119 |     def _init_cli(self, binary):
120 |         return IqtreeCLI(executable=binary)
121 | 
122 | 
123 | def get_default_options():
124 |     return OptionSet([
125 |         # Number of threads
126 |         IntegerOption('-nt', 2, active=True),
127 | 
128 |         # Set the model for either DNA or AA alignment
129 |         StringOption('-m', '', active=False),
130 | 
131 |         # Ultrafast bootstrap (>=1000)
132 |         IntegerOption('-bb', 0, active=False),
133 | 
134 |         # SH-like approximate likelihood ratio test (SH-aLRT)
135 |         IntegerOption('-alrt', 0, active=False),
136 | 
137 |         # Bootstrap + ML tree + consensus tree (>=100)
138 |         IntegerOption('-b', 0, active=False)
139 |     ])
140 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/treebuilders/phyml.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import tempfile
  4 | import logging
  5 | from pyparsing import ParseException
  6 | from Bio import AlignIO, SeqIO
  7 | 
  8 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType
  9 | from .parsers import PhymlParser
 10 | 
 11 | from ..abstract_cli import AbstractCLI
 12 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
 13 | 
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | logger.addHandler(logging.StreamHandler())
 17 | logger.setLevel(logging.INFO)
 18 | 
 19 | 
 20 | class PhymlCLI(AbstractCLI):
 21 |     @property
 22 |     def _default_exe(self):
 23 |         return 'phyml'
 24 | 
 25 | 
 26 | def set_default_dna_options(treebuilder):
 27 |     """
 28 |     Dummy function as sensible default
 29 |     """
 30 |     treebuilder.options = get_default_options()
 31 |     treebuilder.options['-d'].set_value('nt')
 32 | 
 33 | 
 34 | def set_default_protein_options(treebuilder):
 35 |     """
 36 |     Dummy function as sensible default
 37 |     """
 38 |     treebuilder.options = get_default_options()
 39 | 
 40 | 
 41 | class Phyml(TreeBuilder):
 42 |     """ Phyml tree reconstruction
 43 | 
 44 |     This wrapper can be called to reconstruct a phylogenetic tree
 45 |     using PhyML.
 46 |     """
 47 | 
 48 |     def __init__(self, alignment, *args, **kwargs):
 49 |         """
 50 |         :param alignment: input multiple sequence alignment. This can be either
 51 |             a filename or an biopython SeqRecord collection.
 52 |         """
 53 |         super(Phyml, self).__init__(alignment, *args, **kwargs)
 54 |         self.options = get_default_options()
 55 |         if self.datatype == DataType.DNA:
 56 |             set_default_dna_options(self)
 57 |         else:
 58 |             set_default_protein_options(self)
 59 | 
 60 |     def __call__(self, *args, **kwargs):
 61 |         """
 62 |         Anything to do with calling Mafft should go here.
 63 |         If any extra arguments need to be passed they can
 64 |         be specified (listed as *args and **kwargs for now).
 65 |         """
 66 |         start = time.time()  # time the execution
 67 | 
 68 |         if self.input_type == AlignmentInput.OBJECT:  # different operation depending on what it is
 69 |             with tempfile.NamedTemporaryFile(mode='wt') as fh:
 70 |                 SeqIO.write(self.input, fh, 'phylip-relaxed')  # default interleaved
 71 |                 fh.seek(0)
 72 |                 output, error = self._call(fh.name, *args, **kwargs)
 73 |                 self.result = self._read_result(fh.name)  # store result
 74 |         else:
 75 |             path = os.path.dirname(self.input)
 76 |             filename = os.path.basename(self.input)
 77 |             # some operations done because phyml can not deal with large filenames that are caused due to a large path
 78 |             with os.chdir(path):
 79 |                 output, error = self._call(filename, *args, **kwargs)
 80 |                 self.result = self._read_result(filename)  # store result
 81 | 
 82 |         self.stdout = output
 83 |         self.stderr = error
 84 | 
 85 |         end = time.time()
 86 |         self.elapsed_time = end - start
 87 |         return self.result["tree"]
 88 |         # End call
 89 | 
 90 |     # Any other accessory methods
 91 |     def _call(self, filename, *args, **kwargs):
 92 |         """
 93 |         Call underlying low level _Phyml wrapper.
 94 |         Options are passed via *args and **kwargs
 95 |         [This only covers the simplest automatic
 96 |          case]
 97 |         """
 98 |         self.cli('{} -i {}'.format(self.command(), filename),
 99 |                  wait=True)
100 |         return self.cli.get_stdout(), self.cli.get_stderr()
101 | 
102 |     def command(self):
103 |         return str(self.options)
104 | 
105 |     def _read_result(self, output):
106 |         """
107 |         Read back the result.
108 |         """
109 | 
110 |         #TODO: change the output dictionary into a better format
111 |         expected_outfiles = ['{}_phyml_stats'.format(output), '{}_phyml_tree'.format(output)]
112 |         parser = PhymlParser()
113 | 
114 |         # Phyml outputs two outfiles, a stats file and a tree file.
115 |         # Sometimes it appends .txt, sometimes not. Seems to be platform-specific.
116 |         # Here we assume they are without .txt, but if we can't find them, try
117 |         # looking for the .txt onees instead
118 |         try:
119 |             # Check if these are the .txt style outfiles
120 |             if not os.path.exists(expected_outfiles[0]):
121 |                 expected_outfiles = [x + '.txt' for x in expected_outfiles]
122 |             result = parser.to_dict(*expected_outfiles)
123 | 
124 |         except IOError as ioerr:
125 |             logger.error('Error reading results')
126 |             result = None
127 |         except ParseException as parseerr:
128 |             logger.error('Other parse error', parseerr)
129 |             result = None
130 | 
131 |         return result
132 | 
133 |     def _init_cli(self, binary):
134 |         return PhymlCLI(executable=binary)
135 | 
136 | 
137 | def get_default_options():
138 |     return OptionSet([
139 |         # Algorithm
140 | 
141 |         # Set datatype to nt or aa
142 |         StringOption('-d', 'aa', active=True),
143 | 
144 |         # Set the model for either DNA or AA alignment
145 |         StringOption('-m', '', active=False),
146 | 
147 |         # If set to true will assume sequential format
148 |         FlagOption('-q', False, active=False),
149 | 
150 |         # Set bootstrap value
151 |         IntegerOption('-b', 0, active=False),
152 | 
153 |         # Tree topology search operation option
154 |         StringOption('-s', 'NNI', active=False)
155 |     ])
156 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/treebuilders/raxml.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import logging
  4 | import random
  5 | from pyparsing import ParseException
  6 | import shutil
  7 | from Bio import AlignIO, SeqIO
  8 | 
  9 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType
 10 | from .parsers import RaxmlParser
 11 | 
 12 | from ..abstract_cli import AbstractCLI
 13 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
 14 | 
 15 | from ...file_utils import TempFile,TempDir
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | logger.addHandler(logging.StreamHandler())
 19 | logger.setLevel(logging.INFO)
 20 | 
 21 | 
 22 | class RaxmlCLI(AbstractCLI):
 23 |     @property
 24 |     def _default_exe(self):
 25 |         return ['raxmlHPC','raxmlHPC-PTHREADS']
 26 | 
 27 | 
 28 | def set_default_dna_options(treebuilder):
 29 |     """
 30 |     Dummy function as sensible default
 31 |     """
 32 |     treebuilder.options = get_default_options()
 33 | 
 34 | 
 35 | def set_default_protein_options(treebuilder):
 36 |     """
 37 |     Dummy function as sensible default
 38 |     """
 39 |     treebuilder.options = get_default_options()
 40 | 
 41 | 
 42 | class Raxml(TreeBuilder):
 43 | 
 44 |     def __init__(self, alignment, *args, **kwargs):
 45 |         self.options = get_default_options()
 46 |         super(Raxml, self).__init__(alignment=alignment, *args, **kwargs)
 47 |         if self.input is not None:
 48 |             if self.datatype == DataType.DNA:
 49 |                 set_default_dna_options(self)
 50 |             else:
 51 |                 set_default_protein_options(self)
 52 | 
 53 | 
 54 | 
 55 |     def __call__(self, *args, **kwargs):
 56 |         """
 57 |         Sets up temporary output files and calls raxml using _call() function.
 58 |         Writes temporary input file if we're working with SeqIO object
 59 |         Saves the stdout and stderr and returns
 60 |         """
 61 |         start = time.time()  # time the execution
 62 | 
 63 |         #Need to create temp directory to put raxml output here
 64 |         with TempDir() as tmpd:
 65 |             if self.input_type is AlignmentInput.OBJECT:  # different operation depending on what it is
 66 |                 with TempFile() as filename:
 67 |                     SeqIO.write(self.input, filename, 'phylip-relaxed') # default interleaved
 68 |                     output, error = self._call(filename,tmpd, *args, **kwargs)
 69 |             elif self.input_type is AlignmentInput.FILENAME:
 70 |                 filename = self.input
 71 |                 output, error = self._call(filename, tmpd, *args, **kwargs)
 72 |             else:
 73 |                 output, error = self._call(None,tmpd, *args, **kwargs)
 74 |             self.result = self._read_result(tmpd)  # store result
 75 |         self.stdout = output
 76 |         self.stderr = error
 77 | 
 78 |         end = time.time()
 79 |         self.elapsed_time = end - start
 80 |         return self.result
 81 |         # End call
 82 | 
 83 |     # Any other accessory methods
 84 |     def _call(self, filename, tmpd, *args, **kwargs):
 85 |         """
 86 |         Call underlying low level _Raxml wrapper.
 87 |         Options are passed via *args and **kwargs
 88 |         [This only covers the simplest automatic
 89 |          case]
 90 |         """
 91 |         #hard code tmp_output as the output name since we don't save it anyway
 92 |         self.cli('{} -n tmp_output -w {tmp_path} -s {seqfile}'.format(self.command(), tmp_path=tmpd, seqfile=filename),
 93 |                 wait=True)
 94 |         return self.cli.get_stdout(), self.cli.get_stderr()
 95 | 
 96 |     def command(self):
 97 |         return str(self.options)
 98 | 
 99 |     def _read_result(self, tmpd):
100 |         """
101 |         Read back the result.
102 |         """
103 | 
104 |         expected_outfiles = [os.path.join(tmpd, 'RAxML_info.tmp_output'), os.path.join(tmpd, 'RAxML_bestTree.tmp_output')]
105 | 
106 | 
107 |         parser = RaxmlParser()
108 | 
109 |         try:
110 |             if self.options['-f'].get_value() is not '':
111 |                 f_value = os.path.splitext(os.path.basename(self.options['-f'].get_value()))[0]
112 | 
113 |                 result = parser.to_dict(*expected_outfiles, dash_f=f_value)
114 |             else:
115 |                 result = parser.to_dict(*expected_outfiles, dash_f=None)
116 | 
117 |         except IOError as ioerr:
118 |             logger.error('Error reading results')
119 |             result = None
120 |         except ParseException as parseerr:
121 |             logger.error('Other parse error', parseerr)
122 |             result = None
123 | 
124 |         return result
125 | 
126 |     def _init_cli(self, binary):
127 |         return RaxmlCLI(executable=binary)
128 | 
129 | 
130 | def get_default_options():
131 |     return OptionSet([
132 |         # Algorithm
133 | 
134 |         # Set the model for either DNA or AA alignment
135 |         StringOption('-m', 'PROTGAMMAGTR', active=True),
136 | 
137 |         # Number of replicates
138 |         IntegerOption('-p', 12345, active=True),
139 | 
140 |         # If set to true will assume sequential format
141 |         FlagOption('-q', False, active=False),
142 | 
143 |         # Turn on bootstrapping - set seed
144 |         IntegerOption('-b', 0, active=False),
145 | 
146 |         # Number of replicates
147 |         IntegerOption('-#', 0, active=False),
148 | 
149 |         # Turn on rapid bootstrap - specify seed
150 |         IntegerOption('-x', 0, active=False),
151 | 
152 |         # Sed number of bootstrap replicates
153 |         IntegerOption('-N', 0, active=False),
154 | 
155 |         # Set number of threads
156 |         IntegerOption('-T', 0, active=False),
157 | 
158 |         # Tree topology search operation option
159 |         StringOption('-s', 'NNI', active=False),
160 | 
161 |         # Select algorithm
162 |         StringOption('-f', '', active=False),
163 | 
164 |         # Specify starting tree
165 |         StringOption('-t', '', active=False),
166 | 
167 |         # Specify filename of file containing multiple trees
168 |         StringOption('-z', '', active=False),
169 | 
170 |     ])
171 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/trimmers/__init__.py:
--------------------------------------------------------------------------------
1 | from .trimal import TrimAl
2 | 
3 | 


--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/trimmers/base_trimmer.py:
--------------------------------------------------------------------------------
 1 | import os, types, itertools
 2 | from abc import ABCMeta, abstractmethod
 3 | from Bio import AlignIO, SeqIO
 4 | from Bio.Align import MultipleSeqAlignment
 5 | from ...seq_utils import identify_input
 6 | from ...wrappers import WrapperError
 7 | 
 8 | import logging
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | 
13 | class MSATrimmer:
14 |     """
15 |     Base class for wrappers of msa trimming software
16 | 
17 |     The wrapper is written as a callable class.
18 |     This can hold data (state) to do with the operation it performs, so it can keep results,
19 |     execution times and other metadata, as well as perform the task.
20 | 
21 |     This is a base implementation to be extended. The important parts are
22 |     __init__ (does the setup) and __call__ (does the work). All
23 |     else are helper methods.
24 | 
25 |     :Example:
26 | 
27 |     ::
28 | 
29 |         callable_wrapper = ConcreteTrimmer(aln)
30 |         result = callable_wrapper()
31 |         time_taken = callable_wrapper.elapsed_time
32 |         result_again = callable_wrapper.result
33 |     """
34 |     __metaclass__ = ABCMeta
35 | 
36 |     def __init__(self, alignment=None, binary=None):
37 |         """
38 |         Should work the same whether you're working with a Biopython object or a file
39 |             but the implementation differs, e.g. a Biopython object will need
40 |             to be written temporarily to disk for the Trimmer to work on it.
41 | 
42 |         alignment is one of 4 things:
43 |             a filename
44 |             a Biopython MSA
45 |             a list of Seq objects
46 |             anything else (throw an exception)
47 | 
48 |         binary is the alignment's executable file, or None
49 |         """
50 | 
51 |         if alignment is not None:
52 |             self.input_type = identify_input(alignment)  # Figure out what it is - file or object
53 |             self.input = alignment  # store it
54 |         else:
55 |             self.input_type = None
56 |             self.input = None
57 | 
58 |         self.elapsed_time = None
59 |         self.stdout = None
60 |         self.stderr = None
61 |         try:
62 |             self.cli = self._init_cli(binary)
63 |         except IOError as err:
64 |             raise WrapperError('Error searching for binary: {}'.format(err))
65 |             # End setup
66 | 
67 |     @abstractmethod
68 |     def __call__(self, *args, **kwargs):
69 |         """
70 |         How to call the underlying aligner
71 |         """
72 |         pass
73 | 
74 |     @abstractmethod
75 |     def _init_cli(self, binary):
76 |         """
77 |         Set up the command-line interface to the wrapped software
78 |         :param binary: filename of executable binary file
79 |         :return: concrete CLI type inheriting from AbstractCLI
80 |         """
81 |         pass
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/archive/analysis/edit_orthxml_file.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """
 3 | 
 4 | I accidanetly comment <property na in hog class
 5 | 
 6 | so with this code I can edit the file
 7 | 
 8 | 
 9 | add       <orthologGroup id="HOG:B0811125_sub1201">
10 |          <property name="TaxRange" value="CHLTR_MYCGE"/>
11 | 
12 | 
13 | """
14 | 
15 | 
16 | file_in = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/archive/xml_output/out_27aug_6pm.xml_no_property"
17 | file_out = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/archive/xml_output/out_27aug_6pm_property.xml"
18 | 
19 | file_in_handle = open(file_in, 'r')
20 | file_out_handle = open(file_out, 'w')
21 | property_str ="<property name=\"TaxRange\" value=\"test\"/>"
22 | print("started")
23 | for line in file_in_handle:
24 |     if not "<orthologGroup" in line:
25 |         file_out_handle.write(line)
26 |     else:
27 |         file_out_handle.write(line)
28 | 
29 |         for i, st in enumerate(line):
30 |             if st == "<":
31 |                 needed_num = i
32 | 
33 |         file_out_handle.write(" "*(needed_num+2) + property_str+"\n")
34 | 
35 | print("finished")
36 | 
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/archive/analysis/find_unfinished_rhog.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | 
 6 | import os
 7 | from os import listdir
 8 | 
 9 | 
10 | folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_bird_/run_1may/out_folder/"
11 | 
12 | 
13 | 
14 | project_files = listdir(folder + "/rhogs_all/")
15 | rhogs = []
16 | for file in project_files:
17 |     file_name_split = file.split(".")
18 |     if file_name_split[-1] == "fa":
19 |         rhog_id = int(file_name_split[0].split("_")[1])
20 |         rhogs.append(rhog_id)
21 | 
22 | print("number of rhogs is ", len(rhogs))
23 | 
24 | folder_pickle = folder + "/pickle_rhogs/"
25 | project_files = listdir(folder_pickle)
26 | pickles = []
27 | for file in project_files:
28 |     if os.path.getsize(folder_pickle + file) > 2:
29 |         file_name_split = file.split(".")
30 |         if file_name_split[-1] == "pickle":
31 |             rhog_id = int(file_name_split[0].split("_")[1])
32 |             pickles.append(rhog_id)
33 |     else:
34 |         print("this file is empty", file)
35 | 
36 | print("number of pickles is ", len(pickles))
37 | 
38 | no_pickle_list = set(rhogs) - set(pickles)
39 | 
40 | print("number of rhogs not finished is ", len(no_pickle_list))
41 | 
42 | print("\n \n ", no_pickle_list)
43 | 


--------------------------------------------------------------------------------
/archive/analysis/preprocess_qfo_files.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Proteins in each file belong to the same species.
 4 | 
 5 | # change the name of each file based on the species name inside each prot id
 6 | 
 7 | 
 8 | from os import listdir
 9 | from Bio import SeqIO
10 | import os
11 | 
12 | working_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/"
13 | prot_folder = working_folder + "/omamer_search_old/proteome/"
14 | project_files = listdir(prot_folder)
15 | query_species_names_old = []
16 | query_species_names_new = []
17 | for file in project_files:
18 |     if file.split(".")[-1] == "fa":
19 |         file_name_split = file.split(".")[:-1]
20 |         query_species_name_old = '.'.join(file_name_split)
21 |         prot_address = prot_folder + query_species_name_old + ".fa"
22 |         prots_record = list(SeqIO.parse(prot_address, "fasta"))
23 |         prot_record = prots_record[0]
24 |         prot_name = prot_record.name  # 'tr|E3JPS4|E3JPS4_PUCGT
25 |         query_species_name_new = prot_name.split("|")[-1].split("_")[-1].strip()
26 |         # if query_species_name_new == 'RAT': query_species_name_new = "RATNO"
27 |         query_species_names_old.append(query_species_name_old)
28 |         query_species_names_new.append(query_species_name_new)
29 | 
30 | os.mkdir(working_folder+"/omamer_search")
31 | os.mkdir(working_folder+"/omamer_search/proteome/")
32 | os.mkdir(working_folder+"/omamer_search/hogmap")
33 | 
34 | 
35 | for idx, query_species_name_old in enumerate(query_species_names_old):
36 |     query_species_name_new = query_species_names_new[idx]
37 | 
38 |     prot_address_old = working_folder + "omamer_search_old/proteome/" + query_species_name_old + ".fa"
39 |     prot_address_new = working_folder + "omamer_search/proteome/" + query_species_name_new + "_.fa"
40 |     os.system('cp ' + prot_address_old + ' ' + prot_address_new)
41 | 
42 |     hogmap_address_old = working_folder + "omamer_search_old/hogmap/" + query_species_name_old + ".hogmap"
43 |     hogmap_address_new = working_folder + "omamer_search/hogmap/" + query_species_name_new + "_.hogmap"
44 |     os.system('cp ' + hogmap_address_old + ' ' + hogmap_address_new)
45 | 
46 | 
47 | # 13:54:16 - the species DANRE  already exists in the oma database, remove them first
48 | 
49 | 
50 | 
51 | print("done")


--------------------------------------------------------------------------------
/archive/analysis/write_gene_id_pickle_old_code.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import xml.etree.ElementTree as ET
 4 | import dill as dill_pickle
 5 | from os import listdir
 6 | from xml.dom import minidom
 7 | import os
 8 | from Bio import SeqIO
 9 | #import dill as dill_pickle
10 | import dill as pickle
11 | #import pickle
12 | 
13 | 
14 | 
15 | address_working_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/ali_code_31aug/"
16 | 
17 | address_rhogs_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/rhog_all_v3_g2_s500/"
18 | address_group_xml_ortho = address_working_folder+"group_xml_ortho_adjusted_family_40_2sep5pm_dill.pickle"
19 | 
20 | 
21 | rhog_files = listdir(address_rhogs_folder)[:]
22 | 
23 | rhog_files = listdir(address_rhogs_folder)
24 | rhogid_num_list = []
25 | for rhog_file in rhog_files:
26 |     if rhog_file.split(".")[-1] == "fa":
27 |         rhogid_num = int(rhog_file.split(".")[0].split("_")[1][1:])
28 |         rhogid_num_list.append(rhogid_num)
29 | 
30 | rhogid_num_list_temp = rhogid_num_list
31 | 
32 | species_prot_dic = {}
33 | # all_prot_temp_list= []
34 | for rhogid_num in rhogid_num_list_temp:
35 |     prot_address = address_rhogs_folder + "HOG_B" + str(rhogid_num).zfill(7) + ".fa"
36 |     rhog_i = list(SeqIO.parse(prot_address, "fasta"))
37 |     for prot_i in rhog_i:
38 |         prot_i_name = prot_i.id  # .split("||")[0] # .split("|")[1]  # tr|E3JPS4|E3JPS4_PUCGT or new || ||
39 |         species_i = prot_i.id.split("||")[1][:-1]  # prot_i.id.split("|")[-1].split("_")[-1]
40 |         if species_i in species_prot_dic:
41 |             species_prot_dic[species_i].append(prot_i_name)
42 |         else:
43 |             species_prot_dic[species_i] = [prot_i_name]
44 |         # all_prot_temp_list.append(prot_i.id)
45 | 
46 | print("there are species ", len(species_prot_dic))
47 | orthoxml_file = ET.Element("orthoXML",
48 |                            attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA", "originVersion": "Nov 2021",
49 |                                    "version": "0.3"})  #
50 | 
51 | gene_counter = 100000
52 | gene_id_name = {}
53 | query_species_names_rHOGs = list(species_prot_dic.keys())
54 | for species_name in query_species_names_rHOGs:
55 |     no_gene_species = True  # for code develop ment
56 |     species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": species_name, "NCBITaxId": "1"})
57 |     database_xml = ET.SubElement(species_xml, "database", attrib={"name": "QFO database ", "version": "2020"})
58 |     genes_xml = ET.SubElement(database_xml, "genes")
59 | 
60 |     prot_list = species_prot_dic[species_name]
61 |     for prot_itr in range(len(prot_list)):  # [12:15]
62 |         prot_i_name = prot_list[prot_itr]
63 |         gene_id_name[prot_i_name] = gene_counter
64 |         prot_i_name_short = prot_i_name.split("||")[0].split("|")[1].strip()  # tr|E3JPS4|E3JPS4_PUCGT
65 |         gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_counter), "protId": prot_i_name_short})
66 |         gene_counter += 1
67 | 
68 | groups_xml = ET.SubElement(orthoxml_file, "groups")
69 | 
70 | 
71 | 
72 | with open(address_group_xml_ortho, 'wb') as handle:
73 |     # dill_pickle.dump(gene_id_name, handle, protocol=dill_pickle.HIGHEST_PROTOCOL)
74 |     pickle.dump((groups_xml, gene_id_name, orthoxml_file), handle, protocol=pickle.HIGHEST_PROTOCOL)
75 | 
76 | print("saved as ", address_group_xml_ortho)
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/archive/analysis/xml_.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import xml.etree.ElementTree as ET
 4 | import dill as dill_pickle
 5 | from os import listdir
 6 | from xml.dom import minidom
 7 | 
 8 | working_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/"
 9 | # gene_trees_folder = ""  # in_folder + "/gene_trees_/"
10 | # check gene_trees_folder exist otherwise mkdir this
11 | 
12 | #address_rhogs_folder = in_folder + "/rhog_g501_done/"  # old3/rhog_all/ /rhog_size_g2_s500/" sample_rootHOG
13 | #species_tree_address = in_folder + "/archive/lineage_tree_qfo.phyloxml"
14 | pickle_folder = working_folder + "/pickle_folder_all_collect/"
15 | # add warning when pickle folder is not empty
16 | output_xml_name = "out_27aug_6pm.xml"
17 | 
18 | 
19 | orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA",
20 |                                                "originVersion": "Nov 2021", "version": "0.3"})  #
21 | 
22 | with open(working_folder + '/file_gene_id_name.pickle', 'rb') as handle:
23 |     gene_id_name = dill_pickle.load(handle)
24 |     # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name)
25 | 
26 | for query_species_name, list_prots in gene_id_name.items():
27 | 
28 |     species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "NCBITaxId": "1"})
29 |     database_xml = ET.SubElement(species_xml, "database", attrib={"name": "QFO database ", "version": "2020"})
30 |     genes_xml = ET.SubElement(database_xml, "genes")
31 | 
32 |     for (gene_idx_integer, query_prot_name) in list_prots:
33 |         query_prot_name_pure = query_prot_name.split("||")[0].strip().split("|")[1]
34 |         gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": query_prot_name_pure})
35 | 
36 | pickle_files_adress = listdir(pickle_folder)
37 | 
38 | hogs_a_rhog_xml_all = []
39 | for pickle_file_adress in pickle_files_adress:
40 |     with open(pickle_folder + pickle_file_adress, 'rb') as handle:
41 |         hogs_a_rhog_xml_batch = dill_pickle.load(handle)  # hogs_a_rhog_xml_batch is orthoxml_to_newick.py list of hog object.
42 |         hogs_a_rhog_xml_all.extend(hogs_a_rhog_xml_batch)
43 |         # hogs_rhogs_xml_all is orthoxml_to_newick.py list of hog object.
44 | 
45 | print("number of hogs in all batches is ", len(hogs_a_rhog_xml_all))
46 | 
47 | groups_xml = ET.SubElement(orthoxml_file, "groups")
48 | 
49 | for hogs_a_rhog_xml in hogs_a_rhog_xml_all:
50 |     groups_xml.append(hogs_a_rhog_xml)
51 | 
52 | xml_str = minidom.parseString(ET.tostring(orthoxml_file)).toprettyxml(indent="   ")
53 | # print(xml_str[:-1000])
54 | 
55 | with open(working_folder +output_xml_name, "w") as file_xml:
56 |     file_xml.write(xml_str)
57 | file_xml.close()
58 | 
59 | print("orthoxml is written in  "+ working_folder +output_xml_name)
60 | 
61 | 


--------------------------------------------------------------------------------
/archive/fastOMA_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/archive/fastOMA_logo.png


--------------------------------------------------------------------------------
/archive/test_curn.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from FastOMA.infer_roothogs import fastoma_infer_roothogs
  3 | from FastOMA._wrappers import logger
  4 | from FastOMA.infer_subhogs import fastoma_infer_subhogs
  5 | 
  6 | 
  7 | # --low-so-detection --fragment-detection
  8 | 
  9 | # --input-rhog-folder ./bb/ --parrallel True  --species-tree species_tree.nwk
 10 | 
 11 | #a=2
 12 | #fastoma_infer_subhogs()
 13 | #  proteome    --hogmap hogmaps   --splice splice  --out-rhog-folder out
 14 | import sys
 15 | 
 16 | folder="pycharm_projects/fastoma_test/"
 17 | sys.argv.extend(['--proteomes', folder+"proteome"])
 18 | sys.argv.extend(['--hogmap', folder+"hogmaps"])
 19 | sys.argv.extend(['--splice', folder+"splice"])
 20 | sys.argv.extend(['--out-rhog-folder', folder+"out"])
 21 | sys.argv.extend(['-vv'])
 22 | fastoma_infer_roothogs()
 23 | 
 24 | a=2 # a
 25 | #
 26 | # from FastOMA.zoo.hog import transform
 27 | #
 28 | # #from zoo.tree_utils import collapse, gene_species, transform, HOG_coverages
 29 | #
 30 | # import io
 31 | # import lxml.etree
 32 | # orthoxml_file = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_qfo/benchmark-webservice3/orthoxml/euk_omamer200.dev8_13oct.orthoxml"
 33 | #
 34 | #
 35 | # orthxml_str = []
 36 | # with open(orthoxml_file, "r") as f:
 37 | #     for i in f:
 38 | #         orthxml_str.append(i)
 39 | # print(len(orthxml_str))
 40 | # dic_gene_integer={}
 41 | # for line in orthxml_str:
 42 | #     if "gene id" in line:
 43 | #         found=False
 44 | #         gene_int= line.split("\"")[1]
 45 | #         gene_name = line.split("\"")[3]
 46 | #         dic_gene_integer[gene_int] = gene_name
 47 | #
 48 | #
 49 | #
 50 | # orthoxml_etree=lxml.etree.parse(orthoxml_file)
 51 | #
 52 | # pw_orthologs_integer = sorted(list(transform.iter_pairwise_relations(orthoxml_etree)))
 53 | # # iter_pairwise_relations(obj, rel_type=None    (def:'ortholog' , but possible to use 'paralog')
 54 | # print(len(pw_orthologs_integer))
 55 | # print(pw_orthologs_integer[:2])
 56 | # pw_orthologs_gene =[]
 57 | # for pair in pw_orthologs_integer:
 58 | #     pw_orthologs_gene.append((dic_gene_integer[pair[0]],dic_gene_integer[pair[1]]))
 59 | #
 60 | #
 61 | #
 62 | # print(len(pw_orthologs_gene))
 63 | #
 64 | # output_file = open(orthoxml_file+"_pairs.tsv","w")
 65 | # for  pair in pw_orthologs_gene:
 66 | #     output_file.write(pair[0]+"\t"+pair[1]+"\n")
 67 | #
 68 | # output_file.close()
 69 | 
 70 | 
 71 | #
 72 | #
 73 | # # orthoxml_handle= open(orthoxml_file,"r")
 74 | # # orthoxml =""
 75 | # # for line in orthoxml_handle:
 76 | # #     orthoxml+=line
 77 | #
 78 | #
 79 | # from xml.etree.ElementTree import XMLParser
 80 | #
 81 | # parser = XMLParser()
 82 | # with open(orthoxml_file, 'rb') as xml:
 83 | #     for chunk in xml:
 84 | #         parser.feed(chunk)
 85 | # parser.close()
 86 | #
 87 | #
 88 | # lxml.etree.parse(oxml)
 89 | #
 90 | # orthoxm= lxml.etree.parse(orthoxml)
 91 | #
 92 | # # expected = [("1", "2"), ("1", "3"), ("1", "4"), ("1", "5"), ("1", "6"),
 93 | # #             ("2", "5"), ("2", "6"), ("3", "4"), ("3", "5"), ("3", "6"),
 94 | # #             ("4", "5"), ("4", "6"), ("5", "6")]
 95 | # #    self.assertEqual(expected, pw_orthologs)
 96 | #
 97 | # from xml.etree import ElementTree
 98 | # tree = ElementTree.parse(orthoxml_file)
 99 | # root = tree.getroot()
100 | 


--------------------------------------------------------------------------------
/conf/base.config:
--------------------------------------------------------------------------------
 1 | /*
 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 3 |     dessimozlab/FastOMA Nextflow base config file
 4 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 5 |     A 'blank slate' config file, appropriate for general use on most high performance
 6 |     compute environments. Assumes that all software is installed and available on
 7 |     the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
 8 | ----------------------------------------------------------------------------------------
 9 | */
10 | 
11 | process {
12 | 
13 |     cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
14 |     memory = { check_max( 6.GB * task.attempt, 'memory' ) }
15 |     time   = { check_max( 4.h  * task.attempt, 'time'   ) }
16 |     shell  = ['/bin/bash', '-euo', 'pipefail']
17 | 
18 |     //errorStrategy = { task.exitStatus in (130..145) ? 'retry' : 'finish' }
19 |     errorStrategy = 'retry'
20 |     maxRetries    = 3
21 | 
22 |     withLabel:process_single {
23 |         cpus   = { check_max( 1                   , 'cpus'    ) }
24 |         memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
25 |         time   = { check_max( 4.h   * task.attempt, 'time'    ) }
26 |     }
27 |     withLabel:process_low {
28 |         cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
29 |         memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
30 |         time   = { check_max( 4.h   * task.attempt, 'time'    ) }
31 |     }
32 |     withLabel:process_medium {
33 |         cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
34 |         memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
35 |         time   = { check_max( 8.h   * task.attempt, 'time'    ) }
36 |     }
37 |     withLabel:process_high {
38 |         cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
39 |         memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
40 |         time   = { check_max( 16.h  * task.attempt, 'time'    ) }
41 |     }
42 |     withLabel:process_long {
43 |         time   = { check_max( 20.h  * task.attempt, 'time'    ) }
44 |     }
45 |     withLabel:process_high_memory {
46 |         memory = { check_max( 200.GB * task.attempt, 'memory' ) }
47 |     }
48 |     withLabel:error_ignore {
49 |         errorStrategy = 'ignore'
50 |     }
51 |     withLabel:error_retry {
52 |         errorStrategy = 'retry'
53 |         maxRetries    = 2
54 |     }
55 | }


--------------------------------------------------------------------------------
/environment-conda.yml:
--------------------------------------------------------------------------------
 1 | name: fastoma-env
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - omamer
 8 |   - mafft
 9 |   - fasttree
10 |   - nextflow
11 |   - papermill
12 |   - seaborn
13 |   - matplotlib
14 |   - pyparsing
15 |   - networkx
16 |   - jupyter
17 |   - mmseqs2
18 |   - pip
19 |   - pip:
20 |     - .[report]
21 | 


--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
  1 | // General configuration used in all profiles
  2 | manifest {
  3 |   name = "dessimozlab/FastOMA"
  4 |   description = """FastOMA computes Hierarchical Orthologous Groups from proteomes."""
  5 |   author = "Sina Majidian, Adrian Altenhoff"
  6 |   homePage = "https://omabrowser.org"
  7 |   mainScript = "FastOMA.nf"
  8 |   nextflowVersion = ">=22.10.4"
  9 |   defaultBranch = "main"
 10 |   doi = "10.1101/2024.01.29.577392"
 11 |   version = "0.3.5"
 12 | }
 13 | 
 14 | params {
 15 |   container_name = "dessimozlab/fastoma"
 16 |   container_version = "0.3.5"
 17 |   omamer_db = "https://omabrowser.org/All/LUCA.h5"
 18 |   debug_enabled = false
 19 |   help = false
 20 |   report = false
 21 |   write_msas = false
 22 |   write_genetrees = false
 23 |   filter_method = "col-row-threshold"
 24 |   filter_gap_ratio_row = 0.3
 25 |   filter_gap_ratio_col = 0.5
 26 |   nr_repr_per_hog = 5
 27 |   min_sequence_length = 40
 28 |   force_pairwise_ortholog_generation = false
 29 | 
 30 |   output_folder = "Output"
 31 |   statsdir = "${params.output_folder}/stats"
 32 | 
 33 |   // Max resource options
 34 |   // Defaults only, expecting to be overwritten
 35 |   max_memory                 = '128.GB'
 36 |   max_cpus                   = 24
 37 |   max_time                   = '120.h'
 38 | }
 39 | 
 40 | // Profiles configure nextflow depending on the environment (local, docker, singularity)
 41 | profiles {
 42 | 
 43 |   docker {
 44 |     process {
 45 |       container = "$params.container_name:$params.container_version"
 46 |     }
 47 |     docker.enabled = true
 48 |   }
 49 | 
 50 |   singularity {
 51 |     process {
 52 |       container = "$params.container_name:$params.container_version"
 53 |     }
 54 |     singularity.enabled = true
 55 |     singularity.autoMounts = true
 56 |   }
 57 | 
 58 |   standard {
 59 |     process.executor = 'local'
 60 |   }
 61 | 
 62 |   slurm {
 63 |     process.executor = "slurm"
 64 |     time = 4.h
 65 |   }
 66 | 
 67 |   conda {
 68 |     process.conda = "${projectDir}/environment-conda.yml"
 69 |     conda.enabled = true
 70 |     conda.createTimeout = '3 h'
 71 |   }
 72 | 
 73 |   slurm_singularity {
 74 |     process {
 75 |       container = "$params.container_name:$params.container_version"
 76 |       executor = "slurm"
 77 |       time = 4.h
 78 |       memory = 20.GB
 79 |     }
 80 |     singularity.enabled = true
 81 |     singularity.autoMounts = true
 82 |   }
 83 | 
 84 |   slurm_conda {
 85 |     process {
 86 |       conda = "${projectDir}/environment-conda.yml"
 87 |       executor = "slurm"
 88 |       time = 4.h
 89 |       memory = 20.GB
 90 |     }
 91 |     conda.enabled = true
 92 |     conda.createTimeout = '3 h'
 93 |   }
 94 | }
 95 | 
 96 | def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
 97 | timeline {
 98 |   enabled = params.report
 99 |   file = "${params.statsdir}/timeline_${trace_timestamp}.html"
100 | }
101 | report {
102 |   enabled = params.report
103 |   file = "${params.statsdir}/report_${trace_timestamp}.html"
104 | }
105 | trace {
106 |     enabled = params.report
107 |     file    = "${params.statsdir}/trace_${trace_timestamp}.txt"
108 | }
109 | dag {
110 |     enabled = params.report
111 |     file    = "${params.statsdir}/pipeline_dag_${trace_timestamp}.html"
112 | }
113 | 
114 | includeConfig "conf/base.config"
115 | 
116 | // function to check maximum resources
117 | def check_max(obj, type) {
118 |     if (type == 'memory') {
119 |         try {
120 |             if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
121 |                 return params.max_memory as nextflow.util.MemoryUnit
122 |             else
123 |                 return obj
124 |         } catch (all) {
125 |             println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
126 |             return obj
127 |         }
128 |     } else if (type == 'time') {
129 |         try {
130 |             if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
131 |                 return params.max_time as nextflow.util.Duration
132 |             else
133 |                 return obj
134 |         } catch (all) {
135 |             println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
136 |             return obj
137 |         }
138 |     } else if (type == 'cpus') {
139 |         try {
140 |             return Math.min( obj, params.max_cpus as int )
141 |         } catch (all) {
142 |             println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
143 |             return obj
144 |         }
145 |     }
146 | }
147 | 
148 | 


--------------------------------------------------------------------------------
/nextflow_slurm.config:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | process.executor = "slurm"
 6 | process.queue = "cpu"
 7 | process.time = 10.h
 8 | process.memory = 95.GB
 9 | executor {
10 |     name = 'slurm'
11 |     queueSize = 550
12 | }
13 | 
14 | errorStrategy = { task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'terminate' }
15 | 
16 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "FastOMA"
 7 | dynamic = ["version"]
 8 | description = "FastOMA - a package to infer orthology information among proteomes"
 9 | readme = "README.md"
10 | license = "MIT"
11 | requires-python = ">=3.8"
12 | authors = [
13 |     { name = "Sina Majidian", email = "sina.majidian@gmail.com" },
14 |     { name = "Adrian Altenhoff", email = "adrian.altenhoff@inf.ethz.ch" }
15 | ]
16 | dependencies = [
17 |     "biopython ~=1.81",
18 |     "DendroPy >=4.5,<=4.6.1",
19 |     "ete3 ~=3.1",
20 |     "lxml >=4.6,<6",
21 |     "omamer ~=2.0",
22 |     "pyham ~=1.1",
23 |     "numpy <2",   # temporary fix as pytables does not yet work with numpy 2.0
24 |     "pyparsing",
25 |     "networkx",
26 | ]
27 | 
28 | [project.optional-dependencies]
29 | nextflow = [
30 |     "nextflow"
31 | ]
32 | report = [
33 |     "papermill",
34 |     "jupyter",
35 |     "matplotlib",
36 |     "seaborn",
37 | ]
38 | 
39 | 
40 | [project.scripts]
41 | fastoma-batch-roothogs = "FastOMA.batch_roothogs:fastoma_batch_roothogs"
42 | fastoma-check-input = "FastOMA.check_input:fastoma_check_input"
43 | fastoma-collect-subhogs = "FastOMA.collect_subhogs:fastoma_collect_subhogs"
44 | fastoma-infer-roothogs = "FastOMA.infer_roothogs:fastoma_infer_roothogs"
45 | fastoma-infer-subhogs = "FastOMA.infer_subhogs:fastoma_infer_subhogs"
46 | fastoma-helper = "FastOMA.helper_scripts:main"
47 | 
48 | [project.urls]
49 | Homepage = "https://github.com/DessimozLab/FastOMA"
50 | 
51 | [tool.hatch.version]
52 | path = "FastOMA/__init__.py"
53 | 
54 | [tool.hatch.build.targets.sdist]
55 | include = [
56 |     "/FastOMA",
57 | ]
58 | 
59 | [tool.hatch.envs.default]
60 | features = [
61 |     "report",
62 | ]
63 | 


--------------------------------------------------------------------------------
/testdata/README.md:
--------------------------------------------------------------------------------
 1 | FastOMa test data
 2 | ======
 3 | 
 4 | 
 5 | This repo contains a small dataset as the test example.
 6 | 
 7 | 1- The proteome folder including three fasta files `AQUAE.fa`, `CHLTR.fa`  and  `MYCGE.fa` corresponding to three species.
 8 | 
 9 | 2- A dummy species tree in Newick format. 
10 | 
11 | 3- You can download the omamer database as follows
12 | ```
13 | cd gethog3/testdata
14 | wget https://omabrowser.org/All/Primates-v2.0.0.h5     # 105MB
15 | mv Primates-v2.0.0.h5    in_folder/omamerdb.h5 
16 | ```
17 | 


--------------------------------------------------------------------------------
/testdata/expected_output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/.DS_Store


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroups.tsv:
--------------------------------------------------------------------------------
 1 | Group	Protein
 2 | OG_0000001	sp|P0CE13|G3P_CHLTR
 3 | OG_0000001	sp|O67161|G3P_AQUAE
 4 | OG_0000001	sp|P47543|G3P_MYCGE
 5 | OG_0000002	sp|O67118|DNAK_AQUAE
 6 | OG_0000002	sp|P47547|DNAK_MYCGE
 7 | OG_0000002	sp|P17821|DNAK_CHLTR
 8 | OG_0000003	sp|O67618|LEPA_AQUAE
 9 | OG_0000003	sp|O84067|LEPA_CHLTR
10 | OG_0000004	sp|P0CD71|EFTU_CHLTR
11 | OG_0000004	sp|P13927|EFTU_MYCGE
12 | OG_0000004	sp|O66429|EFTU_AQUAE
13 | OG_0000005	sp|O84081|FOLD_CHLTR
14 | OG_0000005	sp|O67736|FOLD_AQUAE
15 | OG_0000006	sp|O84332|TPIS_CHLTR
16 | OG_0000006	sp|O66686|TPIS_AQUAE
17 | OG_0000007	sp|P0C0Z7|CH60_CHLTR
18 | OG_0000007	sp|O67943|CH60_AQUAE
19 | OG_0000008	sp|P47639|ATPB_MYCGE
20 | OG_0000008	sp|O67828|ATPB_AQUAE
21 | OG_0000009	sp|P47641|ATPA_MYCGE
22 | OG_0000009	sp|O66907|ATPA_AQUAE
23 | OG_0000010	sp|O66778|ENO_AQUAE
24 | OG_0000010	sp|O84591|ENO_CHLTR
25 | OG_0000011	sp|O84026|RF1_CHLTR
26 | OG_0000011	sp|O67032|RF1_AQUAE
27 | OG_0000011	sp|P47500|RF1_MYCGE
28 | OG_0000012	tr|O84829|O84829_CHLTR
29 | OG_0000012	sp|O67547|SUCD_AQUAE
30 | 


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000001.fa:
--------------------------------------------------------------------------------
 1 | >sp|P47543|G3P_MYCGE sp|P47543|G3P_MYCGE||MYCGE||1000000005 sp|P47543|G3P_MYCGE [MYCGE]
 2 | MAAKNRTIKVAINGFGRIGRLVFRSLLSKANVEVVAINDLTQPEVLAHLLKYDSAHGELK
 3 | RKITVKQNILQIDRKKVYVFSEKDPQNLPWDEHDIDVVIESTGRFVSEEGASLHLKAGAK
 4 | RVIISAPAKEKTIRTVVYNVNHKTISSDDKIISAASCTTNCLAPLVHVLEKNFGIVYGTM
 5 | LTVHAYTADQRLQDAPHNDLRRARAAAVNIVPTTTGAAKAIGLVVPEANGKLNGMSLRVP
 6 | VLTGSIVELSVVLEKSPSVEQVNQAMKRFASASFKYCEDPIVSSDVVSSEYGSIFDSKLT
 7 | NIVEVDGMKLYKVYAWYDNESSYVHQLVRVVSYCAKL
 8 | >sp|P0CE13|G3P_CHLTR sp|P0CE13|G3P_CHLTR||CHLTR||1001000009 sp|P0CE13|G3P_CHLTR [CHLTR]
 9 | MRIVINGFGRIGRLVLRQILKRNSPIEVVAINDLVAGDLLTYLFKYDSTHGSFAPQATFS
10 | DGCLVMGERKVHFLAEKDVQKLPWKDLDVDVVVESTGLFVNRDDVAKHLDSGAKRVLITA
11 | PAKGDVPTFVMGVNHQQFDPADVIISNASCTTNCLAPLAKVLLDNFGIEEGLMTTVHAAT
12 | ATQSVVDGPSRKDWRGGRGAFQNIIPASTGAAKAVGLCLPELKGKLTGMAFRVPVADVSV
13 | VDLTVKLSSATTYEAICEAVKHAANTSMKNIMYYTEEAVVSSDFIGCEYSSVFDAQAGVA
14 | LNDRFFKLVAWYDNEIGYATRIVDLLEYVQENSK
15 | >sp|O67161|G3P_AQUAE sp|O67161|G3P_AQUAE||AQUAE||1002000010 sp|O67161|G3P_AQUAE [AQUAE]
16 | MAIKVGINGFGRIGRSFFRASWGREEIEIVAINDLTDAKHLAHLLKYDSVHGIFKGSVEA
17 | KDDSIVVDGKEIKVFAQKDPSQIPWGDLGVDVVIEATGVFRDRENASKHLQGGAKKVIIT
18 | APAKNPDITVVLGVNEEKYNPKEHNIISNASCTTNCLAPCVKVLNEAFGVEKGYMVTVHA
19 | YTNDQRLLDLPHKDFRRARAAAINIVPTTTGAAKAIGEVIPELKGKLDGTARRVPVPDGS
20 | LIDLTVVVNKAPSSVEEVNEKFREAAQKYRESGKVYLKEILQYCEDPIVSTDIVGNPHSA
21 | IFDAPLTQVIDNLVHIAAWYDNEWGYSCRLRDLVIYLAERGL
22 | 


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000001.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000001.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000002.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000002.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000003.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000003.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000004.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000004.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000005.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000005.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000006.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000006.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000007.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000007.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000008.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000008.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000009.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000009.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000010.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000010.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000011.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000011.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000012.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000012.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGs.tsv:
--------------------------------------------------------------------------------
 1 | RootHOG	Protein	OMAmerRootHOG
 2 | HOG:0000001	sp|P0CE13|G3P_CHLTR	HOG:E1027400
 3 | HOG:0000001	sp|O67161|G3P_AQUAE	HOG:E1027400
 4 | HOG:0000001	sp|P47543|G3P_MYCGE	HOG:E1027400
 5 | HOG:0000002	sp|O67118|DNAK_AQUAE	HOG:E0990770
 6 | HOG:0000002	sp|P47547|DNAK_MYCGE	HOG:E0990770
 7 | HOG:0000002	sp|P17821|DNAK_CHLTR	HOG:E0990770
 8 | HOG:0000003	sp|O67618|LEPA_AQUAE	HOG:E0990677
 9 | HOG:0000003	sp|O84067|LEPA_CHLTR	HOG:E0990677
10 | HOG:0000004	sp|P0CD71|EFTU_CHLTR	HOG:E0990677
11 | HOG:0000004	sp|P13927|EFTU_MYCGE	HOG:E0990677
12 | HOG:0000004	sp|O66429|EFTU_AQUAE	HOG:E0990677
13 | HOG:0000005	sp|O84081|FOLD_CHLTR	HOG:E1027325
14 | HOG:0000005	sp|O67736|FOLD_AQUAE	HOG:E1027325
15 | HOG:0000006	sp|O84332|TPIS_CHLTR	HOG:E1027829
16 | HOG:0000006	sp|O66686|TPIS_AQUAE	HOG:E1027829
17 | HOG:0000007	sp|P0C0Z7|CH60_CHLTR	HOG:E1027301
18 | HOG:0000007	sp|O67943|CH60_AQUAE	HOG:E1027301
19 | HOG:0000008	sp|P47639|ATPB_MYCGE	HOG:E0990823
20 | HOG:0000008	sp|O67828|ATPB_AQUAE	HOG:E0990823
21 | HOG:0000009	sp|P47641|ATPA_MYCGE	HOG:E0990823
22 | HOG:0000009	sp|O66907|ATPA_AQUAE	HOG:E0990823
23 | HOG:0000010	sp|O66778|ENO_AQUAE	HOG:E1027309
24 | HOG:0000010	sp|O84591|ENO_CHLTR	HOG:E1027309
25 | HOG:0000011	sp|O84026|RF1_CHLTR	HOG:E0990790
26 | HOG:0000011	sp|O67032|RF1_AQUAE	HOG:E0990790
27 | HOG:0000011	sp|P47500|RF1_MYCGE	HOG:E0990790
28 | HOG:0000012	tr|O84829|O84829_CHLTR	HOG:E1027626
29 | HOG:0000012	sp|O67547|SUCD_AQUAE	HOG:E1027626
30 | 


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000001.fa:
--------------------------------------------------------------------------------
 1 | >sp|P47543|G3P_MYCGE sp|P47543|G3P_MYCGE||MYCGE||1000000005 sp|P47543|G3P_MYCGE [MYCGE]
 2 | MAAKNRTIKVAINGFGRIGRLVFRSLLSKANVEVVAINDLTQPEVLAHLLKYDSAHGELK
 3 | RKITVKQNILQIDRKKVYVFSEKDPQNLPWDEHDIDVVIESTGRFVSEEGASLHLKAGAK
 4 | RVIISAPAKEKTIRTVVYNVNHKTISSDDKIISAASCTTNCLAPLVHVLEKNFGIVYGTM
 5 | LTVHAYTADQRLQDAPHNDLRRARAAAVNIVPTTTGAAKAIGLVVPEANGKLNGMSLRVP
 6 | VLTGSIVELSVVLEKSPSVEQVNQAMKRFASASFKYCEDPIVSSDVVSSEYGSIFDSKLT
 7 | NIVEVDGMKLYKVYAWYDNESSYVHQLVRVVSYCAKL
 8 | >sp|P0CE13|G3P_CHLTR sp|P0CE13|G3P_CHLTR||CHLTR||1001000009 sp|P0CE13|G3P_CHLTR [CHLTR]
 9 | MRIVINGFGRIGRLVLRQILKRNSPIEVVAINDLVAGDLLTYLFKYDSTHGSFAPQATFS
10 | DGCLVMGERKVHFLAEKDVQKLPWKDLDVDVVVESTGLFVNRDDVAKHLDSGAKRVLITA
11 | PAKGDVPTFVMGVNHQQFDPADVIISNASCTTNCLAPLAKVLLDNFGIEEGLMTTVHAAT
12 | ATQSVVDGPSRKDWRGGRGAFQNIIPASTGAAKAVGLCLPELKGKLTGMAFRVPVADVSV
13 | VDLTVKLSSATTYEAICEAVKHAANTSMKNIMYYTEEAVVSSDFIGCEYSSVFDAQAGVA
14 | LNDRFFKLVAWYDNEIGYATRIVDLLEYVQENSK
15 | >sp|O67161|G3P_AQUAE sp|O67161|G3P_AQUAE||AQUAE||1002000010 sp|O67161|G3P_AQUAE [AQUAE]
16 | MAIKVGINGFGRIGRSFFRASWGREEIEIVAINDLTDAKHLAHLLKYDSVHGIFKGSVEA
17 | KDDSIVVDGKEIKVFAQKDPSQIPWGDLGVDVVIEATGVFRDRENASKHLQGGAKKVIIT
18 | APAKNPDITVVLGVNEEKYNPKEHNIISNASCTTNCLAPCVKVLNEAFGVEKGYMVTVHA
19 | YTNDQRLLDLPHKDFRRARAAAINIVPTTTGAAKAIGEVIPELKGKLDGTARRVPVPDGS
20 | LIDLTVVVNKAPSSVEEVNEKFREAAQKYRESGKVYLKEILQYCEDPIVSTDIVGNPHSA
21 | IFDAPLTQVIDNLVHIAAWYDNEWGYSCRLRDLVIYLAERGL
22 | 


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000001.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000001.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000002.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000002.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000003.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000003.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000004.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000004.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000005.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000005.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000006.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000006.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000007.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000007.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000008.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000008.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000009.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000009.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000010.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000010.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000011.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000011.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000012.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000012.fa.gz


--------------------------------------------------------------------------------
/testdata/expected_output/orthologs.tsv:
--------------------------------------------------------------------------------
 1 | sp|O67161|G3P_AQUAE	sp|P0CE13|G3P_CHLTR
 2 | sp|P47543|G3P_MYCGE	sp|O67161|G3P_AQUAE
 3 | sp|P47543|G3P_MYCGE	sp|P0CE13|G3P_CHLTR
 4 | sp|O67118|DNAK_AQUAE	sp|P17821|DNAK_CHLTR
 5 | sp|P47547|DNAK_MYCGE	sp|P17821|DNAK_CHLTR
 6 | sp|P47547|DNAK_MYCGE	sp|O67118|DNAK_AQUAE
 7 | sp|O67618|LEPA_AQUAE	sp|O84067|LEPA_CHLTR
 8 | sp|O66429|EFTU_AQUAE	sp|P0CD71|EFTU_CHLTR
 9 | sp|P13927|EFTU_MYCGE	sp|P0CD71|EFTU_CHLTR
10 | sp|P13927|EFTU_MYCGE	sp|O66429|EFTU_AQUAE
11 | sp|O67736|FOLD_AQUAE	sp|O84081|FOLD_CHLTR
12 | sp|O66686|TPIS_AQUAE	sp|O84332|TPIS_CHLTR
13 | sp|O67943|CH60_AQUAE	sp|P0C0Z7|CH60_CHLTR
14 | sp|O67828|ATPB_AQUAE	sp|P47639|ATPB_MYCGE
15 | sp|O66907|ATPA_AQUAE	sp|P47641|ATPA_MYCGE
16 | sp|O66778|ENO_AQUAE	sp|O84591|ENO_CHLTR
17 | sp|O67032|RF1_AQUAE	sp|O84026|RF1_CHLTR
18 | sp|P47500|RF1_MYCGE	sp|O84026|RF1_CHLTR
19 | sp|P47500|RF1_MYCGE	sp|O67032|RF1_AQUAE
20 | sp|O67547|SUCD_AQUAE	tr|O84829|O84829_CHLTR
21 | 


--------------------------------------------------------------------------------
/testdata/expected_output/phylostratigraphy.html:
--------------------------------------------------------------------------------
 1 | 
 2 |         <!DOCTYPE html>
 3 |         <html>
 4 |         <head>
 5 |             <title>Phylo.io</title>
 6 |             <meta charset="UTF-8">
 7 |             <script src="https://peterolson.github.com/BigInteger.js/BigInteger.min.js"></script>
 8 |             <script type="text/javascript" src="https://cdn.rawgit.com/DessimozLab/phylo-io/5e89fafc3b1746b22da33c20b2af621d5807b6fb/www/js/jquery-2.1.4.min.js"></script>
 9 |             <script type="text/javascript" src="https://cdn.rawgit.com/DessimozLab/phylo-io/9eae3d2af33f1f75bff78e393909dccfdd400650/www/js/treecompare.js"></script>
10 |             <script type="text/javascript" src="https://underscorejs.org/underscore-min.js"></script>
11 |             <script type="text/javascript" src="https://cdn.rawgit.com/DessimozLab/phylo-io/5e89fafc3b1746b22da33c20b2af621d5807b6fb/www/js/d3.min.js"></script>
12 |             <script type="text/javascript" src="https://cdn.rawgit.com/DessimozLab/phylo-io/5e89fafc3b1746b22da33c20b2af621d5807b6fb/www/js/bootstrap.min.js"></script>
13 |             <script type="text/javascript" src="https://cdn.rawgit.com/DessimozLab/phylo-io/5e89fafc3b1746b22da33c20b2af621d5807b6fb/www/js/FileSaver.min.js"></script>
14 |             <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.0.8/css/solid.css" integrity="sha384-v2Tw72dyUXeU3y4aM2Y0tBJQkGfplr39mxZqlTBDUZAb9BGoC40+rdFCG0m10lXk" crossorigin="anonymous">
15 |             <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.0.8/css/fontawesome.css" integrity="sha384-q3jl8XQu1OpdLgGFvNRnPdj5VIlCvgsDQTQB6owSOHWlAurxul7f+JpUOVdAiJ5P" crossorigin="anonymous">
16 |             <link rel="stylesheet" type="text/css" href="https://cdn.rawgit.com/DessimozLab/phylo-io/5e89fafc3b1746b22da33c20b2af621d5807b6fb/www/css/bootstrap.min.css">
17 |             <link rel="stylesheet" type="text/css" href="https://cdn.rawgit.com/DessimozLab/phylo-io/5e89fafc3b1746b22da33c20b2af621d5807b6fb/www/css/bootstrap-theme.min.css">
18 |             <link rel="stylesheet" type="text/css" href="https://cdn.rawgit.com/DessimozLab/phylo-io/5e89fafc3b1746b22da33c20b2af621d5807b6fb/www/css/style.css">
19 |             <style>
20 | 
21 |             text {stroke: none;}
22 | 
23 |             #help_modal_button {
24 |                 position: fixed;
25 |                 right: 10px;
26 |                 margin-right: 10px; /*magic number */
27 |                 bottom: 10px;
28 |                 z-index: 99;
29 |             }
30 | 
31 |             </style>
32 |         </head>
33 |         <body id="phylo">
34 | 
35 |         <!-- Modal -->
36 |         <div class="modal  fade bs-example-modal-lg" id="myModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
37 |             <div class="modal-dialog modal-lg" role="document">
38 |                 <div class="modal-content">
39 |                     <div class="modal-header">
40 |                         <button type="button" class="close" data-dismiss="modal" aria-label="Close"><span aria-hidden="true">&times;</span></button>
41 |                         <h4 class="modal-title" id="myModalLabel">Tree profile help</h4>
42 |                     </div>
43 |                     <div class="modal-body">
44 | 
45 |                         <embed src="https://cdn.rawgit.com/DessimozLab/pyham/fc01fb94/help.pdf" frameborder="0" width="100%" height="400px">
46 |                     </div>
47 |                     <div class="modal-footer">
48 |                         <button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
49 |                     </div>
50 |                 </div>
51 |             </div>
52 |         </div>
53 | 
54 | 
55 |         <!-- Button trigger modal -->
56 |         <button id="help_modal_button" type="button" class="btn btn-sm" data-toggle="modal" data-target="#myModal">
57 |             Help
58 |         </button>
59 | 
60 | 
61 | 
62 |         <div id="vis-container1" style="width: 100%; height: 100%;">
63 |         <div id="scale-1"> </div>
64 |         </div>
65 |         <script type="text/javascript">
66 |             var treecomp = TreeCompare().init({
67 |                 enableScale: true,
68 |                 scaleColor: "black",
69 |                 showHistogramValues: true,
70 |                 showHistogramSummaryValue: true
71 |             });
72 |             treeData = '{"name": "inter2", "numberGenes": 6, "numberEvents": null, "length": 0.01, "collapsed": "false", "evolutionaryEvents": false, "children": [{"name": "inter1", "numberGenes": 12, "numberEvents": 6, "length": 0.01, "collapsed": "false", "evolutionaryEvents": {"retained": 6, "duplicated": 0, "gained": 6, "lost": 0, "duplication": 0}, "children": [{"name": "AQUAE", "numberGenes": 12, "numberEvents": 0, "length": 0.01, "collapsed": "false", "evolutionaryEvents": {"retained": 12, "duplicated": 0, "gained": 0, "lost": 0, "duplication": 0}}, {"name": "CHLTR", "numberGenes": 10, "numberEvents": 2, "length": 0.01, "collapsed": "false", "evolutionaryEvents": {"retained": 10, "duplicated": 0, "gained": 0, "lost": 2, "duplication": 0}}]}, {"name": "MYCGE", "numberGenes": 6, "numberEvents": 0, "length": 0.01, "collapsed": "false", "evolutionaryEvents": {"retained": 6, "duplicated": 0, "gained": 0, "lost": 0, "duplication": 0}}]}';
73 |             var tree1 = treecomp.addTree(treeData, undefined, "single");
74 |             treecomp.viewTree(tree1.name, "vis-container1", "scale-1");
75 |             treecomp.addMainLegend(tree1.name);
76 |         </script>
77 |         </body>
78 |         </html>
79 |         


--------------------------------------------------------------------------------
/testdata/expected_output/species_tree_checked.nwk:
--------------------------------------------------------------------------------
1 | ((AQUAE:1,CHLTR:1)inter1:1,MYCGE:1)inter2:0;


--------------------------------------------------------------------------------
/testdata/in_folder/proteome/AQUAE.fa:
--------------------------------------------------------------------------------
  1 | >sp|O67618|LEPA_AQUAE
  2 | MEQKNVRNFCIIAHVDHGKSTLADRLLEYTGAISEREKREQLLDTLDVERERGITVKMQA
  3 | VRMFYKAKDGNTYKLHLIDTPGHVDFSYEVSRALAACEGALLLIDASQGIEAQTVANFWK
  4 | AVEQDLVIIPVINKIDLPSADVDRVKKQIEEVLGLDPEEAILASAKEGIGIEEILEAIVN
  5 | RIPPPKGDPQKPLKALIFDSYYDPYRGAVAFVRIFDGEVKPGDKIMLMSTGKEYEVTEVG
  6 | AQTPKMTKFDKLSAGDVGYIAASIKDVRDIRIGDTITHAKNPTKEPVPGFQPAKPMVYAG
  7 | IYPAEDTTYEELRDALEKYAINDAAIVYEPESSPALGMGFRVGFLGLLHMEIVQERLERE
  8 | YGVKIITTAPNVIYRVKKKFTDEVIEVRNPMDFPDNAGLIEYVEEPFVLVTIITPKEYVG
  9 | PIIQLCQEKRGIQKNMTYLDPNTVYLEYEMPLSEIIVDFHDKIKSISRGFASYDYEFIGY
 10 | RPSDLIKLTVLINKKPVDALSFIVHADRAQKFARRVAEKLRETIPRQLFEVHIQVAKGGK
 11 | VIASERIKPLRANVTAKCYGGDVTRKKKLLENQKEGKKRMKQFGKVQLPQEAFLSVLKVE
 12 | >sp|O67032|RF1_AQUAE
 13 | MLKEAYISRLDKLQEKYRKLQEELSKPEVIQDVEKYKKLSKELKELQEINELYERYKKAQ
 14 | KELKEAKELLKSSDKDLRELAEEEVNRLTEEMKKLEEELKVHLVPKDPNDTKNVILEIRA
 15 | GAGGEEAALFAADLFRMYQKYAEEKGWKVSILSSNKTGLGGYKEVIALIEGEGAYSRLKY
 16 | ESGVHRVQRVPVTESSGRIHTSTATVAVLPEVDETDIKIKPEELKIETFRASGAGGQYVN
 17 | TTETAVRITHIPTGIVVQCQDERSQFQNKQKALKILYAKLKDYYERKKQEEIAKERKEQV
 18 | GTGERSEKIRTYNFPQNRVTDHRINLTLYKLQDVLEGKLDEIIDALRAKEIEKKLELVEK
 19 | EG
 20 | >sp|O66778|ENO_AQUAE
 21 | MSRIKRVHGREVLDSRGNPTVEVEVELESGALGRAIVPSGASTGEREALELRDGDPKRYL
 22 | GKGVLKAVDNVNGVIAKALVGLEPYNQREIDQILIELDGTENKSKLGANAILGTSMAVAR
 23 | AAANELGIPLYEYLGGKFGYRLPVPLMNVINGGAHADNNLDIQEFMIVPVCGGAFREALR
 24 | AGVETFHHLKKILKEKGYSTNVGDEGGFAPNLNSSEEALDILMQAIEKAGYKPGEDILLA
 25 | LDVASSEFYENGVYKFEGKERSAEEMIEFYEKLIQKYPIISIEDPMSENDWEGWKEITKR
 26 | LGDKVQLVGDDLFTTNPKILRKGIEEGVANAILVKLNQIGTVSETLDTVMLAKERNYSAI
 27 | ISHRSGETEDTFISHLAVATNAGQIKTGSASRTDRIAKYNELLRIEERLGNGAVFWGREE
 28 | FYRFTS
 29 | >sp|O66429|EFTU_AQUAE
 30 | MAKEKFERTKEHVNVGTIGHVDHGKSTLTSAITCVLAAGLVEGGKAKCFKYEEIDKAPEE
 31 | KERGITINITHVEYETAKRHYAHVDCPGHADYIKNMITGAAQMDGAILVVSAADGPMPQT
 32 | REHVLLARQVNVPYIVVFMNKCDMVDDEELLELVELEVRELLSKYEYPGDEVPVIRGSAL
 33 | GALQELEQNSPGKWVESIKELLNAMDEYIPTPQREVDKPFLMPIEDVFSISGRGTVVTGR
 34 | VERGVLRPGDEVEIVGLREEPLKTVATSIEMFRKVLDEALPGDNIGVLLRGVGKDDVERG
 35 | QVLAQPGSVKAHKRFRAQVYVLSKEEGGRHTPFFVNYRPQFYFRTADVTGTVVKLPEGVE
 36 | MVMPGDNVELEVELIAPVALEEGLRFAIREGGRTVGAGVVTKILD
 37 | >sp|O67547|SUCD_AQUAE
 38 | MAILVNKDTKVVVQGITGKEGSFHAKQCKEYGTQVVAGVTPGKGGMEVEGIPVFNTVKEA
 39 | VKETGANCSLIFVPAPFAADAIVEALDAGIELVVCITEGIPVKDMMMVKDYMLKNYPNAK
 40 | LVGPNCPGVITPGEAKVGIMPGHIFKRGKIGIVSRSGTLTYEAAYQLTKYGLGQSTAVGI
 41 | GGDPVHGLTHRDVIEMFNKDPETEAILMIGEIGGTEEEEAAEYIEKEVDKPVFAYIAGIT
 42 | APPGKRMGHAGAIIMGGKGTAKAKMEALEKAGAYVIENPAKIGETVAKILKVIELEEEER
 43 | TSDAE
 44 | >sp|O66686|TPIS_AQUAE
 45 | MRRLIAANWKMNKTVKETEEYINTFLKFVEHPESREILICPPFTSLYVAGKMLQGTGVKL
 46 | GAQNCHYEKRGAFTGEISIPMLQEVGCEYVIVGHSERRHIFGESDELIHKKIVACLEMGI
 47 | RPILCVGEKKEEREAGMTFKVIETQIKLALTGVEEHTDKIDIAYEPVWAIGTGTPATPED
 48 | AVEVHTFIRNLINQLNPKNEGKTRILYGGSVNPQNAKEFMKHEEINGLLVGTASLDPESF
 49 | AKIVYSF
 50 | >sp|O67828|ATPB_AQUAE
 51 | MAEVIKGKVVQVIGPVVDVEFEGVKELPKIKDGLKTIRRAIDDRGNWFEEVLFMEVAQHI
 52 | GEHRVRAIAMGPTDGLVRGQEVEYLGGPIKIPVGKEVLGRIFNVAGQPIDEQGPVEAKEY
 53 | WPMFRNPPELVEQSTKVEILETGIKVIDLLQPIIKGGKVGLFGGAGVGKTVLMQELIHNI
 54 | ARFHEGYSVVVGVGERTREGNDLWLEMKESGVLPYTVMVYGQMNEPPGVRFRVAHTGLTM
 55 | AEYFRDVEGQDVLIFIDNIFRFVQAGAEVSTLLGRLPSAVGYQPTLNTDVGEVQERITST
 56 | KKGSITAIQAVYVPADDITDPAPWSIFAHLDATTVLTRRLAELGIYPAIDPLESTSKYLA
 57 | PEYVGEEHYEVAMEVKRILQRYKELQEIIAILGMEELSDEDKAIVNRARRIQKFLSQPFH
 58 | VAEQFTGMPGKYVKLEDTIRSFKEVLTGKYDHLPENAFYMVGTIEDVIEKAKQMGAKV
 59 | >sp|O67118|DNAK_AQUAE
 60 | MAEKKEKIIGIDLGTTNSVVSVMMGDEAVVIQNQEGSRLTPSVVSWTKEKEILVGEPAKR
 61 | RAILDPENTVYESKRFIGRKFEEVKEEAKRVSYKVVPDEKGDAAFDIPNAGKLVRPEEVG
 62 | AHVLRKLKEAAEAFLGEPVKKAVITVPAYFNERQRQATKDAGKIAGLEVVRILNEPTAAA
 63 | MAYGLHKKDNVRILVYDFGGGTFDVSILEGGEGVIEVKVTAGDTHLGGANIDERIMDWLI
 64 | EEFKKETGIDLRKDRTALQRLKEASEQAKKELSFKMETEINLPFITIDPNTNQPLHLQKK
 65 | LTRARLEEMIKDIVDRTIDIVKQALEDAKLKPSDIDEVVLVGGSTRIPLVQQRIKEFFGK
 66 | EPHKGLNPDEVVAMGAAIQAGVLAGEVKEIVLVDVTPLSLGVETYGGVMTVLIPRNTPIP
 67 | VRKCEIFTTAHDYQTEVEIHVLQGERPLAKDNKSLAKFYLTGIPPAPRGVPKIEVCFDID
 68 | ADGILHVTAKDLGTGKEQSVRVEISSGLTPEEIERIIKEAEEHAEEDRKKKELIEAKNQL
 69 | DHLVYQLEKALKEAGDKVPADVKSEAEKVIEEAKKTIETATEIEQVKQVTEKVLQVSSKM
 70 | GTTLYGEAGKQAGGGEKKDEGGEGEVEAKPVD
 71 | >sp|O67736|FOLD_AQUAE
 72 | MALILDGKSLSKKIREEIKKEVENFTSKGFRPPALAVILVGNDPASEIYVNNKRKACEKV
 73 | GIKSLFYHLPQDVSEEKLLGLIYELNMNEEVDGILVQLPLPKHIDQTRVILSISPEKDVD
 74 | GFHPENMGKLVAQIEDGFIPCTPLGIDILLKHYGIDVKGKDVTIVGAGFIVGRPLSLLML
 75 | WRNATVSVCHIHTKDVKKFTKEADILISATGVPHLIKEDMIKEGAVVVDVGISRLNGKIV
 76 | GDVDFERVKEKASAITPVPGGVGPMTVTALLLNTLKSYKRKFAHLISTTNP
 77 | >sp|O67161|G3P_AQUAE
 78 | MAIKVGINGFGRIGRSFFRASWGREEIEIVAINDLTDAKHLAHLLKYDSVHGIFKGSVEA
 79 | KDDSIVVDGKEIKVFAQKDPSQIPWGDLGVDVVIEATGVFRDRENASKHLQGGAKKVIIT
 80 | APAKNPDITVVLGVNEEKYNPKEHNIISNASCTTNCLAPCVKVLNEAFGVEKGYMVTVHA
 81 | YTNDQRLLDLPHKDFRRARAAAINIVPTTTGAAKAIGEVIPELKGKLDGTARRVPVPDGS
 82 | LIDLTVVVNKAPSSVEEVNEKFREAAQKYRESGKVYLKEILQYCEDPIVSTDIVGNPHSA
 83 | IFDAPLTQVIDNLVHIAAWYDNEWGYSCRLRDLVIYLAERGL
 84 | >sp|O67943|CH60_AQUAE
 85 | MAAKAIIYNEEARAKLKAGVDKLANAVKVTLGPKGREVILGKNWGTPVVTKDGVTVAKEI
 86 | ELKDKFENIGAQLVKEVASKTADVAGDGTTTATVLAQAIFHEGLRVAASGANVMEVKRGI
 87 | DKAVKKIVEELKKLSKDVKERKEIEQVATISANNDPEIGKIIADAMEEVGKDGVITVEES
 88 | KSAETTLEVVKGMQFDRGYLSPYFVTDPEKMECVLENPYILIYEKKITNVKELLPILEQV
 89 | VRSGRPLLVIAEDVEGEALATLVVNHIKGVLKACAVKAPGFGQRRKDYLGDIAVLTGGQA
 90 | ITEDLGIKLESVTLDMLGQAEKVVVDKEHTTIIGGKGDPEQIKARIEQIKRQIQETTSDY
 91 | DREKLQERLAKLSGGVAIIRVGAATEAELKEKKYRVEDAVHATKAAVEEGIVPGGGVALV
 92 | RASEALEDLKGDNHDQQLGIDIIKKAVRTPLKQIAYNAGYDGSVVLEKVIELGKEKGVSW
 93 | GFNAATGEYVDMYEAGIIDPTKVVRTAIENAASVAGTMLTAEALIADLPEEKKKDITPTD
 94 | MPELD
 95 | >sp|O66907|ATPA_AQUAE
 96 | MATLTYEEALEILRQQIKDFEPEAKMEEVGVVYYVGDGVARAYGLENVMAMEIVEFQGGQ
 97 | QGIAFNLEEDNVGIIILGSETGIEEGHIVKRTGRILDAPVGEGLVGRVIDPLGNPLDGKG
 98 | PIQFEYRSPVEKIAPGVVKRKPVHEPLQTGIKAIDAMIPIGRGQRELIIGDRATGKTTVA
 99 | IDTILAQKNSDVYCIYVAVGQKRAAIARLIELLEREGAMEYTTVVVASASDPASLQYLAP
100 | FVGCTIGEYFRDNGKHALIIYDDLSKHAEAYRQLSLLMRRPPGREAYPGDVFYLHSRLLE
101 | RAAKLNDDLGAGSLTALPIIETKAGDVAAYIPTNVISITDGQIYLEADLFNKGIRPAINV
102 | GLSVSRVGGAAQIKAMKQVAGTLRLELAQFRELEAFVQFASELDKATQQQINRGLRLVEL
103 | LKQEPYNPIPVEKQIVLIYAGTHGYLDDIPVESVRKFEKELYAYLDNERPDILKEISEKK
104 | KLDEELEKKIKEALDAFKQKFVP
105 | 


--------------------------------------------------------------------------------
/testdata/in_folder/proteome/CHLTR.fa:
--------------------------------------------------------------------------------
 1 | >sp|O84067|LEPA_CHLTR
 2 | MKPYKIENIRNFSIIAHIDHGKSTIADRLLESTSTIEQREMREQLLDSMDLERERGITIK
 3 | AHPVTMTYEYEGETYELNLIDTPGHVDFSYEVSRSLAACEGALLIVDAAQGVQAQSLANV
 4 | YLALERDLEIIPVLNKIDLPAAQPEAIKKQIEEFIGLDTSNTIACSAKTGQGIPEILESI
 5 | IRLVPPPKPPQETELKALIFDSHYDPYVGIMVYVRVISGEIKKGDRITFMATKGSSFEVL
 6 | GIGAFLPEATLMEGSLRAGQVGYFIANLKKVKDVKIGDTVTTVKHPAKEPLEGFKEIKPV
 7 | VFAGIYPIDSSDFDTLKDALGRLQLNDSALTIEQENSHSLGFGFRCGFLGLLHLEIIFER
 8 | ISREFDLDIIATAPSVIYKVVLKNGKTLFIDNPTAYPDPALIEHMEEPWVHVNIITPQEY
 9 | LSNIMSLCMDKRGICLKTDMLDQHRLVLSYELPLNEIVSDFNDKLKSVTKGYGSFDYRLG
10 | DYKKGAIIKLEILINDEAVDAFSCLVHRDKAESKGRSICEKLVDVIPPQLFKIPIQAAIN
11 | KKIIARETIRALAKNVTAKCYGGDITRKRKLWDKQKKGKKRMKEFGKVSIPNTAFVEVLK
12 | ME
13 | >sp|O84026|RF1_CHLTR
14 | MEIKVLECLKRLEEVEKQISDPNIFSNPKEYSSLSKEHARLSEIKNAHESLVATKKILQD
15 | DKLALSTEKDPEIVAMLEEGVLVGEEAVERLSKQLENLLIPPDPDDDLSVIMELRAGTGG
16 | DEAALFVGDCVRMYHLYAASKGWQCEVLSTSESDLGGYKEYVMGISGASVKRFLQYEAGT
17 | HRVQRVPETETQGRVHTSAVTVAVLPEPAEDDEEVFIDEKDLRIDTFRSSGAGGQHVNVT
18 | DSAVRITHIPSGVVVTCQDERSQHKNKAKAMRVLKARIRDAEVQKRAQEASAMRSAQVGS
19 | GDRSERIRTYNFPQNRVTDHRIGLTLYNLDRVMEGELDMITTALVTHVHRQLFGHEETA
20 | >sp|O84591|ENO_CHLTR
21 | MFDVVISDIEAREILDSRGYPTLCVKVITNTGTFGEACVPSGASTGIKEALELRDKDPKR
22 | YQGKGVLQAISNVEKVLMPALQGFSVFDQITADAIMIDADGTPNKEKLGANAILGVSLAL
23 | AKAAANTLQRPLYRYLGGSFSHVLPCPMMNLINGGMHATNGLQFQEFMIRPISAPSLTEA
24 | VRMGAEVFNALKKILQNRQLATGVGDEGGFAPNLASNAEALDLLLTAIETAGFTPREDIS
25 | LALDCAASSFYNTQDKTYDGKSYADQVGILAELCEHYPIDSIEDGLAEEDFEGWKLLSET
26 | LGDRVQLVGDDLFVTNSALIAEGIAQGLANAVLIKPNQIGTLTETAEAIRLATIQGYATI
27 | LSHRSGETEDTTIADLAVAFNTGQIKTGSLSRSERIAKYNRLMAIEEEMGPEALFQDSNP
28 | FSKA
29 | >sp|P0CD71|EFTU_CHLTR
30 | MSKETFQRNKPHINIGTIGHVDHGKTTLTAAITRALSGDGLADFRDYSSIDNTPEEKARG
31 | ITINASHVEYETANRHYAHVDCPGHADYVKNMITGAAQMDGAILVVSATDGAMPQTKEHI
32 | LLARQVGVPYIVVFLNKIDMISEEDAELVDLVEMELVELLEEKGYKGCPIIRGSALKALE
33 | GDAAYIEKVRELMQAVDDNIPTPEREIDKPFLMPIEDVFSISGRGTVVTGRIERGIVKVS
34 | DKVQLVGLRDTKETIVTGVEMFRKELPEGRAGENVGLLLRGIGKNDVERGMVVCLPNSVK
35 | PHTQFKCAVYVLQKEEGGRHKPFFTGYRPQFFFRTTDVTGVVTLPEGIEMVMPGDNVEFE
36 | VQLISPVALEEGMRFAIREGGRTIGAGTISKIIA
37 | >tr|O84829|O84829_CHLTR
38 | MLELLSKDLPIITQGITGKAGSFHTTQCVAYGSNFVGGVTPGKGGSQFLDLPIFDSVLEA
39 | KQATGCRASMIFVPPPFAAEAIFEAEDAGIELIVCITEGIPIKDMLEVASLMEKSASSLI
40 | GPNCPGVIKPGVCKIGIMPGYIHLPGKVGVVSRSGTLTYEAVWQLTQRKIGQSVCIGIGG
41 | DPLNGTSFIDALQEFEKDSQTEAVLMIGEIGGSAEEEAADWTRQHSSKPVIAFIAGATAP
42 | KGKRMGHAGAIISGKSGDAFSKQEALRQAGVTVVESLALIGEAVASVLKPR
43 | >sp|O84332|TPIS_CHLTR
44 | MFTDKETHRKPFPTWAHLLHSEPSKQFVFGNWKMNKTLTEAQTFLKSFISSDILSNPQII
45 | TGIIPPFTLLSACQQAVSDSPIFLGAQTTHEADSGAFTGEISAPMLKDIGVDFVLIGHSE
46 | RRHIFHEQNPVLAEKAAAAIHSGMIPVLCIGETLEEQESGATQDILLNQLTTGLSKLPEQ
47 | ASFILAYEPVWAIGTGKVAHPDLVQETHAFCRKTIASLFSKDIAERTPILYGGSVKADNA
48 | RSLSLCPDVNGLLVGGASLSSENFLSIIQQIDIP
49 | >sp|P17821|DNAK_CHLTR
50 | MSEKRKSNKIIGIDLGTTNSCVSVMEGGQPKVIASSEGTRTTPSIVAFKGGETLVGIPAK
51 | RQAVTNPEKTLASTKRFIGRKFSEVESEIKTVPYKVAPNSKGDAVFDVEQKLYTPEEIGA
52 | QILMKMKETAEAYLGETVTEAVITVPAYFNDSQRASTKDAGRIAGLDVKRIIPEPTAAAL
53 | AYGIDKEGDKKIAVFDLGGGTFDISILEIGDGVFEVLSTNGDTHLGGDDFDGVIINWMLD
54 | EFKKQEGIDLSKDNMALQRLKDAAEKAKIELSGVSSTEINQPFITIDANGPKHLALTLTR
55 | AQFEHLASSLIERTKQPCAQALKDAKLSASDIDDVLLVGGMSRMPAVQAVVKEIFGKEPN
56 | KGVNPDEVVAIGAAIQGGVLGGEVKDVLLLDVIPLSLGIETLGGVMTPLVERNTTIPTQK
57 | KQIFSTAADNQPAVTIVVLQGERPMAKDNKEIGRFDLTDIPPAPRGHPQIEVTFDIDANG
58 | ILHVSAKDAASGREQKIRIEASSGLKEDEIQQMIRDAELHKEEDKQRKEASDVKNEADGM
59 | IFRAEKAVKDYHDKIPAELVKEIEEHIEKVRQAIKEDASTTAIKAASDELSTHMQKIGEA
60 | MQAQSASAAASSAANAQGGPNINSEDLKKHSFSTRPPAGGSASSTDNIEDADVEIVDKPE
61 | >sp|O84081|FOLD_CHLTR
62 | MLLKGAPAADHILATIKENIRACSKAPGLAVVLIGNNPASEIYVNMKIKRATDLGMVSKS
63 | YRKPSDATLSDILALIHQLNNDENIHGILVQLPLPKHLDAQAILSTITPDKDVDGLHPVN
64 | VGKLLLGETDGFIPCTPAGIVELCKYYEIPLHGKHVVILGRSNIVGKPLAALLMQRHADT
65 | NASVTLLHSQSEHLTEITRTADILISAIGVPLFVNKEMIAEKTVIMDVGTSRIPAANPKG
66 | YILVGDVDFNNVVPVCRAITPVPGGVGPMTVAMLMRNTWESFLRHTS
67 | >sp|P0CE13|G3P_CHLTR
68 | MRIVINGFGRIGRLVLRQILKRNSPIEVVAINDLVAGDLLTYLFKYDSTHGSFAPQATFS
69 | DGCLVMGERKVHFLAEKDVQKLPWKDLDVDVVVESTGLFVNRDDVAKHLDSGAKRVLITA
70 | PAKGDVPTFVMGVNHQQFDPADVIISNASCTTNCLAPLAKVLLDNFGIEEGLMTTVHAAT
71 | ATQSVVDGPSRKDWRGGRGAFQNIIPASTGAAKAVGLCLPELKGKLTGMAFRVPVADVSV
72 | VDLTVKLSSATTYEAICEAVKHAANTSMKNIMYYTEEAVVSSDFIGCEYSSVFDAQAGVA
73 | LNDRFFKLVAWYDNEIGYATRIVDLLEYVQENSK
74 | >sp|P0C0Z7|CH60_CHLTR
75 | MVAKNIKYNEEARKKIQKGVKTLAEAVKVTLGPKGRHVVIDKSFGSPQVTKDGVTVAKEV
76 | ELADKHENMGAQMVKEVASKTADKAGDGTTTATVLAEAIYTEGLRNVTAGANPMDLKRGI
77 | DKAVKVVVDQIRKISKPVQHHKEIAQVATISANNDAEIGNLIAEAMEKVGKNGSITVEEA
78 | KGFETVLDIVEGMNFNRGYLSSYFATNPETQECVLEDALVLIYDKKISGIKDFLPVLQQV
79 | AESGRPLLIIAEDIEGEALATLVVNRIRGGFRVCAVKAPGFGDRRKAMLEDIAILTGGQL
80 | ISEELGMKLENANLAMLGKAKKVIVSKEDTTIVEGMGEKEALEARCESIKKQIEDSSSDY
81 | DKEKLQERLAKLSGGVAVIRVGAATEIEMKEKKDRVDDAQHATIAAVEEGILPGGGTALI
82 | RCIPTLEAFLPMLTNEDEQIGARIVLKALSAPLKQIAANAGKEGAIIFQQVMSRSANEGY
83 | DALRDAYTDMLEAGILDPAKVTRSALESAASVAGLLLTTEALIAEIPEEKPAAAPAMPGA
84 | GMDY
85 | 


--------------------------------------------------------------------------------
/testdata/in_folder/proteome/MYCGE.fa:
--------------------------------------------------------------------------------
 1 | >sp|P47500|RF1_MYCGE
 2 | MDFDKQLFFNVEKIVELTEQLEKDLNKPNLSFEQIKVINKELKHKQPLIVKFKELQKLVE
 3 | NANEAEQILNNSSLKELHEEAKKELEKIKASLPSLEEEIKFLLLPVDENNQKNVIVEIRP
 4 | AAGGDESCIFLSDLFNMYKNYCTSKNWTVELNEIIPASVGINFVSFAVNGTDVFAKLKFE
 5 | SGVHRVQRVPLTEAKGRVHTSTVTVAVLPQLEEVEITINPSDLRIDTYRASGAGGQHVNR
 6 | TESAVRITHLPTGIVVACQEGKSQFSNRDKAMKMLRAKLWENAQNKQLSTQADLRKSQVG
 7 | SGERAEKIRTYNYPQNRITDHRIKLTINKLNTVILGDLDEIIEALQADEKKQQLEKFIS
 8 | >sp|P13927|EFTU_MYCGE
 9 | MAREKFDRSKPHVNVGTIGHIDHGKTTLTAAICTVLAKEGKSAATRYDEIDKAPEEKARG
10 | ITINSAHVEYSSDKRHYAHVDCPGHADYIKNMITGAAQMDGAILVVSATDSVMPQTREHI
11 | LLARQVGVPKMVVFLNKCDIASDEEVQELVAEEVRDLLTSYGFDGKNTPIIYGSALKALE
12 | GDPKWEAKIHDLIKAVDEWIPTPTREVDKPFLLAIEDTMTITGRGTVVTGRVERGELKVG
13 | QEVEIVGLKPIRKAVVTGIEMFKKELDSAMAGDNAGVLLRGVERKEVERGQVLAKPGSIK
14 | PHKKFKAEIYALKKEEGGRHTGFLNGYRPQFYFRTTDVTGSIALAENTEMVLPGDNASIT
15 | VELIAPIACEKGSKFSIREGGRTVGAGTVTEVLE
16 | >sp|P47639|ATPB_MYCGE
17 | MIKKENLTYGKVHQVIGPVVDVIFSESKQLPRVYDCLSVQLKKSELFLEATQLIGDDIVR
18 | CIALGPTEGLARNVKVTNYNHPIEVPVGKNVLGRMFNVLGEPIDGKEPLPKKPKLSIHRN
19 | PPAFDEQPNTVDIFETGIKVIDLLTPYVRGGKIGLFGGAGVGKTVLVQELIHNIAKEHSG
20 | LSVFAGVGERTREGNDLYYEMIQGGVIDKTVLVFGQMNEPPGARMRVALTALTMAEYFRD
21 | HDNQNVLLFIDNIFRFTQAGSEVSALLGRMPSAVGYQPTLAIEMGKLQERIASTKTGSIT
22 | SVQAIYVPADDLTDPAPATTFTHLDAKTVLDRNIAALGIFPAINPLESTSRLLDPSVVGI
23 | NHYKVALGVQNILQRFAELQDIIAILGIDELSDEDKIIVERARRIRNFLSQPFFVAEKFS
24 | GIAGKYVSLNDTVQSFKEILEGKHDHLPEQAFFYVGTIQEAVEKAKRLNQEFDKTK
25 | >sp|P47547|DNAK_MYCGE
26 | MSADNGLIIGIDLGTTNSCVSVMEGGRPVVLENPEGKRTTPSIVSYKNNEIIVGDAAKRQ
27 | MVTNPNTIVSIKRLMGTSNKVKVQNADGTTKELSPEQVSAQILSYLKDFAEKKIGKKISR
28 | AVITVPAYFNDAERNATKTAGKIAGLNVERIINEPTAAALAYGIDKASREMKVLVYDLGG
29 | GTFDVSLLDIAEGTFEVLATAGDNRLGGDDWDNKIIEYISAYIAKEHQGLNLSKDKMAMQ
30 | RLKEAAERAKIELSAQLETIISLPFLTVTQKGPVNVELKLTRAKFEELTKPLLERTRNPI
31 | SDVIKEAKIKPEEINEILLVGGSTRMPAVQKLVESMVPGKKPNRSINPDEVVAIGAAIQG
32 | GVLRGDVKDVLLLDVTPLTLSIETLGGVATPLIKRNTTIPVSKSQIFSTAQDNQESVDVV
33 | VCQGERPMSRDNKSLGRFNLGGIQPAPKGKPQIEITFSLDANGILNVKAKDLTTQKENSI
34 | TISDNGNLSEEEIQKMIRDAEANKERDNIIRERIELRNEGEGIVNTIKEILASPDAKNFP
35 | KEEKEKLEKLTGNIDAAIKANDYAKLKVEIENFKKWREEMAKKYNPTGEQGPQAK
36 | >sp|P47543|G3P_MYCGE
37 | MAAKNRTIKVAINGFGRIGRLVFRSLLSKANVEVVAINDLTQPEVLAHLLKYDSAHGELK
38 | RKITVKQNILQIDRKKVYVFSEKDPQNLPWDEHDIDVVIESTGRFVSEEGASLHLKAGAK
39 | RVIISAPAKEKTIRTVVYNVNHKTISSDDKIISAASCTTNCLAPLVHVLEKNFGIVYGTM
40 | LTVHAYTADQRLQDAPHNDLRRARAAAVNIVPTTTGAAKAIGLVVPEANGKLNGMSLRVP
41 | VLTGSIVELSVVLEKSPSVEQVNQAMKRFASASFKYCEDPIVSSDVVSSEYGSIFDSKLT
42 | NIVEVDGMKLYKVYAWYDNESSYVHQLVRVVSYCAKL
43 | >sp|P47641|ATPA_MYCGE
44 | MADKLNEYVALIKTEIKKYSKKIFNSEIGQVISVADGIAKVSGLENALLNELIQFENNIQ
45 | GIVLNLEQNTVGIALFGDYSSLREGSTAKRTHSVMKTPVGDVMLGRIVNALGEAIDGRGD
46 | IKATEYDQIEKIAPGVMKRKSVNQPLETGILTIDALFPIGKGQRELIVGDRQTGKTAIAI
47 | DTIINQKDKDVYCVYVAIGQKNSSVAQIVHQLEVNDSMKYTTVVCATASDSDSMVYLSPF
48 | TGITIAEYWLKKGKDVLIVFDDLSKHAVAYRTLSLLLKRPPGREAFPGDVFYLHSRLLER
49 | ACKLNDENGGGSITALPIIETQAGDISAYIPTNVISITDGQLFMVSSLFNAGQRPAIQIG
50 | LSVSRVGSAAQTKAIKQQTGSLKLELAQYSELDSFSQFGSDLDENTKKVLEHGKRVMEMI
51 | KQPNGKPYSQVHEALFLFAINKAFIKFIPVDEIAKFKQRITEEFNGSHPLFKELSNKKEF
52 | TEDLESKTKTAFKMLVKRFISTLTDYDITKFGSIEELN
53 | 


--------------------------------------------------------------------------------
/testdata/in_folder/species_tree.nwk:
--------------------------------------------------------------------------------
1 | ((AQUAE,CHLTR)inter1,MYCGE)inter2;


--------------------------------------------------------------------------------
/tests/data/HOG_0890520.fa:
--------------------------------------------------------------------------------
 1 | >sp|P47500|RF1_MYCGE||MYCGE||1000000000 sp|P47500|RF1_MYCGE
 2 | MDFDKQLFFNVEKIVELTEQLEKDLNKPNLSFEQIKVINKELKHKQPLIVKFKELQKLVE
 3 | NANEAEQILNNSSLKELHEEAKKELEKIKASLPSLEEEIKFLLLPVDENNQKNVIVEIRP
 4 | AAGGDESCIFLSDLFNMYKNYCTSKNWTVELNEIIPASVGINFVSFAVNGTDVFAKLKFE
 5 | SGVHRVQRVPLTEAKGRVHTSTVTVAVLPQLEEVEITINPSDLRIDTYRASGAGGQHVNR
 6 | TESAVRITHLPTGIVVACQEGKSQFSNRDKAMKMLRAKLWENAQNKQLSTQADLRKSQVG
 7 | SGERAEKIRTYNYPQNRITDHRIKLTINKLNTVILGDLDEIIEALQADEKKQQLEKFIS
 8 | >sp|O84026|RF1_CHLTR||CHLTR||1001000001 sp|O84026|RF1_CHLTR
 9 | MEIKVLECLKRLEEVEKQISDPNIFSNPKEYSSLSKEHARLSEIKNAHESLVATKKILQD
10 | DKLALSTEKDPEIVAMLEEGVLVGEEAVERLSKQLENLLIPPDPDDDLSVIMELRAGTGG
11 | DEAALFVGDCVRMYHLYAASKGWQCEVLSTSESDLGGYKEYVMGISGASVKRFLQYEAGT
12 | HRVQRVPETETQGRVHTSAVTVAVLPEPAEDDEEVFIDEKDLRIDTFRSSGAGGQHVNVT
13 | DSAVRITHIPSGVVVTCQDERSQHKNKAKAMRVLKARIRDAEVQKRAQEASAMRSAQVGS
14 | GDRSERIRTYNFPQNRVTDHRIGLTLYNLDRVMEGELDMITTALVTHVHRQLFGHEETA
15 | >sp|O67032|RF1_AQUAE||AQUAE||1002000001 sp|O67032|RF1_AQUAE
16 | MLKEAYISRLDKLQEKYRKLQEELSKPEVIQDVEKYKKLSKELKELQEINELYERYKKAQ
17 | KELKEAKELLKSSDKDLRELAEEEVNRLTEEMKKLEEELKVHLVPKDPNDTKNVILEIRA
18 | GAGGEEAALFAADLFRMYQKYAEEKGWKVSILSSNKTGLGGYKEVIALIEGEGAYSRLKY
19 | ESGVHRVQRVPVTESSGRIHTSTATVAVLPEVDETDIKIKPEELKIETFRASGAGGQYVN
20 | TTETAVRITHIPTGIVVQCQDERSQFQNKQKALKILYAKLKDYYERKKQEEIAKERKEQV
21 | GTGERSEKIRTYNFPQNRVTDHRINLTLYKLQDVLEGKLDEIIDALRAKEIEKKLELVEK
22 | EG
23 | 


--------------------------------------------------------------------------------
/tests/data/correct-msa.fa:
--------------------------------------------------------------------------------
 1 | >HUMAN01350 | OMA1057741 | PBLD_HUMAN | [Homo sapiens]
 2 | --MKLPIFIADAFTARAFRGNPAAVC----LLENELDEDMHQKIAREMNLSETAFIRKLH
 3 | PTDNFAQSSCFGLRWFTPASEVPLCGHATLASAAVLFHKIK-NMNSTLTFVTLSGELRAR
 4 | RAEDGIVLDLPLYPAHPQDFHEV-EDLI---KTAIGNTLVQDICYSPDTQKLLVRLSDVY
 5 | NRSFLENLKVNTENLLQVENTGKVKGLILTLKGEPGGQTQAFDFYSRYFAPWVGVAEDPV
 6 | TGSAHAVLSSYWSQHLGKKEMHAFQCSHRGGELGISLRPDGRVD----------------
 7 | ----IRGGAAVVLEGTLTA
 8 | >YEAST02880 | OMA1057741 | YHI9_YEAST | [Saccharomyces cerevisiae (strain ATCC 204508 / S288c)]
 9 | MTLMVPFKQVDVFTEKPFMGNPVAVINFLEIDENEVSQEELQAIANWTNLSETTFLFK--
10 | PSD---KKYDYKLRIFTPRSELPFAGHPTIGSCKAFLEFTKNTTATSLVQECKIGAVPIT
11 | INEGLISFKAPM-----ADYESISSEMIADYEKAIGLKFIKPPALLHTGPEWIVALVEDA
12 | ETCF--NANPNFAMLAHQTKQNDHVGIILAGPKKEAAIKNSYEM--RAFAPVINVYEDPV
13 | CGSGSVALARYL------QEVYKFEKT-----TDITISEGGRLKRNGLMLASIKKEADNS
14 | TSYYIAGHATTVIDGKIKV
15 | >MOUSE00277 | OMA1057741 | K3W4L7 | [Mus musculus]
16 | --MKLPIFIADAFTATAFRGNPAAVC----LLERTLEEDAHQQIAREMNLSETAFIRKLQ
17 | PTDSFTQSSRFGLRWFTPVSEVPLCGHATLASAAVLFHKIRNNRNSTLTFVTMSGELKAR
18 | RAEDGIVLDFPVYPTFPQDFHEV-EDLI---KAAIGDTLVQDIRYSTDTRKLLVRLSDSY
19 | DRSFLESLKVNTEPLPAIEKTGKVRGLILTVKGEPGGQTAPYDFYSRYFAPWVGIAEDPV
20 | TGSAHTVLSSYWSQQLRKKEMRAFQCSRRGGELDISLRPDGRVD----------------
21 | ----IKGGAVIVLEGTLTA
22 | >PANTR00757 | OMA1057741 | A0A6D2W9P7 | [Pan troglodytes]
23 | --MKLPIFIADAFTARAFRGNPAAVC----LLENELDEDMHQKIAREMNLSETAFIRKLH
24 | PTDNFAQSSCFGLRWFTPASEVPLCGHATLASAAVLFHKIK-NMNSTLTFVTLSGELRAR
25 | RAEDGIVLDLPLYPAHPQDFHEV-EDLI---KTAIGNTLVQDICYSPDTRKLLVRLSDVY
26 | NRSFLENLKVNTENLLQVENTGKVKGLILTLKGEPGGQTQAFDFYSRYFAPWVGVAEDPV
27 | TGSAHAVLSSYWSQHLGKKEMHAFQCSRRGGELGISLRPDGRVD----------------
28 | ----IRGCAAVVLEGTLTA
29 | 


--------------------------------------------------------------------------------
/tests/test_fasttree_wrapper.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | 
 4 | from Bio import AlignIO
 5 | from FastOMA._wrappers import infer_gene_tree
 6 | from FastOMA.zoo.wrappers import WrapperError
 7 | import pathlib
 8 | this_dir = pathlib.Path(__file__).parent
 9 | 
10 | 
11 | class FastTreeTester(unittest.TestCase):
12 |     def test_failing_tree_building_reports_error_from_fasttree(self):
13 |         msa = AlignIO.read(this_dir / "data" / "failing-msa.fa", "fasta")
14 |         with self.assertLogs("FastOMA", level="ERROR") as cm:
15 |             with self.assertRaises(WrapperError):
16 |                 infer_gene_tree(msa)
17 |             self.assertIn("Non-unique name", "\n".join(cm.output))
18 | 
19 |     def test_treebuilding_with_correct_msa(self):
20 |         msa = AlignIO.read(this_dir / "data" / "correct-msa.fa", "fasta")
21 |         tree = infer_gene_tree(msa)
22 |         self.assertIn("HUMAN01350", tree)


--------------------------------------------------------------------------------
/tests/test_infer_subhog.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from ete3 import Tree, TreeNode
 3 | from Bio.Seq import Seq
 4 | from Bio.SeqRecord import SeqRecord
 5 | from argparse import Namespace
 6 | from FastOMA._hog_class import HOG, Representative
 7 | from FastOMA._infer_subhog import LevelHOGProcessor
 8 | 
 9 | 
10 | class TestLevelHogProcessor(TestCase):
11 |     def setUp(self):
12 |         genetree = Tree(
13 |             '(((((G00100_SE001||SE001:153.567,G00100_SE008||SE008:153.567)1:39.499[&&NHX:evoltype=S],(G00100_SE006||SE006:173.507,G00100_SE007||SE007:173.507)1:19.5597[&&NHX:evoltype=S])1:14.0196[&&NHX:evoltype=S],(G00100_SE003||SE003:198.481,((((G00100_SE011||SE011:136.533,G00100_SE012||SE012:136.533)1:7.60673[&&NHX:evoltype=S],(G00100_SE010||SE010:36.1782,G00342_SE010||SE010:36.1782)1:107.961[&&NHX:evoltype=D])1:8.49419[&&NHX:evoltype=S],G00100_SE009||SE009:152.634)1:13.723[&&NHX:evoltype=S],(((G00186_SE004||SE004:143.819,(G00186_SE011||SE011:136.533,(G00186_SE012||SE012:116.411,G00242_SE012||SE012:116.411)1:20.1214[&&NHX:evoltype=D])1:7.28662[&&NHX:evoltype=S])1:0.32011[&&NHX:evoltype=S],(G00186_SE010||SE010:31.4887,G00350_SE010||SE010:31.4887)1:112.651[&&NHX:evoltype=D])1:8.49419[&&NHX:evoltype=S],G00186_SE009||SE009:152.634)1:13.723[&&NHX:evoltype=S])1:32.1245[&&NHX:evoltype=D])1:8.60492[&&NHX:evoltype=S])1:36.2336[&&NHX:evoltype=S],(((G00110_SE001||SE001:153.567,G00110_SE008||SE008:153.567)1:39.499[&&NHX:evoltype=S],(G00110_SE006||SE006:173.507,G00110_SE007||SE007:173.507)1:19.5597[&&NHX:evoltype=S])1:14.0196[&&NHX:evoltype=S],(G00110_SE003||SE003:198.481,(((G00110_SE004||SE004:143.819,(G00110_SE011||SE011:136.533,G00110_SE012||SE012:136.533)1:7.28662[&&NHX:evoltype=S])1:0.32011[&&NHX:evoltype=S],G00110_SE010||SE010:144.139)1:8.49419[&&NHX:evoltype=S],G00110_SE009||SE009:152.634)1:45.8474[&&NHX:evoltype=S])1:8.60492[&&NHX:evoltype=S])1:36.2336[&&NHX:evoltype=S])1:6.68041[&&NHX:evoltype=D],(G00100_SE002||SE002:119.545,(G00100_SE013||SE013:97.4899,(G00100_SE014||SE014:87.2367,G00100_SE015||SE015:87.2367)1:10.2532[&&NHX:evoltype=S])1:22.055[&&NHX:evoltype=S])1:130.455[&&NHX:evoltype=S]);')
14 |         sptree = Tree("dummy;")
15 |         hogs = [HOG(SeqRecord(Seq("AAAAAA"), id=n.name), sptree, "test1") for n in genetree.iter_leaves()]
16 |         conf = Namespace(msa_write=False, gene_trees_write=False, number_of_samples_per_hog=5, msa_filter_method="col-row-threshold",
17 |                          gap_ratio_row=0.3, gap_ratio_col=0.5, min_col_trim=400)
18 |         self.genetree = genetree
19 |         self.lp = LevelHOGProcessor(sptree, hogs, "test1", conf)
20 | 
21 |     def test_propose_representatives(self):
22 |         rep = self.lp.find_most_divergent_representatives_from_genetree(self.genetree)
23 |         self.assertEqual(len(rep), self.lp.conf.number_of_samples_per_hog)
24 |         self.assertIn(rep[self.lp.conf.number_of_samples_per_hog-1].get_id(), ("G00100_SE013||SE013","G00100_SE015||SE015","G00100_SE014||SE014","G00100_SE002||SE002"))
25 | 
26 |     def test_reconcilation(self):
27 |         exp = self.genetree.write(features=['evoltype'])
28 |         self.lp.infer_reconciliation(genetree=self.genetree)
29 |         self.assertEqual(exp, self.genetree.write(features=['evoltype']))
30 |         self.assertEqual(self.genetree.sos, 0)
31 |         self.assertEqual(self.genetree.children[0].sos, 1)
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/tests/test_roothog_example.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | 
3 | class RootHOGExampleTestCase(TestCase):
4 | 
5 |     def setUpClass(cls):
6 |         pass


--------------------------------------------------------------------------------
/utils/filter_orthoxml_completeness.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | ###  How to use:  python  filter_orthoxml_completeness.py  FastOMA_HOGs.orthoxml  0.3  
 5 | 
 6 | import sys
 7 | import logging
 8 | logging.basicConfig(level=logging.DEBUG)
 9 | from FastOMA.zoo.hog import filter_orthoxml_file, HOGFilter
10 | 
11 | print("started ")
12 | 
13 | input_orthoxml_add = sys.argv[1]
14 | threshold_filt = float(sys.argv[2])
15 | 
16 | score_type = "CompletenessScore"
17 | 
18 | 
19 | output_name = input_orthoxml_add + "_filt_"+str(threshold_filt)+".orthoxml"
20 | with open(output_name, 'wb') as output_file:
21 |     filt = HOGFilter(score_type, threshold_filt)
22 |     filter_orthoxml_file(input_orthoxml_add, output_file, filt)
23 | 
24 | print("we wrote the output in "+output_name)
25 | 
26 | 


--------------------------------------------------------------------------------
/utils/find_unfinished_rhogs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | import sys
 5 | 
 6 | import os
 7 | folder = sys.argv[1]  #"/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_eukaryota/run_1june/out_folder/"
 8 | 
 9 | from os import listdir
10 | 
11 | project_files = listdir(folder + "/rhogs_all/")
12 | rhogs = []
13 | for file in project_files:
14 |     file_name_split = file.split(".")
15 |     if file_name_split[-1] == "fa":
16 |         rhog_id = int(file_name_split[0].split("_")[1])
17 |         rhogs.append(rhog_id)
18 | 
19 | print("number of rhogs is ", len(rhogs))
20 | 
21 | folder_pickle = folder + "/pickle_rhogs/"
22 | project_files = listdir(folder_pickle)
23 | pickles = []
24 | for file in project_files:
25 |     if os.path.getsize(folder_pickle + file) > 2:
26 |         file_name_split = file.split(".")
27 |         if file_name_split[-1] == "pickle":
28 |             rhog_id = int(file_name_split[0].split("_")[1])
29 |             pickles.append(rhog_id)
30 |     else:
31 |         print("this file is empty", file)
32 | 
33 | print("number of pickles is ", len(pickles))
34 | 
35 | no_pickle_list = set(rhogs) - set(pickles)
36 | 
37 | print("number of rhogs not finished is ", len(no_pickle_list))
38 | 
39 | print("\n \n ", no_pickle_list)
40 | 


--------------------------------------------------------------------------------
/utils/orthoxml2OG.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | this code is for converting an OrthoXML file to a set of Fasta files as Ortholougous groups
  4 | 
  5 | How to run:
  6 | cd out_folder
  7 | python orthoxml2OG.py output_hog_.orthoxml rhogs_all 
  8 | 
  9 | 
 10 | Output
 11 |  - Gene names per OG in maximal_og_prot.tsv
 12 |  - Fasta files in OGs_maximal
 13 | """
 14 | 
 15 | 
 16 | from ete3 import Tree
 17 | import sys
 18 | import os
 19 | from FastOMA.zoo.hog.convert import orthoxml_to_newick
 20 | from Bio import SeqIO
 21 | 
 22 | 
 23 | 
 24 | 
 25 | def max_og_tree(tree):
 26 |     for node in tree.traverse("preorder"):
 27 |         # for node in xml_tree.traverse(strategy="preorder", is_leaf_fn=lambda n: hasattr(n, "attriremoved") and n.attriremoved==True):
 28 |         if not node.is_leaf() and hasattr(node,"Ev") and node.Ev == 'duplication':       # node.name[:3] == "dup"
 29 |             dup_node = node
 30 |             children = dup_node.get_children()
 31 |             list_num_species = []
 32 |             for child in children:
 33 |                 child_name_leaves = child.get_leaves()
 34 |                 species_list = []
 35 |                 for leaf in child_name_leaves:
 36 |                     name = leaf.name
 37 |                     if name[:3] == "no_":
 38 |                         name = leaf.name.split("_")[-1]
 39 |                     if name in species_dic:
 40 |                         species_name = species_dic[name]
 41 |                         species_list.append(species_name)
 42 |                     else:
 43 |                         print("species not in the dic ",name)
 44 |                 species_set = set(species_list)
 45 |                 list_num_species.append(len(species_set))
 46 |             index_max_species = list_num_species.index(max(list_num_species))
 47 |             # if there are few children with identical number of species, the case would be not a polytomi but two children with one species
 48 |             # num_occurence = [1 for i in list_num_species if i == max(list_num_species)]
 49 |             # if len(num_occurence) > 1:
 50 |             #    print("please check this case with the developer the tool. The tree has polytomy.")
 51 |             child_max_species = children[index_max_species]
 52 |             children_to_remove = [i for i in children if i != child_max_species]
 53 |             for child_to_remove in children_to_remove:
 54 |                 for i in child_to_remove.get_leaves():
 55 |                     i.in_og = "no"
 56 | 
 57 | 
 58 |     og_prot_list = []
 59 |     for node in tree.traverse("preorder"):
 60 |         if node.is_leaf():
 61 |             if hasattr(node,"in_og") and node.in_og == "no":
 62 |                 pass # print(node.name)
 63 |             else:
 64 |                 og_prot_list.append(node.name)
 65 | 
 66 |     return og_prot_list
 67 | 
 68 | 
 69 | 
 70 | input_orthoxml=sys.argv[1] # "out_folder/output_hog_.orthoxml" 
 71 | rhog_all_folder = sys.argv[2]+"/" # "out_folder/rhogs_all/" 
 72 | fasta_format = "fa" # of the rhogs_all
 73 | 
 74 | 
 75 | output_file = "maximal_og_prot.tsv"
 76 | 
 77 | 
 78 | trees, species_dic = orthoxml_to_newick(input_orthoxml, return_gene_to_species=True) # encode_levels_as_nhx=False,  xref_tag="protId",
 79 | print("We extracted "+str(len(trees))+" trees  in NHX format from the input HOG orthoxml"+input_orthoxml)
 80 | 
 81 | 
 82 | OGs = {}
 83 | for hog_id, tree_string in trees.items():
 84 | 
 85 |     tree = Tree(tree_string,format=1)
 86 |     og_prot_list = max_og_tree(tree)
 87 |     OGs[hog_id] = og_prot_list
 88 | 
 89 | 
 90 | print("done")
 91 | 
 92 | 
 93 | with open(output_file, 'w') as handle:
 94 |     for hog_id, og_prot_list in OGs.items():
 95 |         line_text = str(hog_id)+"\t"+str(og_prot_list)+"\n"
 96 |         handle.write(line_text)
 97 | handle.close()
 98 | 
 99 | print("We wrote the protein families information in the file "+output_file)
100 | 
101 | 
102 | out_folder_ogs = "OGs_maximal/"
103 | os.makedirs(out_folder_ogs)
104 | 
105 | print("start writing "+str(len(OGs))+" OGs as fasta files in folder " +out_folder_ogs )
106 | for hog_id, og_prot_list in OGs.items(): #hog_id="HOG_0667494_sub10524"
107 |     rhog_id = "_".join(hog_id.split("_")[:2]) 
108 | 
109 |     rhogs_all_address = rhog_all_folder + rhog_id + "."+fasta_format
110 |     rhogs_all_prots = list(SeqIO.parse(rhogs_all_address, "fasta"))
111 | 
112 |     og_prots = []
113 |     og_prot_list = OGs[hog_id]
114 |     for rhogs_prot in rhogs_all_prots:
115 |         if rhogs_prot.id.split("||")[0] in og_prot_list:
116 |             sp= rhogs_prot.id.split("||")[1]
117 |             rhogs_prot.description += " ["+ sp +"]"         
118 |             og_prots.append(rhogs_prot)
119 | 
120 |     og_id =  "OG_" + hog_id  # one OG per rootHOG      # "/HOG_"+ str(rhogid_num).zfill(7)
121 |     SeqIO.write(og_prots, out_folder_ogs+og_id+".fa", "fasta")   
122 | print("writing done")
123 | 
124 | 


--------------------------------------------------------------------------------
/utils/orthoxml2family.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | import sys
 6 | 
 7 | from FastOMA.zoo.hog import extract_flat_groups_at_level
 8 | 
 9 | 
10 | """
11 | how to run
12 |     python orthoxml2family.py my_hogs.orthoxml
13 |     
14 | - to convert orthoxml to rootHOG (protein families)
15 | """
16 | 
17 | input_orthoxml = sys.argv[1]
18 | output_file = "families_prot.tsv"
19 | 
20 | toplevel_groups = []
21 | for grp in extract_flat_groups_at_level(input_orthoxml):
22 |     toplevel_groups.append(set(g.xref for g in grp))
23 | 
24 | # toplevel_groups is a list of sets
25 | 
26 | print("We extracted "+str(len(toplevel_groups))+" protein families from the input HOG orthoxml"+input_orthoxml)
27 | print("The first one contain "+str(len(toplevel_groups[0]))+" proteins.")
28 | 
29 | with open(output_file, 'w') as handle:
30 |     for toplevel_group_idx, toplevel_group in enumerate(toplevel_groups):
31 |         line_text = str(toplevel_group_idx)+"\t"+str(toplevel_group)+"\n"
32 |         handle.write(line_text)
33 | handle.close()
34 | 
35 | print("We wrote the protein families information in the file "+output_file)
36 | 
37 | 
38 | # we need to know the species name of each prot,  as prot_specis dic
39 | # prot_name_universal = []
40 | # for group in toplevel_groups:
41 | #     if len(group) > 0.9 * 2181:
42 | #         species = [prot_specis[prot] for prot in group]
43 | #         species_unq = set(species)
44 | #         if len(species_unq) > 0.9 * 2181:
45 | #             prot_name_universal.append(group)
46 | #
47 | # len(prot_name_universal)


--------------------------------------------------------------------------------
/utils/orthoxml2newick.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | import sys
 5 | import os
 6 | from FastOMA.zoo.hog.convert import orthoxml_to_newick
 7 | 
 8 | """
 9 | how to run
10 |     python orthoxml2newick.py my_hogs.orthoxml
11 | """
12 | 
13 | input_orthoxml = sys.argv[1]
14 | output_folder = "output_folder_trees"
15 | 
16 | os.mkdir(output_folder)
17 | 
18 | trees = orthoxml_to_newick(input_orthoxml)
19 | 
20 | print("We extracted "+str(len(trees))+" trees from the input HOG orthoxml"+input_orthoxml)
21 | 
22 | # write them as files
23 | for treeid_hog, tree in trees.items():
24 |     tree_file_i = output_folder+"/tree_"+str(treeid_hog)+".nwk"
25 |     with open(tree_file_i,'w') as handle:
26 |         handle.write(tree)
27 |     handle.close()
28 |     # tree_i.write(format=1, format_root_node=True, outfile=tree_file_i)
29 | print("We wrote "+str(len(trees))+" trees  in nhx format from the input HOG orthoxml"+input_orthoxml+"in "+output_folder)
30 | print("You can visualise each tree using https://beta.phylo.io/viewer/ as extendeed newick format.")
31 | 


--------------------------------------------------------------------------------
/utils/orthoxml2pairs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from FastOMA.zoo.hog import transform
 3 | 
 4 | #from zoo.tree_utils import collapse, gene_species, transform, HOG_coverages
 5 | 
 6 | import io
 7 | import lxml.etree
 8 | import sys
 9 | orthoxml_file = sys.argv[1]
10 | #"/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_qfo/benchmark-webservice3/orthoxml/euk_omamer200.dev8_13oct.orthoxml"
11 | 
12 | 
13 | orthxml_str = []
14 | with open(orthoxml_file, "r") as f:
15 |     for i in f:
16 |         orthxml_str.append(i)
17 | print(len(orthxml_str))
18 | dic_gene_integer={}
19 | for line in orthxml_str:
20 |     if "gene id" in line:
21 |         found=False
22 |         gene_int= line.split("\"")[1]
23 |         gene_name = line.split("\"")[3]
24 |         dic_gene_integer[gene_int] = gene_name
25 | 
26 | 
27 | 
28 | orthoxml_etree=lxml.etree.parse(orthoxml_file)
29 | 
30 | pw_orthologs_integer = sorted(list(transform.iter_pairwise_relations(orthoxml_etree)))
31 | # iter_pairwise_relations(obj, rel_type=None    (def:'ortholog' , but possible to use 'paralog')
32 | print(len(pw_orthologs_integer))
33 | print(pw_orthologs_integer[:2])
34 | pw_orthologs_gene =[]
35 | for pair in pw_orthologs_integer:
36 |     pw_orthologs_gene.append((dic_gene_integer[pair[0]],dic_gene_integer[pair[1]]))
37 | 
38 | 
39 | 
40 | print(len(pw_orthologs_gene))
41 | print(pw_orthologs_gene[:2])
42 | 
43 | 
44 | output_file = open(orthoxml_file+"_pairs.tsv","w")
45 | for  pair in pw_orthologs_gene:
46 |     output_file.write(pair[0]+"\t"+pair[1]+"\n")
47 | 
48 | output_file.close()
49 | 


--------------------------------------------------------------------------------
/utils/orthoxml2perrhog.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # import OrthoXMLSplitter
 5 | 
 6 | 
 7 | folder="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_eukaryota/run_1june/out_folder/"
 8 | hog_file = folder + "/output_hog_.orthoxml"
 9 | outdir=folder+"/perrhog_folder"
10 | 
11 | from OrthoXMLSplitter import OrthoXMLSplitter
12 | 
13 | splitter = OrthoXMLSplitter(hog_file, outdir)
14 | 
15 | splitter()
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/utils/orthoxml2phylostratigraphy.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # you need to install pyham https://github.com/DessimozLab/pyham 
 3 | 
 4 | import pyham       
 5 | 
 6 | import logging
 7 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)-12s %(levelname)-8s %(message)s")
 8 | 
 9 | 
10 | working_folder="./"
11 | 
12 | nwk_path= working_folder+"in_folder/species_tree.nwk" # species tree should be pruned (no extra leaves)
13 | 
14 | tree_str = pyham.utils.get_newick_string(nwk_path, type="nwk")
15 | print(tree_str[:10])
16 | 
17 | orthoxml_path=working_folder+"out_folder/output_hog.orthoxml"
18 | ham_analysis = pyham.Ham(tree_str, orthoxml_path, use_internal_name=True)
19 | print("Ham analysis done") # for a big orthoxml file it can take ~30mins
20 | 
21 | #phylostratigraphy
22 | 
23 | #create tree profile, classify all genomes by extant or ancestral, and get % of dup, lost, retained, and gained
24 | treeprofile = ham_analysis.create_tree_profile(outfile= working_folder+"/out_folder/phylostratigraphy.html")
25 | treemap = treeprofile.compute_tree_profile_full()
26 | 
27 | 


--------------------------------------------------------------------------------
/utils/pickle2orthoxml.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | from xml.dom import minidom
  4 | import xml.etree.ElementTree as ET
  5 | import pickle
  6 | from FastOMA._utils_subhog import read_species_tree
  7 | from FastOMA.collect_subhogs import convert_speciestree_to_orthoxml_taxonomy
  8 | import sys
  9 | from FastOMA.transformer import header_transformer
 10 | 
 11 | from FastOMA.collect_subhogs import iter_hogs
 12 | from FastOMA.collect_subhogs import update_hogids
 13 | from pathlib import Path
 14 | 
 15 | ```
 16 | python pickle2orthoxml.py  "no_header" "file_D0680685.pickle"
 17 | 
 18 | python pickle2orthoxml.py "selected_genes"  pickle_folder gene_id_dic_xml.pickle  "species_tree_checked.nwk"     # this will be slow.  gene_id_dic_xml.pickle is in the output of infer_roothogs
 19 | ```
 20 | 
 21 | mode = sys.argv[1]  #"selected_genes" #"no_header" # "selected_genes"  "all_genes"
 22 | 
 23 | if mode=="no_header":
 24 | 
 25 |     input_pickle= sys.argv[2]   # "file_D0680685.pickle"
 26 |     handle=open(input_pickle,'rb')
 27 |     orthoxml_file = pickle.load(handle)
 28 | 
 29 |     print(len(orthoxml_file))
 30 |     xml_str = minidom.parseString(ET.tostring(orthoxml_file[0])).toprettyxml(indent="   ")
 31 | 
 32 |     with open(input_pickle+"_noheader.orthoxml","w") as out_file:
 33 |         out_file.write(xml_str)
 34 | 
 35 | if mode =="selected_genes":
 36 | 
 37 |     input_pickle = sys.argv[2] # a folder  of pickles  pickle_folder
 38 |     gene_id_pickle_file = sys.argv[3] # generated in infer_roothogs.
 39 |     # available in out_folder/temp_output/gene_id_dic_xml.pickle
 40 |     # this keeps the gene name and the gene integer ID used in orthoxml.
 41 |     species_tree = sys.argv[4] # "species_tree_checked.nwk"
 42 | 
 43 |     handle=open(input_pickle,'rb')
 44 |     orthoxml_file1 = pickle.load(handle) # todo might have two elements inside?
 45 |     gene_int_set = set()
 46 |     num_digit = 10  # integer ids  # assumption ?
 47 |     for orthoxml_part in orthoxml_file1:
 48 |         xml_str = minidom.parseString(ET.tostring(orthoxml_part)).toprettyxml(indent="   ")
 49 |         gene_int_set_i = set([int(i[1:num_digit+1]) for i in xml_str.split("geneRef id=")[1:] ])
 50 |         gene_int_set.update(gene_int_set_i)
 51 | 
 52 |     from datetime import datetime
 53 |     fastoma_version= "0"
 54 |     orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/",
 55 |                                                    "origin": "FastOMA " + fastoma_version,
 56 |                                                    "originVersion": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
 57 |                                                    "version": "0.5"})  #
 58 | 
 59 |     with open(gene_id_pickle_file, 'rb') as handle:
 60 |         gene_id_name = pickle.load(handle)  # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name)
 61 |     print("We read the gene_id_name dictionary with %d items", len(gene_id_name))
 62 | 
 63 |     speciestree = read_species_tree(species_tree)
 64 |     taxonomy, name2taxid = convert_speciestree_to_orthoxml_taxonomy(speciestree)
 65 |     print("Now creating the header of orthoxml")
 66 | 
 67 |     id_transform_= "noop" #  noop:No transformation, "UniProt":   '>sp|P68250|1433B_BOVIN' --> P68250""")
 68 | 
 69 |     id_transformer = header_transformer(id_transform_)
 70 | 
 71 |     #  #### create the header of orthoxml ####
 72 |     for query_species_name, list_prots in gene_id_name.items():
 73 |         first=True
 74 |         for (gene_idx_integer, query_prot_name) in list_prots:
 75 |             if gene_idx_integer in gene_int_set:
 76 |                 if first:
 77 |                     species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "taxonId": str(name2taxid[query_species_name]), "NCBITaxId": "0"})
 78 |                     database_xml = ET.SubElement(species_xml, "database", attrib={"name": "database", "version": "2023"})
 79 |                     genes_xml = ET.SubElement(database_xml, "genes")
 80 |                     prot_id = id_transformer.transform(query_prot_name)
 81 |                     gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": prot_id})
 82 |                     first=False
 83 |                 else:
 84 |                     prot_id = id_transformer.transform(query_prot_name)
 85 |                     gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": prot_id})
 86 | 
 87 | 
 88 | 
 89 |     print("gene_xml is created.")
 90 |     # orthoxml_file.append(taxonomy)
 91 | 
 92 |     scores = ET.SubElement(orthoxml_file, "scores")
 93 |     ET.SubElement(scores, "scoreDef", {"id": "CompletenessScore",
 94 |                                        "desc": "Fraction of expected species with genes in the (Sub)HOG"})
 95 | 
 96 |     #  #### create the groups of orthoxml   ####
 97 |     groups_xml = ET.SubElement(orthoxml_file, "groups")
 98 | 
 99 |     with open(input_pickle, 'rb') as handle:
100 |         hogs_a_rhog_xml = pickle.load(handle)
101 |     for idx, hog_a_rhog_xml in enumerate(hogs_a_rhog_xml):
102 |         fam = idx # this could be improved
103 |         groups_xml.append(update_hogids(fam, hog_a_rhog_xml, name2taxid))
104 |     #for fam, hogs_a_rhog_xml in enumerate(iter_hogs(Path(pickle_folder)), start=1):
105 |     #    groups_xml.append(update_hogids(fam, hogs_a_rhog_xml, name2taxid))
106 |     print("converting the xml object to string.")
107 | 
108 |     output_xml_name= input_pickle+".orthoxml"
109 |     with open(output_xml_name, 'wb') as fh:
110 |         ET.indent(orthoxml_file, space='  ', level=0)
111 |         orthoxml = ET.ElementTree(orthoxml_file)
112 |         orthoxml.write(fh, encoding="utf-8", xml_declaration=True, )
113 |     print("orthoxml is written in %s", output_xml_name)
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/utils/write_orthoxml_per_rHOG.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | 
 3 | from os import listdir
 4 | from xml.dom import minidom
 5 | 
 6 | import pickle
 7 | 
 8 | 
 9 | 
10 | folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_eukaryota/run_1june/"
11 | 
12 | # create this folder /out_folder/orthoxml_out/
13 | 
14 | gene_id_pickle_file = folder + "/out_folder/gene_id_dic_xml.pickle"
15 | 
16 | with open(gene_id_pickle_file, 'rb') as handle:
17 |     gene_id_name = pickle.load(handle)
18 |     # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name)
19 | print("gene_id_name read ", len(gene_id_name))
20 | 
21 | pickle_folder = folder + "/out_folder/pickle_rhogs_/"
22 | pickle_files_adress = listdir(pickle_folder)
23 | 
24 | orthoxml_out_folder = folder + "/out_folder/orthoxml_out/"
25 | check = listdir(orthoxml_out_folder)
26 | 
27 | 
28 | print("gene_xml created ")
29 | # hogs_a_rhog_xml_all = []
30 | for idx, pickle_file_adress in enumerate(pickle_files_adress):
31 | 
32 |     if idx % 100 == 0: print(idx)
33 |     with open(pickle_folder + pickle_file_adress, 'rb') as handle:
34 |         hogs_a_rhog_xml_batch = pickle.load(
35 |             handle)  # hogs_a_rhog_xml_batch is orthoxml_to_newick.py list of hog object.
36 |     handle.close()
37 |     # hogs_a_rhog_xml_all.extend(hogs_a_rhog_xml_batch)
38 |     # hogs_rhogs_xml_all is orthoxml_to_newick.py list of hog object.
39 |     # print("number of hogs is batch is ", len(hogs_a_rhog_xml_batch))
40 | 
41 |     xml_str = ""
42 |     for i in hogs_a_rhog_xml_batch:
43 |         xml_str += minidom.parseString(ET.tostring(i)).toprettyxml(indent="   ")
44 |     xs = xml_str.split("\n")
45 |     list_geneid = []
46 |     for x in xs:
47 |         if "geneRef id" in x:
48 |             list_geneid.append(int(x.split("\"")[1]))
49 |     print(len(list_geneid))
50 | 
51 |     query_species_name_list = []
52 |     for query_species_name, list_prots in gene_id_name.items():
53 | 
54 |         for (gene_idx_integer, query_prot_name) in list_prots:
55 |             if gene_idx_integer in list_geneid:
56 |                 query_species_name_list.append(query_species_name)
57 | 
58 |     query_species_name_set = list(set(query_species_name_list))
59 | 
60 |     output_xml_name = orthoxml_out_folder + pickle_files_adress[0] + "_.orthoxml"
61 |     orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA",
62 |                                                    "originVersion": "Nov 2021", "version": "0.3"})  #
63 | 
64 |     for query_species_name, list_prots in gene_id_name.items():
65 |         if query_species_name in query_species_name_set:
66 |             species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "NCBITaxId": "1"})
67 |             database_xml = ET.SubElement(species_xml, "database", attrib={"name": " database ", "version": "2020"})
68 |             genes_xml = ET.SubElement(database_xml, "genes")
69 | 
70 |             for (gene_idx_integer, query_prot_name) in list_prots:
71 |                 if gene_idx_integer in list_geneid:  # +[1007003758]
72 |                     query_prot_name_pure = query_prot_name
73 |                     gene_xml = ET.SubElement(genes_xml, "gene",
74 |                                              attrib={"id": str(gene_idx_integer), "protId": query_prot_name_pure})
75 | 
76 |     groups_xml = ET.SubElement(orthoxml_file, "groups")
77 | 
78 |     for hogs_a_rhog_xml in hogs_a_rhog_xml_batch:
79 |         groups_xml.append(hogs_a_rhog_xml)
80 |     # print("convert to string")
81 | 
82 |     xml_str = minidom.parseString(ET.tostring(orthoxml_file)).toprettyxml(indent="   ")
83 | 
84 |     with open(output_xml_name, "w") as file_xml:
85 |         file_xml.write(xml_str)
86 |     file_xml.close()
87 | 
88 |     print("orthoxml is written in  " + output_xml_name)
89 | 


--------------------------------------------------------------------------------