├── .dockerignore
├── .github
├── dependabot.yml
└── workflows
│ ├── docker-image.yml
│ └── publish-pypi-release.yml
├── .gitignore
├── .idea
└── remote-mappings.xml
├── Dockerfile
├── FastOMA.nf
├── FastOMA
├── __init__.py
├── _hog_class.py
├── _infer_subhog.py
├── _utils_frag_SO_detection.py
├── _utils_roothog.py
├── _utils_subhog.py
├── _wrappers.py
├── batch_roothogs.py
├── check_input.py
├── collect_subhogs.py
├── fastoma_notebook_stat.ipynb
├── helper_scripts.py
├── infer_roothogs.py
├── infer_subhogs.py
├── transformer.py
└── zoo
│ ├── README.md
│ ├── __init__.py
│ ├── familyanalyzer
│ ├── __init__.py
│ ├── genetree.py
│ ├── newick.py
│ ├── orthoxmlquery.py
│ ├── taxonomy.py
│ └── tools.py
│ ├── file_utils
│ ├── __init__.py
│ ├── context_managers.py
│ └── extractors.py
│ ├── hog
│ ├── __init__.py
│ ├── convert.py
│ ├── extract_groups.py
│ ├── extract_hog_info.py
│ ├── filter_orthoxml.py
│ ├── orthoxml_merge.py
│ └── transform.py
│ ├── seq_utils
│ ├── __init__.py
│ └── utils.py
│ ├── unionfind.py
│ ├── utils.py
│ └── wrappers
│ ├── __init__.py
│ ├── abstract_cli.py
│ ├── aligners
│ ├── __init__.py
│ ├── base_aligner.py
│ ├── mafft.py
│ ├── muscle.py
│ ├── probcons.py
│ └── prographmsa.py
│ ├── modeltesters
│ ├── __init__.py
│ ├── base_modeltester.py
│ ├── parsers.py
│ └── prottest.py
│ ├── options.py
│ ├── treebuilders
│ ├── __init__.py
│ ├── base_treebuilder.py
│ ├── fasttree.py
│ ├── guenomu.py
│ ├── iqtree.py
│ ├── parsers.py
│ ├── phyml.py
│ └── raxml.py
│ └── trimmers
│ ├── __init__.py
│ ├── base_trimmer.py
│ └── trimal.py
├── README.md
├── archive
├── analysis
│ ├── edit_orthxml_file.py
│ ├── find_unfinished_rhog.py
│ ├── preprocess_qfo_files.py
│ ├── write_gene_id_pickle_old_code.py
│ └── xml_.py
├── fastOMA_logo.png
└── test_curn.py
├── conf
└── base.config
├── environment-conda.yml
├── license
├── nextflow.config
├── nextflow_slurm.config
├── pyproject.toml
├── testdata
├── README.md
├── expected_output
│ ├── .DS_Store
│ ├── FastOMA_HOGs.orthoxml
│ ├── OrthologousGroups.tsv
│ ├── OrthologousGroupsFasta
│ │ ├── OG_0000001.fa
│ │ ├── OG_0000001.fa.gz
│ │ ├── OG_0000002.fa.gz
│ │ ├── OG_0000003.fa.gz
│ │ ├── OG_0000004.fa.gz
│ │ ├── OG_0000005.fa.gz
│ │ ├── OG_0000006.fa.gz
│ │ ├── OG_0000007.fa.gz
│ │ ├── OG_0000008.fa.gz
│ │ ├── OG_0000009.fa.gz
│ │ ├── OG_0000010.fa.gz
│ │ ├── OG_0000011.fa.gz
│ │ └── OG_0000012.fa.gz
│ ├── RootHOGs.tsv
│ ├── RootHOGsFasta
│ │ ├── HOG0000001.fa
│ │ ├── HOG0000001.fa.gz
│ │ ├── HOG0000002.fa.gz
│ │ ├── HOG0000003.fa.gz
│ │ ├── HOG0000004.fa.gz
│ │ ├── HOG0000005.fa.gz
│ │ ├── HOG0000006.fa.gz
│ │ ├── HOG0000007.fa.gz
│ │ ├── HOG0000008.fa.gz
│ │ ├── HOG0000009.fa.gz
│ │ ├── HOG0000010.fa.gz
│ │ ├── HOG0000011.fa.gz
│ │ └── HOG0000012.fa.gz
│ ├── hogmap
│ │ ├── AQUAE.fa.hogmap
│ │ ├── CHLTR.fa.hogmap
│ │ └── MYCGE.fa.hogmap
│ ├── orthologs.tsv
│ ├── phylostratigraphy.html
│ ├── report.html
│ ├── report.ipynb
│ ├── species_tree_checked.nwk
│ └── stats
│ │ └── report_2024-10-18_02-43-20.html
└── in_folder
│ ├── proteome
│ ├── AQUAE.fa
│ ├── CHLTR.fa
│ └── MYCGE.fa
│ └── species_tree.nwk
├── tests
├── data
│ ├── HOG_0890520.fa
│ ├── correct-msa.fa
│ └── failing-msa.fa
├── test_fasttree_wrapper.py
├── test_infer_subhog.py
└── test_roothog_example.py
└── utils
├── OrthoXMLSplitter.py
├── filter_orthoxml_completeness.py
├── find_unfinished_rhogs.py
├── orthoxml2OG.py
├── orthoxml2family.py
├── orthoxml2newick.py
├── orthoxml2pairs.py
├── orthoxml2perrhog.py
├── orthoxml2phylostratigraphy.py
├── pickle2orthoxml.py
└── write_orthoxml_per_rHOG.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | work
2 | .nextflow*
3 | .idea
4 | .git
5 | output
6 | testdata
7 | dist
8 | archive/
9 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | # Maintain dependencies for GitHub Actions
4 | - package-ecosystem: "github-actions"
5 | directory: "/"
6 | schedule:
7 | interval: "daily"
8 |
--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
1 | name: Docker Image CI
2 |
3 | on:
4 | push:
5 | pull_request:
6 | release:
7 | type: [published]
8 |
9 | env:
10 | TEST_TAG: dessimozlab/fastoma:test
11 |
12 | jobs:
13 |
14 | build:
15 |
16 | runs-on: ubuntu-latest
17 |
18 | steps:
19 | - name: Checkout
20 | uses: actions/checkout@v4
21 | with:
22 | submodules: recursive
23 |
24 | - name: Docker meta
25 | id: meta
26 | uses: docker/metadata-action@v5
27 | with:
28 | # list of Docker images to use as base name for tags
29 | images: |
30 | dessimozlab/fastoma
31 | # generate Docker tags based on the following events/attributes
32 | tags: |
33 | type=schedule
34 | type=ref,event=branch
35 | type=ref,event=pr
36 | type=semver,pattern={{version}}
37 | type=semver,pattern={{major}}.{{minor}}
38 | type=semver,pattern={{major}}
39 | type=sha
40 |
41 | - name: Set up QEMU
42 | uses: docker/setup-qemu-action@v3
43 |
44 | - name: Set up Docker Buildx
45 | uses: docker/setup-buildx-action@v3
46 |
47 | - name: Build and export to docker for testing
48 | uses: docker/build-push-action@v6
49 | with:
50 | context: .
51 | load: true
52 | tags: ${{ env.TEST_TAG }}
53 |
54 | #- name: Test
55 | # run: |
56 | # docker run --rm -i -v $PWD/tests:/input -v $PWD/tests/:/reads -v $PWD/output:/out -v $PWD/run:/run ${{ env.TEST_TAG }} --tree --standalone_path /input/marker_genes --dna_reference /input/cds-marker_genes.fasta.gz --reads /reads/sample_1.fastq --output_path /out
57 | # if [ ! -f output/tree_sample_1.nwk ] ; then exit 1; fi
58 |
59 | - name: Login to DockerHub
60 | uses: docker/login-action@v3
61 | with:
62 | username: ${{ secrets.DOCKER_HUB_USERNAME }}
63 | password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
64 |
65 | - name: Build and push
66 | uses: docker/build-push-action@v6
67 | with:
68 | context: .
69 | platforms: linux/amd64,linux/arm64
70 | push: true
71 | #${{ github.event_name != 'push' && github.event_name != 'pull_request' }}
72 | tags: ${{ steps.meta.outputs.tags }}
73 | labels: ${{ steps.meta.outputs.labels }}
74 |
--------------------------------------------------------------------------------
/.github/workflows/publish-pypi-release.yml:
--------------------------------------------------------------------------------
1 |
2 | name: Upload FastOMA to pypi
3 |
4 | on:
5 | push:
6 | tags:
7 | - v*
8 |
9 | jobs:
10 | deploy:
11 |
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - uses: actions/checkout@v4
16 | - name: Set up Python
17 | uses: actions/setup-python@v5
18 | with:
19 | python-version: '3.x'
20 | - name: Install dependencies
21 | run: |
22 | python -m pip install --upgrade pip
23 | pip install hatch
24 | - name: Build package
25 | run: hatch build
26 | - name: Publish package
27 | uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc
28 | with:
29 | user: __token__
30 | password: ${{ secrets.PYPI_API_TOKEN }}
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .nextflow*
2 | work/
3 | .idea/
4 | dist/
5 | archive
6 | .git
7 | .gitignore
8 | __pycache__
9 | *.orig
10 |
--------------------------------------------------------------------------------
/.idea/remote-mappings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim as basis
2 |
3 | # set environment varibles
4 | ENV PYTHONDONTWRITEBYTECODE 1
5 | ENV PYTHONUNBUFFERED 1
6 |
7 |
8 | FROM basis as builder
9 | RUN apt-get update \
10 | && apt-get install -y --no-install-recommends \
11 | build-essential \
12 | fasttree \
13 | libxml2 \
14 | mafft \
15 | && rm -rf /var/lib/apt/lists/*
16 |
17 | WORKDIR /src
18 | RUN pip install --upgrade hatch pip
19 | COPY pyproject.toml .
20 | RUN python -m venv /app \
21 | && hatch dep show requirements --all > requirements.txt \
22 | && /app/bin/pip install wheel setuptools \
23 | && /app/bin/pip install -r requirements.txt
24 |
25 | COPY . .
26 | RUN ls -la \
27 | && hatch build \
28 | && ls -la dist/ \
29 | && /app/bin/pip install dist/*.whl
30 |
31 |
32 | FROM basis as runtime
33 | RUN apt-get update \
34 | && apt-get install -y --no-install-recommends \
35 | fasttree \
36 | libxml2 \
37 | mafft \
38 | mmseqs2 \
39 | procps \
40 | && apt-get -y autoremove \
41 | && apt-get -y autoclean \
42 | && rm -rf /var/lib/apt/lists/*
43 |
44 | COPY --from=builder /app /app
45 | ENV PATH="/app/bin:$PATH"
46 |
--------------------------------------------------------------------------------
/FastOMA/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | __packagename__ = "FastOMA"
3 | __version__ = "0.3.5"
4 |
--------------------------------------------------------------------------------
/FastOMA/batch_roothogs.py:
--------------------------------------------------------------------------------
1 |
2 | import shutil
3 | from pathlib import Path
4 | from ._wrappers import logger
5 | from . import __version__ as fastoma_version
6 |
7 | big_rhog_filesize_thresh = 400 * 1000
8 | sum_list_rhogs_filesize_thresh = 2 * 1e6
9 |
10 |
11 | """
12 |
13 | fastoma-batch-roothogs --input-roothogs omamer_rhogs --out-big rhogs_big --out-rest rhogs_rest -vv
14 |
15 | """
16 |
17 | class BatchBuilder:
18 | def __init__(self, outdir: Path, max_size: int):
19 | self.outdir = outdir
20 | self.max_size = max_size
21 |
22 | def __enter__(self):
23 | self.cur_batch = []
24 | self.cur_size = 0
25 | self.counter = 0
26 | self.outdir.mkdir(parents=True, exist_ok=True)
27 | return self
28 |
29 | def __exit__(self, exc_type, exc_val, exc_tb):
30 | if len(self.cur_batch) > 0:
31 | self._flush()
32 |
33 | def add_hog(self, hog_file: Path):
34 | self.cur_batch.append(hog_file)
35 | self.cur_size += hog_file.stat().st_size
36 | logger.debug("adding %s with size %d to batch %d", hog_file, hog_file.stat().st_size, self.counter)
37 | if self.cur_size > self.max_size:
38 | self._flush()
39 | self.counter += 1
40 |
41 | def _flush(self):
42 | batch_dir = self.outdir / str(self.counter)
43 | batch_dir.mkdir()
44 | for fn in self.cur_batch:
45 | shutil.copy(fn, batch_dir)
46 | logger.debug("creating batch %s with %d families; total size of files is %d",
47 | batch_dir, len(self.cur_batch), self.cur_size)
48 | self.cur_size = 0
49 | self.cur_batch = []
50 |
51 |
52 | def folder_1h_rhog(roothog_path: Path, output_folder_big: Path, output_folder_rest: Path):
53 | # create a list of hogs in descending filesize order
54 | hog_size_tuples = sorted([(f, f.stat().st_size) for f in roothog_path.rglob("*.fa")], key=lambda x: -x[1])
55 | with BatchBuilder(output_folder_big, 1) as big_hogs, \
56 | BatchBuilder(output_folder_rest, sum_list_rhogs_filesize_thresh) as rest_hogs:
57 | for hog, fsize in hog_size_tuples:
58 | if fsize > big_rhog_filesize_thresh:
59 | big_hogs.add_hog(hog)
60 | else:
61 | rest_hogs.add_hog(hog)
62 |
63 |
64 | def fastoma_batch_roothogs():
65 | import argparse
66 | parser = argparse.ArgumentParser(description="Analyse roothog families and create batches for analysis")
67 | parser.add_argument("--version", action="version", version="FastOMA v"+fastoma_version)
68 | parser.add_argument('--input-roothogs', required=True, help="folder where input roothogs are stored")
69 | parser.add_argument('--out-big', required=True, help="folder where the big single family hogs should be stored")
70 | parser.add_argument('--out-rest', required=True, help="folder where the remaining families should be stored in"
71 | "batch subfolder structure.")
72 | parser.add_argument('-v', default=0, action="count", help="incrase verbosity")
73 | conf_batch_roothogs = parser.parse_args()
74 | logger.setLevel(level=30 - 10 * min(conf_batch_roothogs.v, 2))
75 | logger.debug("Arguments: %s", conf_batch_roothogs)
76 |
77 | folder_1h_rhog(Path(conf_batch_roothogs.input_roothogs), Path(conf_batch_roothogs.out_big), Path(conf_batch_roothogs.out_rest))
78 |
79 |
--------------------------------------------------------------------------------
/FastOMA/helper_scripts.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from ._wrappers import logger
3 | from .zoo.utils import auto_open
4 |
5 |
6 | def extract_pw_rels(args):
7 | from lxml import etree
8 | from .zoo.hog import transform
9 | xml = etree.parse(args.orthoxml)
10 | with auto_open(args.out, 'wt') as fout:
11 | for p1, p2 in transform.iter_pairwise_relations(xml, rel_type=args.type, id_attribute="protId"):
12 | fout.write(f"{p1}\t{p2}\n")
13 |
14 |
15 | def main():
16 | parser = argparse.ArgumentParser(description="FastOMA helper scripts")
17 | parser.add_argument('-v', default=0, action="count", help="increase verbosity")
18 | subparsers = parser.add_subparsers(required=True)
19 |
20 | parser_pw = subparsers.add_parser('pw-rel')
21 | parser_pw.add_argument("--type", choices=("ortholog", "paralog"), default="ortholog",
22 | help="Type of relations to extract. either 'ortholog' or 'paralog'")
23 | parser_pw.add_argument("--out", required=True, help="Path to output file")
24 | parser_pw.add_argument("--orthoxml", required=True, help="Path to input orthoxml file")
25 | parser_pw.set_defaults(func=extract_pw_rels)
26 |
27 | conf = parser.parse_args()
28 | logger.setLevel(level=30 - 10 * min(conf.v, 2))
29 | logger.debug(conf)
30 | conf.func(conf)
31 |
32 |
33 | if __name__ == "__main__":
34 | main()
--------------------------------------------------------------------------------
/FastOMA/infer_roothogs.py:
--------------------------------------------------------------------------------
1 | import os.path
2 | from shutil import which
3 |
4 | from . import _utils_roothog
5 | from ._wrappers import logger
6 | from . import __version__ as fastoma_version
7 |
8 |
9 |
10 | """
11 |
12 | fastoma-infer-roothogs --proteomes proteome --hogmap hogmap --out-rhog-folder omamer_rhogs -vv
13 |
14 | """
15 |
16 |
17 | def fastoma_infer_roothogs():
18 | import argparse
19 | parser = argparse.ArgumentParser(description="checking parameters for FastOMA")
20 | parser.add_argument("--version", action="version", version="FastOMA v"+fastoma_version)
21 | parser.add_argument("--proteomes", required=True, help="Path to the folder containing the input proteomes")
22 | parser.add_argument("--splice", help="Path to the folder containing the splice information files")
23 | parser.add_argument("--hogmap", help="Path to the folder containing the hogmap files")
24 | parser.add_argument("--out-rhog-folder", required=True, help="Folder where the roothog fasta files are written") #out_rhog_folder
25 | parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug")
26 | parser.add_argument('--min-sequence-length', required=False, default=50, type=int,
27 | help="minimum sequence length. Shorter sequences will be ignored. (Default=50)")
28 |
29 | parser.add_argument("--mergHOG-ratioMax-thresh", required=False, type=float, default=0.8, help="For merging rootHOGs, threshold of ratioMax ") # mergHOG_ratioMax_thresh
30 | parser.add_argument("--mergHOG-ratioMin-thresh", required=False, type=float, default=0.9, help="For merging rootHOGs, threshold of ratioMin ") # mergHOG_ratioMin_thresh
31 | parser.add_argument("--mergHOG-shared-thresh", required=False, type=float, default=10, help="For merging rootHOGs, threshold of number shared proteins ") # mergHOG_shared_thresh
32 | parser.add_argument("--mergHOG-fscore-thresh", required=False, type=float, default=70, help="For merging rootHOGs, threshold of famlut score shared proteins ") # mergHOG_fscore_thresh
33 | parser.add_argument("--big-rhog-size", required=False, type=int, default=50*1000, help= "For big rootHOGs, we have different heuristics") # big_rhog_size
34 | parser.add_argument("--big-fscore-thresh", required=False, type=int, default=95, help="For huge rootHOGs, we have different heuristics, like filtering low family score protiens") # big_fscore_thresh
35 |
36 | conf = parser.parse_args()
37 | logger.setLevel(level=30 - 10 * min(conf.v, 2))
38 | logger.debug("Arguments: %s", conf)
39 |
40 | species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes, conf.min_sequence_length) # optional input folder
41 | prot_recs_all = _utils_roothog.add_species_name_prot_id(prot_recs_lists)
42 |
43 | hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(prot_recs_lists, fasta_format_keep, folder=conf.hogmap) # optional input folder
44 |
45 | splice_files = conf.splice is not None and os.path.exists(conf.splice)
46 | if splice_files:
47 | isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names, folder=conf.splice)
48 | isoform_selected, isoform_not_selected = _utils_roothog.find_nonbest_isoform(
49 | species_names, isoform_by_gene_all, hogmaps
50 | )
51 | _utils_roothog.write_isoform_selected(isoform_by_gene_all, isoform_selected, prot_recs_lists)
52 | # for each isoform file, there will be a file ending with _selected_isoforms.tsv
53 | hogmaps = _utils_roothog.handle_splice(hogmaps, isoform_not_selected)
54 |
55 | rhogs_prots = _utils_roothog.group_prots_roothogs(hogmaps)
56 | rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots, hogmaps, conf)
57 | rhogs_prots = _utils_roothog.merge_rhogs2(hogmaps, rhogs_prots, conf)
58 | rhogs_prots = _utils_roothog.filter_big_roothogs(hogmaps, rhogs_prots, conf)
59 |
60 | min_rhog_size = 2
61 | rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, conf.out_rhog_folder, min_rhog_size)
62 | linclust_available=which("mmseqs") # True #
63 | # if memseqs is not installed the output will be empty / None
64 | if linclust_available:
65 | num_unmapped_singleton = _utils_roothog.collect_unmapped_singleton(rhogs_prots, unmapped, prot_recs_all, "singleton_unmapped.fa")
66 | if num_unmapped_singleton:
67 | result_linclust = _utils_roothog.run_linclust(fasta_to_cluster="singleton_unmapped.fa")
68 | logger.debug(" linclust is done %s", result_linclust)
69 | num_clusters = _utils_roothog.write_clusters(conf.out_rhog_folder, min_rhog_size)
70 | logger.debug("we wrote %d new clusters with linclust ", num_clusters)
71 |
72 |
73 | if __name__ == "__main__":
74 | fastoma_infer_roothogs()
--------------------------------------------------------------------------------
/FastOMA/infer_subhogs.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from . import _utils_subhog
4 | from . import _infer_subhog
5 | from ._wrappers import logger
6 | from . import __version__ as fastoma_version
7 |
8 | """
9 |
10 | fastoma-infer-subhogs --input-rhog-folder rhogs_rest/0 --output-pickles "pickle_hogs" \
11 | --species-tree species_tree_checked.nwk -vv --parallel # --msa-write --gene-trees-write
12 |
13 | """
14 |
15 | def fastoma_infer_subhogs():
16 |
17 | import argparse
18 | parser = argparse.ArgumentParser(description="checking parameters for FastOMA",
19 | formatter_class=argparse.ArgumentDefaultsHelpFormatter)
20 | parser.add_argument("--version", action="version", version="FastOMA v"+fastoma_version)
21 | parser.add_argument("--input-rhog-folder", required=True, help="Path to the input rootHOG folder.")
22 | parser.add_argument("--parallel", action='store_true', help="use concurrent parallel per rootHOG")
23 | parser.add_argument("--species-tree", required=True,
24 | help="Path to the input species tree file in newick format")
25 | parser.add_argument("--output-pickles", required=False, default="pickle_hogs",
26 | help="Path to the output folder")
27 |
28 | parser.add_argument("--threshold-dubious-sd", required=False, type=float, default=1/10,
29 | help="Threshold to remove proteins in a gene tree due to low species overlap score, not enough evidence for duplication event.") # threshold_dubious_sd
30 | parser.add_argument("--number-of-samples-per-hog", type=int, default=5,
31 | help="Number of representatives (sequences) per HOG. Defaults to ")
32 | parser.add_argument("--overlap-fragments", required=False, type=float, default=0.15,
33 | help="Threshold overlap between two sequences (rows) in MSA to decide whether they are fragments of a gene.") # overlap_fragments
34 | parser.add_argument("--gene-rooting-method", required=False, default="midpoint", # gene_rooting_method
35 | help="The method used for rooting of gene tree : midpoint mad Nevers_rooting .")
36 | parser.add_argument("--gene-trees-write", action='store_true',
37 | help="writing the all gene trees .") # the order seems to be nwk_SD_labeled.nwk, dubious_sd0.nwk_SD_labeled.nwk, dubious_sd1.nwk_SD_labeled.nwk
38 | parser.add_argument("--msa-write", action='store_true',
39 | help="writing the raw MSAs (might have more genes that the final gene tree).")
40 | parser.add_argument("--msa-filter-method",
41 | choices=("col-row-threshold", "col-elbow-row-threshold", "trimal"),
42 | default="col-row-threshold",
43 | help="The method used for filtering MSAs.")
44 | parser.add_argument("--gap-ratio-row", required=False, type=float, default=0.3,
45 | help="For trimming the MSA, the threshold of ratio of gaps for each row.")
46 | parser.add_argument("--gap-ratio-col", required=False, type=float, default=0.5,
47 | help="For trimming the MSA, the threshold of ratio of gaps for each column.")
48 | parser.add_argument("--min-col-trim", required=False, type=int, default=50, # todo min rows trim
49 | help="min no. columns in msa to consider for filtering")
50 | parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug")
51 | conf_infer_subhhogs = parser.parse_args()
52 | logger.setLevel(level=30 - 10 * min(conf_infer_subhhogs.v, 2))
53 | logger.debug("Arguments: %s", conf_infer_subhhogs)
54 |
55 | address_rhogs_folder = conf_infer_subhhogs.input_rhog_folder
56 | # address_rhogs_folder = "./" # _config.input_rhog_folder
57 | inferhog_concurrent_on = conf_infer_subhhogs.parallel
58 | if inferhog_concurrent_on:
59 | print("parallelization for subhog inference is on.")
60 |
61 | if not os.path.exists(conf_infer_subhhogs.output_pickles):
62 | os.makedirs(conf_infer_subhhogs.output_pickles)
63 |
64 | pickles_subhog_folder_all = "./" # pickle per taxonomic level
65 |
66 | list_rhog_fastas_files = _utils_subhog.list_rhog_fastas(address_rhogs_folder)
67 | print("there are ", len(list_rhog_fastas_files), "rhogs in the input folder")
68 |
69 | rhogs_fa_folder = address_rhogs_folder
70 |
71 | list_rhog_fastas_files_rem = _utils_subhog.list_rhog_fastas(address_rhogs_folder)
72 | print("there are ", len(list_rhog_fastas_files_rem), "rhogs remained in the input folder", list_rhog_fastas_files_rem[:5] )
73 |
74 | hogs_rhog_xml_batch = _infer_subhog.read_infer_xml_rhogs_batch(list_rhog_fastas_files_rem, inferhog_concurrent_on, conf_infer_subhhogs.output_pickles, pickles_subhog_folder_all, rhogs_fa_folder, conf_infer_subhhogs)
75 |
76 | print("finsihed ", address_rhogs_folder)
77 |
78 | threshold_dubious_sd= 0.1
79 |
80 |
81 | if __name__ == "__main__":
82 | fastoma_infer_subhogs()
--------------------------------------------------------------------------------
/FastOMA/transformer.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import re
3 | from ._wrappers import logger
4 |
5 |
6 | class FastaHeaderTransformer(metaclass=abc.ABCMeta):
7 | @abc.abstractmethod
8 | def transform(self, header):
9 | return header
10 |
11 |
12 | class NoOpFastaHeaderTransformer(FastaHeaderTransformer):
13 | def transform(self, header):
14 | return header
15 |
16 |
17 | class ExtractUniProtAccessionFastaHeaderTransformer(FastaHeaderTransformer):
18 | def __init__(self):
19 | self._up_re = re.compile(r"[sptr]{2}\|(?P[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})\|.*")
20 |
21 | def transform(self, header):
22 | m = self._up_re.match(header)
23 | if m:
24 | return m.group('acc')
25 | logger.warning("cannot extract uniprot accession from header: %s", header)
26 | return header
27 |
28 |
29 | def header_transformer(name):
30 | if name.lower() == "noop":
31 | return NoOpFastaHeaderTransformer()
32 | elif name.lower() == 'uniprot':
33 | return ExtractUniProtAccessionFastaHeaderTransformer()
34 |
--------------------------------------------------------------------------------
/FastOMA/zoo/README.md:
--------------------------------------------------------------------------------
1 | zoo
2 | ===
3 |
4 |
5 |
6 | This is part of the [zoo](https://zoo.cs.ucl.ac.uk/doc/zoo/wrappers.html)
--------------------------------------------------------------------------------
/FastOMA/zoo/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | __version__ = "0.0.5"
--------------------------------------------------------------------------------
/FastOMA/zoo/familyanalyzer/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 | from __future__ import print_function
3 | from __future__ import division
4 | from __future__ import absolute_import
5 | from future import standard_library
6 | standard_library.install_hooks()
7 | from .genetree import *
8 | from .taxonomy import *
9 |
--------------------------------------------------------------------------------
/FastOMA/zoo/familyanalyzer/orthoxmlquery.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 | from __future__ import print_function
3 | from __future__ import division
4 | from __future__ import absolute_import
5 | from future.builtins import str
6 | from future import standard_library
7 | standard_library.install_hooks()
8 |
9 |
10 | class ElementError(Exception):
11 | def __init__(self, msg):
12 | self.msg = msg
13 |
14 | def __str__(self):
15 | return str(self.msg)
16 |
17 |
18 | class OrthoXMLQuery(object):
19 | """Helper class with predefined queries on an orthoxml tree."""
20 |
21 | ns = {"ns0": "http://orthoXML.org/2011/"} # xml namespace
22 |
23 | @classmethod
24 | def getToplevelOrthologGroups(cls, root):
25 | """returns a list with the toplevel orthologGroup elements
26 | of the given root element."""
27 | xquery = ".//{{{ns0}}}groups/{{{ns0}}}orthologGroup".format(**cls.ns)
28 | return root.findall(xquery)
29 |
30 | @classmethod
31 | def getTaxRangeNodes(cls, root, recursively=True):
32 | xPrefix = ".//" if recursively else "./"
33 | xquery = '{}{{{}}}property[@name="TaxRange"]'.format(xPrefix,
34 | cls.ns['ns0'])
35 | return root.findall(xquery)
36 |
37 | @classmethod
38 | def getTaxidNodes(cls, root, recursively=True):
39 | xPrefix = ".//" if recursively else "./"
40 | xquery = '{}{{{}}}property[@name="taxid"]'.format(xPrefix, cls.ns['ns0'])
41 | return root.findall(xquery)
42 |
43 | @classmethod
44 | def getGeneRefNodes(cls, root, recursively=True):
45 | iterfn = root.iter if recursively else root.iterchildren
46 | iterator = iterfn('{{{}}}geneRef'.format(cls.ns['ns0']))
47 | return list(iterator)
48 |
49 | @classmethod
50 | def getGeneFromId(cls, id_, root):
51 | xquery = ".*//{{{}}}gene[@id='{}']".format(cls.ns['ns0'], id_)
52 | genes = root.findall(xquery)
53 | if len(genes) > 1:
54 | raise ElementError('several gene nodes with id {} '
55 | 'exist'.format(id_))
56 | gene = genes[0] if len(genes)>0 else None
57 | return gene
58 |
59 | @classmethod
60 | def getGroupsAtLevel(cls, level, root):
61 | """returns a list with the orthologGroup elements which have a
62 | TaxRange property equals to the requested level."""
63 | xquery = (".//{{{0}}}property[@name='TaxRange'][@value='{1}']/..".
64 | format(cls.ns['ns0'], level))
65 | return root.findall(xquery)
66 |
67 | @classmethod
68 | def getSubNodes(cls, targetNode, root, recursively=True):
69 | """method which returns a list of all (if recursively
70 | is set to true) or only the direct children nodes
71 | having 'targetNode' as their tagname.
72 | The namespace is automatically added to the tagname."""
73 | xPrefix = ".//" if recursively else "./"
74 | xquery = "{}{{{}}}{}".format(xPrefix, cls.ns['ns0'], targetNode)
75 | return root.findall(xquery)
76 |
77 | @classmethod
78 | def is_geneRef_node(cls, element):
79 | """check whether a given element is an instance of a geneRef
80 | element."""
81 | return element.tag == '{{{ns0}}}geneRef'.format(**cls.ns)
82 |
83 | @classmethod
84 | def getLevels(cls, element):
85 | """returns a list of the TaxRange levels associated to the
86 | passed orthologGroup element. If the element does not have
87 | any TaxRange property tags associated, an empty list is
88 | returned."""
89 | propTags = cls.getSubNodes("property", element, recursively=False)
90 | res = [t.get('value') for t in propTags if t.get('name') == 'TaxRange']
91 | return res
92 |
93 | @classmethod
94 | def getInputGenes(cls, root, species=None):
95 | """returns a list of all gene elements in the orthoxml inside
96 | tags, i.e. the list of genes prior to running
97 | OMA-HOGS. Optionally filtered by species."""
98 | filter_ = ('[@name="{}"]'.format(species)
99 | if species is not None else '')
100 | if filter_ > '':
101 | xquery = ('/ns:orthoXML/ns:species{}/ns:database/'
102 | 'ns:genes//ns:gene'.format(filter_))
103 | else:
104 | xquery = '//ns:gene'
105 | return root.xpath(xquery, namespaces={'ns': cls.ns['ns0']})
106 |
107 | @classmethod
108 | def getGroupedGenes(cls, root, species=None):
109 | """ returns a list of all geneRef elements inside tags, i.e.
110 | the list of genes clustered into families after running OMA-HOGS.
111 | Optionally filtered by species."""
112 | filter_ = ('[@name="TaxRange"and@value="{}"]'.format(species)
113 | if species is not None else '')
114 | if filter_ > '':
115 | xquery = ('/ns:orthoXML/ns:groups/ns:orthologGroup//ns:property{}/'
116 | 'following-sibling::ns:geneRef'.format(filter_))
117 | else:
118 | xquery = '//ns:geneRef'
119 | return root.xpath(xquery, namespaces={'ns': cls.ns['ns0']})
120 |
121 | @classmethod
122 | def getScoreNodes(cls, root, score_id=None):
123 | """returns the associated score nodes for a certain (orthologGroup) node.
124 | If score_id is not specified, all scores will be returned"""
125 | xquery = './ns:score'
126 | if score_id is not None:
127 | xquery += "[@id='{}']".format(score_id)
128 | return root.xpath(xquery, namespaces={'ns': cls.ns['ns0']})
129 |
--------------------------------------------------------------------------------
/FastOMA/zoo/familyanalyzer/tools.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
3 | from __future__ import absolute_import
4 | from future.builtins import dict
5 | from future.builtins import zip
6 | from future.builtins import range
7 | from future import standard_library
8 | standard_library.install_hooks()
9 |
10 | try:
11 | from progressbar import ProgressBar, Percentage, Timer, ETA, Bar
12 | PROGRESSBAR = True
13 | except ImportError:
14 | PROGRESSBAR = False
15 |
16 | from collections import deque
17 |
18 | def setup_progressbar(msg, size):
19 | if not msg.endswith(': '):
20 | msg += ': '
21 |
22 | widgets = [msg,
23 | Percentage(), ' ',
24 | Bar(), ' ',
25 | Timer(), ' ',
26 | ETA()]
27 |
28 | pbar = ProgressBar(widgets=widgets, maxval=size)
29 | return pbar
30 |
31 | def enum(*sequential, **named):
32 | """creates an Enum type with given values"""
33 | enums = dict(zip(sequential, range(len(sequential))), **named)
34 | enums['reverse'] = dict((value, key) for key, value in enums.items())
35 | return type('Enum', (object, ), enums)
36 |
37 |
38 | class IterableClassException(Exception):
39 | pass
40 |
41 | def py2_iterable(Class):
42 | """
43 | Use as a class decorator to make a class that has a python 3 next method --
44 | __next__() -- also iterable with python 2, which uses next(). Also checks
45 | for an __iter__ method -- if this is missing the class won't be iterable anyway.
46 |
47 |
48 | e.g.
49 | @py2_iterable
50 | class Py2and3Iterator(object):
51 | def __init__(self):
52 | self.data = list('somestuff')
53 | self._pos = 0
54 |
55 | def __iter__(self):
56 | return self
57 |
58 | def __next__(self):
59 | if self._pos == len(self.data):
60 | self._pos = 0
61 | raise StopIteration
62 | char = self.data[self._pos]
63 | self._pos += 1
64 | return char
65 |
66 |
67 | :param Class: the class being decorated
68 | :return: Class: the decorated class, which is iterable in py2 and py3
69 | """
70 | if not hasattr(Class, '__iter__'):
71 | raise IterableClassException('Class "{}" has no __iter__ method and will not be iterable'
72 | .format(Class.__class__.__name__))
73 |
74 | if hasattr(Class, '__next__'):
75 | next_method = getattr(Class, '__next__')
76 | setattr(Class, 'next', next_method)
77 |
78 | return Class
79 |
80 |
81 | @py2_iterable
82 | class Queue(object):
83 |
84 | def __init__(self):
85 | self.__queue = deque()
86 |
87 | def __iter__(self):
88 | return self
89 |
90 | def __len__(self):
91 | return len(self.__queue)
92 |
93 | def __next__(self):
94 | if self.isempty():
95 | raise StopIteration
96 | return self.dequeue()
97 |
98 | def enqueue(self, item):
99 | self.__queue.append(item)
100 |
101 | def dequeue(self):
102 | if self.isempty():
103 | raise Exception('empty queue')
104 | return self.__queue.popleft()
105 |
106 | def isempty(self):
107 | return len(self.__queue) == 0
108 |
--------------------------------------------------------------------------------
/FastOMA/zoo/file_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .context_managers import *
2 | from .extractors import *
3 |
4 |
--------------------------------------------------------------------------------
/FastOMA/zoo/file_utils/context_managers.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import tempfile
4 |
5 |
6 |
7 | __all__ = ['TempFile', 'TempDir', 'ChDir', 'MkDir', 'NonDeletingTempDir']
8 |
9 | class TempFile(object):
10 | """
11 | Context manager for working with a temporary file
12 | that automatically cleans up.
13 |
14 | Usage:
15 |
16 | with TempFile() as tmp:
17 | # In scope, tmp exists on the disk
18 | # Do some work with tmp, e.g. tmp.write('something')
19 |
20 | # Out of scope, tmp is deleted
21 |
22 | with TempFile('local_temp_space') as tmp:
23 | # tmp is created in the directory 'local_temp_space'
24 | # The specified directory must exist, or an error is thrown
25 |
26 | """
27 |
28 | def __init__(self, dir_=None):
29 | if dir_ is not None and not os.path.exists(dir_):
30 | raise IOError('Directory "{}"" does not exist'.format(dir_))
31 | self.dir = dir_
32 |
33 | def __enter__(self):
34 | self._fd, self._wrapped_tmp = tempfile.mkstemp(dir=self.dir)
35 | return os.path.abspath(self._wrapped_tmp)
36 |
37 | def __exit__(self, type, value, tb):
38 | os.close(self._fd)
39 | os.remove(self._wrapped_tmp)
40 |
41 |
42 | class TempDir(object):
43 | """
44 | Context manager for working with a temporary file
45 | that automatically cleans up.
46 |
47 | Usage:
48 |
49 | with TempDir() as tmpd:
50 | # In scope, tmpd exists on the disk
51 | # Do some work with tmpd ...
52 |
53 | # Out of scope, tmpd is deleted along with all its content
54 |
55 | Can be nested with TempFile, e.g.
56 |
57 | with TempDir() as tmpd, TempFile(tmpd) as tmpf:
58 | # tempfile tmpf is created inside temporary directory tmpd
59 | # On exit, everything is deleted
60 |
61 | """
62 |
63 | def __enter__(self):
64 | self._wrapped_tmpdir = tempfile.mkdtemp()
65 | return os.path.abspath(self._wrapped_tmpdir)
66 |
67 | def __exit__(self, type, value, tb):
68 | shutil.rmtree(self._wrapped_tmpdir)
69 |
70 |
71 | class NonDeletingTempDir(TempDir):
72 | def __exit__(self, tpye, value, tb):
73 | pass
74 |
75 |
76 | class ChDir(object):
77 | """
78 | Context manager to switch to a working directory,
79 | and return to the current directory (like 'Dir.chdir do' block in Ruby)
80 |
81 | Usage:
82 |
83 | with TempDir() as dir, ChDir(dir):
84 | # Do some work in the working temp directory 'dir'
85 |
86 | # Exit 'dir'
87 | """
88 |
89 | def __init__(self, working_dir):
90 | if not os.path.exists(working_dir):
91 | raise IOError('Directory "{}"" does not exist'.format(working_dir))
92 | self._cdir = os.getcwd()
93 | self._wdir = working_dir
94 |
95 | def __enter__(self):
96 | os.chdir(self._wdir)
97 |
98 | def __exit__(self, type, value, tb):
99 | os.chdir(self._cdir)
100 |
101 |
102 | class MkDir(ChDir):
103 | """
104 | Context manager to create and switch to a working directory,
105 | then return to the current directory.
106 |
107 | Usage:
108 |
109 | with TempDir() as dir, MkDir(dir):
110 | # Do some work in the working temp directory 'dir'
111 |
112 | # Exit 'dir'
113 | """
114 |
115 | def __init__(self, working_dir):
116 | if not os.path.exists(working_dir):
117 | try:
118 | os.makedirs(working_dir)
119 | except OSError as e:
120 | if e.errno != 17:
121 | raise
122 | pass # path was created by another thread / process
123 | # this is a race condition, but probably benign
124 |
125 | def __enter__(self):
126 | pass
127 |
128 | def __exit__(self, type, value, tb):
129 | pass
130 |
--------------------------------------------------------------------------------
/FastOMA/zoo/file_utils/extractors.py:
--------------------------------------------------------------------------------
1 | import os
2 | import collections
3 | import re
4 |
5 |
6 |
7 | __all__ = ['tail', 'fall_back_tail', 'grep']
8 |
9 |
10 | def tail(fh, lines=20, block_size=1024):
11 | """Returns the last n lines from a file
12 |
13 | This function returns the last n lines from an file-like
14 | object. It does this efficiently without reading the whole
15 | file, but rather by loading blocks from the end of the file.
16 |
17 | .. note::
18 |
19 | If the file is opened in text mode, i.e. open('/path', 'rt'),
20 | python3 cannot efficiently move in the file. In this case,
21 | the function fall back to a slow method that goes through
22 | the whole file.
23 |
24 | Example:
25 |
26 | >>> with open("/etc/passwd", 'rb') as f:
27 | ... last_lines = tail(f, 2)
28 | ...
29 | >>> print(last_lines)
30 |
31 | :param fh: file-like object to read from
32 | :param int lines: number of lines to be returned
33 | :param int block_size: size of block to be read at once.
34 | intended for optimisation.
35 | :returns: The last lines as a list of bytes/str object"""
36 |
37 | if lines <= 0:
38 | raise ValueError('invalid lines value %r' % lines)
39 |
40 | encoded = getattr(fh, 'encoding', False)
41 | if encoded:
42 | return fall_back_tail(fh, lines)
43 | CR = '\n' if encoded else b'\n'
44 | data = '' if encoded else b''
45 | fh.seek(0, os.SEEK_END)
46 | fsize = fh.tell()
47 | block = -1
48 | loaded_enough_data = False
49 | while not loaded_enough_data:
50 | step = (block * block_size)
51 | if abs(step) >= fsize:
52 | fh.seek(0)
53 | newdata = fh.read(block_size - (abs(step) - fsize))
54 | loaded_enough_data = True
55 | else:
56 | fh.seek(step, os.SEEK_END)
57 | newdata = fh.read(block_size)
58 | data = newdata + data
59 | if data.count(CR) > lines:
60 | break
61 | else:
62 | block -= 1
63 | return data.splitlines()[-lines:]
64 |
65 |
66 | def fall_back_tail(fh, lines):
67 | fh.seek(0)
68 | data = collections.deque(fh, maxlen=lines)
69 | return [e.rstrip('\n') for e in data]
70 |
71 |
72 | def grep(fh, pat):
73 | """Yields lines matching a pattern
74 |
75 | This function yields all the lines that match a given pattern.
76 | The pattern can be either a simple str/bytes, or a compiled
77 | regex expression. The newline character is not removed.
78 |
79 | Example:
80 | >>> with open('/etc/hosts', 'rb') as fh:
81 | ... for line in grep(fh, b'127.0.0.1'):
82 | ... print(line)
83 | 127.0.0.1 localhost
84 |
85 | :param fh: file-like object
86 | :param pat: search pattern, either str, bytes or compiled regex
87 | :returns: generator yielding lines matching pattern.
88 |
89 | """
90 | if isinstance(pat, (str, bytes)):
91 | encoded = getattr(fh, 'encoding', False)
92 | if encoded and isinstance(pat, bytes):
93 | pat = re.compile(pat.decode())
94 | elif not encoded and isinstance(pat, str):
95 | pat = re.compile(pat.encode('utf-8'))
96 | else:
97 | pat = re.compile(pat)
98 | fh.seek(0)
99 | for line in fh:
100 | if pat.search(line):
101 | yield line
102 |
--------------------------------------------------------------------------------
/FastOMA/zoo/hog/__init__.py:
--------------------------------------------------------------------------------
1 | from .filter_orthoxml import *
2 | from .convert import orthoxml_to_newick
3 | from .orthoxml_merge import merge_orthoxml_files
4 | from .extract_groups import TaxLevel, extract_flat_groups_at_level, extract_marker_groups_at_level
5 |
--------------------------------------------------------------------------------
/FastOMA/zoo/hog/convert.py:
--------------------------------------------------------------------------------
1 | from xml.etree.ElementTree import XMLParser
2 | __all__ = ["orthoxml_to_newick"]
3 |
4 |
5 | class TaxonNHXMixin:
6 | def get_tax_nhx(self):
7 | tags = []
8 | if self.level:
9 | tags.append(":S={}".format(self.level))
10 | if self.taxid:
11 | tags.append(":T={}".format(self.taxid))
12 | return tags
13 |
14 |
15 | class Speciation:
16 | type = None
17 |
18 | def __init__(self, parent=None):
19 | self.level = ""
20 | self.taxid = None
21 | self.children = []
22 | self.parent = parent
23 | if parent is not None:
24 | parent.add_child(self)
25 |
26 | def add_child(self, e):
27 | self.children.append(e)
28 |
29 | def set_level(self, level):
30 | self.level = level
31 |
32 | def set_taxid(self, taxid):
33 | self.taxid = taxid
34 |
35 | def get_newick_node_name(self):
36 | if not hasattr(self, 'get_tax_nhx'):
37 | return self.level.replace(' ', '_')
38 | return ""
39 |
40 | def as_nhx(self):
41 | nhx = "[&&NHX"
42 | t = ",".join([c.as_nhx() for c in self.children])
43 | if t != "":
44 | t = "({})".format(t)
45 | tags = self.get_newick_node_name()
46 |
47 | if self.type:
48 | nhx += ":Ev={}".format(self.type)
49 | if hasattr(self, "get_tax_nhx"):
50 | nhx += "".join(self.get_tax_nhx())
51 | nhx += "]"
52 | if len(nhx) > 7:
53 | tags += nhx
54 | return "{}{}".format(t, tags)
55 |
56 |
57 | class Duplication(Speciation):
58 | type = "duplication"
59 |
60 |
61 | class Leaf(Speciation):
62 | def __init__(self, xref, species, parent=None):
63 | super().__init__(parent=parent)
64 | self.name = xref
65 | self.level = species
66 |
67 | def get_newick_node_name(self):
68 | return self.name
69 |
70 |
71 | class NHXSpeciation(Speciation, TaxonNHXMixin):
72 | pass
73 |
74 | class NHXDuplication(Duplication, TaxonNHXMixin):
75 | pass
76 |
77 | class NHXLeaf(Leaf, TaxonNHXMixin):
78 | pass
79 |
80 |
81 | class OrthoxmlToNewick:
82 |
83 | def __init__(self, xref_tag="protId", encode_levels_as_nhx=True, return_gene_to_species=False):
84 | self.xref_tag = xref_tag
85 | self.gene2xref = {}
86 | self.trees = {}
87 | self.depth = 0
88 | self.famid = None
89 | self.cur_event = None
90 | self.cur_species = None
91 | self._use_nhx = encode_levels_as_nhx
92 | self._return_gene_to_species= return_gene_to_species
93 |
94 | def start(self, tag, attrib):
95 | if tag == "{http://orthoXML.org/2011/}species":
96 | self.cur_species = attrib['name']
97 | if tag == "{http://orthoXML.org/2011/}gene":
98 | self.gene2xref[attrib['id']] = (attrib[self.xref_tag], self.cur_species)
99 | elif tag == "{http://orthoXML.org/2011/}geneRef":
100 | leaf_cls = NHXLeaf if self._use_nhx else Leaf
101 | self.cur_event.add_child(leaf_cls(*self.gene2xref[attrib['id']]))
102 | elif tag == "{http://orthoXML.org/2011/}orthologGroup":
103 | if self.depth == 0:
104 | self.famid = attrib['id']
105 | speciation_cls = NHXSpeciation if self._use_nhx else Speciation
106 | self.cur_event = speciation_cls(self.cur_event)
107 | self.depth += 1
108 | elif tag == "{http://orthoXML.org/2011/}paralogGroup":
109 | dupl_cls = NHXDuplication if self._use_nhx else Duplication
110 | self.cur_event = dupl_cls(self.cur_event)
111 | elif tag == "{http://orthoXML.org/2011/}property":
112 | if attrib['name'] == "TaxRange":
113 | self.cur_event.set_level(attrib['value'])
114 | elif attrib['name'].lower() in ("taxid", "taxonid", "taxon_id", "ncbi_taxon_id"):
115 | self.cur_event.set_taxid(attrib['value'])
116 |
117 | def end(self, tag):
118 | if tag == "{http://orthoXML.org/2011/}paralogGroup":
119 | self.cur_event = self.cur_event.parent
120 | elif tag == "{http://orthoXML.org/2011/}orthologGroup":
121 | self.depth -= 1
122 | if self.depth == 0:
123 | assert(self.cur_event.parent is None)
124 | self.trees[self.famid] = self.cur_event.as_nhx() + ";"
125 | self.cur_event = self.cur_event.parent
126 |
127 | def close(self):
128 | if self._return_gene_to_species:
129 | gene2species = {k[0]: k[1] for k in self.gene2xref.values()}
130 | return self.trees, gene2species
131 | return self.trees
132 |
133 |
134 | def orthoxml_to_newick(filename, xref_tag="protId", encode_levels_as_nhx=False, return_gene_to_species=False):
135 | """function to convert all HOGs from an orthoxml file into newick trees
136 |
137 | This function converts all toplevel orthologGroups into a dictionary of newick trees.
138 | Duplication nodes are labeled as such using the nhx tag, e.g. a paralogGroup node
139 | will be translated into an internal node having the nhx label [&&NHX:Ev=duplication]
140 |
141 | :param filename: the filename of the input orthoxml file
142 |
143 | :param xref_tag: the attribute of the element that should be used to get as label
144 | for the leaves labels.
145 |
146 | :param encode_levels_as_nhx: boolean flag indicating whether or not the species information
147 | of the internal and extend nodes should be returned in NHX format
148 | with the :S=<...> and :T=<...> format. otherwise, the TaxRange
149 | value will be used as newick node label for the internal nodes.
150 |
151 | :param return_gene_to_species: boolean flag indicating if a mapping with the gene to species
152 | should be returned.
153 |
154 | :returns either a dict of {roothogid: tree} where tree is in nhx format or a tuple with the
155 | first element being the tree dictionary and the second being a mapping from
156 | {gene: species}.
157 | """
158 |
159 | target = OrthoxmlToNewick(
160 | xref_tag=xref_tag,
161 | encode_levels_as_nhx=encode_levels_as_nhx,
162 | return_gene_to_species=return_gene_to_species)
163 | parser = XMLParser(target=target)
164 | with open(filename, 'rb') as xml:
165 | for chunk in xml:
166 | parser.feed(chunk)
167 | return parser.close()
168 |
--------------------------------------------------------------------------------
/FastOMA/zoo/hog/extract_hog_info.py:
--------------------------------------------------------------------------------
1 | from ..utils import auto_open
2 | import collections
3 | from time import time
4 | import xml.etree.ElementTree as etree
5 | from pathlib import Path
6 | import logging
7 | logger = logging.getLogger(__name__)
8 |
9 | Gene = collections.namedtuple("Gene", "xref species internal_id")
10 |
11 |
12 | class SpeciesAnalyser:
13 | def __init__(self, gene_attr="protId"):
14 | self.gene_attr = gene_attr
15 | self.genes = {}
16 | self.nr_genes_per_species = collections.defaultdict(int)
17 |
18 | def add_genome_genes(self, genome_node):
19 | genome_name = genome_node.get('name', None)
20 | if genome_name is None:
21 | genome_name = genome_node.get("NCBITaxId")
22 |
23 | generef_2_xref = {}
24 | for gene in genome_node.findall('.//{http://orthoXML.org/2011/}gene'):
25 | gene_id = gene.get('id')
26 | gene_prot_id = gene.get(self.gene_attr)
27 | generef_2_xref[gene_id] = Gene(gene_prot_id, genome_name, gene_id)
28 | self.nr_genes_per_species[genome_name] += 1
29 | self.genes.update(generef_2_xref)
30 |
31 | def gene_in_group(self, gene_id):
32 | self.genes.pop(gene_id)
33 |
34 | def get_singletons(self):
35 | return self.genes
36 |
37 | def summary(self):
38 | single = collections.defaultdict(int)
39 | for g in self.genes.values():
40 | single[g.species] += 1
41 | return [{'species': g, 'genes': self.nr_genes_per_species[g], 'not_in_group': single[g]}
42 | for g in self.nr_genes_per_species]
43 |
44 |
45 | def parse_orthoxml(fh, genome_watcher: SpeciesAnalyser):
46 | taxonomy = {}
47 | og_level = 0
48 |
49 | def collect_genes(elem):
50 | genes = 0
51 | for child in elem.iter():
52 | if child == elem:
53 | continue
54 | if child.tag == "{http://orthoXML.org/2011/}geneRef":
55 | genes += 1
56 | if genome_watcher is not None:
57 | genome_watcher.gene_in_group(child.get('id'))
58 | elif child.tag == "{http://orthoXML.org/2011/}orthologGroup":
59 | genes += child.text
60 | elem.clear()
61 | elem.text = genes
62 | return genes
63 |
64 | logger.info("start mapping of orthoxml formatted input file")
65 | for event, elem in etree.iterparse(fh, events=('start', 'end')):
66 | if event == "start":
67 | if elem.tag == "{http://orthoXML.org/2011/}orthoXML":
68 | if elem.get('version') != "0.5":
69 | raise RuntimeError(f"Expecting orthoXML version 0.5, but is {elem.get('version')}")
70 | elif elem.tag == '{http://orthoXML.org/2011/}orthologGroup':
71 | og_level += 1
72 | elif event == 'end':
73 | if elem.tag == "{http://orthoXML.org/2011/}orthologGroup":
74 | og_level -= 1
75 | data = {'id': elem.get('id'), 'level': taxonomy[elem.get('taxonId')]}
76 | for child in elem.findall('./{http://orthoXML.org/2011/}score'):
77 | data[child.get('id')] = float(child.get('value'))
78 | data['nr_members'] = collect_genes(elem)
79 | data['is_roothog'] = og_level == 0
80 | yield data
81 | if og_level == 0:
82 | elem.clear()
83 | elif elem.tag == "{http://orthoXML.org/2011/}species":
84 | if genome_watcher is not None:
85 | genome_watcher.add_genome_genes(elem)
86 | elem.clear()
87 | elif elem.tag == "{http://orthoXML.org/2011/}taxon":
88 | taxonomy[elem.get('id')] = elem.get('name')
89 |
90 |
91 | if __name__ == "__main__":
92 | import argparse
93 | parser = argparse.ArgumentParser()
94 | parser.add_argument("--orthoxml", required=True)
95 | conf = parser.parse_args()
96 | genome_coverage_stats = SpeciesAnalyser()
97 | with open(conf.orthoxml, 'rt') as xml:
98 | for group in parse_orthoxml(xml, genome_coverage_stats):
99 | print(group)
--------------------------------------------------------------------------------
/FastOMA/zoo/hog/filter_orthoxml.py:
--------------------------------------------------------------------------------
1 |
2 | from ..utils import auto_open
3 | # import collections
4 | # from time import time
5 | from lxml import etree as ET
6 | # import Bio.Phylo
7 | from typing import Iterable
8 | from pathlib import Path
9 | import logging
10 | logger = logging.getLogger(__name__)
11 |
12 | class HOGFilter:
13 | def __init__(self, score:str, value:float):
14 | self.score = score
15 | self.value = value
16 |
17 | def remove(self, score_id, value):
18 | return score_id == self.score and self.value > float(value)
19 |
20 |
21 | class OrthoXMLFilterProcesser:
22 |
23 | def __init__(self, filters:Iterable[HOGFilter]=None):
24 | self.filters = list(filters)
25 |
26 | def add_filter(self, filter:HOGFilter):
27 | self.filters.append(filter)
28 |
29 | def process(self, fh):
30 | NS = "http://orthoXML.org/2011/"
31 | self.doc = ET.parse(fh)
32 | root = self.doc.getroot()
33 | to_rem = []
34 | for hog in root.iterfind('.//{{{0}}}orthologGroup'.format(NS)):
35 | score = hog.find('./{{{0}}}score'.format(NS))
36 | if score is None:
37 | continue
38 | for filt in self.filters:
39 | if filt.remove(score.get('id'), score.get('value')):
40 | to_rem.append(hog)
41 | break
42 | logger.info(f"will remove {len(to_rem)} hogs")
43 | for h in to_rem:
44 | parent = h.getparent()
45 | if 'id' in h.attrib:
46 | logger.info("removing hog " + str(h) + " line " + str(h.sourceline) + " " +str(h.attrib['id']))
47 | else:
48 | logger.info("removing hog " + str(h) + " line " + str(h.sourceline))
49 | if parent:
50 | parent.remove(h)
51 | if sum(c.tag == "{{{0}}}orthologGroup".format(NS) for c in parent) == 0:
52 | if 'id' in parent.attrib:
53 | logger.info("consider deleting the empty parent hog "+str(parent)+" line "+str(parent.sourceline)+" "+str(parent.attrib['id']))
54 | else:
55 | logger.info("consider deleting the empty parent hog " + str(parent) + " line "+str(parent.sourceline))
56 | to_rem.append(parent)
57 |
58 | def write(self, fh):
59 | self.doc.write(fh, xml_declaration=True, encoding="UTF-8")
60 |
61 |
62 |
63 | def filter_orthoxml_file(source_orthoxml, out, filter: HOGFilter):
64 | processor = OrthoXMLFilterProcesser([filter])
65 | if isinstance(source_orthoxml, (str, bytes, Path)):
66 | with auto_open(source_orthoxml, 'rt') as fh:
67 | processor.process(fh)
68 | else:
69 | processor.process(source_orthoxml)
70 | processor.write(out)
71 |
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/FastOMA/zoo/hog/orthoxml_merge.py:
--------------------------------------------------------------------------------
1 | from xml.etree import ElementTree as ET
2 | from typing import List, Iterable
3 | from random import randint
4 |
5 |
6 |
7 | class GeneRefManager:
8 | def __init__(self):
9 | self.xrefs = {}
10 | self.ids = set([])
11 |
12 | def _random_unused_id(self):
13 | while True:
14 | cand = randint(100000, 1000000000)
15 | if str(cand) not in self.ids:
16 | return str(cand)
17 |
18 | def register_and_reassign(self, gene_nodes:Iterable[ET.Element]):
19 | update_ids = {}
20 | to_rem = []
21 | for gene in gene_nodes:
22 | if gene.attrib['id'] in self.ids:
23 | if gene.attrib['protId'] in self.xrefs:
24 | # protId already in set. is it unique? if yes, no action, otherwise error
25 | if self.xrefs[gene.attrib['protId']] != gene.attrib['id']:
26 | raise ValueError("protId '{}' is used several times with different gene id :'{},'{}'"
27 | .format(gene.attrib['protId'], self.xrefs[gene.attrib['protId']], gene.attrib['id']))
28 | else:
29 | to_rem.append(gene.attrib['id'])
30 | continue
31 | else:
32 | # reassign internal gene id.
33 | new_id = self._random_unused_id()
34 | update_ids[gene.attrib['id']] = new_id
35 | gene.attrib['id'] = new_id
36 |
37 | self.xrefs[gene.attrib['protId']] = gene.attrib['id']
38 | self.ids.add(gene.attrib['id'])
39 | return update_ids, to_rem
40 |
41 |
42 | class Merger:
43 | def __init__(self, first):
44 | self.NS = "http://orthoXML.org/2011/"
45 | ET.register_namespace("", self.NS)
46 | self.doc = ET.parse(first)
47 | self.root = self.doc.getroot()
48 |
49 | self.all_species = set(z.attrib['name'] for z in self.doc.findall('./{{{}}}species'.format(self.NS)))
50 | self.all_genes = GeneRefManager()
51 | self.all_genes.register_and_reassign(
52 | self.doc.findall("./{{{0}}}species/{{{0}}}database/{{{0}}}genes/{{{0}}}gene".format(self.NS))
53 | )
54 |
55 | def merge_file(self, other):
56 | gene_id_updates, to_rem = self.all_genes.register_and_reassign(
57 | other.findall("./{{{0}}}species/{{{0}}}database/{{{0}}}genes/{{{0}}}gene".format(self.NS)))
58 | self._remove_unnecessary_genes(other, to_rem)
59 | self._update_geneRef_ids(other.find('./{{{}}}groups'.format(self.NS)), gene_id_updates)
60 |
61 | for sp in other.findall("./{{{}}}species".format(self.NS)):
62 | if sp.attrib['name'] not in self.all_species:
63 | species_seen = False
64 | for i, el in enumerate(self.root):
65 | if el.tag == "{{{}}}species".format(self.NS):
66 | species_seen = True
67 | elif species_seen:
68 | break
69 | self.root.insert(i, sp)
70 | self.all_species.add(sp.attrib['name'])
71 | else:
72 | db = self.root.find("./{{{0}}}species[@name='{1}']/{{{0}}}database/{{{0}}}genes".format(self.NS, sp.attrib['name']))
73 | for g in sp.iterfind(".//{{{}}}gene".format(self.NS)):
74 | db.append(g)
75 | grps = self.root.find("./{{{}}}groups".format(self.NS))
76 | for g in other.find("./{{{}}}groups".format(self.NS)):
77 | grps.append(g)
78 |
79 | def _update_geneRef_ids(self, root, gene_id_updates):
80 | for old_id, new_id in gene_id_updates.items():
81 | for g in root.iterfind(".//{{{0}}}geneRef[@id='{1}']".format(self.NS, old_id)):
82 | g.attrib['id'] = new_id
83 |
84 | def _remove_unnecessary_genes(self, root, to_rem):
85 | for e in to_rem:
86 | parent = root.find("./{{{0}}}species/{{{0}}}database/{{{0}}}genes/{{{0}}}gene[@id='{1}']/.."
87 | .format(self.NS, e))
88 | child = parent.find("./{{{0}}}gene[@id='{1}']".format(self.NS, e))
89 | parent.remove(child)
90 |
91 |
92 |
93 |
94 | def write(self, fh):
95 | self.doc.write(fh, xml_declaration=True, encoding="UTF-8", default_namespace=None)
96 |
97 |
98 | def merge_orthoxml_files(out, files):
99 | """function to merge several orthoxml files into a single orthoxml file that contains all groups.
100 |
101 | This function combines several orthoxml files into a single orthoxml file that
102 | contains all the groups and maintains a valid definition block of the species
103 | and their genes. The protId attributes among all the orthoxml files need to be
104 | either unique or being at least assigned to the same internal gene id; in that
105 | case it is assumed that it is the same gene across the different files and it
106 | can be merged.
107 | if the gene id attribute is the same two or more orthoxml files, but their
108 | protId value is different, a new gene id value is generated and the geneRef
109 | values are updated accordingly.
110 |
111 | :param out: a path or a filehandle object where the combined orthoxml data should
112 | be written to.
113 |
114 | :param files: a list of paths or filehandle objects (of valid orthoxml format) that
115 | should be merged.
116 |
117 | """
118 |
119 | first = files.pop()
120 | merger = Merger(first)
121 | for f in files:
122 | merger.merge_file(ET.parse(f).getroot())
123 |
124 | return merger.write(out)
125 |
--------------------------------------------------------------------------------
/FastOMA/zoo/seq_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *
2 |
3 |
4 |
--------------------------------------------------------------------------------
/FastOMA/zoo/unionfind.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 | """UnionFind.py
4 |
5 | Union-find data structure. Based on Josiah Carlson's code,
6 | http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/215912
7 | with significant additional changes by D. Eppstein and
8 | Adrian Altenhoff.
9 | """
10 |
11 |
12 | class UnionFind(object):
13 | """Union-find data structure.
14 |
15 | Each unionFind instance X maintains a family of disjoint sets of
16 | hashable objects, supporting the following two methods:
17 |
18 | - X[item] returns a name for the set containing the given item.
19 | Each set is named by an arbitrarily-chosen one of its members; as
20 | long as the set remains unchanged it will keep the same name. If
21 | the item is not yet part of a set in X, a new singleton set is
22 | created for it.
23 |
24 | - X.union(item1, item2, ...) merges the sets containing each item
25 | into a single larger set. If any item is not yet part of a set
26 | in X, it is added to X as one of the members of the merged set.
27 | """
28 |
29 | def __init__(self, elements=None):
30 | """Create a new union-find structure.
31 |
32 | If elements is not None, the structure gets initialized
33 | with each element as a singleton component.
34 |
35 | :param elements: an iterable to initialize the structure.
36 | """
37 |
38 | self.weights = {}
39 | self.parents = {}
40 | if elements is not None:
41 | for elem in iter(elements):
42 | self.parents[elem] = elem
43 | self.weights[elem] = 1
44 |
45 | def __getitem__(self, obj):
46 | """return the name of set which contains obj.
47 |
48 | :param obj: the query object
49 |
50 | :SeeAlso: :meth:`find`"""
51 | return self.find(obj)
52 |
53 | def find(self, obj):
54 | """Find and return the name of the set containing the obj.
55 |
56 | If the object is not found in any set, a new singleton set
57 | is created that holds only this object until it is further merged."""
58 |
59 | # check for previously unknown obj. If unknown, add it
60 | # as a new cluster
61 | if obj not in self.parents:
62 | self.parents[obj] = obj
63 | self.weights[obj] = 1
64 | return obj
65 |
66 | # find path of objects leading to the root
67 | path = [obj]
68 | root = self.parents[obj]
69 | while root != path[-1]:
70 | path.append(root)
71 | root = self.parents[root]
72 |
73 | # compress the path and return
74 | for ancestor in path:
75 | self.parents[ancestor] = root
76 | return root
77 |
78 | def remove(self, obj):
79 | """Remove an object from the sets.
80 |
81 | Removes an object entirly from the datastructure. The
82 | containing set will shrink by this one element.
83 |
84 | :Note: If one tries to accessed it afterwards using
85 | :meth:`find`, it will be created newly and put as a
86 | singleton.
87 | """
88 | if obj not in self.parents:
89 | return
90 | comp = self.find(obj)
91 | self.weights[comp] -= 1
92 | self.parents.pop(obj)
93 |
94 | def __iter__(self):
95 | """Iterate through all items ever found or unioned by this structure."""
96 | return iter(self.parents)
97 |
98 | def union(self, *objects):
99 | """Find the sets containing the objects and merge them.
100 |
101 | any number of objects can be passed to this method and
102 | all of them will be merged into one set containing at
103 | least these objects.
104 |
105 | :param objects: the objects to be merged. they have to be all
106 | hashable. If they haven't been initialy added to the UnionFind
107 | datastructre at instantiation time, they are added at this point
108 | in time.
109 | """
110 | roots = [self[x] for x in objects]
111 | heaviest = max([(self.weights[r], r) for r in roots], key=lambda x: x[0])[1]
112 | for r in roots:
113 | if r != heaviest:
114 | self.weights[heaviest] += self.weights[r]
115 | self.parents[r] = heaviest
116 |
117 | def get_components(self):
118 | """return a list of sets corresponding to the connected
119 | components of the structure."""
120 | comp_dict = collections.defaultdict(set)
121 | for elem in iter(self):
122 | comp_dict[self[elem]].add(elem)
123 | comp = list(comp_dict.values())
124 | return comp
125 |
--------------------------------------------------------------------------------
/FastOMA/zoo/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | '''
3 | Utilities for zoo files.
4 | '''
5 | from io import BytesIO
6 | import bz2
7 | import gzip
8 | import os
9 | import logging
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | # File opening. This is based on the example on SO here:
14 | # http://stackoverflow.com/a/26986344
15 | fmagic = {b'\x1f\x8b\x08': gzip.open,
16 | b'\x42\x5a\x68': bz2.BZ2File}
17 |
18 |
19 | def auto_open(fn, *args, **kwargs):
20 | """function to open regular or compressed files for read / write.
21 |
22 | This function opens files based on their "magic bytes". Supports bz2
23 | and gzip. If it finds neither of these, presumption is it is a
24 | standard, uncompressed file.
25 |
26 | Example::
27 |
28 | with auto_open("/path/to/file/maybe/compressed", mode="rb") as fh:
29 | fh.read()
30 |
31 | with auto_open("/tmp/test.txt.gz", mode="wb") as fh:
32 | fh.write("my big testfile")
33 |
34 | :param fn: either a string of an existing or new file path, or
35 | a BytesIO handle
36 | :param \*\*kwargs: additional arguments that are understood by the
37 | underlying open handler
38 | :returns: a file handler
39 | """
40 | if isinstance(fn, BytesIO):
41 | return fn
42 |
43 | if os.path.isfile(fn) and os.stat(fn).st_size > 0:
44 | with open(fn, 'rb') as fp:
45 | fs = fp.read(max([len(x) for x in fmagic]))
46 | for (magic, _open) in fmagic.items():
47 | if fs.startswith(magic):
48 | return _open(fn, *args, **kwargs)
49 | else:
50 | if fn.endswith('gz'):
51 | return gzip.open(fn, *args, **kwargs)
52 | elif fn.endswith('bz2'):
53 | return bz2.BZ2File(fn, *args, **kwargs)
54 |
55 | return open(fn, *args, **kwargs)
56 |
57 |
58 | class LazyProperty(object):
59 | """Decorator to evaluate a property only on access.
60 |
61 | Compute the attribute value and caches it in the instance.
62 | Python Cookbook (Denis Otkidach) http://stackoverflow.com/users/168352/denis-otkidach
63 | This decorator allows you to create a property which can be computed once and
64 | accessed many times.
65 |
66 | Example::
67 |
68 | class Circle:
69 | def __init__(self, radius):
70 | self.radius = radius
71 |
72 | @LazyProperty
73 | def area(self):
74 | print("computing area")
75 | return 3.14 * self.radius ** 2
76 |
77 | >>> c = Circle(4)
78 | >>> c.area
79 | computing area
80 | 50.24
81 | >>> c.area
82 | 50.24
83 |
84 | You can see that the property method is only executed once.
85 | """
86 |
87 | def __init__(self, method, name=None):
88 | # record the unbound-method and the name
89 | self.method = method
90 | self.name = name or method.__name__
91 | self.__doc__ = method.__doc__
92 |
93 | def __get__(self, inst, cls):
94 | if inst is None:
95 | return self
96 | # compute, cache and return the instance's attribute value
97 | result = self.method(inst)
98 | # setattr redefines the instance's attribute so this doesn't get called again
99 | setattr(inst, self.name, result)
100 | return result
101 |
102 |
103 | def unique(seq):
104 | """Return the elements of a list uniquely while preserving the order
105 |
106 | :param list seq: a list of hashable elements
107 | :returns: new list with first occurence of elements of seq"""
108 | seen = set()
109 | return [x for x in seq if x not in seen and not seen.add(x)]
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | class WrapperError(Exception):
2 | pass
3 |
4 |
5 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/aligners/__init__.py:
--------------------------------------------------------------------------------
1 | from .mafft import Mafft
2 | from .muscle import Muscle
3 | from .prographmsa import ProGraphMSA
4 | from .probcons import ProbCons
5 | from .base_aligner import AlignmentInput, DataType, WrapperError
6 |
7 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/aligners/base_aligner.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | from abc import ABCMeta, abstractmethod
3 | from enum import Enum
4 | from Bio import AlignIO, SeqIO
5 |
6 |
7 | from ...seq_utils import is_dna, identify_input, AlignmentInput
8 | from .. import WrapperError
9 |
10 |
11 |
12 |
13 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN')
14 |
15 |
16 | class Aligner(object):
17 | """
18 | Base class for wrappers of Multiple Sequence Aligner software
19 |
20 | The wrapper is written as a callable class.
21 | This can hold data (state) to do with the operation it performs, so it can keep results,
22 | execution times and other metadata, as well as perform the task.
23 |
24 | This is a base implementation to be extended. The important parts are
25 | __init__ (does the setup) and __call__ (does the work). All
26 | else are helper methods.
27 |
28 | :Example:
29 |
30 | ::
31 |
32 | callable_wrapper = ConcreteAligner(aln)
33 | result = callable_wrapper()
34 | time_taken = callable_wrapper.elapsed_time
35 | result_again = callable_wrapper.result
36 |
37 | """
38 | __metaclass__ = ABCMeta
39 |
40 | def __init__(self, input_, datatype=DataType.UNKNOWN, binary=None):
41 | """
42 | Should work the same whether you're working with a Biopython object or a file
43 | but the implementation differs, e.g. a Biopython object will need
44 | to be written temporarily to disk for the Aligner to work on it.
45 |
46 | :param input_: can be either a filename or a biopython multiple
47 | sequence alignment (a collection of :class:`Bio.SeqRecord.SeqRecord`)
48 |
49 | :param binary: is the alignment's executable file, or None. If set to
50 | None, it is assumed to be found in the PATH.
51 |
52 | :param datatype: means is it DNA or protein?
53 | """
54 | self.input_type = identify_input(input_) # Figure out what it is - file or object
55 |
56 | if isinstance(datatype, str):
57 | try:
58 | datatype = getattr(DataType, datatype.upper())
59 | except AttributeError:
60 | raise ValueError("\"{}\" is an invalid datatype for an Aligner".format(datatype))
61 | if datatype == DataType.UNKNOWN:
62 | self.datatype = guess_datatype(input_, from_filename=self.input_type == AlignmentInput.FILENAME)
63 | if self.input_type == AlignmentInput.OBJECT:
64 | dup, input_ = itertools.tee(input_)
65 | self.datatype = guess_datatype(dup, False)
66 | else:
67 | self.datatype = guess_datatype(input_, True)
68 | else:
69 | self.datatype = datatype
70 |
71 | self.input = input_ # store it
72 | self.elapsed_time = None
73 | self.stdout = None
74 | self.stderr = None
75 | try:
76 | self.cli = self._init_cli(binary)
77 | except IOError as err:
78 | raise WrapperError('Error searching for binary: {}'.format(err))
79 | # End setup
80 |
81 | @abstractmethod
82 | def __call__(self, *args, **kwargs):
83 | """
84 | How to call the underlying aligner
85 | """
86 | pass
87 |
88 | @abstractmethod
89 | def _init_cli(self, binary):
90 | pass
91 |
92 | import logging
93 | logger = logging.getLogger()
94 |
95 |
96 | def guess_datatype(alignment, from_filename=False):
97 | logger.warning("Guessing is not recommended - specify the sequence type with option datatype={DNA, PROTEIN}, be more confident")
98 | if from_filename:
99 | try:
100 | alignment = SeqIO.parse(alignment, 'fasta')
101 | except:
102 | alignment = SeqIO.parse(alignment, 'phylip-relaxed')
103 | return DataType.DNA if is_dna(alignment) else DataType.PROTEIN
104 |
105 |
106 | # TODO: Break the identify_input function into two parts - one to work out the datatype, one to work out whether
107 | # this is a file or an object
108 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/aligners/muscle.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | import time
3 | from Bio import AlignIO, SeqIO
4 | from six import StringIO
5 | from ..abstract_cli import AbstractCLI
6 | from .base_aligner import Aligner, AlignmentInput, DataType
7 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, TreeInputOption, OptionSet
8 |
9 |
10 |
11 |
12 | class MuscleCLI(AbstractCLI):
13 | """
14 | Muscle low-level command line interface
15 |
16 | example:
17 | muscle_cli = MuscleCLI()
18 | process = muscle_cli(cmd='muscle args...')
19 | stdout = muscle_cli.get_stdout()
20 | """
21 | @property
22 | def _default_exe(self):
23 | return 'muscle'
24 |
25 | # def _set_help(self):
26 | # self(help=True, wait=True)
27 | # self._help = self.get_stdout()
28 |
29 | def set_default_dna_options(aligner):
30 | """
31 | Dummy function as sensible default already provided by mafft --auto
32 | """
33 | aligner.options = get_default_options()
34 |
35 |
36 | def set_default_protein_options(aligner):
37 | """
38 | Dummy function as sensible default already provided by mafft --auto
39 | """
40 | aligner.options = get_default_options()
41 |
42 | class Muscle(Aligner):
43 | """
44 | Convenient wrapper for Muscle multiple sequence aligner
45 |
46 | The wrapper is written as a callable class.
47 | This can hold data (state) to do with the operation it performs, so it can keep results,
48 | execution times and other metadata, as well as perform the task.
49 |
50 | This is a basic implementation that can be extended. The important parts are
51 | __init__ (does the setup) and __call__ (does the work). All
52 | else are helper methods.
53 |
54 | :Example:
55 |
56 | ::
57 |
58 | callable_wrapper = Muscle(aln)
59 | result = callable_wrapper()
60 | time_taken = callable_wrapper.elapsed_time
61 | result_again = callable_wrapper.result
62 | """
63 |
64 | def __init__(self, input_, *args, **kwargs):
65 | super(Muscle, self).__init__(input_, *args, **kwargs)
66 | self.options = get_default_options()
67 |
68 | if self.datatype == DataType.DNA:
69 | set_default_dna_options(self)
70 | else:
71 | set_default_protein_options(self)
72 |
73 | def __call__(self, *args, **kwargs):
74 | """
75 | Anything to do with calling Muscle should go here.
76 | If any extra arguments need to be passed they can
77 | be specified (listed as *args and **kwargs for now).
78 | """
79 | start = time.time() # time the execution
80 |
81 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
82 | with tempfile.NamedTemporaryFile(mode="wt") as filehandle:
83 | SeqIO.write(self.input, filehandle, 'fasta')
84 | filehandle.seek(0)
85 | output, error = self._call(filehandle.name, *args, **kwargs)
86 | else:
87 | output, error = self._call(self.input, *args, **kwargs)
88 |
89 | self.result = self._read_result(output) # store result
90 | self.stdout = output
91 | self.stderr = error
92 |
93 | end = time.time()
94 | self.elapsed_time = end - start
95 | return self.result
96 | # End call
97 |
98 | # Any other accessory methods
99 | def _call(self, filename, *args, **kwargs):
100 | """
101 | Call underlying low level _MuscleCLI wrapper.
102 | Options are passed via *args and **kwargs
103 | [This only covers the simplest automatic
104 | case]
105 | """
106 | self.cli('{} -in {}'.format(self.command(), filename),
107 | wait=True)
108 | return self.cli.get_stdout(), self.cli.get_stderr()
109 |
110 | def command(self):
111 | return str(self.options)
112 |
113 | def _read_result(self, output):
114 | """
115 | Read back the result.
116 | """
117 | fileobj = StringIO(output)
118 | return AlignIO.read(fileobj, 'fasta')
119 |
120 | def _init_cli(self, binary):
121 | return MuscleCLI(executable=binary)
122 |
123 |
124 | def get_default_options():
125 | return OptionSet([
126 | # Algorithm
127 |
128 | # Find diagonals (faster for similar sequences)
129 | FlagOption('-diags', False, active=False),
130 |
131 | # Maximum number of iterations(integer, default 16)
132 | IntegerOption('-maxiters', 16, active=False),
133 |
134 | # Maximum time to iterate in hours (default no limit)
135 | FloatOption('-maxhours', 0.0, active=False)
136 |
137 | #reeInputOption('-usetree', '', active=False)
138 | ])
139 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/aligners/probcons.py:
--------------------------------------------------------------------------------
1 | import time
2 | from Bio import AlignIO, SeqIO
3 | from six import StringIO
4 | from ..abstract_cli import AbstractCLI
5 | from .base_aligner import Aligner, AlignmentInput, DataType
6 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
7 | import tempfile
8 |
9 |
10 |
11 |
12 | class ProbConsCLI(AbstractCLI):
13 | """
14 | ProbCons low-level command line interface
15 |
16 | :Example:
17 |
18 | ::
19 |
20 | probcons_cli = _ProbConsCLI()
21 | process = mafft_cli(cmd='mafft args...')
22 | stdout = mafft_cli.get_stdout()
23 | """
24 | @property
25 | def _default_exe(self):
26 | return 'probcons'
27 |
28 | # def _set_help(self):
29 | # self(help=True, wait=True)
30 | # self._help = self.get_stdout()
31 |
32 |
33 | def set_default_dna_options(aligner):
34 | """
35 | Dummy function as sensible default already provided by mafft --auto
36 | """
37 | aligner.options = get_default_options()
38 |
39 |
40 | def set_default_protein_options(aligner):
41 | """
42 | Dummy function as sensible default already provided by mafft --auto
43 | """
44 | aligner.options = get_default_options()
45 |
46 |
47 | class ProbCons(Aligner):
48 | """
49 | Convenient wrapper for ProbCons multiple sequence aligner
50 |
51 | The wrapper is written as a callable class.
52 | This can hold data (state) to do with the operation it performs, so it can keep results,
53 | execution times and other metadata, as well as perform the task.
54 |
55 | This is a basic implementation that can be extended. The important parts are
56 | __init__ (does the setup) and __call__ (does the work). All
57 | else are helper methods.
58 |
59 | :Example:
60 |
61 | ::
62 |
63 | callable_wrapper = ProbCons(aln)
64 | result = callable_wrapper()
65 | time_taken = callable_wrapper.elapsed_time
66 | result_again = callable_wrapper.result
67 |
68 |
69 | .. note:: There exists an ipython notebook on how to work with wrappers,
70 | including dealing with non-default parameters.
71 | """
72 |
73 | def __init__(self, input_, *args, **kwargs):
74 | super(ProbCons, self).__init__(input_, *args, **kwargs)
75 | self.options = get_default_options()
76 | if self.datatype == DataType.DNA:
77 | set_default_dna_options(self)
78 | else:
79 | set_default_protein_options(self)
80 |
81 | def __call__(self, *args, **kwargs):
82 | """
83 | Anything to do with calling Mafft should go here.
84 | If any extra arguments need to be passed they can
85 | be specified (listed as *args and **kwargs for now).
86 | """
87 | start = time.time() # time the execution
88 |
89 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
90 | with tempfile.NamedTemporaryFile(mode='wt') as filehandle:
91 | SeqIO.write(self.input, filehandle, 'fasta')
92 | filehandle.seek(0)
93 | output, error = self._call(filehandle.name, *args, **kwargs)
94 |
95 | else:
96 | output, error = self._call(self.input, *args, **kwargs)
97 |
98 | self.result = self._read_result(output) # store result
99 | self.stdout = output
100 | self.stderr = error
101 |
102 | end = time.time()
103 | self.elapsed_time = end - start
104 | return self.result
105 | # End call
106 |
107 | # Any other accessory methods
108 | def _call(self, filename, *args, **kwargs):
109 | """
110 | Call underlying low level _Mafft wrapper.
111 | Options are passed via *args and **kwargs
112 | [This only covers the simplest automatic
113 | case]
114 | """
115 | self.cli('{} {}'.format(self.command(), filename),
116 | wait=True)
117 | return self.cli.get_stdout(), self.cli.get_stderr()
118 |
119 | def command(self):
120 | return str(self.options)
121 |
122 | def _read_result(self, output):
123 | """
124 | Read back the result.
125 | """
126 | fileobj = StringIO(output)
127 | return AlignIO.read(fileobj, 'fasta')
128 |
129 | def _init_cli(self, binary):
130 | return ProbConsCLI(executable=binary)
131 |
132 |
133 | def get_default_options():
134 | return OptionSet([
135 | # Algorithm
136 |
137 | # use CLUSTALW output format instead of MFA
138 | FlagOption('-clustalw', False, active=False),
139 |
140 | # use 0 <= REPS <= 5 (default: 2) passes of consistency transformation
141 | IntegerOption('-c', 0, active=False),
142 |
143 | # use 0 <= REPS <= 1000 (default: 100) passes of iterative-refinement
144 | IntegerOption('-ir', 100, active=False),
145 |
146 | # use 0 <= REPS <= 20 (default: 0) rounds of pretraining
147 | IntegerOption('-pre', 0, active=False),
148 |
149 | # generate all-pairs pairwise alignments
150 | FlagOption('-pairs', False, active=False),
151 |
152 | #use Viterbi algorithm to generate all pairs(automatically enables - pairs)
153 | FlagOption('-viterbi', False, active=False),
154 |
155 | # write annotation for multiple alignment to FILENAME
156 | StringOption('-annot', '', active=False),
157 |
158 | # print sequences in alignment order rather than input order (default: off)
159 | FlagOption('-a', False, active=False)
160 |
161 | ])
162 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/aligners/prographmsa.py:
--------------------------------------------------------------------------------
1 | import time
2 | from Bio import AlignIO, SeqIO
3 | import tempfile
4 | from six import StringIO
5 | from ..abstract_cli import AbstractCLI
6 | from .base_aligner import Aligner, AlignmentInput, DataType
7 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
8 |
9 |
10 |
11 |
12 | class ProGraphMSACLI(AbstractCLI):
13 | """
14 | PrographMSA low-level command line interface
15 |
16 | :Example:
17 |
18 | ::
19 |
20 | prograph_cli = ProGraphMSACLI()
21 | process = prograph_cli(cmd='mafft args...')
22 | stdout = prograph_cli.get_stdout()
23 | """
24 |
25 | @property
26 | def _default_exe(self):
27 | return 'ProGraphMSA'
28 |
29 |
30 | def set_default_dna_options(aligner):
31 | """
32 | Dummy function as sensible default already provided by mafft --auto
33 | """
34 | aligner.options = get_default_options()
35 |
36 |
37 | def set_default_protein_options(aligner):
38 | """
39 | Dummy function as sensible default already provided by mafft --auto
40 | """
41 | aligner.options = get_default_options()
42 |
43 |
44 | class ProGraphMSA(Aligner):
45 | """
46 | Convenient wrapper for ProGraphMSA multiple sequence aligner
47 |
48 | The wrapper is written as a callable class.
49 | This can hold data (state) to do with the operation it performs, so it can keep results,
50 | execution times and other metadata, as well as perform the task.
51 |
52 | This is a basic implementation that can be extended. The important parts are
53 | __init__ (does the setup) and __call__ (does the work). All
54 | else are helper methods.
55 |
56 | :Example:
57 |
58 | ::
59 |
60 | callable_wrapper = Mafft(aln)
61 | result = callable_wrapper()
62 | time_taken = callable_wrapper.elapsed_time
63 | result_again = callable_wrapper.result
64 | """
65 |
66 | def __init__(self, input_, *args, **kwargs):
67 | super(ProGraphMSA, self).__init__(input_, *args, **kwargs)
68 | self.options = get_default_options()
69 | if self.datatype == DataType.DNA:
70 | set_default_dna_options(self)
71 | else:
72 | set_default_protein_options(self)
73 |
74 | def __call__(self, *args, **kwargs):
75 | """
76 | Anything to do with calling ProGraphMSA should go here.
77 | If any extra arguments need to be passed they can
78 | be specified (listed as *args and **kwargs for now).
79 | """
80 | start = time.time() # time the execution
81 |
82 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
83 | with tempfile.NamedTemporaryFile(mode="wt") as fh:
84 | SeqIO.write(self.input, fh, 'fasta')
85 | fh.seek(0)
86 | output, error = self._call(fh.name, *args, **kwargs)
87 |
88 | else:
89 | output, error = self._call(self.input, *args, **kwargs)
90 |
91 | self.result = self._read_result(output) # store result
92 | self.stdout = output
93 | self.stderr = error
94 |
95 | end = time.time()
96 | self.elapsed_time = end - start
97 | return self.result
98 | # End call
99 |
100 | # Any other accessory methods
101 | def _call(self, filename, *args, **kwargs):
102 | """
103 | Call underlying low level ProGraphMSA wrapper.
104 | Options are passed via *args and **kwargs
105 | [This only covers the simplest automatic
106 | case]
107 | """
108 | self.cli('{} {}'.format(self.command(), filename),
109 | wait=True)
110 | return self.cli.get_stdout(), self.cli.get_stderr()
111 |
112 | def command(self):
113 | return str(self.options)
114 |
115 | def _read_result(self, output):
116 | """
117 | Read back the result.
118 | """
119 | fileobj = StringIO(output)
120 | return AlignIO.read(fileobj, 'fasta')
121 |
122 | def _init_cli(self, binary):
123 | return ProGraphMSACLI(executable=binary)
124 |
125 |
126 | def get_default_options():
127 | return OptionSet([
128 | # Algorithm
129 |
130 | # output fasta format (instead of stockholm), better because no tree output is produced
131 | FlagOption('--fasta', True, active=True),
132 |
133 | # output all ancestral sequences
134 | FlagOption('--ancestral_seqs', False, active=False),
135 |
136 | # output sequences in input order (default: tree order)
137 | FlagOption('--input_order', False, active=False),
138 |
139 | # output all intermediate guide trees
140 | FlagOption('--all_trees', False, active=False),
141 |
142 | # use ML distances with gap
143 | FlagOption('--mldist_gap', False, active=False),
144 |
145 | # use ML distances
146 | FlagOption('--mldist', False, active=False),
147 |
148 | # use of guide tree
149 | StringOption('--tree', '', active=False)
150 |
151 | ])
152 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/modeltesters/__init__.py:
--------------------------------------------------------------------------------
1 | from .prottest import ProtTest
2 |
3 |
4 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/modeltesters/base_modeltester.py:
--------------------------------------------------------------------------------
1 | import os, types, itertools
2 | from abc import ABCMeta, abstractmethod
3 | from enum import Enum
4 | from Bio import AlignIO, SeqIO
5 | from Bio.Align import MultipleSeqAlignment
6 | from ...seq_utils import is_dna
7 |
8 |
9 |
10 | from zoo.wrappers import WrapperError
11 | from zoo.wrappers.aligners.base_aligner import identify_input
12 |
13 | import logging
14 | logger = logging.getLogger(__name__)
15 |
16 | AlignmentInput = Enum('AlignmentInput', 'OBJECT FILENAME')
17 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN')
18 |
19 |
20 | class ModelTester(object):
21 | """
22 | Base class for wrappers of model testers for phylogeny inference
23 |
24 | The wrapper is written as a callable class.
25 | This can hold data (state) to do with the operation it performs, so it can keep results,
26 | execution times and other metadata, as well as perform the task.
27 |
28 | This is a base implementation to be extended. The important parts are
29 | __init__ (does the setup) and __call__ (does the work). All
30 | else are helper methods.
31 |
32 | :Example:
33 |
34 | ::
35 |
36 | callable_wrapper = ConcreteModelTester(aln)
37 | result = callable_wrapper()
38 | time_taken = callable_wrapper.elapsed_time
39 | result_again = callable_wrapper.result
40 | """
41 | __metaclass__ = ABCMeta
42 |
43 | def __init__(self, alignment=None, datatype=DataType.UNKNOWN, binary=None):
44 | """
45 | ..note:: TODO: this documentation is not correct. it needs to be updateted.
46 |
47 | Should work the same whether you're working with a Biopython object or a file
48 | but the implementation differs, e.g. a Biopython object will need
49 | to be written temporarily to disk for the Aligner to work on it.
50 |
51 | alignment is one of 4 things:
52 | a filename
53 | a Biopython MSA
54 | a list of Seq objects
55 | anything else (throw an exception)
56 |
57 | binary is the alignment's executable file, or None
58 | """
59 |
60 | if alignment is not None:
61 | self.input_type = identify_input(alignment) # Figure out what it is - file or object
62 | if datatype == DataType.UNKNOWN:
63 | # dup, input_ = itertools.tee(input_)
64 | self.datatype = guess_datatype(alignment, from_filename=self.input_type == AlignmentInput.FILENAME)
65 | else:
66 | self.datatype = datatype
67 |
68 | self.input = alignment # store it
69 | else:
70 | self.input_type = None
71 | self.input = None
72 |
73 |
74 | self.elapsed_time = None
75 | self.stdout = None
76 | self.stderr = None
77 | self.cli = self._init_cli(binary)
78 | #TODO: the wrapper error is not compatible with calling a function with java!
79 | #try:
80 | # self.cli = self._init_cli(binary)
81 | #except IOError as err:
82 | # raise WrapperError('Error searching for binary: {}'.format(err))
83 | # End setup
84 |
85 | @abstractmethod
86 | def __call__(self, *args, **kwargs):
87 | """
88 | How to call the underlying aligner
89 | """
90 | pass
91 |
92 | @abstractmethod
93 | def _init_cli(self, binary):
94 | """
95 | Set up the command-line interface to the wrapped software
96 | :param binary: filename of executable binary file
97 | :return: concrete CLI type inheriting from AbstractCLI
98 | """
99 | pass
100 |
101 |
102 | def guess_datatype(alignment, from_filename=False):
103 | logger.warning("Guessing is not recommended - specify the sequence type with option datatype={DNA, PROTEIN}, be more confident")
104 | if from_filename:
105 | try:
106 | alignment = list(SeqIO.parse(alignment, 'fasta'))
107 | except:
108 | alignment = list(SeqIO.parse(alignment, 'phylip-relaxed'))
109 | return DataType.DNA if is_dna(alignment) else DataType.PROTEIN
110 |
111 |
112 | def identify_input(alignment):
113 | """
114 | Work out if we're dealing with an alignment (return True), a file
115 | (return False), or invalid input (raise error)
116 |
117 | :param alignment: either an Biopython MultipleSequenceAlignment or
118 | a filename pointing to an existing msa file.
119 | """
120 | try:
121 | if isinstance(alignment, (MultipleSeqAlignment, types.GeneratorType, list)):
122 | # `alignment` is a Biopython MultipleSequenceAlignment
123 | return AlignmentInput.OBJECT
124 |
125 | elif isinstance(alignment, str) and os.path.exists(alignment):
126 | # `alignment` is a filepath
127 | return AlignmentInput.FILENAME
128 |
129 | except:
130 | # `alignment` is some other thing we can't handle
131 | raise ValueError('{} is not an alignment object or a valid filename'.format(alignment))
132 |
133 |
134 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/modeltesters/parsers.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import dendropy as dpy
3 | from pyparsing import Suppress, SkipTo, Word, Regex, Literal, OneOrMore, Group, LineEnd, CharsNotIn, nums, alphanums, \
4 | ParseException
5 |
6 |
7 | logger = logging.getLogger(__name__)
8 | logger.addHandler(logging.StreamHandler())
9 |
10 |
11 | FLOAT = Word(nums + '.-').setParseAction(lambda x: float(x[0]))
12 | INT = Word(nums).setParseAction(lambda x: int(x[0]))
13 | WORD = Word(alphanums + '_')
14 | SPACEDWORD = Word(alphanums+' _')
15 |
16 |
17 | class ProtTestParser(object):
18 | """
19 | Simple prottest result parser.
20 | """
21 |
22 | def __init__(self):
23 | self.MODEL = Regex(r'Best model according to\s+')
24 | # These are all the models that are possible to be tested using phyml
25 | self.model = OneOrMore(Group(Suppress(SkipTo(self.MODEL)) + Suppress(self.MODEL) + WORD + Suppress(":") + WORD))
26 |
27 | def parse(self, s):
28 | model = None
29 | try:
30 | model = self.model.parseString(s).asList()
31 | except ParseException as err:
32 | logger.error(err)
33 |
34 | return model
35 |
36 | def to_dict(self, stats_filename):
37 | result = {}
38 | model = self.parse(stats_filename)
39 | try:
40 | for mg in model:
41 | result[mg[0]] = mg[1]
42 | except IOError as err:
43 | logger.error(err)
44 | return
45 |
46 | return result
47 |
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/modeltesters/prottest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import tempfile
4 | import logging
5 |
6 |
7 | from pyparsing import ParseException
8 | from Bio import AlignIO, SeqIO
9 |
10 | from .parsers import ProtTestParser
11 | from .base_modeltester import ModelTester, AlignmentInput, DataType
12 |
13 | from ..abstract_cli import AbstractCLI
14 | from ..options import StringOption, FlagOption, OptionSet
15 |
16 | logger = logging.getLogger(__name__)
17 | logger.addHandler(logging.StreamHandler())
18 | logger.setLevel(logging.INFO)
19 |
20 |
21 | class ProtTestCLI(AbstractCLI):
22 | """
23 | Especially in this case it is important that the $PROTTEST_HOME environmental variable is set to the installation directory of the prottest tool
24 | """
25 | @property
26 | def _default_exe(self):
27 | return 'java -jar ' + os.environ['PROTTEST_HOME'] + '/prottest-3.4.2.jar'
28 |
29 |
30 | def set_default_dna_options(modeltester):
31 | """
32 | Dummy function as sensible default
33 | """
34 | modeltester.options = get_default_options()
35 |
36 |
37 | def set_default_protein_options(modeltester):
38 | """
39 | Dummy function as sensible default
40 | """
41 | modeltester.options = get_default_options()
42 |
43 |
44 | class ProtTest(ModelTester):
45 | """ ProtTest to determine the best model for a specific alignment
46 | This wrapper can be called to test various models for phylogeny inference.
47 | """
48 |
49 | def __init__(self, alignment, *args, **kwargs):
50 | """
51 | :param alignment: input multiple sequence alignment. This can be either
52 | a filename or an biopython SeqRecord collection.
53 | """
54 | self.options = get_default_options()
55 | super(ProtTest, self).__init__(alignment=alignment, *args, **kwargs)
56 | if self.datatype == DataType.DNA:
57 | set_default_dna_options(self)
58 | else:
59 | set_default_protein_options(self)
60 |
61 | def __call__(self, *args, **kwargs):
62 | """
63 | Anything to do with calling ProtTest should go here.
64 | If any extra arguments need to be passed they can
65 | be specified (listed as *args and **kwargs for now).
66 | """
67 | start = time.time() # time the execution
68 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
69 | with tempfile.NamedTemporaryFile(mode='wt') as filehandle:
70 | SeqIO.write(self.input, filehandle, 'fasta')
71 | filehandle.seek(0)
72 | output, error = self._call(filehandle.name, *args, **kwargs)
73 | else:
74 | output, error = self._call(self.input, *args, **kwargs)
75 |
76 | self.result = self._read_result(output) # store result
77 | self.stdout = output
78 | self.stderr = error
79 |
80 | end = time.time()
81 | self.elapsed_time = end - start
82 | return self.result
83 | # End call
84 |
85 | # Any other accessory methods
86 | def _call(self, filename, *args, **kwargs):
87 | """
88 | Call underlying low level _ProtTest wrapper.
89 | Options are passed via *args and **kwargs
90 | [This only covers the simplest automatic
91 | case]
92 | """
93 | self.cli('{} -i {}'.format(self.command(), filename),
94 | wait=True)
95 | return self.cli.get_stdout(), self.cli.get_stderr()
96 |
97 | def command(self):
98 | return str(self.options)
99 |
100 | def _read_result(self, output):
101 |
102 | parser = ProtTestParser()
103 |
104 | try:
105 | result = parser.to_dict(output)
106 |
107 | except IOError as ioerr:
108 | logger.error('Error reading results')
109 | result = None
110 | except ParseException as parseerr:
111 | logger.error('Other parse error', parseerr)
112 | result = None
113 |
114 | return result
115 |
116 |
117 | def _init_cli(self, binary):
118 | return ProtTestCLI(executable=binary)
119 |
120 |
121 | def get_default_options():
122 | return OptionSet([
123 | # Algorithm
124 |
125 | # Display models sorted by Akaike Information Criterion (AIC)
126 | FlagOption('-AIC', False, active=False),
127 |
128 | # Display models sorted by Decision Theory Criterion
129 | FlagOption('-DT', False, active=False),
130 |
131 | # Tree file (optional) [default: NJ tree]
132 | StringOption('-t', '', active=False),
133 |
134 | # Display models sorted by Corrected Akaike Information Criterion (AICc)
135 | FlagOption('-AICC', False, active=False),
136 |
137 | #Enables / Disables PhyML logging into log directory(see prottest.properties)
138 | FlagOption('-log', False, active=False)
139 | ])
140 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/options.py:
--------------------------------------------------------------------------------
1 | from numbers import Integral, Real
2 | from six import string_types
3 | from abc import ABCMeta, abstractproperty
4 | from dendropy import Tree
5 |
6 |
7 | class Option(object):
8 | """Abstract base class for an option.
9 |
10 | Options provide an interface between the wrapper and the
11 | concrete command line option of the wrapped program."""
12 | __metaclass__ = ABCMeta
13 |
14 | def __init__(self, name, default=None, active=False):
15 | self._name = name
16 | self.set_value(default)
17 | self.active = active
18 |
19 | def __repr__(self):
20 | return '{}({}={}) <{}>'.format(self.__class__.__name__, self.name, self.get_value(), 'on' if self.active else 'off')
21 |
22 | def __str__(self):
23 | return (' '.join([self._name, str(self.get_value())]) if self.active else '')
24 |
25 | @property
26 | def active(self):
27 | return self._active
28 |
29 | @active.setter
30 | def active(self, val):
31 | self._active = True if val else False
32 |
33 | @property
34 | def name(self):
35 | return self._name
36 |
37 | def set_value(self, value):
38 | self._value = value
39 | if value is not None:
40 | self.active = True
41 |
42 | def get_value(self):
43 | return self._value
44 |
45 | def set_and_activate(self, value):
46 | self.set_value(value)
47 | self.active = True
48 |
49 | def status(self):
50 | return 'Name: {}\nValue: {}\nActive: {}\nStr: {}'.format(self.name,
51 | self.get_value(),
52 | self.active,
53 | str(self) or "''")
54 |
55 |
56 | class ValueOption(Option):
57 | __metaclass__ = ABCMeta
58 |
59 |
60 | class TypedValueOption(ValueOption):
61 | """A TypedValueOption is an option that only accepts options of a given type.
62 |
63 | This abstract class provides the functionality to check the type
64 | of a passed value and raises an ValueError if it doesn't match
65 | the expected type.
66 |
67 | A TypedValueOption must overwrite the abstract property _type.
68 | """
69 |
70 | __metaclass__ = ABCMeta
71 |
72 | @abstractproperty
73 | def _type(self):
74 | pass
75 |
76 | def set_value(self, value):
77 | if isinstance(value, self._type):
78 | self._value = value
79 | self.active = True
80 |
81 | else:
82 | raise ValueError('Value should be of type {}'.format(self.type))
83 |
84 |
85 | ### Concrete classes from here on
86 |
87 | class IntegerOption(TypedValueOption):
88 | """option to hold an integer value"""
89 | @property
90 | def _type(self):
91 | return Integral
92 |
93 |
94 | class FloatOption(TypedValueOption):
95 | """Option to hold a real number value"""
96 |
97 | @property
98 | def _type(self):
99 | return Real
100 |
101 | def get_value(self):
102 | return float(self._value)
103 |
104 |
105 | class StringOption(TypedValueOption):
106 | """Opion to hold a string value"""
107 |
108 | def __init__(self, name, value=None, active=False):
109 | if value is None:
110 | value = str()
111 | super(StringOption, self).__init__(name, value, active)
112 |
113 | @property
114 | def _type(self):
115 | return string_types
116 |
117 |
118 | class FlagOption(TypedValueOption):
119 | """Option to hold a boolean flag value, i.e. True or False"""
120 | @property
121 | def _type(self):
122 | return bool
123 |
124 | def __str__(self):
125 | return (self._name if self.active and self.get_value() else '')
126 |
127 |
128 | class TreeInputOption(TypedValueOption):
129 | """Option to hold a phylogenetic tree argument.
130 |
131 | As of now, Trees are represented as :class:`dendropy.Tree` objects."""
132 |
133 | @property
134 | def _type(self):
135 | return Tree
136 |
137 |
138 | class MultiOption(Option):
139 | """Option to hold a list"""
140 |
141 | @property
142 | def _type(self):
143 | return list
144 |
145 | def __str__(self):
146 | listopts = self.get_value()
147 | if listopts is None: return ''
148 | strings = []
149 | for item in listopts:
150 | item_string = ' '.join([self._name, str(item)]) if self.active else ''
151 | if item_string > '':
152 | strings.append(item_string)
153 |
154 | return ' '.join(strings)
155 |
156 |
157 | class OptionSet(object):
158 | """Option to hold a set of key-value pairs."""
159 | def __init__(self, options):
160 | if isinstance(options, (list, tuple)):
161 | self.options = {opt.name: opt for opt in options}
162 | elif isinstance(options, dict):
163 | self.options = options
164 | else:
165 | raise ValueError('Expected a list, tuple or dict of options, not {}'.format(type(options)))
166 |
167 | def __str__(self):
168 | strings = []
169 | for name, option in self.options.items():
170 | option_string = str(option)
171 | if option_string > '':
172 | strings.append(option_string)
173 |
174 | return ' '.join(strings)
175 |
176 | def __getitem__(self, item):
177 | return self.options[item]
178 |
179 | def list(self):
180 | return [(name, option) for (name, option) in self.options.items()]
181 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/treebuilders/__init__.py:
--------------------------------------------------------------------------------
1 | from .phyml import Phyml
2 | from .raxml import Raxml
3 | from .iqtree import Iqtree
4 | from .fasttree import Fasttree
5 | from .guenomu import Guenomu
6 |
7 |
8 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/treebuilders/base_treebuilder.py:
--------------------------------------------------------------------------------
1 | import os, types, itertools
2 | from abc import ABCMeta, abstractmethod
3 | from enum import Enum
4 | from Bio import AlignIO, SeqIO
5 | from Bio.Align import MultipleSeqAlignment
6 | from ...seq_utils import is_dna
7 |
8 | from FastOMA.zoo.wrappers import WrapperError
9 | from FastOMA.zoo.wrappers.aligners.base_aligner import identify_input
10 |
11 | import logging
12 | logger = logging.getLogger(__name__)
13 |
14 | AlignmentInput = Enum('AlignmentInput', 'OBJECT FILENAME')
15 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN')
16 |
17 |
18 | class TreeBuilder(object):
19 | """
20 | Base class for wrappers of tree building software
21 |
22 | The wrapper is written as a callable class.
23 | This can hold data (state) to do with the operation it performs, so it can keep results,
24 | execution times and other metadata, as well as perform the task.
25 |
26 | This is a base implementation to be extended. The important parts are
27 | __init__ (does the setup) and __call__ (does the work). All
28 | else are helper methods.
29 |
30 | :Example:
31 |
32 | ::
33 |
34 | callable_wrapper = ConcreteAligner(aln)
35 | result = callable_wrapper()
36 | time_taken = callable_wrapper.elapsed_time
37 | result_again = callable_wrapper.result
38 | """
39 | __metaclass__ = ABCMeta
40 |
41 | def __init__(self, alignment=None, datatype=DataType.UNKNOWN, binary=None):
42 | """
43 | ..note:: TODO: this documentation is not correct. it needs to be updateted.
44 |
45 | Should work the same whether you're working with a Biopython object or a file
46 | but the implementation differs, e.g. a Biopython object will need
47 | to be written temporarily to disk for the Aligner to work on it.
48 |
49 | alignment is one of 4 things:
50 | a filename
51 | a Biopython MSA
52 | a list of Seq objects
53 | anything else (throw an exception)
54 |
55 | binary is the alignment's executable file, or None
56 | """
57 |
58 | if alignment is not None:
59 | self.input_type = identify_input(alignment) # Figure out what it is - file or object
60 | if datatype == DataType.UNKNOWN:
61 | # dup, input_ = itertools.tee(input_)
62 | self.datatype = guess_datatype(alignment, from_filename=self.input_type == AlignmentInput.FILENAME)
63 | else:
64 | self.datatype = datatype
65 |
66 | self.input = alignment # store it
67 | else:
68 | self.input_type = None
69 | self.input = None
70 |
71 | self.elapsed_time = None
72 | self.stdout = None
73 | self.stderr = None
74 | try:
75 | self.cli = self._init_cli(binary)
76 | except IOError as err:
77 | raise WrapperError('Error searching for binary: {}'.format(err))
78 | # End setup
79 |
80 | @abstractmethod
81 | def __call__(self, *args, **kwargs):
82 | """
83 | How to call the underlying aligner
84 | """
85 | pass
86 |
87 | @abstractmethod
88 | def _init_cli(self, binary):
89 | """
90 | Set up the command-line interface to the wrapped software
91 | :param binary: filename of executable binary file
92 | :return: concrete CLI type inheriting from AbstractCLI
93 | """
94 | pass
95 |
96 |
97 | def guess_datatype(alignment, from_filename=False):
98 | if from_filename:
99 | try:
100 | alignment = list(SeqIO.parse(alignment, 'fasta'))
101 | except:
102 | alignment = list(SeqIO.parse(alignment, 'phylip-relaxed'))
103 | if is_dna(alignment):
104 | logger.warning("Guessed datatype=DNA. But better explicitly specify the sequence type with option datatype={DNA, PROTEIN}.")
105 | return DataType.DNA
106 | else:
107 | logger.warning("Guessed datatype=PROTEIN. But better explicitly specify the sequence type with option datatype={DNA, PROTEIN}.")
108 | return DataType.PROTEIN
109 |
110 |
111 | def identify_input(alignment):
112 | """
113 | Work out if we're dealing with an alignment (return True), a file
114 | (return False), or invalid input (raise error)
115 |
116 | :param alignment: either an Biopython MultipleSequenceAlignment or
117 | a filename pointing to an existing msa file.
118 | """
119 | try:
120 | if isinstance(alignment, (MultipleSeqAlignment, types.GeneratorType, list)):
121 | # `alignment` is a Biopython MultipleSequenceAlignment
122 | return AlignmentInput.OBJECT
123 |
124 | elif isinstance(alignment, str) and os.path.exists(alignment):
125 | # `alignment` is a filepath
126 | return AlignmentInput.FILENAME
127 |
128 | except:
129 | # `alignment` is some other thing we can't handle
130 | raise ValueError('{} is not an alignment object or a valid filename'.format(alignment))
131 |
132 |
133 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/treebuilders/fasttree.py:
--------------------------------------------------------------------------------
1 | # Author: Ivana Pilizota
2 | # Date: 1 November 2016
3 |
4 | import logging
5 | import os
6 | import time
7 |
8 | from Bio import SeqIO
9 | from pyparsing import ParseException
10 | import tempfile
11 |
12 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType, WrapperError
13 | from .parsers import FasttreeParser
14 |
15 | from ..abstract_cli import AbstractCLI
16 | from ..options import OptionSet, StringOption, IntegerOption
17 | from ...file_utils import TempFile, TempDir
18 |
19 | logger = logging.getLogger(__name__)
20 | logger.addHandler(logging.StreamHandler())
21 | logger.setLevel(logging.INFO)
22 |
23 |
24 |
25 |
26 | class FasttreeCLI(AbstractCLI):
27 | @property
28 | def _default_exe(self):
29 | return ['fasttree', 'FastTree']
30 |
31 |
32 | def set_default_dna_options(treebuilder):
33 | """
34 | Dummy function as sensible default
35 | """
36 | treebuilder.options = get_default_options()
37 |
38 |
39 | def set_default_protein_options(treebuilder):
40 | """
41 | Dummy function as sensible default
42 | """
43 | treebuilder.options = get_default_options()
44 |
45 |
46 | class Fasttree(TreeBuilder):
47 |
48 | def __init__(self, alignment, *args, **kwargs):
49 | self.options = get_default_options()
50 | super(Fasttree, self).__init__(alignment=alignment, *args, **kwargs)
51 | if self.input is not None:
52 | if self.datatype == DataType.DNA:
53 | set_default_dna_options(self)
54 | else:
55 | set_default_protein_options(self)
56 |
57 | def __call__(self, *args, **kwargs):
58 | """
59 | Sets up temporary output file location and calls FastTree using _call() function.
60 | Writes temporary input file if we're working with SeqIO object
61 | Saves the stdout and stderr and returns
62 | """
63 | start = time.time() # time the execution
64 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
65 | with tempfile.NamedTemporaryFile(mode='wt') as fh:
66 | SeqIO.write(self.input, fh, 'fasta') # default interleaved # 'phylip-relaxed'
67 | fh.seek(0)
68 | output, error = self._call(fh.name, *args, **kwargs)
69 | self.result = self._read_result(output, error) # store result
70 | else:
71 | filename = os.path.abspath(self.input)
72 | output, error = self._call(filename, *args, **kwargs)
73 | self.result = self._read_result(output, error) # store result
74 |
75 | end = time.time()
76 | self.elapsed_time = end - start
77 | return self.result["tree"]
78 | # End call
79 |
80 | # Any other accessory methods
81 | def _call(self, filename, *args, **kwargs):
82 | """
83 | Call underlying low level FastTree wrapper.
84 | Options are passed via *args and **kwargs
85 | [This only covers the simplest automatic
86 | case]
87 | """
88 | #hard code tmp_output as the output name since we don't save it anyway
89 | #self.cli('{} -log {log_output} {seqfile} > {tmp_path}'.format(self.command(), tmp_path=os.path.join(tmpd,'tmp_output'), log_output=logfile, seqfile=filename), wait=True)
90 | self.cli('{} {seq_file}'.format(self.command(), seq_file=filename), wait=True)
91 | self.returncode = self.cli.process.returncode
92 |
93 | if self.returncode != 0:
94 | self.stderr = self.cli.get_stderr()
95 | last_error_line = self.stderr.split('\n')[-1].strip()
96 | msg = f"Fasttree failed on {filename}: {last_error_line}"
97 | logger.error(msg)
98 | raise WrapperError(msg, self.stderr)
99 |
100 | return (self.cli.get_stdout(), self.cli.get_stderr())
101 |
102 | def command(self):
103 | return str(self.options)
104 |
105 | def _read_result(self, stdout, stderr):
106 | """
107 | Read back the result.
108 | """
109 | parser = FasttreeParser()
110 |
111 | try:
112 | parser.parse(tree=stdout, other=stderr)
113 | result = parser.to_dict()
114 | except IOError as ioerr:
115 | logger.error('Error reading results')
116 | result = None
117 | except ParseException as parseerr:
118 | logger.error('Other parse error', parseerr)
119 | result = None
120 |
121 | return result
122 |
123 | def _init_cli(self, binary):
124 | return FasttreeCLI(executable=binary)
125 |
126 |
127 | def get_default_options():
128 |
129 | return OptionSet([
130 | # Algorithm
131 |
132 | # Set datatype to DNA (nt) or AA alignment: AA by default. If set to True will assume DNA format.
133 | StringOption('-nt', active=False),
134 |
135 | # Set the WAG model for AA alignment. Default Jones-Taylor-Thorton
136 | StringOption('-wag', active=False),
137 |
138 | # Set the GTR model for nt alignment. Default Jones-Taylor-Thorton
139 | StringOption('-gtr', active=False),
140 |
141 | # Set the gamma model. Default Jones-Taylor-Thorton
142 | StringOption('-gamma', active=False),
143 |
144 | # Specify the number of rate categories of sites. Default 20.
145 | IntegerOption('-cat', 20, active=False),
146 |
147 | IntegerOption('-seed',1234, active=False),
148 |
149 | # Specify starting tree
150 | StringOption('-intree', '', active=False),
151 |
152 | # Speed up the neighbor joining phase & reduce memory usage (recommended for >50,000 sequences)
153 | StringOption('-fastest', active=False),
154 | # allow spaces and other restricted characters (but not ' ) in sequence names and quote names in the output tree (fasta input only; FastTree will not be able to read these trees back in)
155 | StringOption('-quote', active=True),
156 |
157 | #-quote -- quote sequence names in the output and allow spaces, commas, parentheses, and colons in them but not ' characters (fasta files only)\n"
158 | StringOption('-quote', active=False),
159 |
160 | # Set the number of rounds of maximum-likelihood NNIs. Deafault 4*log2(N), N = the number of unique sequences
161 | IntegerOption('-mlnni', 0, active=False),
162 |
163 | ])
164 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/treebuilders/iqtree.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import logging
4 | import random
5 | from pyparsing import ParseException
6 | import shutil
7 | from Bio import SeqIO
8 |
9 |
10 | from .parsers import IqtreeParser
11 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType
12 |
13 |
14 | from ..abstract_cli import AbstractCLI
15 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
16 |
17 | from ...file_utils import TempFile, TempDir
18 |
19 | logger = logging.getLogger(__name__)
20 | logger.addHandler(logging.StreamHandler())
21 | logger.setLevel(logging.INFO)
22 |
23 |
24 | class IqtreeCLI(AbstractCLI):
25 | @property
26 | def _default_exe(self):
27 | return ['iqtree-omp', 'iqtree']
28 |
29 |
30 | def set_default_dna_options(treebuilder):
31 | """
32 | Dummy function as sensible default
33 | """
34 | treebuilder.options = get_default_options()
35 | treebuilder.options['-st'].set_value('DNA')
36 |
37 |
38 | def set_default_protein_options(treebuilder):
39 | """
40 | Dummy function as sensible default
41 | """
42 | treebuilder.options = get_default_options()
43 | treebuilder.options['-st'].set_value('AA')
44 |
45 |
46 | class Iqtree(TreeBuilder):
47 |
48 | def __init__(self, input_, *args, **kwargs):
49 | super(Iqtree, self).__init__(alignment=input_, *args, **kwargs)
50 | self.options = get_default_options()
51 | if self.datatype == DataType.DNA:
52 | set_default_dna_options(self)
53 | elif self.datatype == DataType.PROTEIN:
54 | set_default_protein_options(self)
55 |
56 | def __call__(self, *args, **kwargs):
57 | """
58 | Sets up temporary output file location and calls iqtree using _call() function.
59 | Writes temporary input file if we're working with SeqIO object
60 | Saves the stdout and stderr and returns
61 | """
62 | start = time.time() # time the execution
63 |
64 | #Need to create temp directory to put raxml output here
65 | with TempDir() as tmpd:
66 | if self.input_type is AlignmentInput.OBJECT: # different operation depending on what it is
67 | with TempFile() as filename:
68 | SeqIO.write(self.input, filename, 'phylip-relaxed') # default interleaved
69 | output, error = self._call(filename,tmpd, *args, **kwargs)
70 | elif self.input_type is AlignmentInput.FILENAME:
71 | filename = self.input
72 | output, error = self._call(filename, tmpd, *args, **kwargs)
73 | else:
74 | output, error = self._call(None,tmpd, *args, **kwargs)
75 | self.result = self._read_result(tmpd) # store result
76 | self.stdout = output
77 | self.stderr = error
78 |
79 | end = time.time()
80 | self.elapsed_time = end - start
81 | return self.result
82 | # End call
83 |
84 | # Any other accessory methods
85 | def _call(self, filename, tmpd, *args, **kwargs):
86 | """
87 | Call underlying low level _iqtree wrapper.
88 | Options are passed via *args and **kwargs
89 | [This only covers the simplest automatic
90 | case]
91 | """
92 | self.cli('{} -pre {tmp_path} -s {seqfile}'.format(self.command(),
93 | tmp_path=os.path.join(tmpd, 'tmp_output'),
94 | seqfile=filename),
95 | wait=True)
96 | return self.cli.get_stdout(), self.cli.get_stderr()
97 |
98 | def command(self):
99 | return str(self.options)
100 |
101 | def _read_result(self, tmpd):
102 | """
103 | Read back the result.
104 | """
105 |
106 | expected_outfiles = [os.path.join(tmpd, 'tmp_output.iqtree'),
107 | os.path.join(tmpd, 'tmp_output.treefile')]
108 | parser = IqtreeParser()
109 | try:
110 | result = parser.to_dict(*expected_outfiles)
111 | except IOError as ioerr:
112 | logger.error('Error reading results')
113 | result = None
114 | except ParseException as parseerr:
115 | logger.error('Other parse error', parseerr)
116 | result = None
117 | return result
118 |
119 | def _init_cli(self, binary):
120 | return IqtreeCLI(executable=binary)
121 |
122 |
123 | def get_default_options():
124 | return OptionSet([
125 | # Number of threads
126 | IntegerOption('-nt', 2, active=True),
127 |
128 | # Set the model for either DNA or AA alignment
129 | StringOption('-m', '', active=False),
130 |
131 | # Ultrafast bootstrap (>=1000)
132 | IntegerOption('-bb', 0, active=False),
133 |
134 | # SH-like approximate likelihood ratio test (SH-aLRT)
135 | IntegerOption('-alrt', 0, active=False),
136 |
137 | # Bootstrap + ML tree + consensus tree (>=100)
138 | IntegerOption('-b', 0, active=False)
139 | ])
140 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/treebuilders/phyml.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import tempfile
4 | import logging
5 | from pyparsing import ParseException
6 | from Bio import AlignIO, SeqIO
7 |
8 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType
9 | from .parsers import PhymlParser
10 |
11 | from ..abstract_cli import AbstractCLI
12 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
13 |
14 |
15 | logger = logging.getLogger(__name__)
16 | logger.addHandler(logging.StreamHandler())
17 | logger.setLevel(logging.INFO)
18 |
19 |
20 | class PhymlCLI(AbstractCLI):
21 | @property
22 | def _default_exe(self):
23 | return 'phyml'
24 |
25 |
26 | def set_default_dna_options(treebuilder):
27 | """
28 | Dummy function as sensible default
29 | """
30 | treebuilder.options = get_default_options()
31 | treebuilder.options['-d'].set_value('nt')
32 |
33 |
34 | def set_default_protein_options(treebuilder):
35 | """
36 | Dummy function as sensible default
37 | """
38 | treebuilder.options = get_default_options()
39 |
40 |
41 | class Phyml(TreeBuilder):
42 | """ Phyml tree reconstruction
43 |
44 | This wrapper can be called to reconstruct a phylogenetic tree
45 | using PhyML.
46 | """
47 |
48 | def __init__(self, alignment, *args, **kwargs):
49 | """
50 | :param alignment: input multiple sequence alignment. This can be either
51 | a filename or an biopython SeqRecord collection.
52 | """
53 | super(Phyml, self).__init__(alignment, *args, **kwargs)
54 | self.options = get_default_options()
55 | if self.datatype == DataType.DNA:
56 | set_default_dna_options(self)
57 | else:
58 | set_default_protein_options(self)
59 |
60 | def __call__(self, *args, **kwargs):
61 | """
62 | Anything to do with calling Mafft should go here.
63 | If any extra arguments need to be passed they can
64 | be specified (listed as *args and **kwargs for now).
65 | """
66 | start = time.time() # time the execution
67 |
68 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
69 | with tempfile.NamedTemporaryFile(mode='wt') as fh:
70 | SeqIO.write(self.input, fh, 'phylip-relaxed') # default interleaved
71 | fh.seek(0)
72 | output, error = self._call(fh.name, *args, **kwargs)
73 | self.result = self._read_result(fh.name) # store result
74 | else:
75 | path = os.path.dirname(self.input)
76 | filename = os.path.basename(self.input)
77 | # some operations done because phyml can not deal with large filenames that are caused due to a large path
78 | with os.chdir(path):
79 | output, error = self._call(filename, *args, **kwargs)
80 | self.result = self._read_result(filename) # store result
81 |
82 | self.stdout = output
83 | self.stderr = error
84 |
85 | end = time.time()
86 | self.elapsed_time = end - start
87 | return self.result["tree"]
88 | # End call
89 |
90 | # Any other accessory methods
91 | def _call(self, filename, *args, **kwargs):
92 | """
93 | Call underlying low level _Phyml wrapper.
94 | Options are passed via *args and **kwargs
95 | [This only covers the simplest automatic
96 | case]
97 | """
98 | self.cli('{} -i {}'.format(self.command(), filename),
99 | wait=True)
100 | return self.cli.get_stdout(), self.cli.get_stderr()
101 |
102 | def command(self):
103 | return str(self.options)
104 |
105 | def _read_result(self, output):
106 | """
107 | Read back the result.
108 | """
109 |
110 | #TODO: change the output dictionary into a better format
111 | expected_outfiles = ['{}_phyml_stats'.format(output), '{}_phyml_tree'.format(output)]
112 | parser = PhymlParser()
113 |
114 | # Phyml outputs two outfiles, a stats file and a tree file.
115 | # Sometimes it appends .txt, sometimes not. Seems to be platform-specific.
116 | # Here we assume they are without .txt, but if we can't find them, try
117 | # looking for the .txt onees instead
118 | try:
119 | # Check if these are the .txt style outfiles
120 | if not os.path.exists(expected_outfiles[0]):
121 | expected_outfiles = [x + '.txt' for x in expected_outfiles]
122 | result = parser.to_dict(*expected_outfiles)
123 |
124 | except IOError as ioerr:
125 | logger.error('Error reading results')
126 | result = None
127 | except ParseException as parseerr:
128 | logger.error('Other parse error', parseerr)
129 | result = None
130 |
131 | return result
132 |
133 | def _init_cli(self, binary):
134 | return PhymlCLI(executable=binary)
135 |
136 |
137 | def get_default_options():
138 | return OptionSet([
139 | # Algorithm
140 |
141 | # Set datatype to nt or aa
142 | StringOption('-d', 'aa', active=True),
143 |
144 | # Set the model for either DNA or AA alignment
145 | StringOption('-m', '', active=False),
146 |
147 | # If set to true will assume sequential format
148 | FlagOption('-q', False, active=False),
149 |
150 | # Set bootstrap value
151 | IntegerOption('-b', 0, active=False),
152 |
153 | # Tree topology search operation option
154 | StringOption('-s', 'NNI', active=False)
155 | ])
156 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/treebuilders/raxml.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import logging
4 | import random
5 | from pyparsing import ParseException
6 | import shutil
7 | from Bio import AlignIO, SeqIO
8 |
9 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType
10 | from .parsers import RaxmlParser
11 |
12 | from ..abstract_cli import AbstractCLI
13 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
14 |
15 | from ...file_utils import TempFile,TempDir
16 |
17 | logger = logging.getLogger(__name__)
18 | logger.addHandler(logging.StreamHandler())
19 | logger.setLevel(logging.INFO)
20 |
21 |
22 | class RaxmlCLI(AbstractCLI):
23 | @property
24 | def _default_exe(self):
25 | return ['raxmlHPC','raxmlHPC-PTHREADS']
26 |
27 |
28 | def set_default_dna_options(treebuilder):
29 | """
30 | Dummy function as sensible default
31 | """
32 | treebuilder.options = get_default_options()
33 |
34 |
35 | def set_default_protein_options(treebuilder):
36 | """
37 | Dummy function as sensible default
38 | """
39 | treebuilder.options = get_default_options()
40 |
41 |
42 | class Raxml(TreeBuilder):
43 |
44 | def __init__(self, alignment, *args, **kwargs):
45 | self.options = get_default_options()
46 | super(Raxml, self).__init__(alignment=alignment, *args, **kwargs)
47 | if self.input is not None:
48 | if self.datatype == DataType.DNA:
49 | set_default_dna_options(self)
50 | else:
51 | set_default_protein_options(self)
52 |
53 |
54 |
55 | def __call__(self, *args, **kwargs):
56 | """
57 | Sets up temporary output files and calls raxml using _call() function.
58 | Writes temporary input file if we're working with SeqIO object
59 | Saves the stdout and stderr and returns
60 | """
61 | start = time.time() # time the execution
62 |
63 | #Need to create temp directory to put raxml output here
64 | with TempDir() as tmpd:
65 | if self.input_type is AlignmentInput.OBJECT: # different operation depending on what it is
66 | with TempFile() as filename:
67 | SeqIO.write(self.input, filename, 'phylip-relaxed') # default interleaved
68 | output, error = self._call(filename,tmpd, *args, **kwargs)
69 | elif self.input_type is AlignmentInput.FILENAME:
70 | filename = self.input
71 | output, error = self._call(filename, tmpd, *args, **kwargs)
72 | else:
73 | output, error = self._call(None,tmpd, *args, **kwargs)
74 | self.result = self._read_result(tmpd) # store result
75 | self.stdout = output
76 | self.stderr = error
77 |
78 | end = time.time()
79 | self.elapsed_time = end - start
80 | return self.result
81 | # End call
82 |
83 | # Any other accessory methods
84 | def _call(self, filename, tmpd, *args, **kwargs):
85 | """
86 | Call underlying low level _Raxml wrapper.
87 | Options are passed via *args and **kwargs
88 | [This only covers the simplest automatic
89 | case]
90 | """
91 | #hard code tmp_output as the output name since we don't save it anyway
92 | self.cli('{} -n tmp_output -w {tmp_path} -s {seqfile}'.format(self.command(), tmp_path=tmpd, seqfile=filename),
93 | wait=True)
94 | return self.cli.get_stdout(), self.cli.get_stderr()
95 |
96 | def command(self):
97 | return str(self.options)
98 |
99 | def _read_result(self, tmpd):
100 | """
101 | Read back the result.
102 | """
103 |
104 | expected_outfiles = [os.path.join(tmpd, 'RAxML_info.tmp_output'), os.path.join(tmpd, 'RAxML_bestTree.tmp_output')]
105 |
106 |
107 | parser = RaxmlParser()
108 |
109 | try:
110 | if self.options['-f'].get_value() is not '':
111 | f_value = os.path.splitext(os.path.basename(self.options['-f'].get_value()))[0]
112 |
113 | result = parser.to_dict(*expected_outfiles, dash_f=f_value)
114 | else:
115 | result = parser.to_dict(*expected_outfiles, dash_f=None)
116 |
117 | except IOError as ioerr:
118 | logger.error('Error reading results')
119 | result = None
120 | except ParseException as parseerr:
121 | logger.error('Other parse error', parseerr)
122 | result = None
123 |
124 | return result
125 |
126 | def _init_cli(self, binary):
127 | return RaxmlCLI(executable=binary)
128 |
129 |
130 | def get_default_options():
131 | return OptionSet([
132 | # Algorithm
133 |
134 | # Set the model for either DNA or AA alignment
135 | StringOption('-m', 'PROTGAMMAGTR', active=True),
136 |
137 | # Number of replicates
138 | IntegerOption('-p', 12345, active=True),
139 |
140 | # If set to true will assume sequential format
141 | FlagOption('-q', False, active=False),
142 |
143 | # Turn on bootstrapping - set seed
144 | IntegerOption('-b', 0, active=False),
145 |
146 | # Number of replicates
147 | IntegerOption('-#', 0, active=False),
148 |
149 | # Turn on rapid bootstrap - specify seed
150 | IntegerOption('-x', 0, active=False),
151 |
152 | # Sed number of bootstrap replicates
153 | IntegerOption('-N', 0, active=False),
154 |
155 | # Set number of threads
156 | IntegerOption('-T', 0, active=False),
157 |
158 | # Tree topology search operation option
159 | StringOption('-s', 'NNI', active=False),
160 |
161 | # Select algorithm
162 | StringOption('-f', '', active=False),
163 |
164 | # Specify starting tree
165 | StringOption('-t', '', active=False),
166 |
167 | # Specify filename of file containing multiple trees
168 | StringOption('-z', '', active=False),
169 |
170 | ])
171 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/trimmers/__init__.py:
--------------------------------------------------------------------------------
1 | from .trimal import TrimAl
2 |
3 |
--------------------------------------------------------------------------------
/FastOMA/zoo/wrappers/trimmers/base_trimmer.py:
--------------------------------------------------------------------------------
1 | import os, types, itertools
2 | from abc import ABCMeta, abstractmethod
3 | from Bio import AlignIO, SeqIO
4 | from Bio.Align import MultipleSeqAlignment
5 | from ...seq_utils import identify_input
6 | from ...wrappers import WrapperError
7 |
8 | import logging
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 |
13 | class MSATrimmer:
14 | """
15 | Base class for wrappers of msa trimming software
16 |
17 | The wrapper is written as a callable class.
18 | This can hold data (state) to do with the operation it performs, so it can keep results,
19 | execution times and other metadata, as well as perform the task.
20 |
21 | This is a base implementation to be extended. The important parts are
22 | __init__ (does the setup) and __call__ (does the work). All
23 | else are helper methods.
24 |
25 | :Example:
26 |
27 | ::
28 |
29 | callable_wrapper = ConcreteTrimmer(aln)
30 | result = callable_wrapper()
31 | time_taken = callable_wrapper.elapsed_time
32 | result_again = callable_wrapper.result
33 | """
34 | __metaclass__ = ABCMeta
35 |
36 | def __init__(self, alignment=None, binary=None):
37 | """
38 | Should work the same whether you're working with a Biopython object or a file
39 | but the implementation differs, e.g. a Biopython object will need
40 | to be written temporarily to disk for the Trimmer to work on it.
41 |
42 | alignment is one of 4 things:
43 | a filename
44 | a Biopython MSA
45 | a list of Seq objects
46 | anything else (throw an exception)
47 |
48 | binary is the alignment's executable file, or None
49 | """
50 |
51 | if alignment is not None:
52 | self.input_type = identify_input(alignment) # Figure out what it is - file or object
53 | self.input = alignment # store it
54 | else:
55 | self.input_type = None
56 | self.input = None
57 |
58 | self.elapsed_time = None
59 | self.stdout = None
60 | self.stderr = None
61 | try:
62 | self.cli = self._init_cli(binary)
63 | except IOError as err:
64 | raise WrapperError('Error searching for binary: {}'.format(err))
65 | # End setup
66 |
67 | @abstractmethod
68 | def __call__(self, *args, **kwargs):
69 | """
70 | How to call the underlying aligner
71 | """
72 | pass
73 |
74 | @abstractmethod
75 | def _init_cli(self, binary):
76 | """
77 | Set up the command-line interface to the wrapped software
78 | :param binary: filename of executable binary file
79 | :return: concrete CLI type inheriting from AbstractCLI
80 | """
81 | pass
82 |
83 |
84 |
--------------------------------------------------------------------------------
/archive/analysis/edit_orthxml_file.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 |
4 | I accidanetly comment
10 |
11 |
12 |
13 | """
14 |
15 |
16 | file_in = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/archive/xml_output/out_27aug_6pm.xml_no_property"
17 | file_out = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/archive/xml_output/out_27aug_6pm_property.xml"
18 |
19 | file_in_handle = open(file_in, 'r')
20 | file_out_handle = open(file_out, 'w')
21 | property_str =""
22 | print("started")
23 | for line in file_in_handle:
24 | if not " 2:
29 | file_name_split = file.split(".")
30 | if file_name_split[-1] == "pickle":
31 | rhog_id = int(file_name_split[0].split("_")[1])
32 | pickles.append(rhog_id)
33 | else:
34 | print("this file is empty", file)
35 |
36 | print("number of pickles is ", len(pickles))
37 |
38 | no_pickle_list = set(rhogs) - set(pickles)
39 |
40 | print("number of rhogs not finished is ", len(no_pickle_list))
41 |
42 | print("\n \n ", no_pickle_list)
43 |
--------------------------------------------------------------------------------
/archive/analysis/preprocess_qfo_files.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Proteins in each file belong to the same species.
4 |
5 | # change the name of each file based on the species name inside each prot id
6 |
7 |
8 | from os import listdir
9 | from Bio import SeqIO
10 | import os
11 |
12 | working_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/"
13 | prot_folder = working_folder + "/omamer_search_old/proteome/"
14 | project_files = listdir(prot_folder)
15 | query_species_names_old = []
16 | query_species_names_new = []
17 | for file in project_files:
18 | if file.split(".")[-1] == "fa":
19 | file_name_split = file.split(".")[:-1]
20 | query_species_name_old = '.'.join(file_name_split)
21 | prot_address = prot_folder + query_species_name_old + ".fa"
22 | prots_record = list(SeqIO.parse(prot_address, "fasta"))
23 | prot_record = prots_record[0]
24 | prot_name = prot_record.name # 'tr|E3JPS4|E3JPS4_PUCGT
25 | query_species_name_new = prot_name.split("|")[-1].split("_")[-1].strip()
26 | # if query_species_name_new == 'RAT': query_species_name_new = "RATNO"
27 | query_species_names_old.append(query_species_name_old)
28 | query_species_names_new.append(query_species_name_new)
29 |
30 | os.mkdir(working_folder+"/omamer_search")
31 | os.mkdir(working_folder+"/omamer_search/proteome/")
32 | os.mkdir(working_folder+"/omamer_search/hogmap")
33 |
34 |
35 | for idx, query_species_name_old in enumerate(query_species_names_old):
36 | query_species_name_new = query_species_names_new[idx]
37 |
38 | prot_address_old = working_folder + "omamer_search_old/proteome/" + query_species_name_old + ".fa"
39 | prot_address_new = working_folder + "omamer_search/proteome/" + query_species_name_new + "_.fa"
40 | os.system('cp ' + prot_address_old + ' ' + prot_address_new)
41 |
42 | hogmap_address_old = working_folder + "omamer_search_old/hogmap/" + query_species_name_old + ".hogmap"
43 | hogmap_address_new = working_folder + "omamer_search/hogmap/" + query_species_name_new + "_.hogmap"
44 | os.system('cp ' + hogmap_address_old + ' ' + hogmap_address_new)
45 |
46 |
47 | # 13:54:16 - the species DANRE already exists in the oma database, remove them first
48 |
49 |
50 |
51 | print("done")
--------------------------------------------------------------------------------
/archive/analysis/write_gene_id_pickle_old_code.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import xml.etree.ElementTree as ET
4 | import dill as dill_pickle
5 | from os import listdir
6 | from xml.dom import minidom
7 | import os
8 | from Bio import SeqIO
9 | #import dill as dill_pickle
10 | import dill as pickle
11 | #import pickle
12 |
13 |
14 |
15 | address_working_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/ali_code_31aug/"
16 |
17 | address_rhogs_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/rhog_all_v3_g2_s500/"
18 | address_group_xml_ortho = address_working_folder+"group_xml_ortho_adjusted_family_40_2sep5pm_dill.pickle"
19 |
20 |
21 | rhog_files = listdir(address_rhogs_folder)[:]
22 |
23 | rhog_files = listdir(address_rhogs_folder)
24 | rhogid_num_list = []
25 | for rhog_file in rhog_files:
26 | if rhog_file.split(".")[-1] == "fa":
27 | rhogid_num = int(rhog_file.split(".")[0].split("_")[1][1:])
28 | rhogid_num_list.append(rhogid_num)
29 |
30 | rhogid_num_list_temp = rhogid_num_list
31 |
32 | species_prot_dic = {}
33 | # all_prot_temp_list= []
34 | for rhogid_num in rhogid_num_list_temp:
35 | prot_address = address_rhogs_folder + "HOG_B" + str(rhogid_num).zfill(7) + ".fa"
36 | rhog_i = list(SeqIO.parse(prot_address, "fasta"))
37 | for prot_i in rhog_i:
38 | prot_i_name = prot_i.id # .split("||")[0] # .split("|")[1] # tr|E3JPS4|E3JPS4_PUCGT or new || ||
39 | species_i = prot_i.id.split("||")[1][:-1] # prot_i.id.split("|")[-1].split("_")[-1]
40 | if species_i in species_prot_dic:
41 | species_prot_dic[species_i].append(prot_i_name)
42 | else:
43 | species_prot_dic[species_i] = [prot_i_name]
44 | # all_prot_temp_list.append(prot_i.id)
45 |
46 | print("there are species ", len(species_prot_dic))
47 | orthoxml_file = ET.Element("orthoXML",
48 | attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA", "originVersion": "Nov 2021",
49 | "version": "0.3"}) #
50 |
51 | gene_counter = 100000
52 | gene_id_name = {}
53 | query_species_names_rHOGs = list(species_prot_dic.keys())
54 | for species_name in query_species_names_rHOGs:
55 | no_gene_species = True # for code develop ment
56 | species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": species_name, "NCBITaxId": "1"})
57 | database_xml = ET.SubElement(species_xml, "database", attrib={"name": "QFO database ", "version": "2020"})
58 | genes_xml = ET.SubElement(database_xml, "genes")
59 |
60 | prot_list = species_prot_dic[species_name]
61 | for prot_itr in range(len(prot_list)): # [12:15]
62 | prot_i_name = prot_list[prot_itr]
63 | gene_id_name[prot_i_name] = gene_counter
64 | prot_i_name_short = prot_i_name.split("||")[0].split("|")[1].strip() # tr|E3JPS4|E3JPS4_PUCGT
65 | gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_counter), "protId": prot_i_name_short})
66 | gene_counter += 1
67 |
68 | groups_xml = ET.SubElement(orthoxml_file, "groups")
69 |
70 |
71 |
72 | with open(address_group_xml_ortho, 'wb') as handle:
73 | # dill_pickle.dump(gene_id_name, handle, protocol=dill_pickle.HIGHEST_PROTOCOL)
74 | pickle.dump((groups_xml, gene_id_name, orthoxml_file), handle, protocol=pickle.HIGHEST_PROTOCOL)
75 |
76 | print("saved as ", address_group_xml_ortho)
77 |
78 |
79 |
80 |
81 |
82 |
--------------------------------------------------------------------------------
/archive/analysis/xml_.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import xml.etree.ElementTree as ET
4 | import dill as dill_pickle
5 | from os import listdir
6 | from xml.dom import minidom
7 |
8 | working_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/"
9 | # gene_trees_folder = "" # in_folder + "/gene_trees_/"
10 | # check gene_trees_folder exist otherwise mkdir this
11 |
12 | #address_rhogs_folder = in_folder + "/rhog_g501_done/" # old3/rhog_all/ /rhog_size_g2_s500/" sample_rootHOG
13 | #species_tree_address = in_folder + "/archive/lineage_tree_qfo.phyloxml"
14 | pickle_folder = working_folder + "/pickle_folder_all_collect/"
15 | # add warning when pickle folder is not empty
16 | output_xml_name = "out_27aug_6pm.xml"
17 |
18 |
19 | orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA",
20 | "originVersion": "Nov 2021", "version": "0.3"}) #
21 |
22 | with open(working_folder + '/file_gene_id_name.pickle', 'rb') as handle:
23 | gene_id_name = dill_pickle.load(handle)
24 | # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name)
25 |
26 | for query_species_name, list_prots in gene_id_name.items():
27 |
28 | species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "NCBITaxId": "1"})
29 | database_xml = ET.SubElement(species_xml, "database", attrib={"name": "QFO database ", "version": "2020"})
30 | genes_xml = ET.SubElement(database_xml, "genes")
31 |
32 | for (gene_idx_integer, query_prot_name) in list_prots:
33 | query_prot_name_pure = query_prot_name.split("||")[0].strip().split("|")[1]
34 | gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": query_prot_name_pure})
35 |
36 | pickle_files_adress = listdir(pickle_folder)
37 |
38 | hogs_a_rhog_xml_all = []
39 | for pickle_file_adress in pickle_files_adress:
40 | with open(pickle_folder + pickle_file_adress, 'rb') as handle:
41 | hogs_a_rhog_xml_batch = dill_pickle.load(handle) # hogs_a_rhog_xml_batch is orthoxml_to_newick.py list of hog object.
42 | hogs_a_rhog_xml_all.extend(hogs_a_rhog_xml_batch)
43 | # hogs_rhogs_xml_all is orthoxml_to_newick.py list of hog object.
44 |
45 | print("number of hogs in all batches is ", len(hogs_a_rhog_xml_all))
46 |
47 | groups_xml = ET.SubElement(orthoxml_file, "groups")
48 |
49 | for hogs_a_rhog_xml in hogs_a_rhog_xml_all:
50 | groups_xml.append(hogs_a_rhog_xml)
51 |
52 | xml_str = minidom.parseString(ET.tostring(orthoxml_file)).toprettyxml(indent=" ")
53 | # print(xml_str[:-1000])
54 |
55 | with open(working_folder +output_xml_name, "w") as file_xml:
56 | file_xml.write(xml_str)
57 | file_xml.close()
58 |
59 | print("orthoxml is written in "+ working_folder +output_xml_name)
60 |
61 |
--------------------------------------------------------------------------------
/archive/fastOMA_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/archive/fastOMA_logo.png
--------------------------------------------------------------------------------
/archive/test_curn.py:
--------------------------------------------------------------------------------
1 |
2 | from FastOMA.infer_roothogs import fastoma_infer_roothogs
3 | from FastOMA._wrappers import logger
4 | from FastOMA.infer_subhogs import fastoma_infer_subhogs
5 |
6 |
7 | # --low-so-detection --fragment-detection
8 |
9 | # --input-rhog-folder ./bb/ --parrallel True --species-tree species_tree.nwk
10 |
11 | #a=2
12 | #fastoma_infer_subhogs()
13 | # proteome --hogmap hogmaps --splice splice --out-rhog-folder out
14 | import sys
15 |
16 | folder="pycharm_projects/fastoma_test/"
17 | sys.argv.extend(['--proteomes', folder+"proteome"])
18 | sys.argv.extend(['--hogmap', folder+"hogmaps"])
19 | sys.argv.extend(['--splice', folder+"splice"])
20 | sys.argv.extend(['--out-rhog-folder', folder+"out"])
21 | sys.argv.extend(['-vv'])
22 | fastoma_infer_roothogs()
23 |
24 | a=2 # a
25 | #
26 | # from FastOMA.zoo.hog import transform
27 | #
28 | # #from zoo.tree_utils import collapse, gene_species, transform, HOG_coverages
29 | #
30 | # import io
31 | # import lxml.etree
32 | # orthoxml_file = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_qfo/benchmark-webservice3/orthoxml/euk_omamer200.dev8_13oct.orthoxml"
33 | #
34 | #
35 | # orthxml_str = []
36 | # with open(orthoxml_file, "r") as f:
37 | # for i in f:
38 | # orthxml_str.append(i)
39 | # print(len(orthxml_str))
40 | # dic_gene_integer={}
41 | # for line in orthxml_str:
42 | # if "gene id" in line:
43 | # found=False
44 | # gene_int= line.split("\"")[1]
45 | # gene_name = line.split("\"")[3]
46 | # dic_gene_integer[gene_int] = gene_name
47 | #
48 | #
49 | #
50 | # orthoxml_etree=lxml.etree.parse(orthoxml_file)
51 | #
52 | # pw_orthologs_integer = sorted(list(transform.iter_pairwise_relations(orthoxml_etree)))
53 | # # iter_pairwise_relations(obj, rel_type=None (def:'ortholog' , but possible to use 'paralog')
54 | # print(len(pw_orthologs_integer))
55 | # print(pw_orthologs_integer[:2])
56 | # pw_orthologs_gene =[]
57 | # for pair in pw_orthologs_integer:
58 | # pw_orthologs_gene.append((dic_gene_integer[pair[0]],dic_gene_integer[pair[1]]))
59 | #
60 | #
61 | #
62 | # print(len(pw_orthologs_gene))
63 | #
64 | # output_file = open(orthoxml_file+"_pairs.tsv","w")
65 | # for pair in pw_orthologs_gene:
66 | # output_file.write(pair[0]+"\t"+pair[1]+"\n")
67 | #
68 | # output_file.close()
69 |
70 |
71 | #
72 | #
73 | # # orthoxml_handle= open(orthoxml_file,"r")
74 | # # orthoxml =""
75 | # # for line in orthoxml_handle:
76 | # # orthoxml+=line
77 | #
78 | #
79 | # from xml.etree.ElementTree import XMLParser
80 | #
81 | # parser = XMLParser()
82 | # with open(orthoxml_file, 'rb') as xml:
83 | # for chunk in xml:
84 | # parser.feed(chunk)
85 | # parser.close()
86 | #
87 | #
88 | # lxml.etree.parse(oxml)
89 | #
90 | # orthoxm= lxml.etree.parse(orthoxml)
91 | #
92 | # # expected = [("1", "2"), ("1", "3"), ("1", "4"), ("1", "5"), ("1", "6"),
93 | # # ("2", "5"), ("2", "6"), ("3", "4"), ("3", "5"), ("3", "6"),
94 | # # ("4", "5"), ("4", "6"), ("5", "6")]
95 | # # self.assertEqual(expected, pw_orthologs)
96 | #
97 | # from xml.etree import ElementTree
98 | # tree = ElementTree.parse(orthoxml_file)
99 | # root = tree.getroot()
100 |
--------------------------------------------------------------------------------
/conf/base.config:
--------------------------------------------------------------------------------
1 | /*
2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3 | dessimozlab/FastOMA Nextflow base config file
4 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5 | A 'blank slate' config file, appropriate for general use on most high performance
6 | compute environments. Assumes that all software is installed and available on
7 | the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
8 | ----------------------------------------------------------------------------------------
9 | */
10 |
11 | process {
12 |
13 | cpus = { check_max( 1 * task.attempt, 'cpus' ) }
14 | memory = { check_max( 6.GB * task.attempt, 'memory' ) }
15 | time = { check_max( 4.h * task.attempt, 'time' ) }
16 | shell = ['/bin/bash', '-euo', 'pipefail']
17 |
18 | //errorStrategy = { task.exitStatus in (130..145) ? 'retry' : 'finish' }
19 | errorStrategy = 'retry'
20 | maxRetries = 3
21 |
22 | withLabel:process_single {
23 | cpus = { check_max( 1 , 'cpus' ) }
24 | memory = { check_max( 12.GB * task.attempt, 'memory' ) }
25 | time = { check_max( 4.h * task.attempt, 'time' ) }
26 | }
27 | withLabel:process_low {
28 | cpus = { check_max( 2 * task.attempt, 'cpus' ) }
29 | memory = { check_max( 12.GB * task.attempt, 'memory' ) }
30 | time = { check_max( 4.h * task.attempt, 'time' ) }
31 | }
32 | withLabel:process_medium {
33 | cpus = { check_max( 6 * task.attempt, 'cpus' ) }
34 | memory = { check_max( 36.GB * task.attempt, 'memory' ) }
35 | time = { check_max( 8.h * task.attempt, 'time' ) }
36 | }
37 | withLabel:process_high {
38 | cpus = { check_max( 12 * task.attempt, 'cpus' ) }
39 | memory = { check_max( 72.GB * task.attempt, 'memory' ) }
40 | time = { check_max( 16.h * task.attempt, 'time' ) }
41 | }
42 | withLabel:process_long {
43 | time = { check_max( 20.h * task.attempt, 'time' ) }
44 | }
45 | withLabel:process_high_memory {
46 | memory = { check_max( 200.GB * task.attempt, 'memory' ) }
47 | }
48 | withLabel:error_ignore {
49 | errorStrategy = 'ignore'
50 | }
51 | withLabel:error_retry {
52 | errorStrategy = 'retry'
53 | maxRetries = 2
54 | }
55 | }
--------------------------------------------------------------------------------
/environment-conda.yml:
--------------------------------------------------------------------------------
1 | name: fastoma-env
2 | channels:
3 | - conda-forge
4 | - bioconda
5 | - defaults
6 | dependencies:
7 | - omamer
8 | - mafft
9 | - fasttree
10 | - nextflow
11 | - papermill
12 | - seaborn
13 | - matplotlib
14 | - pyparsing
15 | - networkx
16 | - jupyter
17 | - mmseqs2
18 | - pip
19 | - pip:
20 | - .[report]
21 |
--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
1 | // General configuration used in all profiles
2 | manifest {
3 | name = "dessimozlab/FastOMA"
4 | description = """FastOMA computes Hierarchical Orthologous Groups from proteomes."""
5 | author = "Sina Majidian, Adrian Altenhoff"
6 | homePage = "https://omabrowser.org"
7 | mainScript = "FastOMA.nf"
8 | nextflowVersion = ">=22.10.4"
9 | defaultBranch = "main"
10 | doi = "10.1101/2024.01.29.577392"
11 | version = "0.3.5"
12 | }
13 |
14 | params {
15 | container_name = "dessimozlab/fastoma"
16 | container_version = "0.3.5"
17 | omamer_db = "https://omabrowser.org/All/LUCA.h5"
18 | debug_enabled = false
19 | help = false
20 | report = false
21 | write_msas = false
22 | write_genetrees = false
23 | filter_method = "col-row-threshold"
24 | filter_gap_ratio_row = 0.3
25 | filter_gap_ratio_col = 0.5
26 | nr_repr_per_hog = 5
27 | min_sequence_length = 40
28 | force_pairwise_ortholog_generation = false
29 |
30 | output_folder = "Output"
31 | statsdir = "${params.output_folder}/stats"
32 |
33 | // Max resource options
34 | // Defaults only, expecting to be overwritten
35 | max_memory = '128.GB'
36 | max_cpus = 24
37 | max_time = '120.h'
38 | }
39 |
40 | // Profiles configure nextflow depending on the environment (local, docker, singularity)
41 | profiles {
42 |
43 | docker {
44 | process {
45 | container = "$params.container_name:$params.container_version"
46 | }
47 | docker.enabled = true
48 | }
49 |
50 | singularity {
51 | process {
52 | container = "$params.container_name:$params.container_version"
53 | }
54 | singularity.enabled = true
55 | singularity.autoMounts = true
56 | }
57 |
58 | standard {
59 | process.executor = 'local'
60 | }
61 |
62 | slurm {
63 | process.executor = "slurm"
64 | time = 4.h
65 | }
66 |
67 | conda {
68 | process.conda = "${projectDir}/environment-conda.yml"
69 | conda.enabled = true
70 | conda.createTimeout = '3 h'
71 | }
72 |
73 | slurm_singularity {
74 | process {
75 | container = "$params.container_name:$params.container_version"
76 | executor = "slurm"
77 | time = 4.h
78 | memory = 20.GB
79 | }
80 | singularity.enabled = true
81 | singularity.autoMounts = true
82 | }
83 |
84 | slurm_conda {
85 | process {
86 | conda = "${projectDir}/environment-conda.yml"
87 | executor = "slurm"
88 | time = 4.h
89 | memory = 20.GB
90 | }
91 | conda.enabled = true
92 | conda.createTimeout = '3 h'
93 | }
94 | }
95 |
96 | def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
97 | timeline {
98 | enabled = params.report
99 | file = "${params.statsdir}/timeline_${trace_timestamp}.html"
100 | }
101 | report {
102 | enabled = params.report
103 | file = "${params.statsdir}/report_${trace_timestamp}.html"
104 | }
105 | trace {
106 | enabled = params.report
107 | file = "${params.statsdir}/trace_${trace_timestamp}.txt"
108 | }
109 | dag {
110 | enabled = params.report
111 | file = "${params.statsdir}/pipeline_dag_${trace_timestamp}.html"
112 | }
113 |
114 | includeConfig "conf/base.config"
115 |
116 | // function to check maximum resources
117 | def check_max(obj, type) {
118 | if (type == 'memory') {
119 | try {
120 | if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
121 | return params.max_memory as nextflow.util.MemoryUnit
122 | else
123 | return obj
124 | } catch (all) {
125 | println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj"
126 | return obj
127 | }
128 | } else if (type == 'time') {
129 | try {
130 | if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
131 | return params.max_time as nextflow.util.Duration
132 | else
133 | return obj
134 | } catch (all) {
135 | println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj"
136 | return obj
137 | }
138 | } else if (type == 'cpus') {
139 | try {
140 | return Math.min( obj, params.max_cpus as int )
141 | } catch (all) {
142 | println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
143 | return obj
144 | }
145 | }
146 | }
147 |
148 |
--------------------------------------------------------------------------------
/nextflow_slurm.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | process.executor = "slurm"
6 | process.queue = "cpu"
7 | process.time = 10.h
8 | process.memory = 95.GB
9 | executor {
10 | name = 'slurm'
11 | queueSize = 550
12 | }
13 |
14 | errorStrategy = { task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'terminate' }
15 |
16 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling"]
3 | build-backend = "hatchling.build"
4 |
5 | [project]
6 | name = "FastOMA"
7 | dynamic = ["version"]
8 | description = "FastOMA - a package to infer orthology information among proteomes"
9 | readme = "README.md"
10 | license = "MIT"
11 | requires-python = ">=3.8"
12 | authors = [
13 | { name = "Sina Majidian", email = "sina.majidian@gmail.com" },
14 | { name = "Adrian Altenhoff", email = "adrian.altenhoff@inf.ethz.ch" }
15 | ]
16 | dependencies = [
17 | "biopython ~=1.81",
18 | "DendroPy >=4.5,<=4.6.1",
19 | "ete3 ~=3.1",
20 | "lxml >=4.6,<6",
21 | "omamer ~=2.0",
22 | "pyham ~=1.1",
23 | "numpy <2", # temporary fix as pytables does not yet work with numpy 2.0
24 | "pyparsing",
25 | "networkx",
26 | ]
27 |
28 | [project.optional-dependencies]
29 | nextflow = [
30 | "nextflow"
31 | ]
32 | report = [
33 | "papermill",
34 | "jupyter",
35 | "matplotlib",
36 | "seaborn",
37 | ]
38 |
39 |
40 | [project.scripts]
41 | fastoma-batch-roothogs = "FastOMA.batch_roothogs:fastoma_batch_roothogs"
42 | fastoma-check-input = "FastOMA.check_input:fastoma_check_input"
43 | fastoma-collect-subhogs = "FastOMA.collect_subhogs:fastoma_collect_subhogs"
44 | fastoma-infer-roothogs = "FastOMA.infer_roothogs:fastoma_infer_roothogs"
45 | fastoma-infer-subhogs = "FastOMA.infer_subhogs:fastoma_infer_subhogs"
46 | fastoma-helper = "FastOMA.helper_scripts:main"
47 |
48 | [project.urls]
49 | Homepage = "https://github.com/DessimozLab/FastOMA"
50 |
51 | [tool.hatch.version]
52 | path = "FastOMA/__init__.py"
53 |
54 | [tool.hatch.build.targets.sdist]
55 | include = [
56 | "/FastOMA",
57 | ]
58 |
59 | [tool.hatch.envs.default]
60 | features = [
61 | "report",
62 | ]
63 |
--------------------------------------------------------------------------------
/testdata/README.md:
--------------------------------------------------------------------------------
1 | FastOMa test data
2 | ======
3 |
4 |
5 | This repo contains a small dataset as the test example.
6 |
7 | 1- The proteome folder including three fasta files `AQUAE.fa`, `CHLTR.fa` and `MYCGE.fa` corresponding to three species.
8 |
9 | 2- A dummy species tree in Newick format.
10 |
11 | 3- You can download the omamer database as follows
12 | ```
13 | cd gethog3/testdata
14 | wget https://omabrowser.org/All/Primates-v2.0.0.h5 # 105MB
15 | mv Primates-v2.0.0.h5 in_folder/omamerdb.h5
16 | ```
17 |
--------------------------------------------------------------------------------
/testdata/expected_output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/.DS_Store
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroups.tsv:
--------------------------------------------------------------------------------
1 | Group Protein
2 | OG_0000001 sp|P0CE13|G3P_CHLTR
3 | OG_0000001 sp|O67161|G3P_AQUAE
4 | OG_0000001 sp|P47543|G3P_MYCGE
5 | OG_0000002 sp|O67118|DNAK_AQUAE
6 | OG_0000002 sp|P47547|DNAK_MYCGE
7 | OG_0000002 sp|P17821|DNAK_CHLTR
8 | OG_0000003 sp|O67618|LEPA_AQUAE
9 | OG_0000003 sp|O84067|LEPA_CHLTR
10 | OG_0000004 sp|P0CD71|EFTU_CHLTR
11 | OG_0000004 sp|P13927|EFTU_MYCGE
12 | OG_0000004 sp|O66429|EFTU_AQUAE
13 | OG_0000005 sp|O84081|FOLD_CHLTR
14 | OG_0000005 sp|O67736|FOLD_AQUAE
15 | OG_0000006 sp|O84332|TPIS_CHLTR
16 | OG_0000006 sp|O66686|TPIS_AQUAE
17 | OG_0000007 sp|P0C0Z7|CH60_CHLTR
18 | OG_0000007 sp|O67943|CH60_AQUAE
19 | OG_0000008 sp|P47639|ATPB_MYCGE
20 | OG_0000008 sp|O67828|ATPB_AQUAE
21 | OG_0000009 sp|P47641|ATPA_MYCGE
22 | OG_0000009 sp|O66907|ATPA_AQUAE
23 | OG_0000010 sp|O66778|ENO_AQUAE
24 | OG_0000010 sp|O84591|ENO_CHLTR
25 | OG_0000011 sp|O84026|RF1_CHLTR
26 | OG_0000011 sp|O67032|RF1_AQUAE
27 | OG_0000011 sp|P47500|RF1_MYCGE
28 | OG_0000012 tr|O84829|O84829_CHLTR
29 | OG_0000012 sp|O67547|SUCD_AQUAE
30 |
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000001.fa:
--------------------------------------------------------------------------------
1 | >sp|P47543|G3P_MYCGE sp|P47543|G3P_MYCGE||MYCGE||1000000005 sp|P47543|G3P_MYCGE [MYCGE]
2 | MAAKNRTIKVAINGFGRIGRLVFRSLLSKANVEVVAINDLTQPEVLAHLLKYDSAHGELK
3 | RKITVKQNILQIDRKKVYVFSEKDPQNLPWDEHDIDVVIESTGRFVSEEGASLHLKAGAK
4 | RVIISAPAKEKTIRTVVYNVNHKTISSDDKIISAASCTTNCLAPLVHVLEKNFGIVYGTM
5 | LTVHAYTADQRLQDAPHNDLRRARAAAVNIVPTTTGAAKAIGLVVPEANGKLNGMSLRVP
6 | VLTGSIVELSVVLEKSPSVEQVNQAMKRFASASFKYCEDPIVSSDVVSSEYGSIFDSKLT
7 | NIVEVDGMKLYKVYAWYDNESSYVHQLVRVVSYCAKL
8 | >sp|P0CE13|G3P_CHLTR sp|P0CE13|G3P_CHLTR||CHLTR||1001000009 sp|P0CE13|G3P_CHLTR [CHLTR]
9 | MRIVINGFGRIGRLVLRQILKRNSPIEVVAINDLVAGDLLTYLFKYDSTHGSFAPQATFS
10 | DGCLVMGERKVHFLAEKDVQKLPWKDLDVDVVVESTGLFVNRDDVAKHLDSGAKRVLITA
11 | PAKGDVPTFVMGVNHQQFDPADVIISNASCTTNCLAPLAKVLLDNFGIEEGLMTTVHAAT
12 | ATQSVVDGPSRKDWRGGRGAFQNIIPASTGAAKAVGLCLPELKGKLTGMAFRVPVADVSV
13 | VDLTVKLSSATTYEAICEAVKHAANTSMKNIMYYTEEAVVSSDFIGCEYSSVFDAQAGVA
14 | LNDRFFKLVAWYDNEIGYATRIVDLLEYVQENSK
15 | >sp|O67161|G3P_AQUAE sp|O67161|G3P_AQUAE||AQUAE||1002000010 sp|O67161|G3P_AQUAE [AQUAE]
16 | MAIKVGINGFGRIGRSFFRASWGREEIEIVAINDLTDAKHLAHLLKYDSVHGIFKGSVEA
17 | KDDSIVVDGKEIKVFAQKDPSQIPWGDLGVDVVIEATGVFRDRENASKHLQGGAKKVIIT
18 | APAKNPDITVVLGVNEEKYNPKEHNIISNASCTTNCLAPCVKVLNEAFGVEKGYMVTVHA
19 | YTNDQRLLDLPHKDFRRARAAAINIVPTTTGAAKAIGEVIPELKGKLDGTARRVPVPDGS
20 | LIDLTVVVNKAPSSVEEVNEKFREAAQKYRESGKVYLKEILQYCEDPIVSTDIVGNPHSA
21 | IFDAPLTQVIDNLVHIAAWYDNEWGYSCRLRDLVIYLAERGL
22 |
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000001.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000001.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000002.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000002.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000003.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000003.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000004.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000004.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000005.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000005.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000006.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000006.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000007.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000007.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000008.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000008.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000009.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000009.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000010.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000010.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000011.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000011.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/OrthologousGroupsFasta/OG_0000012.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000012.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGs.tsv:
--------------------------------------------------------------------------------
1 | RootHOG Protein OMAmerRootHOG
2 | HOG:0000001 sp|P0CE13|G3P_CHLTR HOG:E1027400
3 | HOG:0000001 sp|O67161|G3P_AQUAE HOG:E1027400
4 | HOG:0000001 sp|P47543|G3P_MYCGE HOG:E1027400
5 | HOG:0000002 sp|O67118|DNAK_AQUAE HOG:E0990770
6 | HOG:0000002 sp|P47547|DNAK_MYCGE HOG:E0990770
7 | HOG:0000002 sp|P17821|DNAK_CHLTR HOG:E0990770
8 | HOG:0000003 sp|O67618|LEPA_AQUAE HOG:E0990677
9 | HOG:0000003 sp|O84067|LEPA_CHLTR HOG:E0990677
10 | HOG:0000004 sp|P0CD71|EFTU_CHLTR HOG:E0990677
11 | HOG:0000004 sp|P13927|EFTU_MYCGE HOG:E0990677
12 | HOG:0000004 sp|O66429|EFTU_AQUAE HOG:E0990677
13 | HOG:0000005 sp|O84081|FOLD_CHLTR HOG:E1027325
14 | HOG:0000005 sp|O67736|FOLD_AQUAE HOG:E1027325
15 | HOG:0000006 sp|O84332|TPIS_CHLTR HOG:E1027829
16 | HOG:0000006 sp|O66686|TPIS_AQUAE HOG:E1027829
17 | HOG:0000007 sp|P0C0Z7|CH60_CHLTR HOG:E1027301
18 | HOG:0000007 sp|O67943|CH60_AQUAE HOG:E1027301
19 | HOG:0000008 sp|P47639|ATPB_MYCGE HOG:E0990823
20 | HOG:0000008 sp|O67828|ATPB_AQUAE HOG:E0990823
21 | HOG:0000009 sp|P47641|ATPA_MYCGE HOG:E0990823
22 | HOG:0000009 sp|O66907|ATPA_AQUAE HOG:E0990823
23 | HOG:0000010 sp|O66778|ENO_AQUAE HOG:E1027309
24 | HOG:0000010 sp|O84591|ENO_CHLTR HOG:E1027309
25 | HOG:0000011 sp|O84026|RF1_CHLTR HOG:E0990790
26 | HOG:0000011 sp|O67032|RF1_AQUAE HOG:E0990790
27 | HOG:0000011 sp|P47500|RF1_MYCGE HOG:E0990790
28 | HOG:0000012 tr|O84829|O84829_CHLTR HOG:E1027626
29 | HOG:0000012 sp|O67547|SUCD_AQUAE HOG:E1027626
30 |
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000001.fa:
--------------------------------------------------------------------------------
1 | >sp|P47543|G3P_MYCGE sp|P47543|G3P_MYCGE||MYCGE||1000000005 sp|P47543|G3P_MYCGE [MYCGE]
2 | MAAKNRTIKVAINGFGRIGRLVFRSLLSKANVEVVAINDLTQPEVLAHLLKYDSAHGELK
3 | RKITVKQNILQIDRKKVYVFSEKDPQNLPWDEHDIDVVIESTGRFVSEEGASLHLKAGAK
4 | RVIISAPAKEKTIRTVVYNVNHKTISSDDKIISAASCTTNCLAPLVHVLEKNFGIVYGTM
5 | LTVHAYTADQRLQDAPHNDLRRARAAAVNIVPTTTGAAKAIGLVVPEANGKLNGMSLRVP
6 | VLTGSIVELSVVLEKSPSVEQVNQAMKRFASASFKYCEDPIVSSDVVSSEYGSIFDSKLT
7 | NIVEVDGMKLYKVYAWYDNESSYVHQLVRVVSYCAKL
8 | >sp|P0CE13|G3P_CHLTR sp|P0CE13|G3P_CHLTR||CHLTR||1001000009 sp|P0CE13|G3P_CHLTR [CHLTR]
9 | MRIVINGFGRIGRLVLRQILKRNSPIEVVAINDLVAGDLLTYLFKYDSTHGSFAPQATFS
10 | DGCLVMGERKVHFLAEKDVQKLPWKDLDVDVVVESTGLFVNRDDVAKHLDSGAKRVLITA
11 | PAKGDVPTFVMGVNHQQFDPADVIISNASCTTNCLAPLAKVLLDNFGIEEGLMTTVHAAT
12 | ATQSVVDGPSRKDWRGGRGAFQNIIPASTGAAKAVGLCLPELKGKLTGMAFRVPVADVSV
13 | VDLTVKLSSATTYEAICEAVKHAANTSMKNIMYYTEEAVVSSDFIGCEYSSVFDAQAGVA
14 | LNDRFFKLVAWYDNEIGYATRIVDLLEYVQENSK
15 | >sp|O67161|G3P_AQUAE sp|O67161|G3P_AQUAE||AQUAE||1002000010 sp|O67161|G3P_AQUAE [AQUAE]
16 | MAIKVGINGFGRIGRSFFRASWGREEIEIVAINDLTDAKHLAHLLKYDSVHGIFKGSVEA
17 | KDDSIVVDGKEIKVFAQKDPSQIPWGDLGVDVVIEATGVFRDRENASKHLQGGAKKVIIT
18 | APAKNPDITVVLGVNEEKYNPKEHNIISNASCTTNCLAPCVKVLNEAFGVEKGYMVTVHA
19 | YTNDQRLLDLPHKDFRRARAAAINIVPTTTGAAKAIGEVIPELKGKLDGTARRVPVPDGS
20 | LIDLTVVVNKAPSSVEEVNEKFREAAQKYRESGKVYLKEILQYCEDPIVSTDIVGNPHSA
21 | IFDAPLTQVIDNLVHIAAWYDNEWGYSCRLRDLVIYLAERGL
22 |
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000001.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000001.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000002.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000002.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000003.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000003.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000004.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000004.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000005.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000005.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000006.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000006.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000007.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000007.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000008.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000008.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000009.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000009.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000010.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000010.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000011.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000011.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/RootHOGsFasta/HOG0000012.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000012.fa.gz
--------------------------------------------------------------------------------
/testdata/expected_output/orthologs.tsv:
--------------------------------------------------------------------------------
1 | sp|O67161|G3P_AQUAE sp|P0CE13|G3P_CHLTR
2 | sp|P47543|G3P_MYCGE sp|O67161|G3P_AQUAE
3 | sp|P47543|G3P_MYCGE sp|P0CE13|G3P_CHLTR
4 | sp|O67118|DNAK_AQUAE sp|P17821|DNAK_CHLTR
5 | sp|P47547|DNAK_MYCGE sp|P17821|DNAK_CHLTR
6 | sp|P47547|DNAK_MYCGE sp|O67118|DNAK_AQUAE
7 | sp|O67618|LEPA_AQUAE sp|O84067|LEPA_CHLTR
8 | sp|O66429|EFTU_AQUAE sp|P0CD71|EFTU_CHLTR
9 | sp|P13927|EFTU_MYCGE sp|P0CD71|EFTU_CHLTR
10 | sp|P13927|EFTU_MYCGE sp|O66429|EFTU_AQUAE
11 | sp|O67736|FOLD_AQUAE sp|O84081|FOLD_CHLTR
12 | sp|O66686|TPIS_AQUAE sp|O84332|TPIS_CHLTR
13 | sp|O67943|CH60_AQUAE sp|P0C0Z7|CH60_CHLTR
14 | sp|O67828|ATPB_AQUAE sp|P47639|ATPB_MYCGE
15 | sp|O66907|ATPA_AQUAE sp|P47641|ATPA_MYCGE
16 | sp|O66778|ENO_AQUAE sp|O84591|ENO_CHLTR
17 | sp|O67032|RF1_AQUAE sp|O84026|RF1_CHLTR
18 | sp|P47500|RF1_MYCGE sp|O84026|RF1_CHLTR
19 | sp|P47500|RF1_MYCGE sp|O67032|RF1_AQUAE
20 | sp|O67547|SUCD_AQUAE tr|O84829|O84829_CHLTR
21 |
--------------------------------------------------------------------------------
/testdata/expected_output/phylostratigraphy.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Phylo.io
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
32 |
33 |
34 |
35 |
36 |
53 |
54 |
55 |
56 |
59 |
60 |
61 |
62 |
65 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/testdata/expected_output/species_tree_checked.nwk:
--------------------------------------------------------------------------------
1 | ((AQUAE:1,CHLTR:1)inter1:1,MYCGE:1)inter2:0;
--------------------------------------------------------------------------------
/testdata/in_folder/proteome/AQUAE.fa:
--------------------------------------------------------------------------------
1 | >sp|O67618|LEPA_AQUAE
2 | MEQKNVRNFCIIAHVDHGKSTLADRLLEYTGAISEREKREQLLDTLDVERERGITVKMQA
3 | VRMFYKAKDGNTYKLHLIDTPGHVDFSYEVSRALAACEGALLLIDASQGIEAQTVANFWK
4 | AVEQDLVIIPVINKIDLPSADVDRVKKQIEEVLGLDPEEAILASAKEGIGIEEILEAIVN
5 | RIPPPKGDPQKPLKALIFDSYYDPYRGAVAFVRIFDGEVKPGDKIMLMSTGKEYEVTEVG
6 | AQTPKMTKFDKLSAGDVGYIAASIKDVRDIRIGDTITHAKNPTKEPVPGFQPAKPMVYAG
7 | IYPAEDTTYEELRDALEKYAINDAAIVYEPESSPALGMGFRVGFLGLLHMEIVQERLERE
8 | YGVKIITTAPNVIYRVKKKFTDEVIEVRNPMDFPDNAGLIEYVEEPFVLVTIITPKEYVG
9 | PIIQLCQEKRGIQKNMTYLDPNTVYLEYEMPLSEIIVDFHDKIKSISRGFASYDYEFIGY
10 | RPSDLIKLTVLINKKPVDALSFIVHADRAQKFARRVAEKLRETIPRQLFEVHIQVAKGGK
11 | VIASERIKPLRANVTAKCYGGDVTRKKKLLENQKEGKKRMKQFGKVQLPQEAFLSVLKVE
12 | >sp|O67032|RF1_AQUAE
13 | MLKEAYISRLDKLQEKYRKLQEELSKPEVIQDVEKYKKLSKELKELQEINELYERYKKAQ
14 | KELKEAKELLKSSDKDLRELAEEEVNRLTEEMKKLEEELKVHLVPKDPNDTKNVILEIRA
15 | GAGGEEAALFAADLFRMYQKYAEEKGWKVSILSSNKTGLGGYKEVIALIEGEGAYSRLKY
16 | ESGVHRVQRVPVTESSGRIHTSTATVAVLPEVDETDIKIKPEELKIETFRASGAGGQYVN
17 | TTETAVRITHIPTGIVVQCQDERSQFQNKQKALKILYAKLKDYYERKKQEEIAKERKEQV
18 | GTGERSEKIRTYNFPQNRVTDHRINLTLYKLQDVLEGKLDEIIDALRAKEIEKKLELVEK
19 | EG
20 | >sp|O66778|ENO_AQUAE
21 | MSRIKRVHGREVLDSRGNPTVEVEVELESGALGRAIVPSGASTGEREALELRDGDPKRYL
22 | GKGVLKAVDNVNGVIAKALVGLEPYNQREIDQILIELDGTENKSKLGANAILGTSMAVAR
23 | AAANELGIPLYEYLGGKFGYRLPVPLMNVINGGAHADNNLDIQEFMIVPVCGGAFREALR
24 | AGVETFHHLKKILKEKGYSTNVGDEGGFAPNLNSSEEALDILMQAIEKAGYKPGEDILLA
25 | LDVASSEFYENGVYKFEGKERSAEEMIEFYEKLIQKYPIISIEDPMSENDWEGWKEITKR
26 | LGDKVQLVGDDLFTTNPKILRKGIEEGVANAILVKLNQIGTVSETLDTVMLAKERNYSAI
27 | ISHRSGETEDTFISHLAVATNAGQIKTGSASRTDRIAKYNELLRIEERLGNGAVFWGREE
28 | FYRFTS
29 | >sp|O66429|EFTU_AQUAE
30 | MAKEKFERTKEHVNVGTIGHVDHGKSTLTSAITCVLAAGLVEGGKAKCFKYEEIDKAPEE
31 | KERGITINITHVEYETAKRHYAHVDCPGHADYIKNMITGAAQMDGAILVVSAADGPMPQT
32 | REHVLLARQVNVPYIVVFMNKCDMVDDEELLELVELEVRELLSKYEYPGDEVPVIRGSAL
33 | GALQELEQNSPGKWVESIKELLNAMDEYIPTPQREVDKPFLMPIEDVFSISGRGTVVTGR
34 | VERGVLRPGDEVEIVGLREEPLKTVATSIEMFRKVLDEALPGDNIGVLLRGVGKDDVERG
35 | QVLAQPGSVKAHKRFRAQVYVLSKEEGGRHTPFFVNYRPQFYFRTADVTGTVVKLPEGVE
36 | MVMPGDNVELEVELIAPVALEEGLRFAIREGGRTVGAGVVTKILD
37 | >sp|O67547|SUCD_AQUAE
38 | MAILVNKDTKVVVQGITGKEGSFHAKQCKEYGTQVVAGVTPGKGGMEVEGIPVFNTVKEA
39 | VKETGANCSLIFVPAPFAADAIVEALDAGIELVVCITEGIPVKDMMMVKDYMLKNYPNAK
40 | LVGPNCPGVITPGEAKVGIMPGHIFKRGKIGIVSRSGTLTYEAAYQLTKYGLGQSTAVGI
41 | GGDPVHGLTHRDVIEMFNKDPETEAILMIGEIGGTEEEEAAEYIEKEVDKPVFAYIAGIT
42 | APPGKRMGHAGAIIMGGKGTAKAKMEALEKAGAYVIENPAKIGETVAKILKVIELEEEER
43 | TSDAE
44 | >sp|O66686|TPIS_AQUAE
45 | MRRLIAANWKMNKTVKETEEYINTFLKFVEHPESREILICPPFTSLYVAGKMLQGTGVKL
46 | GAQNCHYEKRGAFTGEISIPMLQEVGCEYVIVGHSERRHIFGESDELIHKKIVACLEMGI
47 | RPILCVGEKKEEREAGMTFKVIETQIKLALTGVEEHTDKIDIAYEPVWAIGTGTPATPED
48 | AVEVHTFIRNLINQLNPKNEGKTRILYGGSVNPQNAKEFMKHEEINGLLVGTASLDPESF
49 | AKIVYSF
50 | >sp|O67828|ATPB_AQUAE
51 | MAEVIKGKVVQVIGPVVDVEFEGVKELPKIKDGLKTIRRAIDDRGNWFEEVLFMEVAQHI
52 | GEHRVRAIAMGPTDGLVRGQEVEYLGGPIKIPVGKEVLGRIFNVAGQPIDEQGPVEAKEY
53 | WPMFRNPPELVEQSTKVEILETGIKVIDLLQPIIKGGKVGLFGGAGVGKTVLMQELIHNI
54 | ARFHEGYSVVVGVGERTREGNDLWLEMKESGVLPYTVMVYGQMNEPPGVRFRVAHTGLTM
55 | AEYFRDVEGQDVLIFIDNIFRFVQAGAEVSTLLGRLPSAVGYQPTLNTDVGEVQERITST
56 | KKGSITAIQAVYVPADDITDPAPWSIFAHLDATTVLTRRLAELGIYPAIDPLESTSKYLA
57 | PEYVGEEHYEVAMEVKRILQRYKELQEIIAILGMEELSDEDKAIVNRARRIQKFLSQPFH
58 | VAEQFTGMPGKYVKLEDTIRSFKEVLTGKYDHLPENAFYMVGTIEDVIEKAKQMGAKV
59 | >sp|O67118|DNAK_AQUAE
60 | MAEKKEKIIGIDLGTTNSVVSVMMGDEAVVIQNQEGSRLTPSVVSWTKEKEILVGEPAKR
61 | RAILDPENTVYESKRFIGRKFEEVKEEAKRVSYKVVPDEKGDAAFDIPNAGKLVRPEEVG
62 | AHVLRKLKEAAEAFLGEPVKKAVITVPAYFNERQRQATKDAGKIAGLEVVRILNEPTAAA
63 | MAYGLHKKDNVRILVYDFGGGTFDVSILEGGEGVIEVKVTAGDTHLGGANIDERIMDWLI
64 | EEFKKETGIDLRKDRTALQRLKEASEQAKKELSFKMETEINLPFITIDPNTNQPLHLQKK
65 | LTRARLEEMIKDIVDRTIDIVKQALEDAKLKPSDIDEVVLVGGSTRIPLVQQRIKEFFGK
66 | EPHKGLNPDEVVAMGAAIQAGVLAGEVKEIVLVDVTPLSLGVETYGGVMTVLIPRNTPIP
67 | VRKCEIFTTAHDYQTEVEIHVLQGERPLAKDNKSLAKFYLTGIPPAPRGVPKIEVCFDID
68 | ADGILHVTAKDLGTGKEQSVRVEISSGLTPEEIERIIKEAEEHAEEDRKKKELIEAKNQL
69 | DHLVYQLEKALKEAGDKVPADVKSEAEKVIEEAKKTIETATEIEQVKQVTEKVLQVSSKM
70 | GTTLYGEAGKQAGGGEKKDEGGEGEVEAKPVD
71 | >sp|O67736|FOLD_AQUAE
72 | MALILDGKSLSKKIREEIKKEVENFTSKGFRPPALAVILVGNDPASEIYVNNKRKACEKV
73 | GIKSLFYHLPQDVSEEKLLGLIYELNMNEEVDGILVQLPLPKHIDQTRVILSISPEKDVD
74 | GFHPENMGKLVAQIEDGFIPCTPLGIDILLKHYGIDVKGKDVTIVGAGFIVGRPLSLLML
75 | WRNATVSVCHIHTKDVKKFTKEADILISATGVPHLIKEDMIKEGAVVVDVGISRLNGKIV
76 | GDVDFERVKEKASAITPVPGGVGPMTVTALLLNTLKSYKRKFAHLISTTNP
77 | >sp|O67161|G3P_AQUAE
78 | MAIKVGINGFGRIGRSFFRASWGREEIEIVAINDLTDAKHLAHLLKYDSVHGIFKGSVEA
79 | KDDSIVVDGKEIKVFAQKDPSQIPWGDLGVDVVIEATGVFRDRENASKHLQGGAKKVIIT
80 | APAKNPDITVVLGVNEEKYNPKEHNIISNASCTTNCLAPCVKVLNEAFGVEKGYMVTVHA
81 | YTNDQRLLDLPHKDFRRARAAAINIVPTTTGAAKAIGEVIPELKGKLDGTARRVPVPDGS
82 | LIDLTVVVNKAPSSVEEVNEKFREAAQKYRESGKVYLKEILQYCEDPIVSTDIVGNPHSA
83 | IFDAPLTQVIDNLVHIAAWYDNEWGYSCRLRDLVIYLAERGL
84 | >sp|O67943|CH60_AQUAE
85 | MAAKAIIYNEEARAKLKAGVDKLANAVKVTLGPKGREVILGKNWGTPVVTKDGVTVAKEI
86 | ELKDKFENIGAQLVKEVASKTADVAGDGTTTATVLAQAIFHEGLRVAASGANVMEVKRGI
87 | DKAVKKIVEELKKLSKDVKERKEIEQVATISANNDPEIGKIIADAMEEVGKDGVITVEES
88 | KSAETTLEVVKGMQFDRGYLSPYFVTDPEKMECVLENPYILIYEKKITNVKELLPILEQV
89 | VRSGRPLLVIAEDVEGEALATLVVNHIKGVLKACAVKAPGFGQRRKDYLGDIAVLTGGQA
90 | ITEDLGIKLESVTLDMLGQAEKVVVDKEHTTIIGGKGDPEQIKARIEQIKRQIQETTSDY
91 | DREKLQERLAKLSGGVAIIRVGAATEAELKEKKYRVEDAVHATKAAVEEGIVPGGGVALV
92 | RASEALEDLKGDNHDQQLGIDIIKKAVRTPLKQIAYNAGYDGSVVLEKVIELGKEKGVSW
93 | GFNAATGEYVDMYEAGIIDPTKVVRTAIENAASVAGTMLTAEALIADLPEEKKKDITPTD
94 | MPELD
95 | >sp|O66907|ATPA_AQUAE
96 | MATLTYEEALEILRQQIKDFEPEAKMEEVGVVYYVGDGVARAYGLENVMAMEIVEFQGGQ
97 | QGIAFNLEEDNVGIIILGSETGIEEGHIVKRTGRILDAPVGEGLVGRVIDPLGNPLDGKG
98 | PIQFEYRSPVEKIAPGVVKRKPVHEPLQTGIKAIDAMIPIGRGQRELIIGDRATGKTTVA
99 | IDTILAQKNSDVYCIYVAVGQKRAAIARLIELLEREGAMEYTTVVVASASDPASLQYLAP
100 | FVGCTIGEYFRDNGKHALIIYDDLSKHAEAYRQLSLLMRRPPGREAYPGDVFYLHSRLLE
101 | RAAKLNDDLGAGSLTALPIIETKAGDVAAYIPTNVISITDGQIYLEADLFNKGIRPAINV
102 | GLSVSRVGGAAQIKAMKQVAGTLRLELAQFRELEAFVQFASELDKATQQQINRGLRLVEL
103 | LKQEPYNPIPVEKQIVLIYAGTHGYLDDIPVESVRKFEKELYAYLDNERPDILKEISEKK
104 | KLDEELEKKIKEALDAFKQKFVP
105 |
--------------------------------------------------------------------------------
/testdata/in_folder/proteome/CHLTR.fa:
--------------------------------------------------------------------------------
1 | >sp|O84067|LEPA_CHLTR
2 | MKPYKIENIRNFSIIAHIDHGKSTIADRLLESTSTIEQREMREQLLDSMDLERERGITIK
3 | AHPVTMTYEYEGETYELNLIDTPGHVDFSYEVSRSLAACEGALLIVDAAQGVQAQSLANV
4 | YLALERDLEIIPVLNKIDLPAAQPEAIKKQIEEFIGLDTSNTIACSAKTGQGIPEILESI
5 | IRLVPPPKPPQETELKALIFDSHYDPYVGIMVYVRVISGEIKKGDRITFMATKGSSFEVL
6 | GIGAFLPEATLMEGSLRAGQVGYFIANLKKVKDVKIGDTVTTVKHPAKEPLEGFKEIKPV
7 | VFAGIYPIDSSDFDTLKDALGRLQLNDSALTIEQENSHSLGFGFRCGFLGLLHLEIIFER
8 | ISREFDLDIIATAPSVIYKVVLKNGKTLFIDNPTAYPDPALIEHMEEPWVHVNIITPQEY
9 | LSNIMSLCMDKRGICLKTDMLDQHRLVLSYELPLNEIVSDFNDKLKSVTKGYGSFDYRLG
10 | DYKKGAIIKLEILINDEAVDAFSCLVHRDKAESKGRSICEKLVDVIPPQLFKIPIQAAIN
11 | KKIIARETIRALAKNVTAKCYGGDITRKRKLWDKQKKGKKRMKEFGKVSIPNTAFVEVLK
12 | ME
13 | >sp|O84026|RF1_CHLTR
14 | MEIKVLECLKRLEEVEKQISDPNIFSNPKEYSSLSKEHARLSEIKNAHESLVATKKILQD
15 | DKLALSTEKDPEIVAMLEEGVLVGEEAVERLSKQLENLLIPPDPDDDLSVIMELRAGTGG
16 | DEAALFVGDCVRMYHLYAASKGWQCEVLSTSESDLGGYKEYVMGISGASVKRFLQYEAGT
17 | HRVQRVPETETQGRVHTSAVTVAVLPEPAEDDEEVFIDEKDLRIDTFRSSGAGGQHVNVT
18 | DSAVRITHIPSGVVVTCQDERSQHKNKAKAMRVLKARIRDAEVQKRAQEASAMRSAQVGS
19 | GDRSERIRTYNFPQNRVTDHRIGLTLYNLDRVMEGELDMITTALVTHVHRQLFGHEETA
20 | >sp|O84591|ENO_CHLTR
21 | MFDVVISDIEAREILDSRGYPTLCVKVITNTGTFGEACVPSGASTGIKEALELRDKDPKR
22 | YQGKGVLQAISNVEKVLMPALQGFSVFDQITADAIMIDADGTPNKEKLGANAILGVSLAL
23 | AKAAANTLQRPLYRYLGGSFSHVLPCPMMNLINGGMHATNGLQFQEFMIRPISAPSLTEA
24 | VRMGAEVFNALKKILQNRQLATGVGDEGGFAPNLASNAEALDLLLTAIETAGFTPREDIS
25 | LALDCAASSFYNTQDKTYDGKSYADQVGILAELCEHYPIDSIEDGLAEEDFEGWKLLSET
26 | LGDRVQLVGDDLFVTNSALIAEGIAQGLANAVLIKPNQIGTLTETAEAIRLATIQGYATI
27 | LSHRSGETEDTTIADLAVAFNTGQIKTGSLSRSERIAKYNRLMAIEEEMGPEALFQDSNP
28 | FSKA
29 | >sp|P0CD71|EFTU_CHLTR
30 | MSKETFQRNKPHINIGTIGHVDHGKTTLTAAITRALSGDGLADFRDYSSIDNTPEEKARG
31 | ITINASHVEYETANRHYAHVDCPGHADYVKNMITGAAQMDGAILVVSATDGAMPQTKEHI
32 | LLARQVGVPYIVVFLNKIDMISEEDAELVDLVEMELVELLEEKGYKGCPIIRGSALKALE
33 | GDAAYIEKVRELMQAVDDNIPTPEREIDKPFLMPIEDVFSISGRGTVVTGRIERGIVKVS
34 | DKVQLVGLRDTKETIVTGVEMFRKELPEGRAGENVGLLLRGIGKNDVERGMVVCLPNSVK
35 | PHTQFKCAVYVLQKEEGGRHKPFFTGYRPQFFFRTTDVTGVVTLPEGIEMVMPGDNVEFE
36 | VQLISPVALEEGMRFAIREGGRTIGAGTISKIIA
37 | >tr|O84829|O84829_CHLTR
38 | MLELLSKDLPIITQGITGKAGSFHTTQCVAYGSNFVGGVTPGKGGSQFLDLPIFDSVLEA
39 | KQATGCRASMIFVPPPFAAEAIFEAEDAGIELIVCITEGIPIKDMLEVASLMEKSASSLI
40 | GPNCPGVIKPGVCKIGIMPGYIHLPGKVGVVSRSGTLTYEAVWQLTQRKIGQSVCIGIGG
41 | DPLNGTSFIDALQEFEKDSQTEAVLMIGEIGGSAEEEAADWTRQHSSKPVIAFIAGATAP
42 | KGKRMGHAGAIISGKSGDAFSKQEALRQAGVTVVESLALIGEAVASVLKPR
43 | >sp|O84332|TPIS_CHLTR
44 | MFTDKETHRKPFPTWAHLLHSEPSKQFVFGNWKMNKTLTEAQTFLKSFISSDILSNPQII
45 | TGIIPPFTLLSACQQAVSDSPIFLGAQTTHEADSGAFTGEISAPMLKDIGVDFVLIGHSE
46 | RRHIFHEQNPVLAEKAAAAIHSGMIPVLCIGETLEEQESGATQDILLNQLTTGLSKLPEQ
47 | ASFILAYEPVWAIGTGKVAHPDLVQETHAFCRKTIASLFSKDIAERTPILYGGSVKADNA
48 | RSLSLCPDVNGLLVGGASLSSENFLSIIQQIDIP
49 | >sp|P17821|DNAK_CHLTR
50 | MSEKRKSNKIIGIDLGTTNSCVSVMEGGQPKVIASSEGTRTTPSIVAFKGGETLVGIPAK
51 | RQAVTNPEKTLASTKRFIGRKFSEVESEIKTVPYKVAPNSKGDAVFDVEQKLYTPEEIGA
52 | QILMKMKETAEAYLGETVTEAVITVPAYFNDSQRASTKDAGRIAGLDVKRIIPEPTAAAL
53 | AYGIDKEGDKKIAVFDLGGGTFDISILEIGDGVFEVLSTNGDTHLGGDDFDGVIINWMLD
54 | EFKKQEGIDLSKDNMALQRLKDAAEKAKIELSGVSSTEINQPFITIDANGPKHLALTLTR
55 | AQFEHLASSLIERTKQPCAQALKDAKLSASDIDDVLLVGGMSRMPAVQAVVKEIFGKEPN
56 | KGVNPDEVVAIGAAIQGGVLGGEVKDVLLLDVIPLSLGIETLGGVMTPLVERNTTIPTQK
57 | KQIFSTAADNQPAVTIVVLQGERPMAKDNKEIGRFDLTDIPPAPRGHPQIEVTFDIDANG
58 | ILHVSAKDAASGREQKIRIEASSGLKEDEIQQMIRDAELHKEEDKQRKEASDVKNEADGM
59 | IFRAEKAVKDYHDKIPAELVKEIEEHIEKVRQAIKEDASTTAIKAASDELSTHMQKIGEA
60 | MQAQSASAAASSAANAQGGPNINSEDLKKHSFSTRPPAGGSASSTDNIEDADVEIVDKPE
61 | >sp|O84081|FOLD_CHLTR
62 | MLLKGAPAADHILATIKENIRACSKAPGLAVVLIGNNPASEIYVNMKIKRATDLGMVSKS
63 | YRKPSDATLSDILALIHQLNNDENIHGILVQLPLPKHLDAQAILSTITPDKDVDGLHPVN
64 | VGKLLLGETDGFIPCTPAGIVELCKYYEIPLHGKHVVILGRSNIVGKPLAALLMQRHADT
65 | NASVTLLHSQSEHLTEITRTADILISAIGVPLFVNKEMIAEKTVIMDVGTSRIPAANPKG
66 | YILVGDVDFNNVVPVCRAITPVPGGVGPMTVAMLMRNTWESFLRHTS
67 | >sp|P0CE13|G3P_CHLTR
68 | MRIVINGFGRIGRLVLRQILKRNSPIEVVAINDLVAGDLLTYLFKYDSTHGSFAPQATFS
69 | DGCLVMGERKVHFLAEKDVQKLPWKDLDVDVVVESTGLFVNRDDVAKHLDSGAKRVLITA
70 | PAKGDVPTFVMGVNHQQFDPADVIISNASCTTNCLAPLAKVLLDNFGIEEGLMTTVHAAT
71 | ATQSVVDGPSRKDWRGGRGAFQNIIPASTGAAKAVGLCLPELKGKLTGMAFRVPVADVSV
72 | VDLTVKLSSATTYEAICEAVKHAANTSMKNIMYYTEEAVVSSDFIGCEYSSVFDAQAGVA
73 | LNDRFFKLVAWYDNEIGYATRIVDLLEYVQENSK
74 | >sp|P0C0Z7|CH60_CHLTR
75 | MVAKNIKYNEEARKKIQKGVKTLAEAVKVTLGPKGRHVVIDKSFGSPQVTKDGVTVAKEV
76 | ELADKHENMGAQMVKEVASKTADKAGDGTTTATVLAEAIYTEGLRNVTAGANPMDLKRGI
77 | DKAVKVVVDQIRKISKPVQHHKEIAQVATISANNDAEIGNLIAEAMEKVGKNGSITVEEA
78 | KGFETVLDIVEGMNFNRGYLSSYFATNPETQECVLEDALVLIYDKKISGIKDFLPVLQQV
79 | AESGRPLLIIAEDIEGEALATLVVNRIRGGFRVCAVKAPGFGDRRKAMLEDIAILTGGQL
80 | ISEELGMKLENANLAMLGKAKKVIVSKEDTTIVEGMGEKEALEARCESIKKQIEDSSSDY
81 | DKEKLQERLAKLSGGVAVIRVGAATEIEMKEKKDRVDDAQHATIAAVEEGILPGGGTALI
82 | RCIPTLEAFLPMLTNEDEQIGARIVLKALSAPLKQIAANAGKEGAIIFQQVMSRSANEGY
83 | DALRDAYTDMLEAGILDPAKVTRSALESAASVAGLLLTTEALIAEIPEEKPAAAPAMPGA
84 | GMDY
85 |
--------------------------------------------------------------------------------
/testdata/in_folder/proteome/MYCGE.fa:
--------------------------------------------------------------------------------
1 | >sp|P47500|RF1_MYCGE
2 | MDFDKQLFFNVEKIVELTEQLEKDLNKPNLSFEQIKVINKELKHKQPLIVKFKELQKLVE
3 | NANEAEQILNNSSLKELHEEAKKELEKIKASLPSLEEEIKFLLLPVDENNQKNVIVEIRP
4 | AAGGDESCIFLSDLFNMYKNYCTSKNWTVELNEIIPASVGINFVSFAVNGTDVFAKLKFE
5 | SGVHRVQRVPLTEAKGRVHTSTVTVAVLPQLEEVEITINPSDLRIDTYRASGAGGQHVNR
6 | TESAVRITHLPTGIVVACQEGKSQFSNRDKAMKMLRAKLWENAQNKQLSTQADLRKSQVG
7 | SGERAEKIRTYNYPQNRITDHRIKLTINKLNTVILGDLDEIIEALQADEKKQQLEKFIS
8 | >sp|P13927|EFTU_MYCGE
9 | MAREKFDRSKPHVNVGTIGHIDHGKTTLTAAICTVLAKEGKSAATRYDEIDKAPEEKARG
10 | ITINSAHVEYSSDKRHYAHVDCPGHADYIKNMITGAAQMDGAILVVSATDSVMPQTREHI
11 | LLARQVGVPKMVVFLNKCDIASDEEVQELVAEEVRDLLTSYGFDGKNTPIIYGSALKALE
12 | GDPKWEAKIHDLIKAVDEWIPTPTREVDKPFLLAIEDTMTITGRGTVVTGRVERGELKVG
13 | QEVEIVGLKPIRKAVVTGIEMFKKELDSAMAGDNAGVLLRGVERKEVERGQVLAKPGSIK
14 | PHKKFKAEIYALKKEEGGRHTGFLNGYRPQFYFRTTDVTGSIALAENTEMVLPGDNASIT
15 | VELIAPIACEKGSKFSIREGGRTVGAGTVTEVLE
16 | >sp|P47639|ATPB_MYCGE
17 | MIKKENLTYGKVHQVIGPVVDVIFSESKQLPRVYDCLSVQLKKSELFLEATQLIGDDIVR
18 | CIALGPTEGLARNVKVTNYNHPIEVPVGKNVLGRMFNVLGEPIDGKEPLPKKPKLSIHRN
19 | PPAFDEQPNTVDIFETGIKVIDLLTPYVRGGKIGLFGGAGVGKTVLVQELIHNIAKEHSG
20 | LSVFAGVGERTREGNDLYYEMIQGGVIDKTVLVFGQMNEPPGARMRVALTALTMAEYFRD
21 | HDNQNVLLFIDNIFRFTQAGSEVSALLGRMPSAVGYQPTLAIEMGKLQERIASTKTGSIT
22 | SVQAIYVPADDLTDPAPATTFTHLDAKTVLDRNIAALGIFPAINPLESTSRLLDPSVVGI
23 | NHYKVALGVQNILQRFAELQDIIAILGIDELSDEDKIIVERARRIRNFLSQPFFVAEKFS
24 | GIAGKYVSLNDTVQSFKEILEGKHDHLPEQAFFYVGTIQEAVEKAKRLNQEFDKTK
25 | >sp|P47547|DNAK_MYCGE
26 | MSADNGLIIGIDLGTTNSCVSVMEGGRPVVLENPEGKRTTPSIVSYKNNEIIVGDAAKRQ
27 | MVTNPNTIVSIKRLMGTSNKVKVQNADGTTKELSPEQVSAQILSYLKDFAEKKIGKKISR
28 | AVITVPAYFNDAERNATKTAGKIAGLNVERIINEPTAAALAYGIDKASREMKVLVYDLGG
29 | GTFDVSLLDIAEGTFEVLATAGDNRLGGDDWDNKIIEYISAYIAKEHQGLNLSKDKMAMQ
30 | RLKEAAERAKIELSAQLETIISLPFLTVTQKGPVNVELKLTRAKFEELTKPLLERTRNPI
31 | SDVIKEAKIKPEEINEILLVGGSTRMPAVQKLVESMVPGKKPNRSINPDEVVAIGAAIQG
32 | GVLRGDVKDVLLLDVTPLTLSIETLGGVATPLIKRNTTIPVSKSQIFSTAQDNQESVDVV
33 | VCQGERPMSRDNKSLGRFNLGGIQPAPKGKPQIEITFSLDANGILNVKAKDLTTQKENSI
34 | TISDNGNLSEEEIQKMIRDAEANKERDNIIRERIELRNEGEGIVNTIKEILASPDAKNFP
35 | KEEKEKLEKLTGNIDAAIKANDYAKLKVEIENFKKWREEMAKKYNPTGEQGPQAK
36 | >sp|P47543|G3P_MYCGE
37 | MAAKNRTIKVAINGFGRIGRLVFRSLLSKANVEVVAINDLTQPEVLAHLLKYDSAHGELK
38 | RKITVKQNILQIDRKKVYVFSEKDPQNLPWDEHDIDVVIESTGRFVSEEGASLHLKAGAK
39 | RVIISAPAKEKTIRTVVYNVNHKTISSDDKIISAASCTTNCLAPLVHVLEKNFGIVYGTM
40 | LTVHAYTADQRLQDAPHNDLRRARAAAVNIVPTTTGAAKAIGLVVPEANGKLNGMSLRVP
41 | VLTGSIVELSVVLEKSPSVEQVNQAMKRFASASFKYCEDPIVSSDVVSSEYGSIFDSKLT
42 | NIVEVDGMKLYKVYAWYDNESSYVHQLVRVVSYCAKL
43 | >sp|P47641|ATPA_MYCGE
44 | MADKLNEYVALIKTEIKKYSKKIFNSEIGQVISVADGIAKVSGLENALLNELIQFENNIQ
45 | GIVLNLEQNTVGIALFGDYSSLREGSTAKRTHSVMKTPVGDVMLGRIVNALGEAIDGRGD
46 | IKATEYDQIEKIAPGVMKRKSVNQPLETGILTIDALFPIGKGQRELIVGDRQTGKTAIAI
47 | DTIINQKDKDVYCVYVAIGQKNSSVAQIVHQLEVNDSMKYTTVVCATASDSDSMVYLSPF
48 | TGITIAEYWLKKGKDVLIVFDDLSKHAVAYRTLSLLLKRPPGREAFPGDVFYLHSRLLER
49 | ACKLNDENGGGSITALPIIETQAGDISAYIPTNVISITDGQLFMVSSLFNAGQRPAIQIG
50 | LSVSRVGSAAQTKAIKQQTGSLKLELAQYSELDSFSQFGSDLDENTKKVLEHGKRVMEMI
51 | KQPNGKPYSQVHEALFLFAINKAFIKFIPVDEIAKFKQRITEEFNGSHPLFKELSNKKEF
52 | TEDLESKTKTAFKMLVKRFISTLTDYDITKFGSIEELN
53 |
--------------------------------------------------------------------------------
/testdata/in_folder/species_tree.nwk:
--------------------------------------------------------------------------------
1 | ((AQUAE,CHLTR)inter1,MYCGE)inter2;
--------------------------------------------------------------------------------
/tests/data/HOG_0890520.fa:
--------------------------------------------------------------------------------
1 | >sp|P47500|RF1_MYCGE||MYCGE||1000000000 sp|P47500|RF1_MYCGE
2 | MDFDKQLFFNVEKIVELTEQLEKDLNKPNLSFEQIKVINKELKHKQPLIVKFKELQKLVE
3 | NANEAEQILNNSSLKELHEEAKKELEKIKASLPSLEEEIKFLLLPVDENNQKNVIVEIRP
4 | AAGGDESCIFLSDLFNMYKNYCTSKNWTVELNEIIPASVGINFVSFAVNGTDVFAKLKFE
5 | SGVHRVQRVPLTEAKGRVHTSTVTVAVLPQLEEVEITINPSDLRIDTYRASGAGGQHVNR
6 | TESAVRITHLPTGIVVACQEGKSQFSNRDKAMKMLRAKLWENAQNKQLSTQADLRKSQVG
7 | SGERAEKIRTYNYPQNRITDHRIKLTINKLNTVILGDLDEIIEALQADEKKQQLEKFIS
8 | >sp|O84026|RF1_CHLTR||CHLTR||1001000001 sp|O84026|RF1_CHLTR
9 | MEIKVLECLKRLEEVEKQISDPNIFSNPKEYSSLSKEHARLSEIKNAHESLVATKKILQD
10 | DKLALSTEKDPEIVAMLEEGVLVGEEAVERLSKQLENLLIPPDPDDDLSVIMELRAGTGG
11 | DEAALFVGDCVRMYHLYAASKGWQCEVLSTSESDLGGYKEYVMGISGASVKRFLQYEAGT
12 | HRVQRVPETETQGRVHTSAVTVAVLPEPAEDDEEVFIDEKDLRIDTFRSSGAGGQHVNVT
13 | DSAVRITHIPSGVVVTCQDERSQHKNKAKAMRVLKARIRDAEVQKRAQEASAMRSAQVGS
14 | GDRSERIRTYNFPQNRVTDHRIGLTLYNLDRVMEGELDMITTALVTHVHRQLFGHEETA
15 | >sp|O67032|RF1_AQUAE||AQUAE||1002000001 sp|O67032|RF1_AQUAE
16 | MLKEAYISRLDKLQEKYRKLQEELSKPEVIQDVEKYKKLSKELKELQEINELYERYKKAQ
17 | KELKEAKELLKSSDKDLRELAEEEVNRLTEEMKKLEEELKVHLVPKDPNDTKNVILEIRA
18 | GAGGEEAALFAADLFRMYQKYAEEKGWKVSILSSNKTGLGGYKEVIALIEGEGAYSRLKY
19 | ESGVHRVQRVPVTESSGRIHTSTATVAVLPEVDETDIKIKPEELKIETFRASGAGGQYVN
20 | TTETAVRITHIPTGIVVQCQDERSQFQNKQKALKILYAKLKDYYERKKQEEIAKERKEQV
21 | GTGERSEKIRTYNFPQNRVTDHRINLTLYKLQDVLEGKLDEIIDALRAKEIEKKLELVEK
22 | EG
23 |
--------------------------------------------------------------------------------
/tests/data/correct-msa.fa:
--------------------------------------------------------------------------------
1 | >HUMAN01350 | OMA1057741 | PBLD_HUMAN | [Homo sapiens]
2 | --MKLPIFIADAFTARAFRGNPAAVC----LLENELDEDMHQKIAREMNLSETAFIRKLH
3 | PTDNFAQSSCFGLRWFTPASEVPLCGHATLASAAVLFHKIK-NMNSTLTFVTLSGELRAR
4 | RAEDGIVLDLPLYPAHPQDFHEV-EDLI---KTAIGNTLVQDICYSPDTQKLLVRLSDVY
5 | NRSFLENLKVNTENLLQVENTGKVKGLILTLKGEPGGQTQAFDFYSRYFAPWVGVAEDPV
6 | TGSAHAVLSSYWSQHLGKKEMHAFQCSHRGGELGISLRPDGRVD----------------
7 | ----IRGGAAVVLEGTLTA
8 | >YEAST02880 | OMA1057741 | YHI9_YEAST | [Saccharomyces cerevisiae (strain ATCC 204508 / S288c)]
9 | MTLMVPFKQVDVFTEKPFMGNPVAVINFLEIDENEVSQEELQAIANWTNLSETTFLFK--
10 | PSD---KKYDYKLRIFTPRSELPFAGHPTIGSCKAFLEFTKNTTATSLVQECKIGAVPIT
11 | INEGLISFKAPM-----ADYESISSEMIADYEKAIGLKFIKPPALLHTGPEWIVALVEDA
12 | ETCF--NANPNFAMLAHQTKQNDHVGIILAGPKKEAAIKNSYEM--RAFAPVINVYEDPV
13 | CGSGSVALARYL------QEVYKFEKT-----TDITISEGGRLKRNGLMLASIKKEADNS
14 | TSYYIAGHATTVIDGKIKV
15 | >MOUSE00277 | OMA1057741 | K3W4L7 | [Mus musculus]
16 | --MKLPIFIADAFTATAFRGNPAAVC----LLERTLEEDAHQQIAREMNLSETAFIRKLQ
17 | PTDSFTQSSRFGLRWFTPVSEVPLCGHATLASAAVLFHKIRNNRNSTLTFVTMSGELKAR
18 | RAEDGIVLDFPVYPTFPQDFHEV-EDLI---KAAIGDTLVQDIRYSTDTRKLLVRLSDSY
19 | DRSFLESLKVNTEPLPAIEKTGKVRGLILTVKGEPGGQTAPYDFYSRYFAPWVGIAEDPV
20 | TGSAHTVLSSYWSQQLRKKEMRAFQCSRRGGELDISLRPDGRVD----------------
21 | ----IKGGAVIVLEGTLTA
22 | >PANTR00757 | OMA1057741 | A0A6D2W9P7 | [Pan troglodytes]
23 | --MKLPIFIADAFTARAFRGNPAAVC----LLENELDEDMHQKIAREMNLSETAFIRKLH
24 | PTDNFAQSSCFGLRWFTPASEVPLCGHATLASAAVLFHKIK-NMNSTLTFVTLSGELRAR
25 | RAEDGIVLDLPLYPAHPQDFHEV-EDLI---KTAIGNTLVQDICYSPDTRKLLVRLSDVY
26 | NRSFLENLKVNTENLLQVENTGKVKGLILTLKGEPGGQTQAFDFYSRYFAPWVGVAEDPV
27 | TGSAHAVLSSYWSQHLGKKEMHAFQCSRRGGELGISLRPDGRVD----------------
28 | ----IRGCAAVVLEGTLTA
29 |
--------------------------------------------------------------------------------
/tests/test_fasttree_wrapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 |
4 | from Bio import AlignIO
5 | from FastOMA._wrappers import infer_gene_tree
6 | from FastOMA.zoo.wrappers import WrapperError
7 | import pathlib
8 | this_dir = pathlib.Path(__file__).parent
9 |
10 |
11 | class FastTreeTester(unittest.TestCase):
12 | def test_failing_tree_building_reports_error_from_fasttree(self):
13 | msa = AlignIO.read(this_dir / "data" / "failing-msa.fa", "fasta")
14 | with self.assertLogs("FastOMA", level="ERROR") as cm:
15 | with self.assertRaises(WrapperError):
16 | infer_gene_tree(msa)
17 | self.assertIn("Non-unique name", "\n".join(cm.output))
18 |
19 | def test_treebuilding_with_correct_msa(self):
20 | msa = AlignIO.read(this_dir / "data" / "correct-msa.fa", "fasta")
21 | tree = infer_gene_tree(msa)
22 | self.assertIn("HUMAN01350", tree)
--------------------------------------------------------------------------------
/tests/test_infer_subhog.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from ete3 import Tree, TreeNode
3 | from Bio.Seq import Seq
4 | from Bio.SeqRecord import SeqRecord
5 | from argparse import Namespace
6 | from FastOMA._hog_class import HOG, Representative
7 | from FastOMA._infer_subhog import LevelHOGProcessor
8 |
9 |
10 | class TestLevelHogProcessor(TestCase):
11 | def setUp(self):
12 | genetree = Tree(
13 | '(((((G00100_SE001||SE001:153.567,G00100_SE008||SE008:153.567)1:39.499[&&NHX:evoltype=S],(G00100_SE006||SE006:173.507,G00100_SE007||SE007:173.507)1:19.5597[&&NHX:evoltype=S])1:14.0196[&&NHX:evoltype=S],(G00100_SE003||SE003:198.481,((((G00100_SE011||SE011:136.533,G00100_SE012||SE012:136.533)1:7.60673[&&NHX:evoltype=S],(G00100_SE010||SE010:36.1782,G00342_SE010||SE010:36.1782)1:107.961[&&NHX:evoltype=D])1:8.49419[&&NHX:evoltype=S],G00100_SE009||SE009:152.634)1:13.723[&&NHX:evoltype=S],(((G00186_SE004||SE004:143.819,(G00186_SE011||SE011:136.533,(G00186_SE012||SE012:116.411,G00242_SE012||SE012:116.411)1:20.1214[&&NHX:evoltype=D])1:7.28662[&&NHX:evoltype=S])1:0.32011[&&NHX:evoltype=S],(G00186_SE010||SE010:31.4887,G00350_SE010||SE010:31.4887)1:112.651[&&NHX:evoltype=D])1:8.49419[&&NHX:evoltype=S],G00186_SE009||SE009:152.634)1:13.723[&&NHX:evoltype=S])1:32.1245[&&NHX:evoltype=D])1:8.60492[&&NHX:evoltype=S])1:36.2336[&&NHX:evoltype=S],(((G00110_SE001||SE001:153.567,G00110_SE008||SE008:153.567)1:39.499[&&NHX:evoltype=S],(G00110_SE006||SE006:173.507,G00110_SE007||SE007:173.507)1:19.5597[&&NHX:evoltype=S])1:14.0196[&&NHX:evoltype=S],(G00110_SE003||SE003:198.481,(((G00110_SE004||SE004:143.819,(G00110_SE011||SE011:136.533,G00110_SE012||SE012:136.533)1:7.28662[&&NHX:evoltype=S])1:0.32011[&&NHX:evoltype=S],G00110_SE010||SE010:144.139)1:8.49419[&&NHX:evoltype=S],G00110_SE009||SE009:152.634)1:45.8474[&&NHX:evoltype=S])1:8.60492[&&NHX:evoltype=S])1:36.2336[&&NHX:evoltype=S])1:6.68041[&&NHX:evoltype=D],(G00100_SE002||SE002:119.545,(G00100_SE013||SE013:97.4899,(G00100_SE014||SE014:87.2367,G00100_SE015||SE015:87.2367)1:10.2532[&&NHX:evoltype=S])1:22.055[&&NHX:evoltype=S])1:130.455[&&NHX:evoltype=S]);')
14 | sptree = Tree("dummy;")
15 | hogs = [HOG(SeqRecord(Seq("AAAAAA"), id=n.name), sptree, "test1") for n in genetree.iter_leaves()]
16 | conf = Namespace(msa_write=False, gene_trees_write=False, number_of_samples_per_hog=5, msa_filter_method="col-row-threshold",
17 | gap_ratio_row=0.3, gap_ratio_col=0.5, min_col_trim=400)
18 | self.genetree = genetree
19 | self.lp = LevelHOGProcessor(sptree, hogs, "test1", conf)
20 |
21 | def test_propose_representatives(self):
22 | rep = self.lp.find_most_divergent_representatives_from_genetree(self.genetree)
23 | self.assertEqual(len(rep), self.lp.conf.number_of_samples_per_hog)
24 | self.assertIn(rep[self.lp.conf.number_of_samples_per_hog-1].get_id(), ("G00100_SE013||SE013","G00100_SE015||SE015","G00100_SE014||SE014","G00100_SE002||SE002"))
25 |
26 | def test_reconcilation(self):
27 | exp = self.genetree.write(features=['evoltype'])
28 | self.lp.infer_reconciliation(genetree=self.genetree)
29 | self.assertEqual(exp, self.genetree.write(features=['evoltype']))
30 | self.assertEqual(self.genetree.sos, 0)
31 | self.assertEqual(self.genetree.children[0].sos, 1)
32 |
33 |
34 |
--------------------------------------------------------------------------------
/tests/test_roothog_example.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | class RootHOGExampleTestCase(TestCase):
4 |
5 | def setUpClass(cls):
6 | pass
--------------------------------------------------------------------------------
/utils/filter_orthoxml_completeness.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | ### How to use: python filter_orthoxml_completeness.py FastOMA_HOGs.orthoxml 0.3
5 |
6 | import sys
7 | import logging
8 | logging.basicConfig(level=logging.DEBUG)
9 | from FastOMA.zoo.hog import filter_orthoxml_file, HOGFilter
10 |
11 | print("started ")
12 |
13 | input_orthoxml_add = sys.argv[1]
14 | threshold_filt = float(sys.argv[2])
15 |
16 | score_type = "CompletenessScore"
17 |
18 |
19 | output_name = input_orthoxml_add + "_filt_"+str(threshold_filt)+".orthoxml"
20 | with open(output_name, 'wb') as output_file:
21 | filt = HOGFilter(score_type, threshold_filt)
22 | filter_orthoxml_file(input_orthoxml_add, output_file, filt)
23 |
24 | print("we wrote the output in "+output_name)
25 |
26 |
--------------------------------------------------------------------------------
/utils/find_unfinished_rhogs.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | import sys
5 |
6 | import os
7 | folder = sys.argv[1] #"/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_eukaryota/run_1june/out_folder/"
8 |
9 | from os import listdir
10 |
11 | project_files = listdir(folder + "/rhogs_all/")
12 | rhogs = []
13 | for file in project_files:
14 | file_name_split = file.split(".")
15 | if file_name_split[-1] == "fa":
16 | rhog_id = int(file_name_split[0].split("_")[1])
17 | rhogs.append(rhog_id)
18 |
19 | print("number of rhogs is ", len(rhogs))
20 |
21 | folder_pickle = folder + "/pickle_rhogs/"
22 | project_files = listdir(folder_pickle)
23 | pickles = []
24 | for file in project_files:
25 | if os.path.getsize(folder_pickle + file) > 2:
26 | file_name_split = file.split(".")
27 | if file_name_split[-1] == "pickle":
28 | rhog_id = int(file_name_split[0].split("_")[1])
29 | pickles.append(rhog_id)
30 | else:
31 | print("this file is empty", file)
32 |
33 | print("number of pickles is ", len(pickles))
34 |
35 | no_pickle_list = set(rhogs) - set(pickles)
36 |
37 | print("number of rhogs not finished is ", len(no_pickle_list))
38 |
39 | print("\n \n ", no_pickle_list)
40 |
--------------------------------------------------------------------------------
/utils/orthoxml2OG.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | this code is for converting an OrthoXML file to a set of Fasta files as Ortholougous groups
4 |
5 | How to run:
6 | cd out_folder
7 | python orthoxml2OG.py output_hog_.orthoxml rhogs_all
8 |
9 |
10 | Output
11 | - Gene names per OG in maximal_og_prot.tsv
12 | - Fasta files in OGs_maximal
13 | """
14 |
15 |
16 | from ete3 import Tree
17 | import sys
18 | import os
19 | from FastOMA.zoo.hog.convert import orthoxml_to_newick
20 | from Bio import SeqIO
21 |
22 |
23 |
24 |
25 | def max_og_tree(tree):
26 | for node in tree.traverse("preorder"):
27 | # for node in xml_tree.traverse(strategy="preorder", is_leaf_fn=lambda n: hasattr(n, "attriremoved") and n.attriremoved==True):
28 | if not node.is_leaf() and hasattr(node,"Ev") and node.Ev == 'duplication': # node.name[:3] == "dup"
29 | dup_node = node
30 | children = dup_node.get_children()
31 | list_num_species = []
32 | for child in children:
33 | child_name_leaves = child.get_leaves()
34 | species_list = []
35 | for leaf in child_name_leaves:
36 | name = leaf.name
37 | if name[:3] == "no_":
38 | name = leaf.name.split("_")[-1]
39 | if name in species_dic:
40 | species_name = species_dic[name]
41 | species_list.append(species_name)
42 | else:
43 | print("species not in the dic ",name)
44 | species_set = set(species_list)
45 | list_num_species.append(len(species_set))
46 | index_max_species = list_num_species.index(max(list_num_species))
47 | # if there are few children with identical number of species, the case would be not a polytomi but two children with one species
48 | # num_occurence = [1 for i in list_num_species if i == max(list_num_species)]
49 | # if len(num_occurence) > 1:
50 | # print("please check this case with the developer the tool. The tree has polytomy.")
51 | child_max_species = children[index_max_species]
52 | children_to_remove = [i for i in children if i != child_max_species]
53 | for child_to_remove in children_to_remove:
54 | for i in child_to_remove.get_leaves():
55 | i.in_og = "no"
56 |
57 |
58 | og_prot_list = []
59 | for node in tree.traverse("preorder"):
60 | if node.is_leaf():
61 | if hasattr(node,"in_og") and node.in_og == "no":
62 | pass # print(node.name)
63 | else:
64 | og_prot_list.append(node.name)
65 |
66 | return og_prot_list
67 |
68 |
69 |
70 | input_orthoxml=sys.argv[1] # "out_folder/output_hog_.orthoxml"
71 | rhog_all_folder = sys.argv[2]+"/" # "out_folder/rhogs_all/"
72 | fasta_format = "fa" # of the rhogs_all
73 |
74 |
75 | output_file = "maximal_og_prot.tsv"
76 |
77 |
78 | trees, species_dic = orthoxml_to_newick(input_orthoxml, return_gene_to_species=True) # encode_levels_as_nhx=False, xref_tag="protId",
79 | print("We extracted "+str(len(trees))+" trees in NHX format from the input HOG orthoxml"+input_orthoxml)
80 |
81 |
82 | OGs = {}
83 | for hog_id, tree_string in trees.items():
84 |
85 | tree = Tree(tree_string,format=1)
86 | og_prot_list = max_og_tree(tree)
87 | OGs[hog_id] = og_prot_list
88 |
89 |
90 | print("done")
91 |
92 |
93 | with open(output_file, 'w') as handle:
94 | for hog_id, og_prot_list in OGs.items():
95 | line_text = str(hog_id)+"\t"+str(og_prot_list)+"\n"
96 | handle.write(line_text)
97 | handle.close()
98 |
99 | print("We wrote the protein families information in the file "+output_file)
100 |
101 |
102 | out_folder_ogs = "OGs_maximal/"
103 | os.makedirs(out_folder_ogs)
104 |
105 | print("start writing "+str(len(OGs))+" OGs as fasta files in folder " +out_folder_ogs )
106 | for hog_id, og_prot_list in OGs.items(): #hog_id="HOG_0667494_sub10524"
107 | rhog_id = "_".join(hog_id.split("_")[:2])
108 |
109 | rhogs_all_address = rhog_all_folder + rhog_id + "."+fasta_format
110 | rhogs_all_prots = list(SeqIO.parse(rhogs_all_address, "fasta"))
111 |
112 | og_prots = []
113 | og_prot_list = OGs[hog_id]
114 | for rhogs_prot in rhogs_all_prots:
115 | if rhogs_prot.id.split("||")[0] in og_prot_list:
116 | sp= rhogs_prot.id.split("||")[1]
117 | rhogs_prot.description += " ["+ sp +"]"
118 | og_prots.append(rhogs_prot)
119 |
120 | og_id = "OG_" + hog_id # one OG per rootHOG # "/HOG_"+ str(rhogid_num).zfill(7)
121 | SeqIO.write(og_prots, out_folder_ogs+og_id+".fa", "fasta")
122 | print("writing done")
123 |
124 |
--------------------------------------------------------------------------------
/utils/orthoxml2family.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | import sys
6 |
7 | from FastOMA.zoo.hog import extract_flat_groups_at_level
8 |
9 |
10 | """
11 | how to run
12 | python orthoxml2family.py my_hogs.orthoxml
13 |
14 | - to convert orthoxml to rootHOG (protein families)
15 | """
16 |
17 | input_orthoxml = sys.argv[1]
18 | output_file = "families_prot.tsv"
19 |
20 | toplevel_groups = []
21 | for grp in extract_flat_groups_at_level(input_orthoxml):
22 | toplevel_groups.append(set(g.xref for g in grp))
23 |
24 | # toplevel_groups is a list of sets
25 |
26 | print("We extracted "+str(len(toplevel_groups))+" protein families from the input HOG orthoxml"+input_orthoxml)
27 | print("The first one contain "+str(len(toplevel_groups[0]))+" proteins.")
28 |
29 | with open(output_file, 'w') as handle:
30 | for toplevel_group_idx, toplevel_group in enumerate(toplevel_groups):
31 | line_text = str(toplevel_group_idx)+"\t"+str(toplevel_group)+"\n"
32 | handle.write(line_text)
33 | handle.close()
34 |
35 | print("We wrote the protein families information in the file "+output_file)
36 |
37 |
38 | # we need to know the species name of each prot, as prot_specis dic
39 | # prot_name_universal = []
40 | # for group in toplevel_groups:
41 | # if len(group) > 0.9 * 2181:
42 | # species = [prot_specis[prot] for prot in group]
43 | # species_unq = set(species)
44 | # if len(species_unq) > 0.9 * 2181:
45 | # prot_name_universal.append(group)
46 | #
47 | # len(prot_name_universal)
--------------------------------------------------------------------------------
/utils/orthoxml2newick.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | import sys
5 | import os
6 | from FastOMA.zoo.hog.convert import orthoxml_to_newick
7 |
8 | """
9 | how to run
10 | python orthoxml2newick.py my_hogs.orthoxml
11 | """
12 |
13 | input_orthoxml = sys.argv[1]
14 | output_folder = "output_folder_trees"
15 |
16 | os.mkdir(output_folder)
17 |
18 | trees = orthoxml_to_newick(input_orthoxml)
19 |
20 | print("We extracted "+str(len(trees))+" trees from the input HOG orthoxml"+input_orthoxml)
21 |
22 | # write them as files
23 | for treeid_hog, tree in trees.items():
24 | tree_file_i = output_folder+"/tree_"+str(treeid_hog)+".nwk"
25 | with open(tree_file_i,'w') as handle:
26 | handle.write(tree)
27 | handle.close()
28 | # tree_i.write(format=1, format_root_node=True, outfile=tree_file_i)
29 | print("We wrote "+str(len(trees))+" trees in nhx format from the input HOG orthoxml"+input_orthoxml+"in "+output_folder)
30 | print("You can visualise each tree using https://beta.phylo.io/viewer/ as extendeed newick format.")
31 |
--------------------------------------------------------------------------------
/utils/orthoxml2pairs.py:
--------------------------------------------------------------------------------
1 |
2 | from FastOMA.zoo.hog import transform
3 |
4 | #from zoo.tree_utils import collapse, gene_species, transform, HOG_coverages
5 |
6 | import io
7 | import lxml.etree
8 | import sys
9 | orthoxml_file = sys.argv[1]
10 | #"/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_qfo/benchmark-webservice3/orthoxml/euk_omamer200.dev8_13oct.orthoxml"
11 |
12 |
13 | orthxml_str = []
14 | with open(orthoxml_file, "r") as f:
15 | for i in f:
16 | orthxml_str.append(i)
17 | print(len(orthxml_str))
18 | dic_gene_integer={}
19 | for line in orthxml_str:
20 | if "gene id" in line:
21 | found=False
22 | gene_int= line.split("\"")[1]
23 | gene_name = line.split("\"")[3]
24 | dic_gene_integer[gene_int] = gene_name
25 |
26 |
27 |
28 | orthoxml_etree=lxml.etree.parse(orthoxml_file)
29 |
30 | pw_orthologs_integer = sorted(list(transform.iter_pairwise_relations(orthoxml_etree)))
31 | # iter_pairwise_relations(obj, rel_type=None (def:'ortholog' , but possible to use 'paralog')
32 | print(len(pw_orthologs_integer))
33 | print(pw_orthologs_integer[:2])
34 | pw_orthologs_gene =[]
35 | for pair in pw_orthologs_integer:
36 | pw_orthologs_gene.append((dic_gene_integer[pair[0]],dic_gene_integer[pair[1]]))
37 |
38 |
39 |
40 | print(len(pw_orthologs_gene))
41 | print(pw_orthologs_gene[:2])
42 |
43 |
44 | output_file = open(orthoxml_file+"_pairs.tsv","w")
45 | for pair in pw_orthologs_gene:
46 | output_file.write(pair[0]+"\t"+pair[1]+"\n")
47 |
48 | output_file.close()
49 |
--------------------------------------------------------------------------------
/utils/orthoxml2perrhog.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # import OrthoXMLSplitter
5 |
6 |
7 | folder="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_eukaryota/run_1june/out_folder/"
8 | hog_file = folder + "/output_hog_.orthoxml"
9 | outdir=folder+"/perrhog_folder"
10 |
11 | from OrthoXMLSplitter import OrthoXMLSplitter
12 |
13 | splitter = OrthoXMLSplitter(hog_file, outdir)
14 |
15 | splitter()
16 |
17 |
18 |
--------------------------------------------------------------------------------
/utils/orthoxml2phylostratigraphy.py:
--------------------------------------------------------------------------------
1 |
2 | # you need to install pyham https://github.com/DessimozLab/pyham
3 |
4 | import pyham
5 |
6 | import logging
7 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)-12s %(levelname)-8s %(message)s")
8 |
9 |
10 | working_folder="./"
11 |
12 | nwk_path= working_folder+"in_folder/species_tree.nwk" # species tree should be pruned (no extra leaves)
13 |
14 | tree_str = pyham.utils.get_newick_string(nwk_path, type="nwk")
15 | print(tree_str[:10])
16 |
17 | orthoxml_path=working_folder+"out_folder/output_hog.orthoxml"
18 | ham_analysis = pyham.Ham(tree_str, orthoxml_path, use_internal_name=True)
19 | print("Ham analysis done") # for a big orthoxml file it can take ~30mins
20 |
21 | #phylostratigraphy
22 |
23 | #create tree profile, classify all genomes by extant or ancestral, and get % of dup, lost, retained, and gained
24 | treeprofile = ham_analysis.create_tree_profile(outfile= working_folder+"/out_folder/phylostratigraphy.html")
25 | treemap = treeprofile.compute_tree_profile_full()
26 |
27 |
--------------------------------------------------------------------------------
/utils/pickle2orthoxml.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from xml.dom import minidom
4 | import xml.etree.ElementTree as ET
5 | import pickle
6 | from FastOMA._utils_subhog import read_species_tree
7 | from FastOMA.collect_subhogs import convert_speciestree_to_orthoxml_taxonomy
8 | import sys
9 | from FastOMA.transformer import header_transformer
10 |
11 | from FastOMA.collect_subhogs import iter_hogs
12 | from FastOMA.collect_subhogs import update_hogids
13 | from pathlib import Path
14 |
15 | ```
16 | python pickle2orthoxml.py "no_header" "file_D0680685.pickle"
17 |
18 | python pickle2orthoxml.py "selected_genes" pickle_folder gene_id_dic_xml.pickle "species_tree_checked.nwk" # this will be slow. gene_id_dic_xml.pickle is in the output of infer_roothogs
19 | ```
20 |
21 | mode = sys.argv[1] #"selected_genes" #"no_header" # "selected_genes" "all_genes"
22 |
23 | if mode=="no_header":
24 |
25 | input_pickle= sys.argv[2] # "file_D0680685.pickle"
26 | handle=open(input_pickle,'rb')
27 | orthoxml_file = pickle.load(handle)
28 |
29 | print(len(orthoxml_file))
30 | xml_str = minidom.parseString(ET.tostring(orthoxml_file[0])).toprettyxml(indent=" ")
31 |
32 | with open(input_pickle+"_noheader.orthoxml","w") as out_file:
33 | out_file.write(xml_str)
34 |
35 | if mode =="selected_genes":
36 |
37 | input_pickle = sys.argv[2] # a folder of pickles pickle_folder
38 | gene_id_pickle_file = sys.argv[3] # generated in infer_roothogs.
39 | # available in out_folder/temp_output/gene_id_dic_xml.pickle
40 | # this keeps the gene name and the gene integer ID used in orthoxml.
41 | species_tree = sys.argv[4] # "species_tree_checked.nwk"
42 |
43 | handle=open(input_pickle,'rb')
44 | orthoxml_file1 = pickle.load(handle) # todo might have two elements inside?
45 | gene_int_set = set()
46 | num_digit = 10 # integer ids # assumption ?
47 | for orthoxml_part in orthoxml_file1:
48 | xml_str = minidom.parseString(ET.tostring(orthoxml_part)).toprettyxml(indent=" ")
49 | gene_int_set_i = set([int(i[1:num_digit+1]) for i in xml_str.split("geneRef id=")[1:] ])
50 | gene_int_set.update(gene_int_set_i)
51 |
52 | from datetime import datetime
53 | fastoma_version= "0"
54 | orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/",
55 | "origin": "FastOMA " + fastoma_version,
56 | "originVersion": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
57 | "version": "0.5"}) #
58 |
59 | with open(gene_id_pickle_file, 'rb') as handle:
60 | gene_id_name = pickle.load(handle) # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name)
61 | print("We read the gene_id_name dictionary with %d items", len(gene_id_name))
62 |
63 | speciestree = read_species_tree(species_tree)
64 | taxonomy, name2taxid = convert_speciestree_to_orthoxml_taxonomy(speciestree)
65 | print("Now creating the header of orthoxml")
66 |
67 | id_transform_= "noop" # noop:No transformation, "UniProt": '>sp|P68250|1433B_BOVIN' --> P68250""")
68 |
69 | id_transformer = header_transformer(id_transform_)
70 |
71 | # #### create the header of orthoxml ####
72 | for query_species_name, list_prots in gene_id_name.items():
73 | first=True
74 | for (gene_idx_integer, query_prot_name) in list_prots:
75 | if gene_idx_integer in gene_int_set:
76 | if first:
77 | species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "taxonId": str(name2taxid[query_species_name]), "NCBITaxId": "0"})
78 | database_xml = ET.SubElement(species_xml, "database", attrib={"name": "database", "version": "2023"})
79 | genes_xml = ET.SubElement(database_xml, "genes")
80 | prot_id = id_transformer.transform(query_prot_name)
81 | gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": prot_id})
82 | first=False
83 | else:
84 | prot_id = id_transformer.transform(query_prot_name)
85 | gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": prot_id})
86 |
87 |
88 |
89 | print("gene_xml is created.")
90 | # orthoxml_file.append(taxonomy)
91 |
92 | scores = ET.SubElement(orthoxml_file, "scores")
93 | ET.SubElement(scores, "scoreDef", {"id": "CompletenessScore",
94 | "desc": "Fraction of expected species with genes in the (Sub)HOG"})
95 |
96 | # #### create the groups of orthoxml ####
97 | groups_xml = ET.SubElement(orthoxml_file, "groups")
98 |
99 | with open(input_pickle, 'rb') as handle:
100 | hogs_a_rhog_xml = pickle.load(handle)
101 | for idx, hog_a_rhog_xml in enumerate(hogs_a_rhog_xml):
102 | fam = idx # this could be improved
103 | groups_xml.append(update_hogids(fam, hog_a_rhog_xml, name2taxid))
104 | #for fam, hogs_a_rhog_xml in enumerate(iter_hogs(Path(pickle_folder)), start=1):
105 | # groups_xml.append(update_hogids(fam, hogs_a_rhog_xml, name2taxid))
106 | print("converting the xml object to string.")
107 |
108 | output_xml_name= input_pickle+".orthoxml"
109 | with open(output_xml_name, 'wb') as fh:
110 | ET.indent(orthoxml_file, space=' ', level=0)
111 | orthoxml = ET.ElementTree(orthoxml_file)
112 | orthoxml.write(fh, encoding="utf-8", xml_declaration=True, )
113 | print("orthoxml is written in %s", output_xml_name)
114 |
115 |
116 |
117 |
--------------------------------------------------------------------------------
/utils/write_orthoxml_per_rHOG.py:
--------------------------------------------------------------------------------
1 | import xml.etree.ElementTree as ET
2 |
3 | from os import listdir
4 | from xml.dom import minidom
5 |
6 | import pickle
7 |
8 |
9 |
10 | folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_eukaryota/run_1june/"
11 |
12 | # create this folder /out_folder/orthoxml_out/
13 |
14 | gene_id_pickle_file = folder + "/out_folder/gene_id_dic_xml.pickle"
15 |
16 | with open(gene_id_pickle_file, 'rb') as handle:
17 | gene_id_name = pickle.load(handle)
18 | # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name)
19 | print("gene_id_name read ", len(gene_id_name))
20 |
21 | pickle_folder = folder + "/out_folder/pickle_rhogs_/"
22 | pickle_files_adress = listdir(pickle_folder)
23 |
24 | orthoxml_out_folder = folder + "/out_folder/orthoxml_out/"
25 | check = listdir(orthoxml_out_folder)
26 |
27 |
28 | print("gene_xml created ")
29 | # hogs_a_rhog_xml_all = []
30 | for idx, pickle_file_adress in enumerate(pickle_files_adress):
31 |
32 | if idx % 100 == 0: print(idx)
33 | with open(pickle_folder + pickle_file_adress, 'rb') as handle:
34 | hogs_a_rhog_xml_batch = pickle.load(
35 | handle) # hogs_a_rhog_xml_batch is orthoxml_to_newick.py list of hog object.
36 | handle.close()
37 | # hogs_a_rhog_xml_all.extend(hogs_a_rhog_xml_batch)
38 | # hogs_rhogs_xml_all is orthoxml_to_newick.py list of hog object.
39 | # print("number of hogs is batch is ", len(hogs_a_rhog_xml_batch))
40 |
41 | xml_str = ""
42 | for i in hogs_a_rhog_xml_batch:
43 | xml_str += minidom.parseString(ET.tostring(i)).toprettyxml(indent=" ")
44 | xs = xml_str.split("\n")
45 | list_geneid = []
46 | for x in xs:
47 | if "geneRef id" in x:
48 | list_geneid.append(int(x.split("\"")[1]))
49 | print(len(list_geneid))
50 |
51 | query_species_name_list = []
52 | for query_species_name, list_prots in gene_id_name.items():
53 |
54 | for (gene_idx_integer, query_prot_name) in list_prots:
55 | if gene_idx_integer in list_geneid:
56 | query_species_name_list.append(query_species_name)
57 |
58 | query_species_name_set = list(set(query_species_name_list))
59 |
60 | output_xml_name = orthoxml_out_folder + pickle_files_adress[0] + "_.orthoxml"
61 | orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA",
62 | "originVersion": "Nov 2021", "version": "0.3"}) #
63 |
64 | for query_species_name, list_prots in gene_id_name.items():
65 | if query_species_name in query_species_name_set:
66 | species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "NCBITaxId": "1"})
67 | database_xml = ET.SubElement(species_xml, "database", attrib={"name": " database ", "version": "2020"})
68 | genes_xml = ET.SubElement(database_xml, "genes")
69 |
70 | for (gene_idx_integer, query_prot_name) in list_prots:
71 | if gene_idx_integer in list_geneid: # +[1007003758]
72 | query_prot_name_pure = query_prot_name
73 | gene_xml = ET.SubElement(genes_xml, "gene",
74 | attrib={"id": str(gene_idx_integer), "protId": query_prot_name_pure})
75 |
76 | groups_xml = ET.SubElement(orthoxml_file, "groups")
77 |
78 | for hogs_a_rhog_xml in hogs_a_rhog_xml_batch:
79 | groups_xml.append(hogs_a_rhog_xml)
80 | # print("convert to string")
81 |
82 | xml_str = minidom.parseString(ET.tostring(orthoxml_file)).toprettyxml(indent=" ")
83 |
84 | with open(output_xml_name, "w") as file_xml:
85 | file_xml.write(xml_str)
86 | file_xml.close()
87 |
88 | print("orthoxml is written in " + output_xml_name)
89 |
--------------------------------------------------------------------------------