├── .dockerignore ├── .github ├── dependabot.yml └── workflows │ ├── docker-image.yml │ └── publish-pypi-release.yml ├── .gitignore ├── .idea └── remote-mappings.xml ├── Dockerfile ├── FastOMA.nf ├── FastOMA ├── __init__.py ├── _hog_class.py ├── _infer_subhog.py ├── _utils_frag_SO_detection.py ├── _utils_roothog.py ├── _utils_subhog.py ├── _wrappers.py ├── batch_roothogs.py ├── check_input.py ├── collect_subhogs.py ├── fastoma_notebook_stat.ipynb ├── helper_scripts.py ├── infer_roothogs.py ├── infer_subhogs.py ├── transformer.py └── zoo │ ├── README.md │ ├── __init__.py │ ├── familyanalyzer │ ├── __init__.py │ ├── genetree.py │ ├── newick.py │ ├── orthoxmlquery.py │ ├── taxonomy.py │ └── tools.py │ ├── file_utils │ ├── __init__.py │ ├── context_managers.py │ └── extractors.py │ ├── hog │ ├── __init__.py │ ├── convert.py │ ├── extract_groups.py │ ├── extract_hog_info.py │ ├── filter_orthoxml.py │ ├── orthoxml_merge.py │ └── transform.py │ ├── seq_utils │ ├── __init__.py │ └── utils.py │ ├── unionfind.py │ ├── utils.py │ └── wrappers │ ├── __init__.py │ ├── abstract_cli.py │ ├── aligners │ ├── __init__.py │ ├── base_aligner.py │ ├── mafft.py │ ├── muscle.py │ ├── probcons.py │ └── prographmsa.py │ ├── modeltesters │ ├── __init__.py │ ├── base_modeltester.py │ ├── parsers.py │ └── prottest.py │ ├── options.py │ ├── treebuilders │ ├── __init__.py │ ├── base_treebuilder.py │ ├── fasttree.py │ ├── guenomu.py │ ├── iqtree.py │ ├── parsers.py │ ├── phyml.py │ └── raxml.py │ └── trimmers │ ├── __init__.py │ ├── base_trimmer.py │ └── trimal.py ├── README.md ├── archive ├── analysis │ ├── edit_orthxml_file.py │ ├── find_unfinished_rhog.py │ ├── preprocess_qfo_files.py │ ├── write_gene_id_pickle_old_code.py │ └── xml_.py ├── fastOMA_logo.png └── test_curn.py ├── conf └── base.config ├── environment-conda.yml ├── license ├── nextflow.config ├── nextflow_slurm.config ├── pyproject.toml ├── testdata ├── README.md ├── expected_output │ ├── .DS_Store │ ├── FastOMA_HOGs.orthoxml │ ├── OrthologousGroups.tsv │ ├── OrthologousGroupsFasta │ │ ├── OG_0000001.fa │ │ ├── OG_0000001.fa.gz │ │ ├── OG_0000002.fa.gz │ │ ├── OG_0000003.fa.gz │ │ ├── OG_0000004.fa.gz │ │ ├── OG_0000005.fa.gz │ │ ├── OG_0000006.fa.gz │ │ ├── OG_0000007.fa.gz │ │ ├── OG_0000008.fa.gz │ │ ├── OG_0000009.fa.gz │ │ ├── OG_0000010.fa.gz │ │ ├── OG_0000011.fa.gz │ │ └── OG_0000012.fa.gz │ ├── RootHOGs.tsv │ ├── RootHOGsFasta │ │ ├── HOG0000001.fa │ │ ├── HOG0000001.fa.gz │ │ ├── HOG0000002.fa.gz │ │ ├── HOG0000003.fa.gz │ │ ├── HOG0000004.fa.gz │ │ ├── HOG0000005.fa.gz │ │ ├── HOG0000006.fa.gz │ │ ├── HOG0000007.fa.gz │ │ ├── HOG0000008.fa.gz │ │ ├── HOG0000009.fa.gz │ │ ├── HOG0000010.fa.gz │ │ ├── HOG0000011.fa.gz │ │ └── HOG0000012.fa.gz │ ├── hogmap │ │ ├── AQUAE.fa.hogmap │ │ ├── CHLTR.fa.hogmap │ │ └── MYCGE.fa.hogmap │ ├── orthologs.tsv │ ├── phylostratigraphy.html │ ├── report.html │ ├── report.ipynb │ ├── species_tree_checked.nwk │ └── stats │ │ └── report_2024-10-18_02-43-20.html └── in_folder │ ├── proteome │ ├── AQUAE.fa │ ├── CHLTR.fa │ └── MYCGE.fa │ └── species_tree.nwk ├── tests ├── data │ ├── HOG_0890520.fa │ ├── correct-msa.fa │ └── failing-msa.fa ├── test_fasttree_wrapper.py ├── test_infer_subhog.py └── test_roothog_example.py └── utils ├── OrthoXMLSplitter.py ├── filter_orthoxml_completeness.py ├── find_unfinished_rhogs.py ├── orthoxml2OG.py ├── orthoxml2family.py ├── orthoxml2newick.py ├── orthoxml2pairs.py ├── orthoxml2perrhog.py ├── orthoxml2phylostratigraphy.py ├── pickle2orthoxml.py └── write_orthoxml_per_rHOG.py /.dockerignore: -------------------------------------------------------------------------------- 1 | work 2 | .nextflow* 3 | .idea 4 | .git 5 | output 6 | testdata 7 | dist 8 | archive/ 9 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Maintain dependencies for GitHub Actions 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "daily" 8 | -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | pull_request: 6 | release: 7 | type: [published] 8 | 9 | env: 10 | TEST_TAG: dessimozlab/fastoma:test 11 | 12 | jobs: 13 | 14 | build: 15 | 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v4 21 | with: 22 | submodules: recursive 23 | 24 | - name: Docker meta 25 | id: meta 26 | uses: docker/metadata-action@v5 27 | with: 28 | # list of Docker images to use as base name for tags 29 | images: | 30 | dessimozlab/fastoma 31 | # generate Docker tags based on the following events/attributes 32 | tags: | 33 | type=schedule 34 | type=ref,event=branch 35 | type=ref,event=pr 36 | type=semver,pattern={{version}} 37 | type=semver,pattern={{major}}.{{minor}} 38 | type=semver,pattern={{major}} 39 | type=sha 40 | 41 | - name: Set up QEMU 42 | uses: docker/setup-qemu-action@v3 43 | 44 | - name: Set up Docker Buildx 45 | uses: docker/setup-buildx-action@v3 46 | 47 | - name: Build and export to docker for testing 48 | uses: docker/build-push-action@v6 49 | with: 50 | context: . 51 | load: true 52 | tags: ${{ env.TEST_TAG }} 53 | 54 | #- name: Test 55 | # run: | 56 | # docker run --rm -i -v $PWD/tests:/input -v $PWD/tests/:/reads -v $PWD/output:/out -v $PWD/run:/run ${{ env.TEST_TAG }} --tree --standalone_path /input/marker_genes --dna_reference /input/cds-marker_genes.fasta.gz --reads /reads/sample_1.fastq --output_path /out 57 | # if [ ! -f output/tree_sample_1.nwk ] ; then exit 1; fi 58 | 59 | - name: Login to DockerHub 60 | uses: docker/login-action@v3 61 | with: 62 | username: ${{ secrets.DOCKER_HUB_USERNAME }} 63 | password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} 64 | 65 | - name: Build and push 66 | uses: docker/build-push-action@v6 67 | with: 68 | context: . 69 | platforms: linux/amd64,linux/arm64 70 | push: true 71 | #${{ github.event_name != 'push' && github.event_name != 'pull_request' }} 72 | tags: ${{ steps.meta.outputs.tags }} 73 | labels: ${{ steps.meta.outputs.labels }} 74 | -------------------------------------------------------------------------------- /.github/workflows/publish-pypi-release.yml: -------------------------------------------------------------------------------- 1 | 2 | name: Upload FastOMA to pypi 3 | 4 | on: 5 | push: 6 | tags: 7 | - v* 8 | 9 | jobs: 10 | deploy: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: '3.x' 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install hatch 24 | - name: Build package 25 | run: hatch build 26 | - name: Publish package 27 | uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc 28 | with: 29 | user: __token__ 30 | password: ${{ secrets.PYPI_API_TOKEN }} 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .nextflow* 2 | work/ 3 | .idea/ 4 | dist/ 5 | archive 6 | .git 7 | .gitignore 8 | __pycache__ 9 | *.orig 10 | -------------------------------------------------------------------------------- /.idea/remote-mappings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim as basis 2 | 3 | # set environment varibles 4 | ENV PYTHONDONTWRITEBYTECODE 1 5 | ENV PYTHONUNBUFFERED 1 6 | 7 | 8 | FROM basis as builder 9 | RUN apt-get update \ 10 | && apt-get install -y --no-install-recommends \ 11 | build-essential \ 12 | fasttree \ 13 | libxml2 \ 14 | mafft \ 15 | && rm -rf /var/lib/apt/lists/* 16 | 17 | WORKDIR /src 18 | RUN pip install --upgrade hatch pip 19 | COPY pyproject.toml . 20 | RUN python -m venv /app \ 21 | && hatch dep show requirements --all > requirements.txt \ 22 | && /app/bin/pip install wheel setuptools \ 23 | && /app/bin/pip install -r requirements.txt 24 | 25 | COPY . . 26 | RUN ls -la \ 27 | && hatch build \ 28 | && ls -la dist/ \ 29 | && /app/bin/pip install dist/*.whl 30 | 31 | 32 | FROM basis as runtime 33 | RUN apt-get update \ 34 | && apt-get install -y --no-install-recommends \ 35 | fasttree \ 36 | libxml2 \ 37 | mafft \ 38 | mmseqs2 \ 39 | procps \ 40 | && apt-get -y autoremove \ 41 | && apt-get -y autoclean \ 42 | && rm -rf /var/lib/apt/lists/* 43 | 44 | COPY --from=builder /app /app 45 | ENV PATH="/app/bin:$PATH" 46 | -------------------------------------------------------------------------------- /FastOMA/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __packagename__ = "FastOMA" 3 | __version__ = "0.3.5" 4 | -------------------------------------------------------------------------------- /FastOMA/batch_roothogs.py: -------------------------------------------------------------------------------- 1 | 2 | import shutil 3 | from pathlib import Path 4 | from ._wrappers import logger 5 | from . import __version__ as fastoma_version 6 | 7 | big_rhog_filesize_thresh = 400 * 1000 8 | sum_list_rhogs_filesize_thresh = 2 * 1e6 9 | 10 | 11 | """ 12 | 13 | fastoma-batch-roothogs --input-roothogs omamer_rhogs --out-big rhogs_big --out-rest rhogs_rest -vv 14 | 15 | """ 16 | 17 | class BatchBuilder: 18 | def __init__(self, outdir: Path, max_size: int): 19 | self.outdir = outdir 20 | self.max_size = max_size 21 | 22 | def __enter__(self): 23 | self.cur_batch = [] 24 | self.cur_size = 0 25 | self.counter = 0 26 | self.outdir.mkdir(parents=True, exist_ok=True) 27 | return self 28 | 29 | def __exit__(self, exc_type, exc_val, exc_tb): 30 | if len(self.cur_batch) > 0: 31 | self._flush() 32 | 33 | def add_hog(self, hog_file: Path): 34 | self.cur_batch.append(hog_file) 35 | self.cur_size += hog_file.stat().st_size 36 | logger.debug("adding %s with size %d to batch %d", hog_file, hog_file.stat().st_size, self.counter) 37 | if self.cur_size > self.max_size: 38 | self._flush() 39 | self.counter += 1 40 | 41 | def _flush(self): 42 | batch_dir = self.outdir / str(self.counter) 43 | batch_dir.mkdir() 44 | for fn in self.cur_batch: 45 | shutil.copy(fn, batch_dir) 46 | logger.debug("creating batch %s with %d families; total size of files is %d", 47 | batch_dir, len(self.cur_batch), self.cur_size) 48 | self.cur_size = 0 49 | self.cur_batch = [] 50 | 51 | 52 | def folder_1h_rhog(roothog_path: Path, output_folder_big: Path, output_folder_rest: Path): 53 | # create a list of hogs in descending filesize order 54 | hog_size_tuples = sorted([(f, f.stat().st_size) for f in roothog_path.rglob("*.fa")], key=lambda x: -x[1]) 55 | with BatchBuilder(output_folder_big, 1) as big_hogs, \ 56 | BatchBuilder(output_folder_rest, sum_list_rhogs_filesize_thresh) as rest_hogs: 57 | for hog, fsize in hog_size_tuples: 58 | if fsize > big_rhog_filesize_thresh: 59 | big_hogs.add_hog(hog) 60 | else: 61 | rest_hogs.add_hog(hog) 62 | 63 | 64 | def fastoma_batch_roothogs(): 65 | import argparse 66 | parser = argparse.ArgumentParser(description="Analyse roothog families and create batches for analysis") 67 | parser.add_argument("--version", action="version", version="FastOMA v"+fastoma_version) 68 | parser.add_argument('--input-roothogs', required=True, help="folder where input roothogs are stored") 69 | parser.add_argument('--out-big', required=True, help="folder where the big single family hogs should be stored") 70 | parser.add_argument('--out-rest', required=True, help="folder where the remaining families should be stored in" 71 | "batch subfolder structure.") 72 | parser.add_argument('-v', default=0, action="count", help="incrase verbosity") 73 | conf_batch_roothogs = parser.parse_args() 74 | logger.setLevel(level=30 - 10 * min(conf_batch_roothogs.v, 2)) 75 | logger.debug("Arguments: %s", conf_batch_roothogs) 76 | 77 | folder_1h_rhog(Path(conf_batch_roothogs.input_roothogs), Path(conf_batch_roothogs.out_big), Path(conf_batch_roothogs.out_rest)) 78 | 79 | -------------------------------------------------------------------------------- /FastOMA/helper_scripts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from ._wrappers import logger 3 | from .zoo.utils import auto_open 4 | 5 | 6 | def extract_pw_rels(args): 7 | from lxml import etree 8 | from .zoo.hog import transform 9 | xml = etree.parse(args.orthoxml) 10 | with auto_open(args.out, 'wt') as fout: 11 | for p1, p2 in transform.iter_pairwise_relations(xml, rel_type=args.type, id_attribute="protId"): 12 | fout.write(f"{p1}\t{p2}\n") 13 | 14 | 15 | def main(): 16 | parser = argparse.ArgumentParser(description="FastOMA helper scripts") 17 | parser.add_argument('-v', default=0, action="count", help="increase verbosity") 18 | subparsers = parser.add_subparsers(required=True) 19 | 20 | parser_pw = subparsers.add_parser('pw-rel') 21 | parser_pw.add_argument("--type", choices=("ortholog", "paralog"), default="ortholog", 22 | help="Type of relations to extract. either 'ortholog' or 'paralog'") 23 | parser_pw.add_argument("--out", required=True, help="Path to output file") 24 | parser_pw.add_argument("--orthoxml", required=True, help="Path to input orthoxml file") 25 | parser_pw.set_defaults(func=extract_pw_rels) 26 | 27 | conf = parser.parse_args() 28 | logger.setLevel(level=30 - 10 * min(conf.v, 2)) 29 | logger.debug(conf) 30 | conf.func(conf) 31 | 32 | 33 | if __name__ == "__main__": 34 | main() -------------------------------------------------------------------------------- /FastOMA/infer_roothogs.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | from shutil import which 3 | 4 | from . import _utils_roothog 5 | from ._wrappers import logger 6 | from . import __version__ as fastoma_version 7 | 8 | 9 | 10 | """ 11 | 12 | fastoma-infer-roothogs --proteomes proteome --hogmap hogmap --out-rhog-folder omamer_rhogs -vv 13 | 14 | """ 15 | 16 | 17 | def fastoma_infer_roothogs(): 18 | import argparse 19 | parser = argparse.ArgumentParser(description="checking parameters for FastOMA") 20 | parser.add_argument("--version", action="version", version="FastOMA v"+fastoma_version) 21 | parser.add_argument("--proteomes", required=True, help="Path to the folder containing the input proteomes") 22 | parser.add_argument("--splice", help="Path to the folder containing the splice information files") 23 | parser.add_argument("--hogmap", help="Path to the folder containing the hogmap files") 24 | parser.add_argument("--out-rhog-folder", required=True, help="Folder where the roothog fasta files are written") #out_rhog_folder 25 | parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug") 26 | parser.add_argument('--min-sequence-length', required=False, default=50, type=int, 27 | help="minimum sequence length. Shorter sequences will be ignored. (Default=50)") 28 | 29 | parser.add_argument("--mergHOG-ratioMax-thresh", required=False, type=float, default=0.8, help="For merging rootHOGs, threshold of ratioMax ") # mergHOG_ratioMax_thresh 30 | parser.add_argument("--mergHOG-ratioMin-thresh", required=False, type=float, default=0.9, help="For merging rootHOGs, threshold of ratioMin ") # mergHOG_ratioMin_thresh 31 | parser.add_argument("--mergHOG-shared-thresh", required=False, type=float, default=10, help="For merging rootHOGs, threshold of number shared proteins ") # mergHOG_shared_thresh 32 | parser.add_argument("--mergHOG-fscore-thresh", required=False, type=float, default=70, help="For merging rootHOGs, threshold of famlut score shared proteins ") # mergHOG_fscore_thresh 33 | parser.add_argument("--big-rhog-size", required=False, type=int, default=50*1000, help= "For big rootHOGs, we have different heuristics") # big_rhog_size 34 | parser.add_argument("--big-fscore-thresh", required=False, type=int, default=95, help="For huge rootHOGs, we have different heuristics, like filtering low family score protiens") # big_fscore_thresh 35 | 36 | conf = parser.parse_args() 37 | logger.setLevel(level=30 - 10 * min(conf.v, 2)) 38 | logger.debug("Arguments: %s", conf) 39 | 40 | species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes, conf.min_sequence_length) # optional input folder 41 | prot_recs_all = _utils_roothog.add_species_name_prot_id(prot_recs_lists) 42 | 43 | hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(prot_recs_lists, fasta_format_keep, folder=conf.hogmap) # optional input folder 44 | 45 | splice_files = conf.splice is not None and os.path.exists(conf.splice) 46 | if splice_files: 47 | isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names, folder=conf.splice) 48 | isoform_selected, isoform_not_selected = _utils_roothog.find_nonbest_isoform( 49 | species_names, isoform_by_gene_all, hogmaps 50 | ) 51 | _utils_roothog.write_isoform_selected(isoform_by_gene_all, isoform_selected, prot_recs_lists) 52 | # for each isoform file, there will be a file ending with _selected_isoforms.tsv 53 | hogmaps = _utils_roothog.handle_splice(hogmaps, isoform_not_selected) 54 | 55 | rhogs_prots = _utils_roothog.group_prots_roothogs(hogmaps) 56 | rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots, hogmaps, conf) 57 | rhogs_prots = _utils_roothog.merge_rhogs2(hogmaps, rhogs_prots, conf) 58 | rhogs_prots = _utils_roothog.filter_big_roothogs(hogmaps, rhogs_prots, conf) 59 | 60 | min_rhog_size = 2 61 | rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, conf.out_rhog_folder, min_rhog_size) 62 | linclust_available=which("mmseqs") # True # 63 | # if memseqs is not installed the output will be empty / None 64 | if linclust_available: 65 | num_unmapped_singleton = _utils_roothog.collect_unmapped_singleton(rhogs_prots, unmapped, prot_recs_all, "singleton_unmapped.fa") 66 | if num_unmapped_singleton: 67 | result_linclust = _utils_roothog.run_linclust(fasta_to_cluster="singleton_unmapped.fa") 68 | logger.debug(" linclust is done %s", result_linclust) 69 | num_clusters = _utils_roothog.write_clusters(conf.out_rhog_folder, min_rhog_size) 70 | logger.debug("we wrote %d new clusters with linclust ", num_clusters) 71 | 72 | 73 | if __name__ == "__main__": 74 | fastoma_infer_roothogs() -------------------------------------------------------------------------------- /FastOMA/infer_subhogs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from . import _utils_subhog 4 | from . import _infer_subhog 5 | from ._wrappers import logger 6 | from . import __version__ as fastoma_version 7 | 8 | """ 9 | 10 | fastoma-infer-subhogs --input-rhog-folder rhogs_rest/0 --output-pickles "pickle_hogs" \ 11 | --species-tree species_tree_checked.nwk -vv --parallel # --msa-write --gene-trees-write 12 | 13 | """ 14 | 15 | def fastoma_infer_subhogs(): 16 | 17 | import argparse 18 | parser = argparse.ArgumentParser(description="checking parameters for FastOMA", 19 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 20 | parser.add_argument("--version", action="version", version="FastOMA v"+fastoma_version) 21 | parser.add_argument("--input-rhog-folder", required=True, help="Path to the input rootHOG folder.") 22 | parser.add_argument("--parallel", action='store_true', help="use concurrent parallel per rootHOG") 23 | parser.add_argument("--species-tree", required=True, 24 | help="Path to the input species tree file in newick format") 25 | parser.add_argument("--output-pickles", required=False, default="pickle_hogs", 26 | help="Path to the output folder") 27 | 28 | parser.add_argument("--threshold-dubious-sd", required=False, type=float, default=1/10, 29 | help="Threshold to remove proteins in a gene tree due to low species overlap score, not enough evidence for duplication event.") # threshold_dubious_sd 30 | parser.add_argument("--number-of-samples-per-hog", type=int, default=5, 31 | help="Number of representatives (sequences) per HOG. Defaults to ") 32 | parser.add_argument("--overlap-fragments", required=False, type=float, default=0.15, 33 | help="Threshold overlap between two sequences (rows) in MSA to decide whether they are fragments of a gene.") # overlap_fragments 34 | parser.add_argument("--gene-rooting-method", required=False, default="midpoint", # gene_rooting_method 35 | help="The method used for rooting of gene tree : midpoint mad Nevers_rooting .") 36 | parser.add_argument("--gene-trees-write", action='store_true', 37 | help="writing the all gene trees .") # the order seems to be nwk_SD_labeled.nwk, dubious_sd0.nwk_SD_labeled.nwk, dubious_sd1.nwk_SD_labeled.nwk 38 | parser.add_argument("--msa-write", action='store_true', 39 | help="writing the raw MSAs (might have more genes that the final gene tree).") 40 | parser.add_argument("--msa-filter-method", 41 | choices=("col-row-threshold", "col-elbow-row-threshold", "trimal"), 42 | default="col-row-threshold", 43 | help="The method used for filtering MSAs.") 44 | parser.add_argument("--gap-ratio-row", required=False, type=float, default=0.3, 45 | help="For trimming the MSA, the threshold of ratio of gaps for each row.") 46 | parser.add_argument("--gap-ratio-col", required=False, type=float, default=0.5, 47 | help="For trimming the MSA, the threshold of ratio of gaps for each column.") 48 | parser.add_argument("--min-col-trim", required=False, type=int, default=50, # todo min rows trim 49 | help="min no. columns in msa to consider for filtering") 50 | parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug") 51 | conf_infer_subhhogs = parser.parse_args() 52 | logger.setLevel(level=30 - 10 * min(conf_infer_subhhogs.v, 2)) 53 | logger.debug("Arguments: %s", conf_infer_subhhogs) 54 | 55 | address_rhogs_folder = conf_infer_subhhogs.input_rhog_folder 56 | # address_rhogs_folder = "./" # _config.input_rhog_folder 57 | inferhog_concurrent_on = conf_infer_subhhogs.parallel 58 | if inferhog_concurrent_on: 59 | print("parallelization for subhog inference is on.") 60 | 61 | if not os.path.exists(conf_infer_subhhogs.output_pickles): 62 | os.makedirs(conf_infer_subhhogs.output_pickles) 63 | 64 | pickles_subhog_folder_all = "./" # pickle per taxonomic level 65 | 66 | list_rhog_fastas_files = _utils_subhog.list_rhog_fastas(address_rhogs_folder) 67 | print("there are ", len(list_rhog_fastas_files), "rhogs in the input folder") 68 | 69 | rhogs_fa_folder = address_rhogs_folder 70 | 71 | list_rhog_fastas_files_rem = _utils_subhog.list_rhog_fastas(address_rhogs_folder) 72 | print("there are ", len(list_rhog_fastas_files_rem), "rhogs remained in the input folder", list_rhog_fastas_files_rem[:5] ) 73 | 74 | hogs_rhog_xml_batch = _infer_subhog.read_infer_xml_rhogs_batch(list_rhog_fastas_files_rem, inferhog_concurrent_on, conf_infer_subhhogs.output_pickles, pickles_subhog_folder_all, rhogs_fa_folder, conf_infer_subhhogs) 75 | 76 | print("finsihed ", address_rhogs_folder) 77 | 78 | threshold_dubious_sd= 0.1 79 | 80 | 81 | if __name__ == "__main__": 82 | fastoma_infer_subhogs() -------------------------------------------------------------------------------- /FastOMA/transformer.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import re 3 | from ._wrappers import logger 4 | 5 | 6 | class FastaHeaderTransformer(metaclass=abc.ABCMeta): 7 | @abc.abstractmethod 8 | def transform(self, header): 9 | return header 10 | 11 | 12 | class NoOpFastaHeaderTransformer(FastaHeaderTransformer): 13 | def transform(self, header): 14 | return header 15 | 16 | 17 | class ExtractUniProtAccessionFastaHeaderTransformer(FastaHeaderTransformer): 18 | def __init__(self): 19 | self._up_re = re.compile(r"[sptr]{2}\|(?P[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})\|.*") 20 | 21 | def transform(self, header): 22 | m = self._up_re.match(header) 23 | if m: 24 | return m.group('acc') 25 | logger.warning("cannot extract uniprot accession from header: %s", header) 26 | return header 27 | 28 | 29 | def header_transformer(name): 30 | if name.lower() == "noop": 31 | return NoOpFastaHeaderTransformer() 32 | elif name.lower() == 'uniprot': 33 | return ExtractUniProtAccessionFastaHeaderTransformer() 34 | -------------------------------------------------------------------------------- /FastOMA/zoo/README.md: -------------------------------------------------------------------------------- 1 | zoo 2 | === 3 | 4 | 5 | 6 | This is part of the [zoo](https://zoo.cs.ucl.ac.uk/doc/zoo/wrappers.html) -------------------------------------------------------------------------------- /FastOMA/zoo/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __version__ = "0.0.5" -------------------------------------------------------------------------------- /FastOMA/zoo/familyanalyzer/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_hooks() 7 | from .genetree import * 8 | from .taxonomy import * 9 | -------------------------------------------------------------------------------- /FastOMA/zoo/familyanalyzer/orthoxmlquery.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future.builtins import str 6 | from future import standard_library 7 | standard_library.install_hooks() 8 | 9 | 10 | class ElementError(Exception): 11 | def __init__(self, msg): 12 | self.msg = msg 13 | 14 | def __str__(self): 15 | return str(self.msg) 16 | 17 | 18 | class OrthoXMLQuery(object): 19 | """Helper class with predefined queries on an orthoxml tree.""" 20 | 21 | ns = {"ns0": "http://orthoXML.org/2011/"} # xml namespace 22 | 23 | @classmethod 24 | def getToplevelOrthologGroups(cls, root): 25 | """returns a list with the toplevel orthologGroup elements 26 | of the given root element.""" 27 | xquery = ".//{{{ns0}}}groups/{{{ns0}}}orthologGroup".format(**cls.ns) 28 | return root.findall(xquery) 29 | 30 | @classmethod 31 | def getTaxRangeNodes(cls, root, recursively=True): 32 | xPrefix = ".//" if recursively else "./" 33 | xquery = '{}{{{}}}property[@name="TaxRange"]'.format(xPrefix, 34 | cls.ns['ns0']) 35 | return root.findall(xquery) 36 | 37 | @classmethod 38 | def getTaxidNodes(cls, root, recursively=True): 39 | xPrefix = ".//" if recursively else "./" 40 | xquery = '{}{{{}}}property[@name="taxid"]'.format(xPrefix, cls.ns['ns0']) 41 | return root.findall(xquery) 42 | 43 | @classmethod 44 | def getGeneRefNodes(cls, root, recursively=True): 45 | iterfn = root.iter if recursively else root.iterchildren 46 | iterator = iterfn('{{{}}}geneRef'.format(cls.ns['ns0'])) 47 | return list(iterator) 48 | 49 | @classmethod 50 | def getGeneFromId(cls, id_, root): 51 | xquery = ".*//{{{}}}gene[@id='{}']".format(cls.ns['ns0'], id_) 52 | genes = root.findall(xquery) 53 | if len(genes) > 1: 54 | raise ElementError('several gene nodes with id {} ' 55 | 'exist'.format(id_)) 56 | gene = genes[0] if len(genes)>0 else None 57 | return gene 58 | 59 | @classmethod 60 | def getGroupsAtLevel(cls, level, root): 61 | """returns a list with the orthologGroup elements which have a 62 | TaxRange property equals to the requested level.""" 63 | xquery = (".//{{{0}}}property[@name='TaxRange'][@value='{1}']/..". 64 | format(cls.ns['ns0'], level)) 65 | return root.findall(xquery) 66 | 67 | @classmethod 68 | def getSubNodes(cls, targetNode, root, recursively=True): 69 | """method which returns a list of all (if recursively 70 | is set to true) or only the direct children nodes 71 | having 'targetNode' as their tagname. 72 | The namespace is automatically added to the tagname.""" 73 | xPrefix = ".//" if recursively else "./" 74 | xquery = "{}{{{}}}{}".format(xPrefix, cls.ns['ns0'], targetNode) 75 | return root.findall(xquery) 76 | 77 | @classmethod 78 | def is_geneRef_node(cls, element): 79 | """check whether a given element is an instance of a geneRef 80 | element.""" 81 | return element.tag == '{{{ns0}}}geneRef'.format(**cls.ns) 82 | 83 | @classmethod 84 | def getLevels(cls, element): 85 | """returns a list of the TaxRange levels associated to the 86 | passed orthologGroup element. If the element does not have 87 | any TaxRange property tags associated, an empty list is 88 | returned.""" 89 | propTags = cls.getSubNodes("property", element, recursively=False) 90 | res = [t.get('value') for t in propTags if t.get('name') == 'TaxRange'] 91 | return res 92 | 93 | @classmethod 94 | def getInputGenes(cls, root, species=None): 95 | """returns a list of all gene elements in the orthoxml inside 96 | tags, i.e. the list of genes prior to running 97 | OMA-HOGS. Optionally filtered by species.""" 98 | filter_ = ('[@name="{}"]'.format(species) 99 | if species is not None else '') 100 | if filter_ > '': 101 | xquery = ('/ns:orthoXML/ns:species{}/ns:database/' 102 | 'ns:genes//ns:gene'.format(filter_)) 103 | else: 104 | xquery = '//ns:gene' 105 | return root.xpath(xquery, namespaces={'ns': cls.ns['ns0']}) 106 | 107 | @classmethod 108 | def getGroupedGenes(cls, root, species=None): 109 | """ returns a list of all geneRef elements inside tags, i.e. 110 | the list of genes clustered into families after running OMA-HOGS. 111 | Optionally filtered by species.""" 112 | filter_ = ('[@name="TaxRange"and@value="{}"]'.format(species) 113 | if species is not None else '') 114 | if filter_ > '': 115 | xquery = ('/ns:orthoXML/ns:groups/ns:orthologGroup//ns:property{}/' 116 | 'following-sibling::ns:geneRef'.format(filter_)) 117 | else: 118 | xquery = '//ns:geneRef' 119 | return root.xpath(xquery, namespaces={'ns': cls.ns['ns0']}) 120 | 121 | @classmethod 122 | def getScoreNodes(cls, root, score_id=None): 123 | """returns the associated score nodes for a certain (orthologGroup) node. 124 | If score_id is not specified, all scores will be returned""" 125 | xquery = './ns:score' 126 | if score_id is not None: 127 | xquery += "[@id='{}']".format(score_id) 128 | return root.xpath(xquery, namespaces={'ns': cls.ns['ns0']}) 129 | -------------------------------------------------------------------------------- /FastOMA/zoo/familyanalyzer/tools.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division 3 | from __future__ import absolute_import 4 | from future.builtins import dict 5 | from future.builtins import zip 6 | from future.builtins import range 7 | from future import standard_library 8 | standard_library.install_hooks() 9 | 10 | try: 11 | from progressbar import ProgressBar, Percentage, Timer, ETA, Bar 12 | PROGRESSBAR = True 13 | except ImportError: 14 | PROGRESSBAR = False 15 | 16 | from collections import deque 17 | 18 | def setup_progressbar(msg, size): 19 | if not msg.endswith(': '): 20 | msg += ': ' 21 | 22 | widgets = [msg, 23 | Percentage(), ' ', 24 | Bar(), ' ', 25 | Timer(), ' ', 26 | ETA()] 27 | 28 | pbar = ProgressBar(widgets=widgets, maxval=size) 29 | return pbar 30 | 31 | def enum(*sequential, **named): 32 | """creates an Enum type with given values""" 33 | enums = dict(zip(sequential, range(len(sequential))), **named) 34 | enums['reverse'] = dict((value, key) for key, value in enums.items()) 35 | return type('Enum', (object, ), enums) 36 | 37 | 38 | class IterableClassException(Exception): 39 | pass 40 | 41 | def py2_iterable(Class): 42 | """ 43 | Use as a class decorator to make a class that has a python 3 next method -- 44 | __next__() -- also iterable with python 2, which uses next(). Also checks 45 | for an __iter__ method -- if this is missing the class won't be iterable anyway. 46 | 47 | 48 | e.g. 49 | @py2_iterable 50 | class Py2and3Iterator(object): 51 | def __init__(self): 52 | self.data = list('somestuff') 53 | self._pos = 0 54 | 55 | def __iter__(self): 56 | return self 57 | 58 | def __next__(self): 59 | if self._pos == len(self.data): 60 | self._pos = 0 61 | raise StopIteration 62 | char = self.data[self._pos] 63 | self._pos += 1 64 | return char 65 | 66 | 67 | :param Class: the class being decorated 68 | :return: Class: the decorated class, which is iterable in py2 and py3 69 | """ 70 | if not hasattr(Class, '__iter__'): 71 | raise IterableClassException('Class "{}" has no __iter__ method and will not be iterable' 72 | .format(Class.__class__.__name__)) 73 | 74 | if hasattr(Class, '__next__'): 75 | next_method = getattr(Class, '__next__') 76 | setattr(Class, 'next', next_method) 77 | 78 | return Class 79 | 80 | 81 | @py2_iterable 82 | class Queue(object): 83 | 84 | def __init__(self): 85 | self.__queue = deque() 86 | 87 | def __iter__(self): 88 | return self 89 | 90 | def __len__(self): 91 | return len(self.__queue) 92 | 93 | def __next__(self): 94 | if self.isempty(): 95 | raise StopIteration 96 | return self.dequeue() 97 | 98 | def enqueue(self, item): 99 | self.__queue.append(item) 100 | 101 | def dequeue(self): 102 | if self.isempty(): 103 | raise Exception('empty queue') 104 | return self.__queue.popleft() 105 | 106 | def isempty(self): 107 | return len(self.__queue) == 0 108 | -------------------------------------------------------------------------------- /FastOMA/zoo/file_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .context_managers import * 2 | from .extractors import * 3 | 4 | -------------------------------------------------------------------------------- /FastOMA/zoo/file_utils/context_managers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | 5 | 6 | 7 | __all__ = ['TempFile', 'TempDir', 'ChDir', 'MkDir', 'NonDeletingTempDir'] 8 | 9 | class TempFile(object): 10 | """ 11 | Context manager for working with a temporary file 12 | that automatically cleans up. 13 | 14 | Usage: 15 | 16 | with TempFile() as tmp: 17 | # In scope, tmp exists on the disk 18 | # Do some work with tmp, e.g. tmp.write('something') 19 | 20 | # Out of scope, tmp is deleted 21 | 22 | with TempFile('local_temp_space') as tmp: 23 | # tmp is created in the directory 'local_temp_space' 24 | # The specified directory must exist, or an error is thrown 25 | 26 | """ 27 | 28 | def __init__(self, dir_=None): 29 | if dir_ is not None and not os.path.exists(dir_): 30 | raise IOError('Directory "{}"" does not exist'.format(dir_)) 31 | self.dir = dir_ 32 | 33 | def __enter__(self): 34 | self._fd, self._wrapped_tmp = tempfile.mkstemp(dir=self.dir) 35 | return os.path.abspath(self._wrapped_tmp) 36 | 37 | def __exit__(self, type, value, tb): 38 | os.close(self._fd) 39 | os.remove(self._wrapped_tmp) 40 | 41 | 42 | class TempDir(object): 43 | """ 44 | Context manager for working with a temporary file 45 | that automatically cleans up. 46 | 47 | Usage: 48 | 49 | with TempDir() as tmpd: 50 | # In scope, tmpd exists on the disk 51 | # Do some work with tmpd ... 52 | 53 | # Out of scope, tmpd is deleted along with all its content 54 | 55 | Can be nested with TempFile, e.g. 56 | 57 | with TempDir() as tmpd, TempFile(tmpd) as tmpf: 58 | # tempfile tmpf is created inside temporary directory tmpd 59 | # On exit, everything is deleted 60 | 61 | """ 62 | 63 | def __enter__(self): 64 | self._wrapped_tmpdir = tempfile.mkdtemp() 65 | return os.path.abspath(self._wrapped_tmpdir) 66 | 67 | def __exit__(self, type, value, tb): 68 | shutil.rmtree(self._wrapped_tmpdir) 69 | 70 | 71 | class NonDeletingTempDir(TempDir): 72 | def __exit__(self, tpye, value, tb): 73 | pass 74 | 75 | 76 | class ChDir(object): 77 | """ 78 | Context manager to switch to a working directory, 79 | and return to the current directory (like 'Dir.chdir do' block in Ruby) 80 | 81 | Usage: 82 | 83 | with TempDir() as dir, ChDir(dir): 84 | # Do some work in the working temp directory 'dir' 85 | 86 | # Exit 'dir' 87 | """ 88 | 89 | def __init__(self, working_dir): 90 | if not os.path.exists(working_dir): 91 | raise IOError('Directory "{}"" does not exist'.format(working_dir)) 92 | self._cdir = os.getcwd() 93 | self._wdir = working_dir 94 | 95 | def __enter__(self): 96 | os.chdir(self._wdir) 97 | 98 | def __exit__(self, type, value, tb): 99 | os.chdir(self._cdir) 100 | 101 | 102 | class MkDir(ChDir): 103 | """ 104 | Context manager to create and switch to a working directory, 105 | then return to the current directory. 106 | 107 | Usage: 108 | 109 | with TempDir() as dir, MkDir(dir): 110 | # Do some work in the working temp directory 'dir' 111 | 112 | # Exit 'dir' 113 | """ 114 | 115 | def __init__(self, working_dir): 116 | if not os.path.exists(working_dir): 117 | try: 118 | os.makedirs(working_dir) 119 | except OSError as e: 120 | if e.errno != 17: 121 | raise 122 | pass # path was created by another thread / process 123 | # this is a race condition, but probably benign 124 | 125 | def __enter__(self): 126 | pass 127 | 128 | def __exit__(self, type, value, tb): 129 | pass 130 | -------------------------------------------------------------------------------- /FastOMA/zoo/file_utils/extractors.py: -------------------------------------------------------------------------------- 1 | import os 2 | import collections 3 | import re 4 | 5 | 6 | 7 | __all__ = ['tail', 'fall_back_tail', 'grep'] 8 | 9 | 10 | def tail(fh, lines=20, block_size=1024): 11 | """Returns the last n lines from a file 12 | 13 | This function returns the last n lines from an file-like 14 | object. It does this efficiently without reading the whole 15 | file, but rather by loading blocks from the end of the file. 16 | 17 | .. note:: 18 | 19 | If the file is opened in text mode, i.e. open('/path', 'rt'), 20 | python3 cannot efficiently move in the file. In this case, 21 | the function fall back to a slow method that goes through 22 | the whole file. 23 | 24 | Example: 25 | 26 | >>> with open("/etc/passwd", 'rb') as f: 27 | ... last_lines = tail(f, 2) 28 | ... 29 | >>> print(last_lines) 30 | 31 | :param fh: file-like object to read from 32 | :param int lines: number of lines to be returned 33 | :param int block_size: size of block to be read at once. 34 | intended for optimisation. 35 | :returns: The last lines as a list of bytes/str object""" 36 | 37 | if lines <= 0: 38 | raise ValueError('invalid lines value %r' % lines) 39 | 40 | encoded = getattr(fh, 'encoding', False) 41 | if encoded: 42 | return fall_back_tail(fh, lines) 43 | CR = '\n' if encoded else b'\n' 44 | data = '' if encoded else b'' 45 | fh.seek(0, os.SEEK_END) 46 | fsize = fh.tell() 47 | block = -1 48 | loaded_enough_data = False 49 | while not loaded_enough_data: 50 | step = (block * block_size) 51 | if abs(step) >= fsize: 52 | fh.seek(0) 53 | newdata = fh.read(block_size - (abs(step) - fsize)) 54 | loaded_enough_data = True 55 | else: 56 | fh.seek(step, os.SEEK_END) 57 | newdata = fh.read(block_size) 58 | data = newdata + data 59 | if data.count(CR) > lines: 60 | break 61 | else: 62 | block -= 1 63 | return data.splitlines()[-lines:] 64 | 65 | 66 | def fall_back_tail(fh, lines): 67 | fh.seek(0) 68 | data = collections.deque(fh, maxlen=lines) 69 | return [e.rstrip('\n') for e in data] 70 | 71 | 72 | def grep(fh, pat): 73 | """Yields lines matching a pattern 74 | 75 | This function yields all the lines that match a given pattern. 76 | The pattern can be either a simple str/bytes, or a compiled 77 | regex expression. The newline character is not removed. 78 | 79 | Example: 80 | >>> with open('/etc/hosts', 'rb') as fh: 81 | ... for line in grep(fh, b'127.0.0.1'): 82 | ... print(line) 83 | 127.0.0.1 localhost 84 | 85 | :param fh: file-like object 86 | :param pat: search pattern, either str, bytes or compiled regex 87 | :returns: generator yielding lines matching pattern. 88 | 89 | """ 90 | if isinstance(pat, (str, bytes)): 91 | encoded = getattr(fh, 'encoding', False) 92 | if encoded and isinstance(pat, bytes): 93 | pat = re.compile(pat.decode()) 94 | elif not encoded and isinstance(pat, str): 95 | pat = re.compile(pat.encode('utf-8')) 96 | else: 97 | pat = re.compile(pat) 98 | fh.seek(0) 99 | for line in fh: 100 | if pat.search(line): 101 | yield line 102 | -------------------------------------------------------------------------------- /FastOMA/zoo/hog/__init__.py: -------------------------------------------------------------------------------- 1 | from .filter_orthoxml import * 2 | from .convert import orthoxml_to_newick 3 | from .orthoxml_merge import merge_orthoxml_files 4 | from .extract_groups import TaxLevel, extract_flat_groups_at_level, extract_marker_groups_at_level 5 | -------------------------------------------------------------------------------- /FastOMA/zoo/hog/convert.py: -------------------------------------------------------------------------------- 1 | from xml.etree.ElementTree import XMLParser 2 | __all__ = ["orthoxml_to_newick"] 3 | 4 | 5 | class TaxonNHXMixin: 6 | def get_tax_nhx(self): 7 | tags = [] 8 | if self.level: 9 | tags.append(":S={}".format(self.level)) 10 | if self.taxid: 11 | tags.append(":T={}".format(self.taxid)) 12 | return tags 13 | 14 | 15 | class Speciation: 16 | type = None 17 | 18 | def __init__(self, parent=None): 19 | self.level = "" 20 | self.taxid = None 21 | self.children = [] 22 | self.parent = parent 23 | if parent is not None: 24 | parent.add_child(self) 25 | 26 | def add_child(self, e): 27 | self.children.append(e) 28 | 29 | def set_level(self, level): 30 | self.level = level 31 | 32 | def set_taxid(self, taxid): 33 | self.taxid = taxid 34 | 35 | def get_newick_node_name(self): 36 | if not hasattr(self, 'get_tax_nhx'): 37 | return self.level.replace(' ', '_') 38 | return "" 39 | 40 | def as_nhx(self): 41 | nhx = "[&&NHX" 42 | t = ",".join([c.as_nhx() for c in self.children]) 43 | if t != "": 44 | t = "({})".format(t) 45 | tags = self.get_newick_node_name() 46 | 47 | if self.type: 48 | nhx += ":Ev={}".format(self.type) 49 | if hasattr(self, "get_tax_nhx"): 50 | nhx += "".join(self.get_tax_nhx()) 51 | nhx += "]" 52 | if len(nhx) > 7: 53 | tags += nhx 54 | return "{}{}".format(t, tags) 55 | 56 | 57 | class Duplication(Speciation): 58 | type = "duplication" 59 | 60 | 61 | class Leaf(Speciation): 62 | def __init__(self, xref, species, parent=None): 63 | super().__init__(parent=parent) 64 | self.name = xref 65 | self.level = species 66 | 67 | def get_newick_node_name(self): 68 | return self.name 69 | 70 | 71 | class NHXSpeciation(Speciation, TaxonNHXMixin): 72 | pass 73 | 74 | class NHXDuplication(Duplication, TaxonNHXMixin): 75 | pass 76 | 77 | class NHXLeaf(Leaf, TaxonNHXMixin): 78 | pass 79 | 80 | 81 | class OrthoxmlToNewick: 82 | 83 | def __init__(self, xref_tag="protId", encode_levels_as_nhx=True, return_gene_to_species=False): 84 | self.xref_tag = xref_tag 85 | self.gene2xref = {} 86 | self.trees = {} 87 | self.depth = 0 88 | self.famid = None 89 | self.cur_event = None 90 | self.cur_species = None 91 | self._use_nhx = encode_levels_as_nhx 92 | self._return_gene_to_species= return_gene_to_species 93 | 94 | def start(self, tag, attrib): 95 | if tag == "{http://orthoXML.org/2011/}species": 96 | self.cur_species = attrib['name'] 97 | if tag == "{http://orthoXML.org/2011/}gene": 98 | self.gene2xref[attrib['id']] = (attrib[self.xref_tag], self.cur_species) 99 | elif tag == "{http://orthoXML.org/2011/}geneRef": 100 | leaf_cls = NHXLeaf if self._use_nhx else Leaf 101 | self.cur_event.add_child(leaf_cls(*self.gene2xref[attrib['id']])) 102 | elif tag == "{http://orthoXML.org/2011/}orthologGroup": 103 | if self.depth == 0: 104 | self.famid = attrib['id'] 105 | speciation_cls = NHXSpeciation if self._use_nhx else Speciation 106 | self.cur_event = speciation_cls(self.cur_event) 107 | self.depth += 1 108 | elif tag == "{http://orthoXML.org/2011/}paralogGroup": 109 | dupl_cls = NHXDuplication if self._use_nhx else Duplication 110 | self.cur_event = dupl_cls(self.cur_event) 111 | elif tag == "{http://orthoXML.org/2011/}property": 112 | if attrib['name'] == "TaxRange": 113 | self.cur_event.set_level(attrib['value']) 114 | elif attrib['name'].lower() in ("taxid", "taxonid", "taxon_id", "ncbi_taxon_id"): 115 | self.cur_event.set_taxid(attrib['value']) 116 | 117 | def end(self, tag): 118 | if tag == "{http://orthoXML.org/2011/}paralogGroup": 119 | self.cur_event = self.cur_event.parent 120 | elif tag == "{http://orthoXML.org/2011/}orthologGroup": 121 | self.depth -= 1 122 | if self.depth == 0: 123 | assert(self.cur_event.parent is None) 124 | self.trees[self.famid] = self.cur_event.as_nhx() + ";" 125 | self.cur_event = self.cur_event.parent 126 | 127 | def close(self): 128 | if self._return_gene_to_species: 129 | gene2species = {k[0]: k[1] for k in self.gene2xref.values()} 130 | return self.trees, gene2species 131 | return self.trees 132 | 133 | 134 | def orthoxml_to_newick(filename, xref_tag="protId", encode_levels_as_nhx=False, return_gene_to_species=False): 135 | """function to convert all HOGs from an orthoxml file into newick trees 136 | 137 | This function converts all toplevel orthologGroups into a dictionary of newick trees. 138 | Duplication nodes are labeled as such using the nhx tag, e.g. a paralogGroup node 139 | will be translated into an internal node having the nhx label [&&NHX:Ev=duplication] 140 | 141 | :param filename: the filename of the input orthoxml file 142 | 143 | :param xref_tag: the attribute of the element that should be used to get as label 144 | for the leaves labels. 145 | 146 | :param encode_levels_as_nhx: boolean flag indicating whether or not the species information 147 | of the internal and extend nodes should be returned in NHX format 148 | with the :S=<...> and :T=<...> format. otherwise, the TaxRange 149 | value will be used as newick node label for the internal nodes. 150 | 151 | :param return_gene_to_species: boolean flag indicating if a mapping with the gene to species 152 | should be returned. 153 | 154 | :returns either a dict of {roothogid: tree} where tree is in nhx format or a tuple with the 155 | first element being the tree dictionary and the second being a mapping from 156 | {gene: species}. 157 | """ 158 | 159 | target = OrthoxmlToNewick( 160 | xref_tag=xref_tag, 161 | encode_levels_as_nhx=encode_levels_as_nhx, 162 | return_gene_to_species=return_gene_to_species) 163 | parser = XMLParser(target=target) 164 | with open(filename, 'rb') as xml: 165 | for chunk in xml: 166 | parser.feed(chunk) 167 | return parser.close() 168 | -------------------------------------------------------------------------------- /FastOMA/zoo/hog/extract_hog_info.py: -------------------------------------------------------------------------------- 1 | from ..utils import auto_open 2 | import collections 3 | from time import time 4 | import xml.etree.ElementTree as etree 5 | from pathlib import Path 6 | import logging 7 | logger = logging.getLogger(__name__) 8 | 9 | Gene = collections.namedtuple("Gene", "xref species internal_id") 10 | 11 | 12 | class SpeciesAnalyser: 13 | def __init__(self, gene_attr="protId"): 14 | self.gene_attr = gene_attr 15 | self.genes = {} 16 | self.nr_genes_per_species = collections.defaultdict(int) 17 | 18 | def add_genome_genes(self, genome_node): 19 | genome_name = genome_node.get('name', None) 20 | if genome_name is None: 21 | genome_name = genome_node.get("NCBITaxId") 22 | 23 | generef_2_xref = {} 24 | for gene in genome_node.findall('.//{http://orthoXML.org/2011/}gene'): 25 | gene_id = gene.get('id') 26 | gene_prot_id = gene.get(self.gene_attr) 27 | generef_2_xref[gene_id] = Gene(gene_prot_id, genome_name, gene_id) 28 | self.nr_genes_per_species[genome_name] += 1 29 | self.genes.update(generef_2_xref) 30 | 31 | def gene_in_group(self, gene_id): 32 | self.genes.pop(gene_id) 33 | 34 | def get_singletons(self): 35 | return self.genes 36 | 37 | def summary(self): 38 | single = collections.defaultdict(int) 39 | for g in self.genes.values(): 40 | single[g.species] += 1 41 | return [{'species': g, 'genes': self.nr_genes_per_species[g], 'not_in_group': single[g]} 42 | for g in self.nr_genes_per_species] 43 | 44 | 45 | def parse_orthoxml(fh, genome_watcher: SpeciesAnalyser): 46 | taxonomy = {} 47 | og_level = 0 48 | 49 | def collect_genes(elem): 50 | genes = 0 51 | for child in elem.iter(): 52 | if child == elem: 53 | continue 54 | if child.tag == "{http://orthoXML.org/2011/}geneRef": 55 | genes += 1 56 | if genome_watcher is not None: 57 | genome_watcher.gene_in_group(child.get('id')) 58 | elif child.tag == "{http://orthoXML.org/2011/}orthologGroup": 59 | genes += child.text 60 | elem.clear() 61 | elem.text = genes 62 | return genes 63 | 64 | logger.info("start mapping of orthoxml formatted input file") 65 | for event, elem in etree.iterparse(fh, events=('start', 'end')): 66 | if event == "start": 67 | if elem.tag == "{http://orthoXML.org/2011/}orthoXML": 68 | if elem.get('version') != "0.5": 69 | raise RuntimeError(f"Expecting orthoXML version 0.5, but is {elem.get('version')}") 70 | elif elem.tag == '{http://orthoXML.org/2011/}orthologGroup': 71 | og_level += 1 72 | elif event == 'end': 73 | if elem.tag == "{http://orthoXML.org/2011/}orthologGroup": 74 | og_level -= 1 75 | data = {'id': elem.get('id'), 'level': taxonomy[elem.get('taxonId')]} 76 | for child in elem.findall('./{http://orthoXML.org/2011/}score'): 77 | data[child.get('id')] = float(child.get('value')) 78 | data['nr_members'] = collect_genes(elem) 79 | data['is_roothog'] = og_level == 0 80 | yield data 81 | if og_level == 0: 82 | elem.clear() 83 | elif elem.tag == "{http://orthoXML.org/2011/}species": 84 | if genome_watcher is not None: 85 | genome_watcher.add_genome_genes(elem) 86 | elem.clear() 87 | elif elem.tag == "{http://orthoXML.org/2011/}taxon": 88 | taxonomy[elem.get('id')] = elem.get('name') 89 | 90 | 91 | if __name__ == "__main__": 92 | import argparse 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument("--orthoxml", required=True) 95 | conf = parser.parse_args() 96 | genome_coverage_stats = SpeciesAnalyser() 97 | with open(conf.orthoxml, 'rt') as xml: 98 | for group in parse_orthoxml(xml, genome_coverage_stats): 99 | print(group) -------------------------------------------------------------------------------- /FastOMA/zoo/hog/filter_orthoxml.py: -------------------------------------------------------------------------------- 1 | 2 | from ..utils import auto_open 3 | # import collections 4 | # from time import time 5 | from lxml import etree as ET 6 | # import Bio.Phylo 7 | from typing import Iterable 8 | from pathlib import Path 9 | import logging 10 | logger = logging.getLogger(__name__) 11 | 12 | class HOGFilter: 13 | def __init__(self, score:str, value:float): 14 | self.score = score 15 | self.value = value 16 | 17 | def remove(self, score_id, value): 18 | return score_id == self.score and self.value > float(value) 19 | 20 | 21 | class OrthoXMLFilterProcesser: 22 | 23 | def __init__(self, filters:Iterable[HOGFilter]=None): 24 | self.filters = list(filters) 25 | 26 | def add_filter(self, filter:HOGFilter): 27 | self.filters.append(filter) 28 | 29 | def process(self, fh): 30 | NS = "http://orthoXML.org/2011/" 31 | self.doc = ET.parse(fh) 32 | root = self.doc.getroot() 33 | to_rem = [] 34 | for hog in root.iterfind('.//{{{0}}}orthologGroup'.format(NS)): 35 | score = hog.find('./{{{0}}}score'.format(NS)) 36 | if score is None: 37 | continue 38 | for filt in self.filters: 39 | if filt.remove(score.get('id'), score.get('value')): 40 | to_rem.append(hog) 41 | break 42 | logger.info(f"will remove {len(to_rem)} hogs") 43 | for h in to_rem: 44 | parent = h.getparent() 45 | if 'id' in h.attrib: 46 | logger.info("removing hog " + str(h) + " line " + str(h.sourceline) + " " +str(h.attrib['id'])) 47 | else: 48 | logger.info("removing hog " + str(h) + " line " + str(h.sourceline)) 49 | if parent: 50 | parent.remove(h) 51 | if sum(c.tag == "{{{0}}}orthologGroup".format(NS) for c in parent) == 0: 52 | if 'id' in parent.attrib: 53 | logger.info("consider deleting the empty parent hog "+str(parent)+" line "+str(parent.sourceline)+" "+str(parent.attrib['id'])) 54 | else: 55 | logger.info("consider deleting the empty parent hog " + str(parent) + " line "+str(parent.sourceline)) 56 | to_rem.append(parent) 57 | 58 | def write(self, fh): 59 | self.doc.write(fh, xml_declaration=True, encoding="UTF-8") 60 | 61 | 62 | 63 | def filter_orthoxml_file(source_orthoxml, out, filter: HOGFilter): 64 | processor = OrthoXMLFilterProcesser([filter]) 65 | if isinstance(source_orthoxml, (str, bytes, Path)): 66 | with auto_open(source_orthoxml, 'rt') as fh: 67 | processor.process(fh) 68 | else: 69 | processor.process(source_orthoxml) 70 | processor.write(out) 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /FastOMA/zoo/hog/orthoxml_merge.py: -------------------------------------------------------------------------------- 1 | from xml.etree import ElementTree as ET 2 | from typing import List, Iterable 3 | from random import randint 4 | 5 | 6 | 7 | class GeneRefManager: 8 | def __init__(self): 9 | self.xrefs = {} 10 | self.ids = set([]) 11 | 12 | def _random_unused_id(self): 13 | while True: 14 | cand = randint(100000, 1000000000) 15 | if str(cand) not in self.ids: 16 | return str(cand) 17 | 18 | def register_and_reassign(self, gene_nodes:Iterable[ET.Element]): 19 | update_ids = {} 20 | to_rem = [] 21 | for gene in gene_nodes: 22 | if gene.attrib['id'] in self.ids: 23 | if gene.attrib['protId'] in self.xrefs: 24 | # protId already in set. is it unique? if yes, no action, otherwise error 25 | if self.xrefs[gene.attrib['protId']] != gene.attrib['id']: 26 | raise ValueError("protId '{}' is used several times with different gene id :'{},'{}'" 27 | .format(gene.attrib['protId'], self.xrefs[gene.attrib['protId']], gene.attrib['id'])) 28 | else: 29 | to_rem.append(gene.attrib['id']) 30 | continue 31 | else: 32 | # reassign internal gene id. 33 | new_id = self._random_unused_id() 34 | update_ids[gene.attrib['id']] = new_id 35 | gene.attrib['id'] = new_id 36 | 37 | self.xrefs[gene.attrib['protId']] = gene.attrib['id'] 38 | self.ids.add(gene.attrib['id']) 39 | return update_ids, to_rem 40 | 41 | 42 | class Merger: 43 | def __init__(self, first): 44 | self.NS = "http://orthoXML.org/2011/" 45 | ET.register_namespace("", self.NS) 46 | self.doc = ET.parse(first) 47 | self.root = self.doc.getroot() 48 | 49 | self.all_species = set(z.attrib['name'] for z in self.doc.findall('./{{{}}}species'.format(self.NS))) 50 | self.all_genes = GeneRefManager() 51 | self.all_genes.register_and_reassign( 52 | self.doc.findall("./{{{0}}}species/{{{0}}}database/{{{0}}}genes/{{{0}}}gene".format(self.NS)) 53 | ) 54 | 55 | def merge_file(self, other): 56 | gene_id_updates, to_rem = self.all_genes.register_and_reassign( 57 | other.findall("./{{{0}}}species/{{{0}}}database/{{{0}}}genes/{{{0}}}gene".format(self.NS))) 58 | self._remove_unnecessary_genes(other, to_rem) 59 | self._update_geneRef_ids(other.find('./{{{}}}groups'.format(self.NS)), gene_id_updates) 60 | 61 | for sp in other.findall("./{{{}}}species".format(self.NS)): 62 | if sp.attrib['name'] not in self.all_species: 63 | species_seen = False 64 | for i, el in enumerate(self.root): 65 | if el.tag == "{{{}}}species".format(self.NS): 66 | species_seen = True 67 | elif species_seen: 68 | break 69 | self.root.insert(i, sp) 70 | self.all_species.add(sp.attrib['name']) 71 | else: 72 | db = self.root.find("./{{{0}}}species[@name='{1}']/{{{0}}}database/{{{0}}}genes".format(self.NS, sp.attrib['name'])) 73 | for g in sp.iterfind(".//{{{}}}gene".format(self.NS)): 74 | db.append(g) 75 | grps = self.root.find("./{{{}}}groups".format(self.NS)) 76 | for g in other.find("./{{{}}}groups".format(self.NS)): 77 | grps.append(g) 78 | 79 | def _update_geneRef_ids(self, root, gene_id_updates): 80 | for old_id, new_id in gene_id_updates.items(): 81 | for g in root.iterfind(".//{{{0}}}geneRef[@id='{1}']".format(self.NS, old_id)): 82 | g.attrib['id'] = new_id 83 | 84 | def _remove_unnecessary_genes(self, root, to_rem): 85 | for e in to_rem: 86 | parent = root.find("./{{{0}}}species/{{{0}}}database/{{{0}}}genes/{{{0}}}gene[@id='{1}']/.." 87 | .format(self.NS, e)) 88 | child = parent.find("./{{{0}}}gene[@id='{1}']".format(self.NS, e)) 89 | parent.remove(child) 90 | 91 | 92 | 93 | 94 | def write(self, fh): 95 | self.doc.write(fh, xml_declaration=True, encoding="UTF-8", default_namespace=None) 96 | 97 | 98 | def merge_orthoxml_files(out, files): 99 | """function to merge several orthoxml files into a single orthoxml file that contains all groups. 100 | 101 | This function combines several orthoxml files into a single orthoxml file that 102 | contains all the groups and maintains a valid definition block of the species 103 | and their genes. The protId attributes among all the orthoxml files need to be 104 | either unique or being at least assigned to the same internal gene id; in that 105 | case it is assumed that it is the same gene across the different files and it 106 | can be merged. 107 | if the gene id attribute is the same two or more orthoxml files, but their 108 | protId value is different, a new gene id value is generated and the geneRef 109 | values are updated accordingly. 110 | 111 | :param out: a path or a filehandle object where the combined orthoxml data should 112 | be written to. 113 | 114 | :param files: a list of paths or filehandle objects (of valid orthoxml format) that 115 | should be merged. 116 | 117 | """ 118 | 119 | first = files.pop() 120 | merger = Merger(first) 121 | for f in files: 122 | merger.merge_file(ET.parse(f).getroot()) 123 | 124 | return merger.write(out) 125 | -------------------------------------------------------------------------------- /FastOMA/zoo/seq_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | 3 | 4 | -------------------------------------------------------------------------------- /FastOMA/zoo/unionfind.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | """UnionFind.py 4 | 5 | Union-find data structure. Based on Josiah Carlson's code, 6 | http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/215912 7 | with significant additional changes by D. Eppstein and 8 | Adrian Altenhoff. 9 | """ 10 | 11 | 12 | class UnionFind(object): 13 | """Union-find data structure. 14 | 15 | Each unionFind instance X maintains a family of disjoint sets of 16 | hashable objects, supporting the following two methods: 17 | 18 | - X[item] returns a name for the set containing the given item. 19 | Each set is named by an arbitrarily-chosen one of its members; as 20 | long as the set remains unchanged it will keep the same name. If 21 | the item is not yet part of a set in X, a new singleton set is 22 | created for it. 23 | 24 | - X.union(item1, item2, ...) merges the sets containing each item 25 | into a single larger set. If any item is not yet part of a set 26 | in X, it is added to X as one of the members of the merged set. 27 | """ 28 | 29 | def __init__(self, elements=None): 30 | """Create a new union-find structure. 31 | 32 | If elements is not None, the structure gets initialized 33 | with each element as a singleton component. 34 | 35 | :param elements: an iterable to initialize the structure. 36 | """ 37 | 38 | self.weights = {} 39 | self.parents = {} 40 | if elements is not None: 41 | for elem in iter(elements): 42 | self.parents[elem] = elem 43 | self.weights[elem] = 1 44 | 45 | def __getitem__(self, obj): 46 | """return the name of set which contains obj. 47 | 48 | :param obj: the query object 49 | 50 | :SeeAlso: :meth:`find`""" 51 | return self.find(obj) 52 | 53 | def find(self, obj): 54 | """Find and return the name of the set containing the obj. 55 | 56 | If the object is not found in any set, a new singleton set 57 | is created that holds only this object until it is further merged.""" 58 | 59 | # check for previously unknown obj. If unknown, add it 60 | # as a new cluster 61 | if obj not in self.parents: 62 | self.parents[obj] = obj 63 | self.weights[obj] = 1 64 | return obj 65 | 66 | # find path of objects leading to the root 67 | path = [obj] 68 | root = self.parents[obj] 69 | while root != path[-1]: 70 | path.append(root) 71 | root = self.parents[root] 72 | 73 | # compress the path and return 74 | for ancestor in path: 75 | self.parents[ancestor] = root 76 | return root 77 | 78 | def remove(self, obj): 79 | """Remove an object from the sets. 80 | 81 | Removes an object entirly from the datastructure. The 82 | containing set will shrink by this one element. 83 | 84 | :Note: If one tries to accessed it afterwards using 85 | :meth:`find`, it will be created newly and put as a 86 | singleton. 87 | """ 88 | if obj not in self.parents: 89 | return 90 | comp = self.find(obj) 91 | self.weights[comp] -= 1 92 | self.parents.pop(obj) 93 | 94 | def __iter__(self): 95 | """Iterate through all items ever found or unioned by this structure.""" 96 | return iter(self.parents) 97 | 98 | def union(self, *objects): 99 | """Find the sets containing the objects and merge them. 100 | 101 | any number of objects can be passed to this method and 102 | all of them will be merged into one set containing at 103 | least these objects. 104 | 105 | :param objects: the objects to be merged. they have to be all 106 | hashable. If they haven't been initialy added to the UnionFind 107 | datastructre at instantiation time, they are added at this point 108 | in time. 109 | """ 110 | roots = [self[x] for x in objects] 111 | heaviest = max([(self.weights[r], r) for r in roots], key=lambda x: x[0])[1] 112 | for r in roots: 113 | if r != heaviest: 114 | self.weights[heaviest] += self.weights[r] 115 | self.parents[r] = heaviest 116 | 117 | def get_components(self): 118 | """return a list of sets corresponding to the connected 119 | components of the structure.""" 120 | comp_dict = collections.defaultdict(set) 121 | for elem in iter(self): 122 | comp_dict[self[elem]].add(elem) 123 | comp = list(comp_dict.values()) 124 | return comp 125 | -------------------------------------------------------------------------------- /FastOMA/zoo/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | Utilities for zoo files. 4 | ''' 5 | from io import BytesIO 6 | import bz2 7 | import gzip 8 | import os 9 | import logging 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | # File opening. This is based on the example on SO here: 14 | # http://stackoverflow.com/a/26986344 15 | fmagic = {b'\x1f\x8b\x08': gzip.open, 16 | b'\x42\x5a\x68': bz2.BZ2File} 17 | 18 | 19 | def auto_open(fn, *args, **kwargs): 20 | """function to open regular or compressed files for read / write. 21 | 22 | This function opens files based on their "magic bytes". Supports bz2 23 | and gzip. If it finds neither of these, presumption is it is a 24 | standard, uncompressed file. 25 | 26 | Example:: 27 | 28 | with auto_open("/path/to/file/maybe/compressed", mode="rb") as fh: 29 | fh.read() 30 | 31 | with auto_open("/tmp/test.txt.gz", mode="wb") as fh: 32 | fh.write("my big testfile") 33 | 34 | :param fn: either a string of an existing or new file path, or 35 | a BytesIO handle 36 | :param \*\*kwargs: additional arguments that are understood by the 37 | underlying open handler 38 | :returns: a file handler 39 | """ 40 | if isinstance(fn, BytesIO): 41 | return fn 42 | 43 | if os.path.isfile(fn) and os.stat(fn).st_size > 0: 44 | with open(fn, 'rb') as fp: 45 | fs = fp.read(max([len(x) for x in fmagic])) 46 | for (magic, _open) in fmagic.items(): 47 | if fs.startswith(magic): 48 | return _open(fn, *args, **kwargs) 49 | else: 50 | if fn.endswith('gz'): 51 | return gzip.open(fn, *args, **kwargs) 52 | elif fn.endswith('bz2'): 53 | return bz2.BZ2File(fn, *args, **kwargs) 54 | 55 | return open(fn, *args, **kwargs) 56 | 57 | 58 | class LazyProperty(object): 59 | """Decorator to evaluate a property only on access. 60 | 61 | Compute the attribute value and caches it in the instance. 62 | Python Cookbook (Denis Otkidach) http://stackoverflow.com/users/168352/denis-otkidach 63 | This decorator allows you to create a property which can be computed once and 64 | accessed many times. 65 | 66 | Example:: 67 | 68 | class Circle: 69 | def __init__(self, radius): 70 | self.radius = radius 71 | 72 | @LazyProperty 73 | def area(self): 74 | print("computing area") 75 | return 3.14 * self.radius ** 2 76 | 77 | >>> c = Circle(4) 78 | >>> c.area 79 | computing area 80 | 50.24 81 | >>> c.area 82 | 50.24 83 | 84 | You can see that the property method is only executed once. 85 | """ 86 | 87 | def __init__(self, method, name=None): 88 | # record the unbound-method and the name 89 | self.method = method 90 | self.name = name or method.__name__ 91 | self.__doc__ = method.__doc__ 92 | 93 | def __get__(self, inst, cls): 94 | if inst is None: 95 | return self 96 | # compute, cache and return the instance's attribute value 97 | result = self.method(inst) 98 | # setattr redefines the instance's attribute so this doesn't get called again 99 | setattr(inst, self.name, result) 100 | return result 101 | 102 | 103 | def unique(seq): 104 | """Return the elements of a list uniquely while preserving the order 105 | 106 | :param list seq: a list of hashable elements 107 | :returns: new list with first occurence of elements of seq""" 108 | seen = set() 109 | return [x for x in seq if x not in seen and not seen.add(x)] 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | class WrapperError(Exception): 2 | pass 3 | 4 | 5 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/aligners/__init__.py: -------------------------------------------------------------------------------- 1 | from .mafft import Mafft 2 | from .muscle import Muscle 3 | from .prographmsa import ProGraphMSA 4 | from .probcons import ProbCons 5 | from .base_aligner import AlignmentInput, DataType, WrapperError 6 | 7 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/aligners/base_aligner.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from abc import ABCMeta, abstractmethod 3 | from enum import Enum 4 | from Bio import AlignIO, SeqIO 5 | 6 | 7 | from ...seq_utils import is_dna, identify_input, AlignmentInput 8 | from .. import WrapperError 9 | 10 | 11 | 12 | 13 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN') 14 | 15 | 16 | class Aligner(object): 17 | """ 18 | Base class for wrappers of Multiple Sequence Aligner software 19 | 20 | The wrapper is written as a callable class. 21 | This can hold data (state) to do with the operation it performs, so it can keep results, 22 | execution times and other metadata, as well as perform the task. 23 | 24 | This is a base implementation to be extended. The important parts are 25 | __init__ (does the setup) and __call__ (does the work). All 26 | else are helper methods. 27 | 28 | :Example: 29 | 30 | :: 31 | 32 | callable_wrapper = ConcreteAligner(aln) 33 | result = callable_wrapper() 34 | time_taken = callable_wrapper.elapsed_time 35 | result_again = callable_wrapper.result 36 | 37 | """ 38 | __metaclass__ = ABCMeta 39 | 40 | def __init__(self, input_, datatype=DataType.UNKNOWN, binary=None): 41 | """ 42 | Should work the same whether you're working with a Biopython object or a file 43 | but the implementation differs, e.g. a Biopython object will need 44 | to be written temporarily to disk for the Aligner to work on it. 45 | 46 | :param input_: can be either a filename or a biopython multiple 47 | sequence alignment (a collection of :class:`Bio.SeqRecord.SeqRecord`) 48 | 49 | :param binary: is the alignment's executable file, or None. If set to 50 | None, it is assumed to be found in the PATH. 51 | 52 | :param datatype: means is it DNA or protein? 53 | """ 54 | self.input_type = identify_input(input_) # Figure out what it is - file or object 55 | 56 | if isinstance(datatype, str): 57 | try: 58 | datatype = getattr(DataType, datatype.upper()) 59 | except AttributeError: 60 | raise ValueError("\"{}\" is an invalid datatype for an Aligner".format(datatype)) 61 | if datatype == DataType.UNKNOWN: 62 | self.datatype = guess_datatype(input_, from_filename=self.input_type == AlignmentInput.FILENAME) 63 | if self.input_type == AlignmentInput.OBJECT: 64 | dup, input_ = itertools.tee(input_) 65 | self.datatype = guess_datatype(dup, False) 66 | else: 67 | self.datatype = guess_datatype(input_, True) 68 | else: 69 | self.datatype = datatype 70 | 71 | self.input = input_ # store it 72 | self.elapsed_time = None 73 | self.stdout = None 74 | self.stderr = None 75 | try: 76 | self.cli = self._init_cli(binary) 77 | except IOError as err: 78 | raise WrapperError('Error searching for binary: {}'.format(err)) 79 | # End setup 80 | 81 | @abstractmethod 82 | def __call__(self, *args, **kwargs): 83 | """ 84 | How to call the underlying aligner 85 | """ 86 | pass 87 | 88 | @abstractmethod 89 | def _init_cli(self, binary): 90 | pass 91 | 92 | import logging 93 | logger = logging.getLogger() 94 | 95 | 96 | def guess_datatype(alignment, from_filename=False): 97 | logger.warning("Guessing is not recommended - specify the sequence type with option datatype={DNA, PROTEIN}, be more confident") 98 | if from_filename: 99 | try: 100 | alignment = SeqIO.parse(alignment, 'fasta') 101 | except: 102 | alignment = SeqIO.parse(alignment, 'phylip-relaxed') 103 | return DataType.DNA if is_dna(alignment) else DataType.PROTEIN 104 | 105 | 106 | # TODO: Break the identify_input function into two parts - one to work out the datatype, one to work out whether 107 | # this is a file or an object 108 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/aligners/muscle.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import time 3 | from Bio import AlignIO, SeqIO 4 | from six import StringIO 5 | from ..abstract_cli import AbstractCLI 6 | from .base_aligner import Aligner, AlignmentInput, DataType 7 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, TreeInputOption, OptionSet 8 | 9 | 10 | 11 | 12 | class MuscleCLI(AbstractCLI): 13 | """ 14 | Muscle low-level command line interface 15 | 16 | example: 17 | muscle_cli = MuscleCLI() 18 | process = muscle_cli(cmd='muscle args...') 19 | stdout = muscle_cli.get_stdout() 20 | """ 21 | @property 22 | def _default_exe(self): 23 | return 'muscle' 24 | 25 | # def _set_help(self): 26 | # self(help=True, wait=True) 27 | # self._help = self.get_stdout() 28 | 29 | def set_default_dna_options(aligner): 30 | """ 31 | Dummy function as sensible default already provided by mafft --auto 32 | """ 33 | aligner.options = get_default_options() 34 | 35 | 36 | def set_default_protein_options(aligner): 37 | """ 38 | Dummy function as sensible default already provided by mafft --auto 39 | """ 40 | aligner.options = get_default_options() 41 | 42 | class Muscle(Aligner): 43 | """ 44 | Convenient wrapper for Muscle multiple sequence aligner 45 | 46 | The wrapper is written as a callable class. 47 | This can hold data (state) to do with the operation it performs, so it can keep results, 48 | execution times and other metadata, as well as perform the task. 49 | 50 | This is a basic implementation that can be extended. The important parts are 51 | __init__ (does the setup) and __call__ (does the work). All 52 | else are helper methods. 53 | 54 | :Example: 55 | 56 | :: 57 | 58 | callable_wrapper = Muscle(aln) 59 | result = callable_wrapper() 60 | time_taken = callable_wrapper.elapsed_time 61 | result_again = callable_wrapper.result 62 | """ 63 | 64 | def __init__(self, input_, *args, **kwargs): 65 | super(Muscle, self).__init__(input_, *args, **kwargs) 66 | self.options = get_default_options() 67 | 68 | if self.datatype == DataType.DNA: 69 | set_default_dna_options(self) 70 | else: 71 | set_default_protein_options(self) 72 | 73 | def __call__(self, *args, **kwargs): 74 | """ 75 | Anything to do with calling Muscle should go here. 76 | If any extra arguments need to be passed they can 77 | be specified (listed as *args and **kwargs for now). 78 | """ 79 | start = time.time() # time the execution 80 | 81 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is 82 | with tempfile.NamedTemporaryFile(mode="wt") as filehandle: 83 | SeqIO.write(self.input, filehandle, 'fasta') 84 | filehandle.seek(0) 85 | output, error = self._call(filehandle.name, *args, **kwargs) 86 | else: 87 | output, error = self._call(self.input, *args, **kwargs) 88 | 89 | self.result = self._read_result(output) # store result 90 | self.stdout = output 91 | self.stderr = error 92 | 93 | end = time.time() 94 | self.elapsed_time = end - start 95 | return self.result 96 | # End call 97 | 98 | # Any other accessory methods 99 | def _call(self, filename, *args, **kwargs): 100 | """ 101 | Call underlying low level _MuscleCLI wrapper. 102 | Options are passed via *args and **kwargs 103 | [This only covers the simplest automatic 104 | case] 105 | """ 106 | self.cli('{} -in {}'.format(self.command(), filename), 107 | wait=True) 108 | return self.cli.get_stdout(), self.cli.get_stderr() 109 | 110 | def command(self): 111 | return str(self.options) 112 | 113 | def _read_result(self, output): 114 | """ 115 | Read back the result. 116 | """ 117 | fileobj = StringIO(output) 118 | return AlignIO.read(fileobj, 'fasta') 119 | 120 | def _init_cli(self, binary): 121 | return MuscleCLI(executable=binary) 122 | 123 | 124 | def get_default_options(): 125 | return OptionSet([ 126 | # Algorithm 127 | 128 | # Find diagonals (faster for similar sequences) 129 | FlagOption('-diags', False, active=False), 130 | 131 | # Maximum number of iterations(integer, default 16) 132 | IntegerOption('-maxiters', 16, active=False), 133 | 134 | # Maximum time to iterate in hours (default no limit) 135 | FloatOption('-maxhours', 0.0, active=False) 136 | 137 | #reeInputOption('-usetree', '', active=False) 138 | ]) 139 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/aligners/probcons.py: -------------------------------------------------------------------------------- 1 | import time 2 | from Bio import AlignIO, SeqIO 3 | from six import StringIO 4 | from ..abstract_cli import AbstractCLI 5 | from .base_aligner import Aligner, AlignmentInput, DataType 6 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet 7 | import tempfile 8 | 9 | 10 | 11 | 12 | class ProbConsCLI(AbstractCLI): 13 | """ 14 | ProbCons low-level command line interface 15 | 16 | :Example: 17 | 18 | :: 19 | 20 | probcons_cli = _ProbConsCLI() 21 | process = mafft_cli(cmd='mafft args...') 22 | stdout = mafft_cli.get_stdout() 23 | """ 24 | @property 25 | def _default_exe(self): 26 | return 'probcons' 27 | 28 | # def _set_help(self): 29 | # self(help=True, wait=True) 30 | # self._help = self.get_stdout() 31 | 32 | 33 | def set_default_dna_options(aligner): 34 | """ 35 | Dummy function as sensible default already provided by mafft --auto 36 | """ 37 | aligner.options = get_default_options() 38 | 39 | 40 | def set_default_protein_options(aligner): 41 | """ 42 | Dummy function as sensible default already provided by mafft --auto 43 | """ 44 | aligner.options = get_default_options() 45 | 46 | 47 | class ProbCons(Aligner): 48 | """ 49 | Convenient wrapper for ProbCons multiple sequence aligner 50 | 51 | The wrapper is written as a callable class. 52 | This can hold data (state) to do with the operation it performs, so it can keep results, 53 | execution times and other metadata, as well as perform the task. 54 | 55 | This is a basic implementation that can be extended. The important parts are 56 | __init__ (does the setup) and __call__ (does the work). All 57 | else are helper methods. 58 | 59 | :Example: 60 | 61 | :: 62 | 63 | callable_wrapper = ProbCons(aln) 64 | result = callable_wrapper() 65 | time_taken = callable_wrapper.elapsed_time 66 | result_again = callable_wrapper.result 67 | 68 | 69 | .. note:: There exists an ipython notebook on how to work with wrappers, 70 | including dealing with non-default parameters. 71 | """ 72 | 73 | def __init__(self, input_, *args, **kwargs): 74 | super(ProbCons, self).__init__(input_, *args, **kwargs) 75 | self.options = get_default_options() 76 | if self.datatype == DataType.DNA: 77 | set_default_dna_options(self) 78 | else: 79 | set_default_protein_options(self) 80 | 81 | def __call__(self, *args, **kwargs): 82 | """ 83 | Anything to do with calling Mafft should go here. 84 | If any extra arguments need to be passed they can 85 | be specified (listed as *args and **kwargs for now). 86 | """ 87 | start = time.time() # time the execution 88 | 89 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is 90 | with tempfile.NamedTemporaryFile(mode='wt') as filehandle: 91 | SeqIO.write(self.input, filehandle, 'fasta') 92 | filehandle.seek(0) 93 | output, error = self._call(filehandle.name, *args, **kwargs) 94 | 95 | else: 96 | output, error = self._call(self.input, *args, **kwargs) 97 | 98 | self.result = self._read_result(output) # store result 99 | self.stdout = output 100 | self.stderr = error 101 | 102 | end = time.time() 103 | self.elapsed_time = end - start 104 | return self.result 105 | # End call 106 | 107 | # Any other accessory methods 108 | def _call(self, filename, *args, **kwargs): 109 | """ 110 | Call underlying low level _Mafft wrapper. 111 | Options are passed via *args and **kwargs 112 | [This only covers the simplest automatic 113 | case] 114 | """ 115 | self.cli('{} {}'.format(self.command(), filename), 116 | wait=True) 117 | return self.cli.get_stdout(), self.cli.get_stderr() 118 | 119 | def command(self): 120 | return str(self.options) 121 | 122 | def _read_result(self, output): 123 | """ 124 | Read back the result. 125 | """ 126 | fileobj = StringIO(output) 127 | return AlignIO.read(fileobj, 'fasta') 128 | 129 | def _init_cli(self, binary): 130 | return ProbConsCLI(executable=binary) 131 | 132 | 133 | def get_default_options(): 134 | return OptionSet([ 135 | # Algorithm 136 | 137 | # use CLUSTALW output format instead of MFA 138 | FlagOption('-clustalw', False, active=False), 139 | 140 | # use 0 <= REPS <= 5 (default: 2) passes of consistency transformation 141 | IntegerOption('-c', 0, active=False), 142 | 143 | # use 0 <= REPS <= 1000 (default: 100) passes of iterative-refinement 144 | IntegerOption('-ir', 100, active=False), 145 | 146 | # use 0 <= REPS <= 20 (default: 0) rounds of pretraining 147 | IntegerOption('-pre', 0, active=False), 148 | 149 | # generate all-pairs pairwise alignments 150 | FlagOption('-pairs', False, active=False), 151 | 152 | #use Viterbi algorithm to generate all pairs(automatically enables - pairs) 153 | FlagOption('-viterbi', False, active=False), 154 | 155 | # write annotation for multiple alignment to FILENAME 156 | StringOption('-annot', '', active=False), 157 | 158 | # print sequences in alignment order rather than input order (default: off) 159 | FlagOption('-a', False, active=False) 160 | 161 | ]) 162 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/aligners/prographmsa.py: -------------------------------------------------------------------------------- 1 | import time 2 | from Bio import AlignIO, SeqIO 3 | import tempfile 4 | from six import StringIO 5 | from ..abstract_cli import AbstractCLI 6 | from .base_aligner import Aligner, AlignmentInput, DataType 7 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet 8 | 9 | 10 | 11 | 12 | class ProGraphMSACLI(AbstractCLI): 13 | """ 14 | PrographMSA low-level command line interface 15 | 16 | :Example: 17 | 18 | :: 19 | 20 | prograph_cli = ProGraphMSACLI() 21 | process = prograph_cli(cmd='mafft args...') 22 | stdout = prograph_cli.get_stdout() 23 | """ 24 | 25 | @property 26 | def _default_exe(self): 27 | return 'ProGraphMSA' 28 | 29 | 30 | def set_default_dna_options(aligner): 31 | """ 32 | Dummy function as sensible default already provided by mafft --auto 33 | """ 34 | aligner.options = get_default_options() 35 | 36 | 37 | def set_default_protein_options(aligner): 38 | """ 39 | Dummy function as sensible default already provided by mafft --auto 40 | """ 41 | aligner.options = get_default_options() 42 | 43 | 44 | class ProGraphMSA(Aligner): 45 | """ 46 | Convenient wrapper for ProGraphMSA multiple sequence aligner 47 | 48 | The wrapper is written as a callable class. 49 | This can hold data (state) to do with the operation it performs, so it can keep results, 50 | execution times and other metadata, as well as perform the task. 51 | 52 | This is a basic implementation that can be extended. The important parts are 53 | __init__ (does the setup) and __call__ (does the work). All 54 | else are helper methods. 55 | 56 | :Example: 57 | 58 | :: 59 | 60 | callable_wrapper = Mafft(aln) 61 | result = callable_wrapper() 62 | time_taken = callable_wrapper.elapsed_time 63 | result_again = callable_wrapper.result 64 | """ 65 | 66 | def __init__(self, input_, *args, **kwargs): 67 | super(ProGraphMSA, self).__init__(input_, *args, **kwargs) 68 | self.options = get_default_options() 69 | if self.datatype == DataType.DNA: 70 | set_default_dna_options(self) 71 | else: 72 | set_default_protein_options(self) 73 | 74 | def __call__(self, *args, **kwargs): 75 | """ 76 | Anything to do with calling ProGraphMSA should go here. 77 | If any extra arguments need to be passed they can 78 | be specified (listed as *args and **kwargs for now). 79 | """ 80 | start = time.time() # time the execution 81 | 82 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is 83 | with tempfile.NamedTemporaryFile(mode="wt") as fh: 84 | SeqIO.write(self.input, fh, 'fasta') 85 | fh.seek(0) 86 | output, error = self._call(fh.name, *args, **kwargs) 87 | 88 | else: 89 | output, error = self._call(self.input, *args, **kwargs) 90 | 91 | self.result = self._read_result(output) # store result 92 | self.stdout = output 93 | self.stderr = error 94 | 95 | end = time.time() 96 | self.elapsed_time = end - start 97 | return self.result 98 | # End call 99 | 100 | # Any other accessory methods 101 | def _call(self, filename, *args, **kwargs): 102 | """ 103 | Call underlying low level ProGraphMSA wrapper. 104 | Options are passed via *args and **kwargs 105 | [This only covers the simplest automatic 106 | case] 107 | """ 108 | self.cli('{} {}'.format(self.command(), filename), 109 | wait=True) 110 | return self.cli.get_stdout(), self.cli.get_stderr() 111 | 112 | def command(self): 113 | return str(self.options) 114 | 115 | def _read_result(self, output): 116 | """ 117 | Read back the result. 118 | """ 119 | fileobj = StringIO(output) 120 | return AlignIO.read(fileobj, 'fasta') 121 | 122 | def _init_cli(self, binary): 123 | return ProGraphMSACLI(executable=binary) 124 | 125 | 126 | def get_default_options(): 127 | return OptionSet([ 128 | # Algorithm 129 | 130 | # output fasta format (instead of stockholm), better because no tree output is produced 131 | FlagOption('--fasta', True, active=True), 132 | 133 | # output all ancestral sequences 134 | FlagOption('--ancestral_seqs', False, active=False), 135 | 136 | # output sequences in input order (default: tree order) 137 | FlagOption('--input_order', False, active=False), 138 | 139 | # output all intermediate guide trees 140 | FlagOption('--all_trees', False, active=False), 141 | 142 | # use ML distances with gap 143 | FlagOption('--mldist_gap', False, active=False), 144 | 145 | # use ML distances 146 | FlagOption('--mldist', False, active=False), 147 | 148 | # use of guide tree 149 | StringOption('--tree', '', active=False) 150 | 151 | ]) 152 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/modeltesters/__init__.py: -------------------------------------------------------------------------------- 1 | from .prottest import ProtTest 2 | 3 | 4 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/modeltesters/base_modeltester.py: -------------------------------------------------------------------------------- 1 | import os, types, itertools 2 | from abc import ABCMeta, abstractmethod 3 | from enum import Enum 4 | from Bio import AlignIO, SeqIO 5 | from Bio.Align import MultipleSeqAlignment 6 | from ...seq_utils import is_dna 7 | 8 | 9 | 10 | from zoo.wrappers import WrapperError 11 | from zoo.wrappers.aligners.base_aligner import identify_input 12 | 13 | import logging 14 | logger = logging.getLogger(__name__) 15 | 16 | AlignmentInput = Enum('AlignmentInput', 'OBJECT FILENAME') 17 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN') 18 | 19 | 20 | class ModelTester(object): 21 | """ 22 | Base class for wrappers of model testers for phylogeny inference 23 | 24 | The wrapper is written as a callable class. 25 | This can hold data (state) to do with the operation it performs, so it can keep results, 26 | execution times and other metadata, as well as perform the task. 27 | 28 | This is a base implementation to be extended. The important parts are 29 | __init__ (does the setup) and __call__ (does the work). All 30 | else are helper methods. 31 | 32 | :Example: 33 | 34 | :: 35 | 36 | callable_wrapper = ConcreteModelTester(aln) 37 | result = callable_wrapper() 38 | time_taken = callable_wrapper.elapsed_time 39 | result_again = callable_wrapper.result 40 | """ 41 | __metaclass__ = ABCMeta 42 | 43 | def __init__(self, alignment=None, datatype=DataType.UNKNOWN, binary=None): 44 | """ 45 | ..note:: TODO: this documentation is not correct. it needs to be updateted. 46 | 47 | Should work the same whether you're working with a Biopython object or a file 48 | but the implementation differs, e.g. a Biopython object will need 49 | to be written temporarily to disk for the Aligner to work on it. 50 | 51 | alignment is one of 4 things: 52 | a filename 53 | a Biopython MSA 54 | a list of Seq objects 55 | anything else (throw an exception) 56 | 57 | binary is the alignment's executable file, or None 58 | """ 59 | 60 | if alignment is not None: 61 | self.input_type = identify_input(alignment) # Figure out what it is - file or object 62 | if datatype == DataType.UNKNOWN: 63 | # dup, input_ = itertools.tee(input_) 64 | self.datatype = guess_datatype(alignment, from_filename=self.input_type == AlignmentInput.FILENAME) 65 | else: 66 | self.datatype = datatype 67 | 68 | self.input = alignment # store it 69 | else: 70 | self.input_type = None 71 | self.input = None 72 | 73 | 74 | self.elapsed_time = None 75 | self.stdout = None 76 | self.stderr = None 77 | self.cli = self._init_cli(binary) 78 | #TODO: the wrapper error is not compatible with calling a function with java! 79 | #try: 80 | # self.cli = self._init_cli(binary) 81 | #except IOError as err: 82 | # raise WrapperError('Error searching for binary: {}'.format(err)) 83 | # End setup 84 | 85 | @abstractmethod 86 | def __call__(self, *args, **kwargs): 87 | """ 88 | How to call the underlying aligner 89 | """ 90 | pass 91 | 92 | @abstractmethod 93 | def _init_cli(self, binary): 94 | """ 95 | Set up the command-line interface to the wrapped software 96 | :param binary: filename of executable binary file 97 | :return: concrete CLI type inheriting from AbstractCLI 98 | """ 99 | pass 100 | 101 | 102 | def guess_datatype(alignment, from_filename=False): 103 | logger.warning("Guessing is not recommended - specify the sequence type with option datatype={DNA, PROTEIN}, be more confident") 104 | if from_filename: 105 | try: 106 | alignment = list(SeqIO.parse(alignment, 'fasta')) 107 | except: 108 | alignment = list(SeqIO.parse(alignment, 'phylip-relaxed')) 109 | return DataType.DNA if is_dna(alignment) else DataType.PROTEIN 110 | 111 | 112 | def identify_input(alignment): 113 | """ 114 | Work out if we're dealing with an alignment (return True), a file 115 | (return False), or invalid input (raise error) 116 | 117 | :param alignment: either an Biopython MultipleSequenceAlignment or 118 | a filename pointing to an existing msa file. 119 | """ 120 | try: 121 | if isinstance(alignment, (MultipleSeqAlignment, types.GeneratorType, list)): 122 | # `alignment` is a Biopython MultipleSequenceAlignment 123 | return AlignmentInput.OBJECT 124 | 125 | elif isinstance(alignment, str) and os.path.exists(alignment): 126 | # `alignment` is a filepath 127 | return AlignmentInput.FILENAME 128 | 129 | except: 130 | # `alignment` is some other thing we can't handle 131 | raise ValueError('{} is not an alignment object or a valid filename'.format(alignment)) 132 | 133 | 134 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/modeltesters/parsers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import dendropy as dpy 3 | from pyparsing import Suppress, SkipTo, Word, Regex, Literal, OneOrMore, Group, LineEnd, CharsNotIn, nums, alphanums, \ 4 | ParseException 5 | 6 | 7 | logger = logging.getLogger(__name__) 8 | logger.addHandler(logging.StreamHandler()) 9 | 10 | 11 | FLOAT = Word(nums + '.-').setParseAction(lambda x: float(x[0])) 12 | INT = Word(nums).setParseAction(lambda x: int(x[0])) 13 | WORD = Word(alphanums + '_') 14 | SPACEDWORD = Word(alphanums+' _') 15 | 16 | 17 | class ProtTestParser(object): 18 | """ 19 | Simple prottest result parser. 20 | """ 21 | 22 | def __init__(self): 23 | self.MODEL = Regex(r'Best model according to\s+') 24 | # These are all the models that are possible to be tested using phyml 25 | self.model = OneOrMore(Group(Suppress(SkipTo(self.MODEL)) + Suppress(self.MODEL) + WORD + Suppress(":") + WORD)) 26 | 27 | def parse(self, s): 28 | model = None 29 | try: 30 | model = self.model.parseString(s).asList() 31 | except ParseException as err: 32 | logger.error(err) 33 | 34 | return model 35 | 36 | def to_dict(self, stats_filename): 37 | result = {} 38 | model = self.parse(stats_filename) 39 | try: 40 | for mg in model: 41 | result[mg[0]] = mg[1] 42 | except IOError as err: 43 | logger.error(err) 44 | return 45 | 46 | return result 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/modeltesters/prottest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import tempfile 4 | import logging 5 | 6 | 7 | from pyparsing import ParseException 8 | from Bio import AlignIO, SeqIO 9 | 10 | from .parsers import ProtTestParser 11 | from .base_modeltester import ModelTester, AlignmentInput, DataType 12 | 13 | from ..abstract_cli import AbstractCLI 14 | from ..options import StringOption, FlagOption, OptionSet 15 | 16 | logger = logging.getLogger(__name__) 17 | logger.addHandler(logging.StreamHandler()) 18 | logger.setLevel(logging.INFO) 19 | 20 | 21 | class ProtTestCLI(AbstractCLI): 22 | """ 23 | Especially in this case it is important that the $PROTTEST_HOME environmental variable is set to the installation directory of the prottest tool 24 | """ 25 | @property 26 | def _default_exe(self): 27 | return 'java -jar ' + os.environ['PROTTEST_HOME'] + '/prottest-3.4.2.jar' 28 | 29 | 30 | def set_default_dna_options(modeltester): 31 | """ 32 | Dummy function as sensible default 33 | """ 34 | modeltester.options = get_default_options() 35 | 36 | 37 | def set_default_protein_options(modeltester): 38 | """ 39 | Dummy function as sensible default 40 | """ 41 | modeltester.options = get_default_options() 42 | 43 | 44 | class ProtTest(ModelTester): 45 | """ ProtTest to determine the best model for a specific alignment 46 | This wrapper can be called to test various models for phylogeny inference. 47 | """ 48 | 49 | def __init__(self, alignment, *args, **kwargs): 50 | """ 51 | :param alignment: input multiple sequence alignment. This can be either 52 | a filename or an biopython SeqRecord collection. 53 | """ 54 | self.options = get_default_options() 55 | super(ProtTest, self).__init__(alignment=alignment, *args, **kwargs) 56 | if self.datatype == DataType.DNA: 57 | set_default_dna_options(self) 58 | else: 59 | set_default_protein_options(self) 60 | 61 | def __call__(self, *args, **kwargs): 62 | """ 63 | Anything to do with calling ProtTest should go here. 64 | If any extra arguments need to be passed they can 65 | be specified (listed as *args and **kwargs for now). 66 | """ 67 | start = time.time() # time the execution 68 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is 69 | with tempfile.NamedTemporaryFile(mode='wt') as filehandle: 70 | SeqIO.write(self.input, filehandle, 'fasta') 71 | filehandle.seek(0) 72 | output, error = self._call(filehandle.name, *args, **kwargs) 73 | else: 74 | output, error = self._call(self.input, *args, **kwargs) 75 | 76 | self.result = self._read_result(output) # store result 77 | self.stdout = output 78 | self.stderr = error 79 | 80 | end = time.time() 81 | self.elapsed_time = end - start 82 | return self.result 83 | # End call 84 | 85 | # Any other accessory methods 86 | def _call(self, filename, *args, **kwargs): 87 | """ 88 | Call underlying low level _ProtTest wrapper. 89 | Options are passed via *args and **kwargs 90 | [This only covers the simplest automatic 91 | case] 92 | """ 93 | self.cli('{} -i {}'.format(self.command(), filename), 94 | wait=True) 95 | return self.cli.get_stdout(), self.cli.get_stderr() 96 | 97 | def command(self): 98 | return str(self.options) 99 | 100 | def _read_result(self, output): 101 | 102 | parser = ProtTestParser() 103 | 104 | try: 105 | result = parser.to_dict(output) 106 | 107 | except IOError as ioerr: 108 | logger.error('Error reading results') 109 | result = None 110 | except ParseException as parseerr: 111 | logger.error('Other parse error', parseerr) 112 | result = None 113 | 114 | return result 115 | 116 | 117 | def _init_cli(self, binary): 118 | return ProtTestCLI(executable=binary) 119 | 120 | 121 | def get_default_options(): 122 | return OptionSet([ 123 | # Algorithm 124 | 125 | # Display models sorted by Akaike Information Criterion (AIC) 126 | FlagOption('-AIC', False, active=False), 127 | 128 | # Display models sorted by Decision Theory Criterion 129 | FlagOption('-DT', False, active=False), 130 | 131 | # Tree file (optional) [default: NJ tree] 132 | StringOption('-t', '', active=False), 133 | 134 | # Display models sorted by Corrected Akaike Information Criterion (AICc) 135 | FlagOption('-AICC', False, active=False), 136 | 137 | #Enables / Disables PhyML logging into log directory(see prottest.properties) 138 | FlagOption('-log', False, active=False) 139 | ]) 140 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/options.py: -------------------------------------------------------------------------------- 1 | from numbers import Integral, Real 2 | from six import string_types 3 | from abc import ABCMeta, abstractproperty 4 | from dendropy import Tree 5 | 6 | 7 | class Option(object): 8 | """Abstract base class for an option. 9 | 10 | Options provide an interface between the wrapper and the 11 | concrete command line option of the wrapped program.""" 12 | __metaclass__ = ABCMeta 13 | 14 | def __init__(self, name, default=None, active=False): 15 | self._name = name 16 | self.set_value(default) 17 | self.active = active 18 | 19 | def __repr__(self): 20 | return '{}({}={}) <{}>'.format(self.__class__.__name__, self.name, self.get_value(), 'on' if self.active else 'off') 21 | 22 | def __str__(self): 23 | return (' '.join([self._name, str(self.get_value())]) if self.active else '') 24 | 25 | @property 26 | def active(self): 27 | return self._active 28 | 29 | @active.setter 30 | def active(self, val): 31 | self._active = True if val else False 32 | 33 | @property 34 | def name(self): 35 | return self._name 36 | 37 | def set_value(self, value): 38 | self._value = value 39 | if value is not None: 40 | self.active = True 41 | 42 | def get_value(self): 43 | return self._value 44 | 45 | def set_and_activate(self, value): 46 | self.set_value(value) 47 | self.active = True 48 | 49 | def status(self): 50 | return 'Name: {}\nValue: {}\nActive: {}\nStr: {}'.format(self.name, 51 | self.get_value(), 52 | self.active, 53 | str(self) or "''") 54 | 55 | 56 | class ValueOption(Option): 57 | __metaclass__ = ABCMeta 58 | 59 | 60 | class TypedValueOption(ValueOption): 61 | """A TypedValueOption is an option that only accepts options of a given type. 62 | 63 | This abstract class provides the functionality to check the type 64 | of a passed value and raises an ValueError if it doesn't match 65 | the expected type. 66 | 67 | A TypedValueOption must overwrite the abstract property _type. 68 | """ 69 | 70 | __metaclass__ = ABCMeta 71 | 72 | @abstractproperty 73 | def _type(self): 74 | pass 75 | 76 | def set_value(self, value): 77 | if isinstance(value, self._type): 78 | self._value = value 79 | self.active = True 80 | 81 | else: 82 | raise ValueError('Value should be of type {}'.format(self.type)) 83 | 84 | 85 | ### Concrete classes from here on 86 | 87 | class IntegerOption(TypedValueOption): 88 | """option to hold an integer value""" 89 | @property 90 | def _type(self): 91 | return Integral 92 | 93 | 94 | class FloatOption(TypedValueOption): 95 | """Option to hold a real number value""" 96 | 97 | @property 98 | def _type(self): 99 | return Real 100 | 101 | def get_value(self): 102 | return float(self._value) 103 | 104 | 105 | class StringOption(TypedValueOption): 106 | """Opion to hold a string value""" 107 | 108 | def __init__(self, name, value=None, active=False): 109 | if value is None: 110 | value = str() 111 | super(StringOption, self).__init__(name, value, active) 112 | 113 | @property 114 | def _type(self): 115 | return string_types 116 | 117 | 118 | class FlagOption(TypedValueOption): 119 | """Option to hold a boolean flag value, i.e. True or False""" 120 | @property 121 | def _type(self): 122 | return bool 123 | 124 | def __str__(self): 125 | return (self._name if self.active and self.get_value() else '') 126 | 127 | 128 | class TreeInputOption(TypedValueOption): 129 | """Option to hold a phylogenetic tree argument. 130 | 131 | As of now, Trees are represented as :class:`dendropy.Tree` objects.""" 132 | 133 | @property 134 | def _type(self): 135 | return Tree 136 | 137 | 138 | class MultiOption(Option): 139 | """Option to hold a list""" 140 | 141 | @property 142 | def _type(self): 143 | return list 144 | 145 | def __str__(self): 146 | listopts = self.get_value() 147 | if listopts is None: return '' 148 | strings = [] 149 | for item in listopts: 150 | item_string = ' '.join([self._name, str(item)]) if self.active else '' 151 | if item_string > '': 152 | strings.append(item_string) 153 | 154 | return ' '.join(strings) 155 | 156 | 157 | class OptionSet(object): 158 | """Option to hold a set of key-value pairs.""" 159 | def __init__(self, options): 160 | if isinstance(options, (list, tuple)): 161 | self.options = {opt.name: opt for opt in options} 162 | elif isinstance(options, dict): 163 | self.options = options 164 | else: 165 | raise ValueError('Expected a list, tuple or dict of options, not {}'.format(type(options))) 166 | 167 | def __str__(self): 168 | strings = [] 169 | for name, option in self.options.items(): 170 | option_string = str(option) 171 | if option_string > '': 172 | strings.append(option_string) 173 | 174 | return ' '.join(strings) 175 | 176 | def __getitem__(self, item): 177 | return self.options[item] 178 | 179 | def list(self): 180 | return [(name, option) for (name, option) in self.options.items()] 181 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/treebuilders/__init__.py: -------------------------------------------------------------------------------- 1 | from .phyml import Phyml 2 | from .raxml import Raxml 3 | from .iqtree import Iqtree 4 | from .fasttree import Fasttree 5 | from .guenomu import Guenomu 6 | 7 | 8 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/treebuilders/base_treebuilder.py: -------------------------------------------------------------------------------- 1 | import os, types, itertools 2 | from abc import ABCMeta, abstractmethod 3 | from enum import Enum 4 | from Bio import AlignIO, SeqIO 5 | from Bio.Align import MultipleSeqAlignment 6 | from ...seq_utils import is_dna 7 | 8 | from FastOMA.zoo.wrappers import WrapperError 9 | from FastOMA.zoo.wrappers.aligners.base_aligner import identify_input 10 | 11 | import logging 12 | logger = logging.getLogger(__name__) 13 | 14 | AlignmentInput = Enum('AlignmentInput', 'OBJECT FILENAME') 15 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN') 16 | 17 | 18 | class TreeBuilder(object): 19 | """ 20 | Base class for wrappers of tree building software 21 | 22 | The wrapper is written as a callable class. 23 | This can hold data (state) to do with the operation it performs, so it can keep results, 24 | execution times and other metadata, as well as perform the task. 25 | 26 | This is a base implementation to be extended. The important parts are 27 | __init__ (does the setup) and __call__ (does the work). All 28 | else are helper methods. 29 | 30 | :Example: 31 | 32 | :: 33 | 34 | callable_wrapper = ConcreteAligner(aln) 35 | result = callable_wrapper() 36 | time_taken = callable_wrapper.elapsed_time 37 | result_again = callable_wrapper.result 38 | """ 39 | __metaclass__ = ABCMeta 40 | 41 | def __init__(self, alignment=None, datatype=DataType.UNKNOWN, binary=None): 42 | """ 43 | ..note:: TODO: this documentation is not correct. it needs to be updateted. 44 | 45 | Should work the same whether you're working with a Biopython object or a file 46 | but the implementation differs, e.g. a Biopython object will need 47 | to be written temporarily to disk for the Aligner to work on it. 48 | 49 | alignment is one of 4 things: 50 | a filename 51 | a Biopython MSA 52 | a list of Seq objects 53 | anything else (throw an exception) 54 | 55 | binary is the alignment's executable file, or None 56 | """ 57 | 58 | if alignment is not None: 59 | self.input_type = identify_input(alignment) # Figure out what it is - file or object 60 | if datatype == DataType.UNKNOWN: 61 | # dup, input_ = itertools.tee(input_) 62 | self.datatype = guess_datatype(alignment, from_filename=self.input_type == AlignmentInput.FILENAME) 63 | else: 64 | self.datatype = datatype 65 | 66 | self.input = alignment # store it 67 | else: 68 | self.input_type = None 69 | self.input = None 70 | 71 | self.elapsed_time = None 72 | self.stdout = None 73 | self.stderr = None 74 | try: 75 | self.cli = self._init_cli(binary) 76 | except IOError as err: 77 | raise WrapperError('Error searching for binary: {}'.format(err)) 78 | # End setup 79 | 80 | @abstractmethod 81 | def __call__(self, *args, **kwargs): 82 | """ 83 | How to call the underlying aligner 84 | """ 85 | pass 86 | 87 | @abstractmethod 88 | def _init_cli(self, binary): 89 | """ 90 | Set up the command-line interface to the wrapped software 91 | :param binary: filename of executable binary file 92 | :return: concrete CLI type inheriting from AbstractCLI 93 | """ 94 | pass 95 | 96 | 97 | def guess_datatype(alignment, from_filename=False): 98 | if from_filename: 99 | try: 100 | alignment = list(SeqIO.parse(alignment, 'fasta')) 101 | except: 102 | alignment = list(SeqIO.parse(alignment, 'phylip-relaxed')) 103 | if is_dna(alignment): 104 | logger.warning("Guessed datatype=DNA. But better explicitly specify the sequence type with option datatype={DNA, PROTEIN}.") 105 | return DataType.DNA 106 | else: 107 | logger.warning("Guessed datatype=PROTEIN. But better explicitly specify the sequence type with option datatype={DNA, PROTEIN}.") 108 | return DataType.PROTEIN 109 | 110 | 111 | def identify_input(alignment): 112 | """ 113 | Work out if we're dealing with an alignment (return True), a file 114 | (return False), or invalid input (raise error) 115 | 116 | :param alignment: either an Biopython MultipleSequenceAlignment or 117 | a filename pointing to an existing msa file. 118 | """ 119 | try: 120 | if isinstance(alignment, (MultipleSeqAlignment, types.GeneratorType, list)): 121 | # `alignment` is a Biopython MultipleSequenceAlignment 122 | return AlignmentInput.OBJECT 123 | 124 | elif isinstance(alignment, str) and os.path.exists(alignment): 125 | # `alignment` is a filepath 126 | return AlignmentInput.FILENAME 127 | 128 | except: 129 | # `alignment` is some other thing we can't handle 130 | raise ValueError('{} is not an alignment object or a valid filename'.format(alignment)) 131 | 132 | 133 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/treebuilders/fasttree.py: -------------------------------------------------------------------------------- 1 | # Author: Ivana Pilizota 2 | # Date: 1 November 2016 3 | 4 | import logging 5 | import os 6 | import time 7 | 8 | from Bio import SeqIO 9 | from pyparsing import ParseException 10 | import tempfile 11 | 12 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType, WrapperError 13 | from .parsers import FasttreeParser 14 | 15 | from ..abstract_cli import AbstractCLI 16 | from ..options import OptionSet, StringOption, IntegerOption 17 | from ...file_utils import TempFile, TempDir 18 | 19 | logger = logging.getLogger(__name__) 20 | logger.addHandler(logging.StreamHandler()) 21 | logger.setLevel(logging.INFO) 22 | 23 | 24 | 25 | 26 | class FasttreeCLI(AbstractCLI): 27 | @property 28 | def _default_exe(self): 29 | return ['fasttree', 'FastTree'] 30 | 31 | 32 | def set_default_dna_options(treebuilder): 33 | """ 34 | Dummy function as sensible default 35 | """ 36 | treebuilder.options = get_default_options() 37 | 38 | 39 | def set_default_protein_options(treebuilder): 40 | """ 41 | Dummy function as sensible default 42 | """ 43 | treebuilder.options = get_default_options() 44 | 45 | 46 | class Fasttree(TreeBuilder): 47 | 48 | def __init__(self, alignment, *args, **kwargs): 49 | self.options = get_default_options() 50 | super(Fasttree, self).__init__(alignment=alignment, *args, **kwargs) 51 | if self.input is not None: 52 | if self.datatype == DataType.DNA: 53 | set_default_dna_options(self) 54 | else: 55 | set_default_protein_options(self) 56 | 57 | def __call__(self, *args, **kwargs): 58 | """ 59 | Sets up temporary output file location and calls FastTree using _call() function. 60 | Writes temporary input file if we're working with SeqIO object 61 | Saves the stdout and stderr and returns 62 | """ 63 | start = time.time() # time the execution 64 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is 65 | with tempfile.NamedTemporaryFile(mode='wt') as fh: 66 | SeqIO.write(self.input, fh, 'fasta') # default interleaved # 'phylip-relaxed' 67 | fh.seek(0) 68 | output, error = self._call(fh.name, *args, **kwargs) 69 | self.result = self._read_result(output, error) # store result 70 | else: 71 | filename = os.path.abspath(self.input) 72 | output, error = self._call(filename, *args, **kwargs) 73 | self.result = self._read_result(output, error) # store result 74 | 75 | end = time.time() 76 | self.elapsed_time = end - start 77 | return self.result["tree"] 78 | # End call 79 | 80 | # Any other accessory methods 81 | def _call(self, filename, *args, **kwargs): 82 | """ 83 | Call underlying low level FastTree wrapper. 84 | Options are passed via *args and **kwargs 85 | [This only covers the simplest automatic 86 | case] 87 | """ 88 | #hard code tmp_output as the output name since we don't save it anyway 89 | #self.cli('{} -log {log_output} {seqfile} > {tmp_path}'.format(self.command(), tmp_path=os.path.join(tmpd,'tmp_output'), log_output=logfile, seqfile=filename), wait=True) 90 | self.cli('{} {seq_file}'.format(self.command(), seq_file=filename), wait=True) 91 | self.returncode = self.cli.process.returncode 92 | 93 | if self.returncode != 0: 94 | self.stderr = self.cli.get_stderr() 95 | last_error_line = self.stderr.split('\n')[-1].strip() 96 | msg = f"Fasttree failed on {filename}: {last_error_line}" 97 | logger.error(msg) 98 | raise WrapperError(msg, self.stderr) 99 | 100 | return (self.cli.get_stdout(), self.cli.get_stderr()) 101 | 102 | def command(self): 103 | return str(self.options) 104 | 105 | def _read_result(self, stdout, stderr): 106 | """ 107 | Read back the result. 108 | """ 109 | parser = FasttreeParser() 110 | 111 | try: 112 | parser.parse(tree=stdout, other=stderr) 113 | result = parser.to_dict() 114 | except IOError as ioerr: 115 | logger.error('Error reading results') 116 | result = None 117 | except ParseException as parseerr: 118 | logger.error('Other parse error', parseerr) 119 | result = None 120 | 121 | return result 122 | 123 | def _init_cli(self, binary): 124 | return FasttreeCLI(executable=binary) 125 | 126 | 127 | def get_default_options(): 128 | 129 | return OptionSet([ 130 | # Algorithm 131 | 132 | # Set datatype to DNA (nt) or AA alignment: AA by default. If set to True will assume DNA format. 133 | StringOption('-nt', active=False), 134 | 135 | # Set the WAG model for AA alignment. Default Jones-Taylor-Thorton 136 | StringOption('-wag', active=False), 137 | 138 | # Set the GTR model for nt alignment. Default Jones-Taylor-Thorton 139 | StringOption('-gtr', active=False), 140 | 141 | # Set the gamma model. Default Jones-Taylor-Thorton 142 | StringOption('-gamma', active=False), 143 | 144 | # Specify the number of rate categories of sites. Default 20. 145 | IntegerOption('-cat', 20, active=False), 146 | 147 | IntegerOption('-seed',1234, active=False), 148 | 149 | # Specify starting tree 150 | StringOption('-intree', '', active=False), 151 | 152 | # Speed up the neighbor joining phase & reduce memory usage (recommended for >50,000 sequences) 153 | StringOption('-fastest', active=False), 154 | # allow spaces and other restricted characters (but not ' ) in sequence names and quote names in the output tree (fasta input only; FastTree will not be able to read these trees back in) 155 | StringOption('-quote', active=True), 156 | 157 | #-quote -- quote sequence names in the output and allow spaces, commas, parentheses, and colons in them but not ' characters (fasta files only)\n" 158 | StringOption('-quote', active=False), 159 | 160 | # Set the number of rounds of maximum-likelihood NNIs. Deafault 4*log2(N), N = the number of unique sequences 161 | IntegerOption('-mlnni', 0, active=False), 162 | 163 | ]) 164 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/treebuilders/iqtree.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import logging 4 | import random 5 | from pyparsing import ParseException 6 | import shutil 7 | from Bio import SeqIO 8 | 9 | 10 | from .parsers import IqtreeParser 11 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType 12 | 13 | 14 | from ..abstract_cli import AbstractCLI 15 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet 16 | 17 | from ...file_utils import TempFile, TempDir 18 | 19 | logger = logging.getLogger(__name__) 20 | logger.addHandler(logging.StreamHandler()) 21 | logger.setLevel(logging.INFO) 22 | 23 | 24 | class IqtreeCLI(AbstractCLI): 25 | @property 26 | def _default_exe(self): 27 | return ['iqtree-omp', 'iqtree'] 28 | 29 | 30 | def set_default_dna_options(treebuilder): 31 | """ 32 | Dummy function as sensible default 33 | """ 34 | treebuilder.options = get_default_options() 35 | treebuilder.options['-st'].set_value('DNA') 36 | 37 | 38 | def set_default_protein_options(treebuilder): 39 | """ 40 | Dummy function as sensible default 41 | """ 42 | treebuilder.options = get_default_options() 43 | treebuilder.options['-st'].set_value('AA') 44 | 45 | 46 | class Iqtree(TreeBuilder): 47 | 48 | def __init__(self, input_, *args, **kwargs): 49 | super(Iqtree, self).__init__(alignment=input_, *args, **kwargs) 50 | self.options = get_default_options() 51 | if self.datatype == DataType.DNA: 52 | set_default_dna_options(self) 53 | elif self.datatype == DataType.PROTEIN: 54 | set_default_protein_options(self) 55 | 56 | def __call__(self, *args, **kwargs): 57 | """ 58 | Sets up temporary output file location and calls iqtree using _call() function. 59 | Writes temporary input file if we're working with SeqIO object 60 | Saves the stdout and stderr and returns 61 | """ 62 | start = time.time() # time the execution 63 | 64 | #Need to create temp directory to put raxml output here 65 | with TempDir() as tmpd: 66 | if self.input_type is AlignmentInput.OBJECT: # different operation depending on what it is 67 | with TempFile() as filename: 68 | SeqIO.write(self.input, filename, 'phylip-relaxed') # default interleaved 69 | output, error = self._call(filename,tmpd, *args, **kwargs) 70 | elif self.input_type is AlignmentInput.FILENAME: 71 | filename = self.input 72 | output, error = self._call(filename, tmpd, *args, **kwargs) 73 | else: 74 | output, error = self._call(None,tmpd, *args, **kwargs) 75 | self.result = self._read_result(tmpd) # store result 76 | self.stdout = output 77 | self.stderr = error 78 | 79 | end = time.time() 80 | self.elapsed_time = end - start 81 | return self.result 82 | # End call 83 | 84 | # Any other accessory methods 85 | def _call(self, filename, tmpd, *args, **kwargs): 86 | """ 87 | Call underlying low level _iqtree wrapper. 88 | Options are passed via *args and **kwargs 89 | [This only covers the simplest automatic 90 | case] 91 | """ 92 | self.cli('{} -pre {tmp_path} -s {seqfile}'.format(self.command(), 93 | tmp_path=os.path.join(tmpd, 'tmp_output'), 94 | seqfile=filename), 95 | wait=True) 96 | return self.cli.get_stdout(), self.cli.get_stderr() 97 | 98 | def command(self): 99 | return str(self.options) 100 | 101 | def _read_result(self, tmpd): 102 | """ 103 | Read back the result. 104 | """ 105 | 106 | expected_outfiles = [os.path.join(tmpd, 'tmp_output.iqtree'), 107 | os.path.join(tmpd, 'tmp_output.treefile')] 108 | parser = IqtreeParser() 109 | try: 110 | result = parser.to_dict(*expected_outfiles) 111 | except IOError as ioerr: 112 | logger.error('Error reading results') 113 | result = None 114 | except ParseException as parseerr: 115 | logger.error('Other parse error', parseerr) 116 | result = None 117 | return result 118 | 119 | def _init_cli(self, binary): 120 | return IqtreeCLI(executable=binary) 121 | 122 | 123 | def get_default_options(): 124 | return OptionSet([ 125 | # Number of threads 126 | IntegerOption('-nt', 2, active=True), 127 | 128 | # Set the model for either DNA or AA alignment 129 | StringOption('-m', '', active=False), 130 | 131 | # Ultrafast bootstrap (>=1000) 132 | IntegerOption('-bb', 0, active=False), 133 | 134 | # SH-like approximate likelihood ratio test (SH-aLRT) 135 | IntegerOption('-alrt', 0, active=False), 136 | 137 | # Bootstrap + ML tree + consensus tree (>=100) 138 | IntegerOption('-b', 0, active=False) 139 | ]) 140 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/treebuilders/phyml.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import tempfile 4 | import logging 5 | from pyparsing import ParseException 6 | from Bio import AlignIO, SeqIO 7 | 8 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType 9 | from .parsers import PhymlParser 10 | 11 | from ..abstract_cli import AbstractCLI 12 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | logger.addHandler(logging.StreamHandler()) 17 | logger.setLevel(logging.INFO) 18 | 19 | 20 | class PhymlCLI(AbstractCLI): 21 | @property 22 | def _default_exe(self): 23 | return 'phyml' 24 | 25 | 26 | def set_default_dna_options(treebuilder): 27 | """ 28 | Dummy function as sensible default 29 | """ 30 | treebuilder.options = get_default_options() 31 | treebuilder.options['-d'].set_value('nt') 32 | 33 | 34 | def set_default_protein_options(treebuilder): 35 | """ 36 | Dummy function as sensible default 37 | """ 38 | treebuilder.options = get_default_options() 39 | 40 | 41 | class Phyml(TreeBuilder): 42 | """ Phyml tree reconstruction 43 | 44 | This wrapper can be called to reconstruct a phylogenetic tree 45 | using PhyML. 46 | """ 47 | 48 | def __init__(self, alignment, *args, **kwargs): 49 | """ 50 | :param alignment: input multiple sequence alignment. This can be either 51 | a filename or an biopython SeqRecord collection. 52 | """ 53 | super(Phyml, self).__init__(alignment, *args, **kwargs) 54 | self.options = get_default_options() 55 | if self.datatype == DataType.DNA: 56 | set_default_dna_options(self) 57 | else: 58 | set_default_protein_options(self) 59 | 60 | def __call__(self, *args, **kwargs): 61 | """ 62 | Anything to do with calling Mafft should go here. 63 | If any extra arguments need to be passed they can 64 | be specified (listed as *args and **kwargs for now). 65 | """ 66 | start = time.time() # time the execution 67 | 68 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is 69 | with tempfile.NamedTemporaryFile(mode='wt') as fh: 70 | SeqIO.write(self.input, fh, 'phylip-relaxed') # default interleaved 71 | fh.seek(0) 72 | output, error = self._call(fh.name, *args, **kwargs) 73 | self.result = self._read_result(fh.name) # store result 74 | else: 75 | path = os.path.dirname(self.input) 76 | filename = os.path.basename(self.input) 77 | # some operations done because phyml can not deal with large filenames that are caused due to a large path 78 | with os.chdir(path): 79 | output, error = self._call(filename, *args, **kwargs) 80 | self.result = self._read_result(filename) # store result 81 | 82 | self.stdout = output 83 | self.stderr = error 84 | 85 | end = time.time() 86 | self.elapsed_time = end - start 87 | return self.result["tree"] 88 | # End call 89 | 90 | # Any other accessory methods 91 | def _call(self, filename, *args, **kwargs): 92 | """ 93 | Call underlying low level _Phyml wrapper. 94 | Options are passed via *args and **kwargs 95 | [This only covers the simplest automatic 96 | case] 97 | """ 98 | self.cli('{} -i {}'.format(self.command(), filename), 99 | wait=True) 100 | return self.cli.get_stdout(), self.cli.get_stderr() 101 | 102 | def command(self): 103 | return str(self.options) 104 | 105 | def _read_result(self, output): 106 | """ 107 | Read back the result. 108 | """ 109 | 110 | #TODO: change the output dictionary into a better format 111 | expected_outfiles = ['{}_phyml_stats'.format(output), '{}_phyml_tree'.format(output)] 112 | parser = PhymlParser() 113 | 114 | # Phyml outputs two outfiles, a stats file and a tree file. 115 | # Sometimes it appends .txt, sometimes not. Seems to be platform-specific. 116 | # Here we assume they are without .txt, but if we can't find them, try 117 | # looking for the .txt onees instead 118 | try: 119 | # Check if these are the .txt style outfiles 120 | if not os.path.exists(expected_outfiles[0]): 121 | expected_outfiles = [x + '.txt' for x in expected_outfiles] 122 | result = parser.to_dict(*expected_outfiles) 123 | 124 | except IOError as ioerr: 125 | logger.error('Error reading results') 126 | result = None 127 | except ParseException as parseerr: 128 | logger.error('Other parse error', parseerr) 129 | result = None 130 | 131 | return result 132 | 133 | def _init_cli(self, binary): 134 | return PhymlCLI(executable=binary) 135 | 136 | 137 | def get_default_options(): 138 | return OptionSet([ 139 | # Algorithm 140 | 141 | # Set datatype to nt or aa 142 | StringOption('-d', 'aa', active=True), 143 | 144 | # Set the model for either DNA or AA alignment 145 | StringOption('-m', '', active=False), 146 | 147 | # If set to true will assume sequential format 148 | FlagOption('-q', False, active=False), 149 | 150 | # Set bootstrap value 151 | IntegerOption('-b', 0, active=False), 152 | 153 | # Tree topology search operation option 154 | StringOption('-s', 'NNI', active=False) 155 | ]) 156 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/treebuilders/raxml.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import logging 4 | import random 5 | from pyparsing import ParseException 6 | import shutil 7 | from Bio import AlignIO, SeqIO 8 | 9 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType 10 | from .parsers import RaxmlParser 11 | 12 | from ..abstract_cli import AbstractCLI 13 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet 14 | 15 | from ...file_utils import TempFile,TempDir 16 | 17 | logger = logging.getLogger(__name__) 18 | logger.addHandler(logging.StreamHandler()) 19 | logger.setLevel(logging.INFO) 20 | 21 | 22 | class RaxmlCLI(AbstractCLI): 23 | @property 24 | def _default_exe(self): 25 | return ['raxmlHPC','raxmlHPC-PTHREADS'] 26 | 27 | 28 | def set_default_dna_options(treebuilder): 29 | """ 30 | Dummy function as sensible default 31 | """ 32 | treebuilder.options = get_default_options() 33 | 34 | 35 | def set_default_protein_options(treebuilder): 36 | """ 37 | Dummy function as sensible default 38 | """ 39 | treebuilder.options = get_default_options() 40 | 41 | 42 | class Raxml(TreeBuilder): 43 | 44 | def __init__(self, alignment, *args, **kwargs): 45 | self.options = get_default_options() 46 | super(Raxml, self).__init__(alignment=alignment, *args, **kwargs) 47 | if self.input is not None: 48 | if self.datatype == DataType.DNA: 49 | set_default_dna_options(self) 50 | else: 51 | set_default_protein_options(self) 52 | 53 | 54 | 55 | def __call__(self, *args, **kwargs): 56 | """ 57 | Sets up temporary output files and calls raxml using _call() function. 58 | Writes temporary input file if we're working with SeqIO object 59 | Saves the stdout and stderr and returns 60 | """ 61 | start = time.time() # time the execution 62 | 63 | #Need to create temp directory to put raxml output here 64 | with TempDir() as tmpd: 65 | if self.input_type is AlignmentInput.OBJECT: # different operation depending on what it is 66 | with TempFile() as filename: 67 | SeqIO.write(self.input, filename, 'phylip-relaxed') # default interleaved 68 | output, error = self._call(filename,tmpd, *args, **kwargs) 69 | elif self.input_type is AlignmentInput.FILENAME: 70 | filename = self.input 71 | output, error = self._call(filename, tmpd, *args, **kwargs) 72 | else: 73 | output, error = self._call(None,tmpd, *args, **kwargs) 74 | self.result = self._read_result(tmpd) # store result 75 | self.stdout = output 76 | self.stderr = error 77 | 78 | end = time.time() 79 | self.elapsed_time = end - start 80 | return self.result 81 | # End call 82 | 83 | # Any other accessory methods 84 | def _call(self, filename, tmpd, *args, **kwargs): 85 | """ 86 | Call underlying low level _Raxml wrapper. 87 | Options are passed via *args and **kwargs 88 | [This only covers the simplest automatic 89 | case] 90 | """ 91 | #hard code tmp_output as the output name since we don't save it anyway 92 | self.cli('{} -n tmp_output -w {tmp_path} -s {seqfile}'.format(self.command(), tmp_path=tmpd, seqfile=filename), 93 | wait=True) 94 | return self.cli.get_stdout(), self.cli.get_stderr() 95 | 96 | def command(self): 97 | return str(self.options) 98 | 99 | def _read_result(self, tmpd): 100 | """ 101 | Read back the result. 102 | """ 103 | 104 | expected_outfiles = [os.path.join(tmpd, 'RAxML_info.tmp_output'), os.path.join(tmpd, 'RAxML_bestTree.tmp_output')] 105 | 106 | 107 | parser = RaxmlParser() 108 | 109 | try: 110 | if self.options['-f'].get_value() is not '': 111 | f_value = os.path.splitext(os.path.basename(self.options['-f'].get_value()))[0] 112 | 113 | result = parser.to_dict(*expected_outfiles, dash_f=f_value) 114 | else: 115 | result = parser.to_dict(*expected_outfiles, dash_f=None) 116 | 117 | except IOError as ioerr: 118 | logger.error('Error reading results') 119 | result = None 120 | except ParseException as parseerr: 121 | logger.error('Other parse error', parseerr) 122 | result = None 123 | 124 | return result 125 | 126 | def _init_cli(self, binary): 127 | return RaxmlCLI(executable=binary) 128 | 129 | 130 | def get_default_options(): 131 | return OptionSet([ 132 | # Algorithm 133 | 134 | # Set the model for either DNA or AA alignment 135 | StringOption('-m', 'PROTGAMMAGTR', active=True), 136 | 137 | # Number of replicates 138 | IntegerOption('-p', 12345, active=True), 139 | 140 | # If set to true will assume sequential format 141 | FlagOption('-q', False, active=False), 142 | 143 | # Turn on bootstrapping - set seed 144 | IntegerOption('-b', 0, active=False), 145 | 146 | # Number of replicates 147 | IntegerOption('-#', 0, active=False), 148 | 149 | # Turn on rapid bootstrap - specify seed 150 | IntegerOption('-x', 0, active=False), 151 | 152 | # Sed number of bootstrap replicates 153 | IntegerOption('-N', 0, active=False), 154 | 155 | # Set number of threads 156 | IntegerOption('-T', 0, active=False), 157 | 158 | # Tree topology search operation option 159 | StringOption('-s', 'NNI', active=False), 160 | 161 | # Select algorithm 162 | StringOption('-f', '', active=False), 163 | 164 | # Specify starting tree 165 | StringOption('-t', '', active=False), 166 | 167 | # Specify filename of file containing multiple trees 168 | StringOption('-z', '', active=False), 169 | 170 | ]) 171 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/trimmers/__init__.py: -------------------------------------------------------------------------------- 1 | from .trimal import TrimAl 2 | 3 | -------------------------------------------------------------------------------- /FastOMA/zoo/wrappers/trimmers/base_trimmer.py: -------------------------------------------------------------------------------- 1 | import os, types, itertools 2 | from abc import ABCMeta, abstractmethod 3 | from Bio import AlignIO, SeqIO 4 | from Bio.Align import MultipleSeqAlignment 5 | from ...seq_utils import identify_input 6 | from ...wrappers import WrapperError 7 | 8 | import logging 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | 13 | class MSATrimmer: 14 | """ 15 | Base class for wrappers of msa trimming software 16 | 17 | The wrapper is written as a callable class. 18 | This can hold data (state) to do with the operation it performs, so it can keep results, 19 | execution times and other metadata, as well as perform the task. 20 | 21 | This is a base implementation to be extended. The important parts are 22 | __init__ (does the setup) and __call__ (does the work). All 23 | else are helper methods. 24 | 25 | :Example: 26 | 27 | :: 28 | 29 | callable_wrapper = ConcreteTrimmer(aln) 30 | result = callable_wrapper() 31 | time_taken = callable_wrapper.elapsed_time 32 | result_again = callable_wrapper.result 33 | """ 34 | __metaclass__ = ABCMeta 35 | 36 | def __init__(self, alignment=None, binary=None): 37 | """ 38 | Should work the same whether you're working with a Biopython object or a file 39 | but the implementation differs, e.g. a Biopython object will need 40 | to be written temporarily to disk for the Trimmer to work on it. 41 | 42 | alignment is one of 4 things: 43 | a filename 44 | a Biopython MSA 45 | a list of Seq objects 46 | anything else (throw an exception) 47 | 48 | binary is the alignment's executable file, or None 49 | """ 50 | 51 | if alignment is not None: 52 | self.input_type = identify_input(alignment) # Figure out what it is - file or object 53 | self.input = alignment # store it 54 | else: 55 | self.input_type = None 56 | self.input = None 57 | 58 | self.elapsed_time = None 59 | self.stdout = None 60 | self.stderr = None 61 | try: 62 | self.cli = self._init_cli(binary) 63 | except IOError as err: 64 | raise WrapperError('Error searching for binary: {}'.format(err)) 65 | # End setup 66 | 67 | @abstractmethod 68 | def __call__(self, *args, **kwargs): 69 | """ 70 | How to call the underlying aligner 71 | """ 72 | pass 73 | 74 | @abstractmethod 75 | def _init_cli(self, binary): 76 | """ 77 | Set up the command-line interface to the wrapped software 78 | :param binary: filename of executable binary file 79 | :return: concrete CLI type inheriting from AbstractCLI 80 | """ 81 | pass 82 | 83 | 84 | -------------------------------------------------------------------------------- /archive/analysis/edit_orthxml_file.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | 4 | I accidanetly comment 10 | 11 | 12 | 13 | """ 14 | 15 | 16 | file_in = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/archive/xml_output/out_27aug_6pm.xml_no_property" 17 | file_out = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/archive/xml_output/out_27aug_6pm_property.xml" 18 | 19 | file_in_handle = open(file_in, 'r') 20 | file_out_handle = open(file_out, 'w') 21 | property_str ="" 22 | print("started") 23 | for line in file_in_handle: 24 | if not " 2: 29 | file_name_split = file.split(".") 30 | if file_name_split[-1] == "pickle": 31 | rhog_id = int(file_name_split[0].split("_")[1]) 32 | pickles.append(rhog_id) 33 | else: 34 | print("this file is empty", file) 35 | 36 | print("number of pickles is ", len(pickles)) 37 | 38 | no_pickle_list = set(rhogs) - set(pickles) 39 | 40 | print("number of rhogs not finished is ", len(no_pickle_list)) 41 | 42 | print("\n \n ", no_pickle_list) 43 | -------------------------------------------------------------------------------- /archive/analysis/preprocess_qfo_files.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Proteins in each file belong to the same species. 4 | 5 | # change the name of each file based on the species name inside each prot id 6 | 7 | 8 | from os import listdir 9 | from Bio import SeqIO 10 | import os 11 | 12 | working_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/" 13 | prot_folder = working_folder + "/omamer_search_old/proteome/" 14 | project_files = listdir(prot_folder) 15 | query_species_names_old = [] 16 | query_species_names_new = [] 17 | for file in project_files: 18 | if file.split(".")[-1] == "fa": 19 | file_name_split = file.split(".")[:-1] 20 | query_species_name_old = '.'.join(file_name_split) 21 | prot_address = prot_folder + query_species_name_old + ".fa" 22 | prots_record = list(SeqIO.parse(prot_address, "fasta")) 23 | prot_record = prots_record[0] 24 | prot_name = prot_record.name # 'tr|E3JPS4|E3JPS4_PUCGT 25 | query_species_name_new = prot_name.split("|")[-1].split("_")[-1].strip() 26 | # if query_species_name_new == 'RAT': query_species_name_new = "RATNO" 27 | query_species_names_old.append(query_species_name_old) 28 | query_species_names_new.append(query_species_name_new) 29 | 30 | os.mkdir(working_folder+"/omamer_search") 31 | os.mkdir(working_folder+"/omamer_search/proteome/") 32 | os.mkdir(working_folder+"/omamer_search/hogmap") 33 | 34 | 35 | for idx, query_species_name_old in enumerate(query_species_names_old): 36 | query_species_name_new = query_species_names_new[idx] 37 | 38 | prot_address_old = working_folder + "omamer_search_old/proteome/" + query_species_name_old + ".fa" 39 | prot_address_new = working_folder + "omamer_search/proteome/" + query_species_name_new + "_.fa" 40 | os.system('cp ' + prot_address_old + ' ' + prot_address_new) 41 | 42 | hogmap_address_old = working_folder + "omamer_search_old/hogmap/" + query_species_name_old + ".hogmap" 43 | hogmap_address_new = working_folder + "omamer_search/hogmap/" + query_species_name_new + "_.hogmap" 44 | os.system('cp ' + hogmap_address_old + ' ' + hogmap_address_new) 45 | 46 | 47 | # 13:54:16 - the species DANRE already exists in the oma database, remove them first 48 | 49 | 50 | 51 | print("done") -------------------------------------------------------------------------------- /archive/analysis/write_gene_id_pickle_old_code.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import xml.etree.ElementTree as ET 4 | import dill as dill_pickle 5 | from os import listdir 6 | from xml.dom import minidom 7 | import os 8 | from Bio import SeqIO 9 | #import dill as dill_pickle 10 | import dill as pickle 11 | #import pickle 12 | 13 | 14 | 15 | address_working_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/ali_code_31aug/" 16 | 17 | address_rhogs_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/rhog_all_v3_g2_s500/" 18 | address_group_xml_ortho = address_working_folder+"group_xml_ortho_adjusted_family_40_2sep5pm_dill.pickle" 19 | 20 | 21 | rhog_files = listdir(address_rhogs_folder)[:] 22 | 23 | rhog_files = listdir(address_rhogs_folder) 24 | rhogid_num_list = [] 25 | for rhog_file in rhog_files: 26 | if rhog_file.split(".")[-1] == "fa": 27 | rhogid_num = int(rhog_file.split(".")[0].split("_")[1][1:]) 28 | rhogid_num_list.append(rhogid_num) 29 | 30 | rhogid_num_list_temp = rhogid_num_list 31 | 32 | species_prot_dic = {} 33 | # all_prot_temp_list= [] 34 | for rhogid_num in rhogid_num_list_temp: 35 | prot_address = address_rhogs_folder + "HOG_B" + str(rhogid_num).zfill(7) + ".fa" 36 | rhog_i = list(SeqIO.parse(prot_address, "fasta")) 37 | for prot_i in rhog_i: 38 | prot_i_name = prot_i.id # .split("||")[0] # .split("|")[1] # tr|E3JPS4|E3JPS4_PUCGT or new || || 39 | species_i = prot_i.id.split("||")[1][:-1] # prot_i.id.split("|")[-1].split("_")[-1] 40 | if species_i in species_prot_dic: 41 | species_prot_dic[species_i].append(prot_i_name) 42 | else: 43 | species_prot_dic[species_i] = [prot_i_name] 44 | # all_prot_temp_list.append(prot_i.id) 45 | 46 | print("there are species ", len(species_prot_dic)) 47 | orthoxml_file = ET.Element("orthoXML", 48 | attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA", "originVersion": "Nov 2021", 49 | "version": "0.3"}) # 50 | 51 | gene_counter = 100000 52 | gene_id_name = {} 53 | query_species_names_rHOGs = list(species_prot_dic.keys()) 54 | for species_name in query_species_names_rHOGs: 55 | no_gene_species = True # for code develop ment 56 | species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": species_name, "NCBITaxId": "1"}) 57 | database_xml = ET.SubElement(species_xml, "database", attrib={"name": "QFO database ", "version": "2020"}) 58 | genes_xml = ET.SubElement(database_xml, "genes") 59 | 60 | prot_list = species_prot_dic[species_name] 61 | for prot_itr in range(len(prot_list)): # [12:15] 62 | prot_i_name = prot_list[prot_itr] 63 | gene_id_name[prot_i_name] = gene_counter 64 | prot_i_name_short = prot_i_name.split("||")[0].split("|")[1].strip() # tr|E3JPS4|E3JPS4_PUCGT 65 | gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_counter), "protId": prot_i_name_short}) 66 | gene_counter += 1 67 | 68 | groups_xml = ET.SubElement(orthoxml_file, "groups") 69 | 70 | 71 | 72 | with open(address_group_xml_ortho, 'wb') as handle: 73 | # dill_pickle.dump(gene_id_name, handle, protocol=dill_pickle.HIGHEST_PROTOCOL) 74 | pickle.dump((groups_xml, gene_id_name, orthoxml_file), handle, protocol=pickle.HIGHEST_PROTOCOL) 75 | 76 | print("saved as ", address_group_xml_ortho) 77 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /archive/analysis/xml_.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import xml.etree.ElementTree as ET 4 | import dill as dill_pickle 5 | from os import listdir 6 | from xml.dom import minidom 7 | 8 | working_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastget/qfo2/" 9 | # gene_trees_folder = "" # in_folder + "/gene_trees_/" 10 | # check gene_trees_folder exist otherwise mkdir this 11 | 12 | #address_rhogs_folder = in_folder + "/rhog_g501_done/" # old3/rhog_all/ /rhog_size_g2_s500/" sample_rootHOG 13 | #species_tree_address = in_folder + "/archive/lineage_tree_qfo.phyloxml" 14 | pickle_folder = working_folder + "/pickle_folder_all_collect/" 15 | # add warning when pickle folder is not empty 16 | output_xml_name = "out_27aug_6pm.xml" 17 | 18 | 19 | orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA", 20 | "originVersion": "Nov 2021", "version": "0.3"}) # 21 | 22 | with open(working_folder + '/file_gene_id_name.pickle', 'rb') as handle: 23 | gene_id_name = dill_pickle.load(handle) 24 | # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name) 25 | 26 | for query_species_name, list_prots in gene_id_name.items(): 27 | 28 | species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "NCBITaxId": "1"}) 29 | database_xml = ET.SubElement(species_xml, "database", attrib={"name": "QFO database ", "version": "2020"}) 30 | genes_xml = ET.SubElement(database_xml, "genes") 31 | 32 | for (gene_idx_integer, query_prot_name) in list_prots: 33 | query_prot_name_pure = query_prot_name.split("||")[0].strip().split("|")[1] 34 | gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": query_prot_name_pure}) 35 | 36 | pickle_files_adress = listdir(pickle_folder) 37 | 38 | hogs_a_rhog_xml_all = [] 39 | for pickle_file_adress in pickle_files_adress: 40 | with open(pickle_folder + pickle_file_adress, 'rb') as handle: 41 | hogs_a_rhog_xml_batch = dill_pickle.load(handle) # hogs_a_rhog_xml_batch is orthoxml_to_newick.py list of hog object. 42 | hogs_a_rhog_xml_all.extend(hogs_a_rhog_xml_batch) 43 | # hogs_rhogs_xml_all is orthoxml_to_newick.py list of hog object. 44 | 45 | print("number of hogs in all batches is ", len(hogs_a_rhog_xml_all)) 46 | 47 | groups_xml = ET.SubElement(orthoxml_file, "groups") 48 | 49 | for hogs_a_rhog_xml in hogs_a_rhog_xml_all: 50 | groups_xml.append(hogs_a_rhog_xml) 51 | 52 | xml_str = minidom.parseString(ET.tostring(orthoxml_file)).toprettyxml(indent=" ") 53 | # print(xml_str[:-1000]) 54 | 55 | with open(working_folder +output_xml_name, "w") as file_xml: 56 | file_xml.write(xml_str) 57 | file_xml.close() 58 | 59 | print("orthoxml is written in "+ working_folder +output_xml_name) 60 | 61 | -------------------------------------------------------------------------------- /archive/fastOMA_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/archive/fastOMA_logo.png -------------------------------------------------------------------------------- /archive/test_curn.py: -------------------------------------------------------------------------------- 1 | 2 | from FastOMA.infer_roothogs import fastoma_infer_roothogs 3 | from FastOMA._wrappers import logger 4 | from FastOMA.infer_subhogs import fastoma_infer_subhogs 5 | 6 | 7 | # --low-so-detection --fragment-detection 8 | 9 | # --input-rhog-folder ./bb/ --parrallel True --species-tree species_tree.nwk 10 | 11 | #a=2 12 | #fastoma_infer_subhogs() 13 | # proteome --hogmap hogmaps --splice splice --out-rhog-folder out 14 | import sys 15 | 16 | folder="pycharm_projects/fastoma_test/" 17 | sys.argv.extend(['--proteomes', folder+"proteome"]) 18 | sys.argv.extend(['--hogmap', folder+"hogmaps"]) 19 | sys.argv.extend(['--splice', folder+"splice"]) 20 | sys.argv.extend(['--out-rhog-folder', folder+"out"]) 21 | sys.argv.extend(['-vv']) 22 | fastoma_infer_roothogs() 23 | 24 | a=2 # a 25 | # 26 | # from FastOMA.zoo.hog import transform 27 | # 28 | # #from zoo.tree_utils import collapse, gene_species, transform, HOG_coverages 29 | # 30 | # import io 31 | # import lxml.etree 32 | # orthoxml_file = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_qfo/benchmark-webservice3/orthoxml/euk_omamer200.dev8_13oct.orthoxml" 33 | # 34 | # 35 | # orthxml_str = [] 36 | # with open(orthoxml_file, "r") as f: 37 | # for i in f: 38 | # orthxml_str.append(i) 39 | # print(len(orthxml_str)) 40 | # dic_gene_integer={} 41 | # for line in orthxml_str: 42 | # if "gene id" in line: 43 | # found=False 44 | # gene_int= line.split("\"")[1] 45 | # gene_name = line.split("\"")[3] 46 | # dic_gene_integer[gene_int] = gene_name 47 | # 48 | # 49 | # 50 | # orthoxml_etree=lxml.etree.parse(orthoxml_file) 51 | # 52 | # pw_orthologs_integer = sorted(list(transform.iter_pairwise_relations(orthoxml_etree))) 53 | # # iter_pairwise_relations(obj, rel_type=None (def:'ortholog' , but possible to use 'paralog') 54 | # print(len(pw_orthologs_integer)) 55 | # print(pw_orthologs_integer[:2]) 56 | # pw_orthologs_gene =[] 57 | # for pair in pw_orthologs_integer: 58 | # pw_orthologs_gene.append((dic_gene_integer[pair[0]],dic_gene_integer[pair[1]])) 59 | # 60 | # 61 | # 62 | # print(len(pw_orthologs_gene)) 63 | # 64 | # output_file = open(orthoxml_file+"_pairs.tsv","w") 65 | # for pair in pw_orthologs_gene: 66 | # output_file.write(pair[0]+"\t"+pair[1]+"\n") 67 | # 68 | # output_file.close() 69 | 70 | 71 | # 72 | # 73 | # # orthoxml_handle= open(orthoxml_file,"r") 74 | # # orthoxml ="" 75 | # # for line in orthoxml_handle: 76 | # # orthoxml+=line 77 | # 78 | # 79 | # from xml.etree.ElementTree import XMLParser 80 | # 81 | # parser = XMLParser() 82 | # with open(orthoxml_file, 'rb') as xml: 83 | # for chunk in xml: 84 | # parser.feed(chunk) 85 | # parser.close() 86 | # 87 | # 88 | # lxml.etree.parse(oxml) 89 | # 90 | # orthoxm= lxml.etree.parse(orthoxml) 91 | # 92 | # # expected = [("1", "2"), ("1", "3"), ("1", "4"), ("1", "5"), ("1", "6"), 93 | # # ("2", "5"), ("2", "6"), ("3", "4"), ("3", "5"), ("3", "6"), 94 | # # ("4", "5"), ("4", "6"), ("5", "6")] 95 | # # self.assertEqual(expected, pw_orthologs) 96 | # 97 | # from xml.etree import ElementTree 98 | # tree = ElementTree.parse(orthoxml_file) 99 | # root = tree.getroot() 100 | -------------------------------------------------------------------------------- /conf/base.config: -------------------------------------------------------------------------------- 1 | /* 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 3 | dessimozlab/FastOMA Nextflow base config file 4 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 5 | A 'blank slate' config file, appropriate for general use on most high performance 6 | compute environments. Assumes that all software is installed and available on 7 | the PATH. Runs in `local` mode - all jobs will be run on the logged in environment. 8 | ---------------------------------------------------------------------------------------- 9 | */ 10 | 11 | process { 12 | 13 | cpus = { check_max( 1 * task.attempt, 'cpus' ) } 14 | memory = { check_max( 6.GB * task.attempt, 'memory' ) } 15 | time = { check_max( 4.h * task.attempt, 'time' ) } 16 | shell = ['/bin/bash', '-euo', 'pipefail'] 17 | 18 | //errorStrategy = { task.exitStatus in (130..145) ? 'retry' : 'finish' } 19 | errorStrategy = 'retry' 20 | maxRetries = 3 21 | 22 | withLabel:process_single { 23 | cpus = { check_max( 1 , 'cpus' ) } 24 | memory = { check_max( 12.GB * task.attempt, 'memory' ) } 25 | time = { check_max( 4.h * task.attempt, 'time' ) } 26 | } 27 | withLabel:process_low { 28 | cpus = { check_max( 2 * task.attempt, 'cpus' ) } 29 | memory = { check_max( 12.GB * task.attempt, 'memory' ) } 30 | time = { check_max( 4.h * task.attempt, 'time' ) } 31 | } 32 | withLabel:process_medium { 33 | cpus = { check_max( 6 * task.attempt, 'cpus' ) } 34 | memory = { check_max( 36.GB * task.attempt, 'memory' ) } 35 | time = { check_max( 8.h * task.attempt, 'time' ) } 36 | } 37 | withLabel:process_high { 38 | cpus = { check_max( 12 * task.attempt, 'cpus' ) } 39 | memory = { check_max( 72.GB * task.attempt, 'memory' ) } 40 | time = { check_max( 16.h * task.attempt, 'time' ) } 41 | } 42 | withLabel:process_long { 43 | time = { check_max( 20.h * task.attempt, 'time' ) } 44 | } 45 | withLabel:process_high_memory { 46 | memory = { check_max( 200.GB * task.attempt, 'memory' ) } 47 | } 48 | withLabel:error_ignore { 49 | errorStrategy = 'ignore' 50 | } 51 | withLabel:error_retry { 52 | errorStrategy = 'retry' 53 | maxRetries = 2 54 | } 55 | } -------------------------------------------------------------------------------- /environment-conda.yml: -------------------------------------------------------------------------------- 1 | name: fastoma-env 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - omamer 8 | - mafft 9 | - fasttree 10 | - nextflow 11 | - papermill 12 | - seaborn 13 | - matplotlib 14 | - pyparsing 15 | - networkx 16 | - jupyter 17 | - mmseqs2 18 | - pip 19 | - pip: 20 | - .[report] 21 | -------------------------------------------------------------------------------- /nextflow.config: -------------------------------------------------------------------------------- 1 | // General configuration used in all profiles 2 | manifest { 3 | name = "dessimozlab/FastOMA" 4 | description = """FastOMA computes Hierarchical Orthologous Groups from proteomes.""" 5 | author = "Sina Majidian, Adrian Altenhoff" 6 | homePage = "https://omabrowser.org" 7 | mainScript = "FastOMA.nf" 8 | nextflowVersion = ">=22.10.4" 9 | defaultBranch = "main" 10 | doi = "10.1101/2024.01.29.577392" 11 | version = "0.3.5" 12 | } 13 | 14 | params { 15 | container_name = "dessimozlab/fastoma" 16 | container_version = "0.3.5" 17 | omamer_db = "https://omabrowser.org/All/LUCA.h5" 18 | debug_enabled = false 19 | help = false 20 | report = false 21 | write_msas = false 22 | write_genetrees = false 23 | filter_method = "col-row-threshold" 24 | filter_gap_ratio_row = 0.3 25 | filter_gap_ratio_col = 0.5 26 | nr_repr_per_hog = 5 27 | min_sequence_length = 40 28 | force_pairwise_ortholog_generation = false 29 | 30 | output_folder = "Output" 31 | statsdir = "${params.output_folder}/stats" 32 | 33 | // Max resource options 34 | // Defaults only, expecting to be overwritten 35 | max_memory = '128.GB' 36 | max_cpus = 24 37 | max_time = '120.h' 38 | } 39 | 40 | // Profiles configure nextflow depending on the environment (local, docker, singularity) 41 | profiles { 42 | 43 | docker { 44 | process { 45 | container = "$params.container_name:$params.container_version" 46 | } 47 | docker.enabled = true 48 | } 49 | 50 | singularity { 51 | process { 52 | container = "$params.container_name:$params.container_version" 53 | } 54 | singularity.enabled = true 55 | singularity.autoMounts = true 56 | } 57 | 58 | standard { 59 | process.executor = 'local' 60 | } 61 | 62 | slurm { 63 | process.executor = "slurm" 64 | time = 4.h 65 | } 66 | 67 | conda { 68 | process.conda = "${projectDir}/environment-conda.yml" 69 | conda.enabled = true 70 | conda.createTimeout = '3 h' 71 | } 72 | 73 | slurm_singularity { 74 | process { 75 | container = "$params.container_name:$params.container_version" 76 | executor = "slurm" 77 | time = 4.h 78 | memory = 20.GB 79 | } 80 | singularity.enabled = true 81 | singularity.autoMounts = true 82 | } 83 | 84 | slurm_conda { 85 | process { 86 | conda = "${projectDir}/environment-conda.yml" 87 | executor = "slurm" 88 | time = 4.h 89 | memory = 20.GB 90 | } 91 | conda.enabled = true 92 | conda.createTimeout = '3 h' 93 | } 94 | } 95 | 96 | def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') 97 | timeline { 98 | enabled = params.report 99 | file = "${params.statsdir}/timeline_${trace_timestamp}.html" 100 | } 101 | report { 102 | enabled = params.report 103 | file = "${params.statsdir}/report_${trace_timestamp}.html" 104 | } 105 | trace { 106 | enabled = params.report 107 | file = "${params.statsdir}/trace_${trace_timestamp}.txt" 108 | } 109 | dag { 110 | enabled = params.report 111 | file = "${params.statsdir}/pipeline_dag_${trace_timestamp}.html" 112 | } 113 | 114 | includeConfig "conf/base.config" 115 | 116 | // function to check maximum resources 117 | def check_max(obj, type) { 118 | if (type == 'memory') { 119 | try { 120 | if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) 121 | return params.max_memory as nextflow.util.MemoryUnit 122 | else 123 | return obj 124 | } catch (all) { 125 | println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" 126 | return obj 127 | } 128 | } else if (type == 'time') { 129 | try { 130 | if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) 131 | return params.max_time as nextflow.util.Duration 132 | else 133 | return obj 134 | } catch (all) { 135 | println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" 136 | return obj 137 | } 138 | } else if (type == 'cpus') { 139 | try { 140 | return Math.min( obj, params.max_cpus as int ) 141 | } catch (all) { 142 | println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" 143 | return obj 144 | } 145 | } 146 | } 147 | 148 | -------------------------------------------------------------------------------- /nextflow_slurm.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | process.executor = "slurm" 6 | process.queue = "cpu" 7 | process.time = 10.h 8 | process.memory = 95.GB 9 | executor { 10 | name = 'slurm' 11 | queueSize = 550 12 | } 13 | 14 | errorStrategy = { task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'terminate' } 15 | 16 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "FastOMA" 7 | dynamic = ["version"] 8 | description = "FastOMA - a package to infer orthology information among proteomes" 9 | readme = "README.md" 10 | license = "MIT" 11 | requires-python = ">=3.8" 12 | authors = [ 13 | { name = "Sina Majidian", email = "sina.majidian@gmail.com" }, 14 | { name = "Adrian Altenhoff", email = "adrian.altenhoff@inf.ethz.ch" } 15 | ] 16 | dependencies = [ 17 | "biopython ~=1.81", 18 | "DendroPy >=4.5,<=4.6.1", 19 | "ete3 ~=3.1", 20 | "lxml >=4.6,<6", 21 | "omamer ~=2.0", 22 | "pyham ~=1.1", 23 | "numpy <2", # temporary fix as pytables does not yet work with numpy 2.0 24 | "pyparsing", 25 | "networkx", 26 | ] 27 | 28 | [project.optional-dependencies] 29 | nextflow = [ 30 | "nextflow" 31 | ] 32 | report = [ 33 | "papermill", 34 | "jupyter", 35 | "matplotlib", 36 | "seaborn", 37 | ] 38 | 39 | 40 | [project.scripts] 41 | fastoma-batch-roothogs = "FastOMA.batch_roothogs:fastoma_batch_roothogs" 42 | fastoma-check-input = "FastOMA.check_input:fastoma_check_input" 43 | fastoma-collect-subhogs = "FastOMA.collect_subhogs:fastoma_collect_subhogs" 44 | fastoma-infer-roothogs = "FastOMA.infer_roothogs:fastoma_infer_roothogs" 45 | fastoma-infer-subhogs = "FastOMA.infer_subhogs:fastoma_infer_subhogs" 46 | fastoma-helper = "FastOMA.helper_scripts:main" 47 | 48 | [project.urls] 49 | Homepage = "https://github.com/DessimozLab/FastOMA" 50 | 51 | [tool.hatch.version] 52 | path = "FastOMA/__init__.py" 53 | 54 | [tool.hatch.build.targets.sdist] 55 | include = [ 56 | "/FastOMA", 57 | ] 58 | 59 | [tool.hatch.envs.default] 60 | features = [ 61 | "report", 62 | ] 63 | -------------------------------------------------------------------------------- /testdata/README.md: -------------------------------------------------------------------------------- 1 | FastOMa test data 2 | ====== 3 | 4 | 5 | This repo contains a small dataset as the test example. 6 | 7 | 1- The proteome folder including three fasta files `AQUAE.fa`, `CHLTR.fa` and `MYCGE.fa` corresponding to three species. 8 | 9 | 2- A dummy species tree in Newick format. 10 | 11 | 3- You can download the omamer database as follows 12 | ``` 13 | cd gethog3/testdata 14 | wget https://omabrowser.org/All/Primates-v2.0.0.h5 # 105MB 15 | mv Primates-v2.0.0.h5 in_folder/omamerdb.h5 16 | ``` 17 | -------------------------------------------------------------------------------- /testdata/expected_output/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/.DS_Store -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroups.tsv: -------------------------------------------------------------------------------- 1 | Group Protein 2 | OG_0000001 sp|P0CE13|G3P_CHLTR 3 | OG_0000001 sp|O67161|G3P_AQUAE 4 | OG_0000001 sp|P47543|G3P_MYCGE 5 | OG_0000002 sp|O67118|DNAK_AQUAE 6 | OG_0000002 sp|P47547|DNAK_MYCGE 7 | OG_0000002 sp|P17821|DNAK_CHLTR 8 | OG_0000003 sp|O67618|LEPA_AQUAE 9 | OG_0000003 sp|O84067|LEPA_CHLTR 10 | OG_0000004 sp|P0CD71|EFTU_CHLTR 11 | OG_0000004 sp|P13927|EFTU_MYCGE 12 | OG_0000004 sp|O66429|EFTU_AQUAE 13 | OG_0000005 sp|O84081|FOLD_CHLTR 14 | OG_0000005 sp|O67736|FOLD_AQUAE 15 | OG_0000006 sp|O84332|TPIS_CHLTR 16 | OG_0000006 sp|O66686|TPIS_AQUAE 17 | OG_0000007 sp|P0C0Z7|CH60_CHLTR 18 | OG_0000007 sp|O67943|CH60_AQUAE 19 | OG_0000008 sp|P47639|ATPB_MYCGE 20 | OG_0000008 sp|O67828|ATPB_AQUAE 21 | OG_0000009 sp|P47641|ATPA_MYCGE 22 | OG_0000009 sp|O66907|ATPA_AQUAE 23 | OG_0000010 sp|O66778|ENO_AQUAE 24 | OG_0000010 sp|O84591|ENO_CHLTR 25 | OG_0000011 sp|O84026|RF1_CHLTR 26 | OG_0000011 sp|O67032|RF1_AQUAE 27 | OG_0000011 sp|P47500|RF1_MYCGE 28 | OG_0000012 tr|O84829|O84829_CHLTR 29 | OG_0000012 sp|O67547|SUCD_AQUAE 30 | -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000001.fa: -------------------------------------------------------------------------------- 1 | >sp|P47543|G3P_MYCGE sp|P47543|G3P_MYCGE||MYCGE||1000000005 sp|P47543|G3P_MYCGE [MYCGE] 2 | MAAKNRTIKVAINGFGRIGRLVFRSLLSKANVEVVAINDLTQPEVLAHLLKYDSAHGELK 3 | RKITVKQNILQIDRKKVYVFSEKDPQNLPWDEHDIDVVIESTGRFVSEEGASLHLKAGAK 4 | RVIISAPAKEKTIRTVVYNVNHKTISSDDKIISAASCTTNCLAPLVHVLEKNFGIVYGTM 5 | LTVHAYTADQRLQDAPHNDLRRARAAAVNIVPTTTGAAKAIGLVVPEANGKLNGMSLRVP 6 | VLTGSIVELSVVLEKSPSVEQVNQAMKRFASASFKYCEDPIVSSDVVSSEYGSIFDSKLT 7 | NIVEVDGMKLYKVYAWYDNESSYVHQLVRVVSYCAKL 8 | >sp|P0CE13|G3P_CHLTR sp|P0CE13|G3P_CHLTR||CHLTR||1001000009 sp|P0CE13|G3P_CHLTR [CHLTR] 9 | MRIVINGFGRIGRLVLRQILKRNSPIEVVAINDLVAGDLLTYLFKYDSTHGSFAPQATFS 10 | DGCLVMGERKVHFLAEKDVQKLPWKDLDVDVVVESTGLFVNRDDVAKHLDSGAKRVLITA 11 | PAKGDVPTFVMGVNHQQFDPADVIISNASCTTNCLAPLAKVLLDNFGIEEGLMTTVHAAT 12 | ATQSVVDGPSRKDWRGGRGAFQNIIPASTGAAKAVGLCLPELKGKLTGMAFRVPVADVSV 13 | VDLTVKLSSATTYEAICEAVKHAANTSMKNIMYYTEEAVVSSDFIGCEYSSVFDAQAGVA 14 | LNDRFFKLVAWYDNEIGYATRIVDLLEYVQENSK 15 | >sp|O67161|G3P_AQUAE sp|O67161|G3P_AQUAE||AQUAE||1002000010 sp|O67161|G3P_AQUAE [AQUAE] 16 | MAIKVGINGFGRIGRSFFRASWGREEIEIVAINDLTDAKHLAHLLKYDSVHGIFKGSVEA 17 | KDDSIVVDGKEIKVFAQKDPSQIPWGDLGVDVVIEATGVFRDRENASKHLQGGAKKVIIT 18 | APAKNPDITVVLGVNEEKYNPKEHNIISNASCTTNCLAPCVKVLNEAFGVEKGYMVTVHA 19 | YTNDQRLLDLPHKDFRRARAAAINIVPTTTGAAKAIGEVIPELKGKLDGTARRVPVPDGS 20 | LIDLTVVVNKAPSSVEEVNEKFREAAQKYRESGKVYLKEILQYCEDPIVSTDIVGNPHSA 21 | IFDAPLTQVIDNLVHIAAWYDNEWGYSCRLRDLVIYLAERGL 22 | -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000001.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000001.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000002.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000002.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000003.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000003.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000004.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000004.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000005.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000005.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000006.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000006.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000007.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000007.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000008.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000008.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000009.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000009.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000010.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000010.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000011.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000011.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/OrthologousGroupsFasta/OG_0000012.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/OrthologousGroupsFasta/OG_0000012.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGs.tsv: -------------------------------------------------------------------------------- 1 | RootHOG Protein OMAmerRootHOG 2 | HOG:0000001 sp|P0CE13|G3P_CHLTR HOG:E1027400 3 | HOG:0000001 sp|O67161|G3P_AQUAE HOG:E1027400 4 | HOG:0000001 sp|P47543|G3P_MYCGE HOG:E1027400 5 | HOG:0000002 sp|O67118|DNAK_AQUAE HOG:E0990770 6 | HOG:0000002 sp|P47547|DNAK_MYCGE HOG:E0990770 7 | HOG:0000002 sp|P17821|DNAK_CHLTR HOG:E0990770 8 | HOG:0000003 sp|O67618|LEPA_AQUAE HOG:E0990677 9 | HOG:0000003 sp|O84067|LEPA_CHLTR HOG:E0990677 10 | HOG:0000004 sp|P0CD71|EFTU_CHLTR HOG:E0990677 11 | HOG:0000004 sp|P13927|EFTU_MYCGE HOG:E0990677 12 | HOG:0000004 sp|O66429|EFTU_AQUAE HOG:E0990677 13 | HOG:0000005 sp|O84081|FOLD_CHLTR HOG:E1027325 14 | HOG:0000005 sp|O67736|FOLD_AQUAE HOG:E1027325 15 | HOG:0000006 sp|O84332|TPIS_CHLTR HOG:E1027829 16 | HOG:0000006 sp|O66686|TPIS_AQUAE HOG:E1027829 17 | HOG:0000007 sp|P0C0Z7|CH60_CHLTR HOG:E1027301 18 | HOG:0000007 sp|O67943|CH60_AQUAE HOG:E1027301 19 | HOG:0000008 sp|P47639|ATPB_MYCGE HOG:E0990823 20 | HOG:0000008 sp|O67828|ATPB_AQUAE HOG:E0990823 21 | HOG:0000009 sp|P47641|ATPA_MYCGE HOG:E0990823 22 | HOG:0000009 sp|O66907|ATPA_AQUAE HOG:E0990823 23 | HOG:0000010 sp|O66778|ENO_AQUAE HOG:E1027309 24 | HOG:0000010 sp|O84591|ENO_CHLTR HOG:E1027309 25 | HOG:0000011 sp|O84026|RF1_CHLTR HOG:E0990790 26 | HOG:0000011 sp|O67032|RF1_AQUAE HOG:E0990790 27 | HOG:0000011 sp|P47500|RF1_MYCGE HOG:E0990790 28 | HOG:0000012 tr|O84829|O84829_CHLTR HOG:E1027626 29 | HOG:0000012 sp|O67547|SUCD_AQUAE HOG:E1027626 30 | -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000001.fa: -------------------------------------------------------------------------------- 1 | >sp|P47543|G3P_MYCGE sp|P47543|G3P_MYCGE||MYCGE||1000000005 sp|P47543|G3P_MYCGE [MYCGE] 2 | MAAKNRTIKVAINGFGRIGRLVFRSLLSKANVEVVAINDLTQPEVLAHLLKYDSAHGELK 3 | RKITVKQNILQIDRKKVYVFSEKDPQNLPWDEHDIDVVIESTGRFVSEEGASLHLKAGAK 4 | RVIISAPAKEKTIRTVVYNVNHKTISSDDKIISAASCTTNCLAPLVHVLEKNFGIVYGTM 5 | LTVHAYTADQRLQDAPHNDLRRARAAAVNIVPTTTGAAKAIGLVVPEANGKLNGMSLRVP 6 | VLTGSIVELSVVLEKSPSVEQVNQAMKRFASASFKYCEDPIVSSDVVSSEYGSIFDSKLT 7 | NIVEVDGMKLYKVYAWYDNESSYVHQLVRVVSYCAKL 8 | >sp|P0CE13|G3P_CHLTR sp|P0CE13|G3P_CHLTR||CHLTR||1001000009 sp|P0CE13|G3P_CHLTR [CHLTR] 9 | MRIVINGFGRIGRLVLRQILKRNSPIEVVAINDLVAGDLLTYLFKYDSTHGSFAPQATFS 10 | DGCLVMGERKVHFLAEKDVQKLPWKDLDVDVVVESTGLFVNRDDVAKHLDSGAKRVLITA 11 | PAKGDVPTFVMGVNHQQFDPADVIISNASCTTNCLAPLAKVLLDNFGIEEGLMTTVHAAT 12 | ATQSVVDGPSRKDWRGGRGAFQNIIPASTGAAKAVGLCLPELKGKLTGMAFRVPVADVSV 13 | VDLTVKLSSATTYEAICEAVKHAANTSMKNIMYYTEEAVVSSDFIGCEYSSVFDAQAGVA 14 | LNDRFFKLVAWYDNEIGYATRIVDLLEYVQENSK 15 | >sp|O67161|G3P_AQUAE sp|O67161|G3P_AQUAE||AQUAE||1002000010 sp|O67161|G3P_AQUAE [AQUAE] 16 | MAIKVGINGFGRIGRSFFRASWGREEIEIVAINDLTDAKHLAHLLKYDSVHGIFKGSVEA 17 | KDDSIVVDGKEIKVFAQKDPSQIPWGDLGVDVVIEATGVFRDRENASKHLQGGAKKVIIT 18 | APAKNPDITVVLGVNEEKYNPKEHNIISNASCTTNCLAPCVKVLNEAFGVEKGYMVTVHA 19 | YTNDQRLLDLPHKDFRRARAAAINIVPTTTGAAKAIGEVIPELKGKLDGTARRVPVPDGS 20 | LIDLTVVVNKAPSSVEEVNEKFREAAQKYRESGKVYLKEILQYCEDPIVSTDIVGNPHSA 21 | IFDAPLTQVIDNLVHIAAWYDNEWGYSCRLRDLVIYLAERGL 22 | -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000001.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000001.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000002.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000002.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000003.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000003.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000004.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000004.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000005.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000005.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000006.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000006.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000007.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000007.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000008.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000008.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000009.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000009.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000010.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000010.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000011.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000011.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/RootHOGsFasta/HOG0000012.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/FastOMA/bf6dcbaa8cf516ab6f6e074dba37eceb59a9b80e/testdata/expected_output/RootHOGsFasta/HOG0000012.fa.gz -------------------------------------------------------------------------------- /testdata/expected_output/orthologs.tsv: -------------------------------------------------------------------------------- 1 | sp|O67161|G3P_AQUAE sp|P0CE13|G3P_CHLTR 2 | sp|P47543|G3P_MYCGE sp|O67161|G3P_AQUAE 3 | sp|P47543|G3P_MYCGE sp|P0CE13|G3P_CHLTR 4 | sp|O67118|DNAK_AQUAE sp|P17821|DNAK_CHLTR 5 | sp|P47547|DNAK_MYCGE sp|P17821|DNAK_CHLTR 6 | sp|P47547|DNAK_MYCGE sp|O67118|DNAK_AQUAE 7 | sp|O67618|LEPA_AQUAE sp|O84067|LEPA_CHLTR 8 | sp|O66429|EFTU_AQUAE sp|P0CD71|EFTU_CHLTR 9 | sp|P13927|EFTU_MYCGE sp|P0CD71|EFTU_CHLTR 10 | sp|P13927|EFTU_MYCGE sp|O66429|EFTU_AQUAE 11 | sp|O67736|FOLD_AQUAE sp|O84081|FOLD_CHLTR 12 | sp|O66686|TPIS_AQUAE sp|O84332|TPIS_CHLTR 13 | sp|O67943|CH60_AQUAE sp|P0C0Z7|CH60_CHLTR 14 | sp|O67828|ATPB_AQUAE sp|P47639|ATPB_MYCGE 15 | sp|O66907|ATPA_AQUAE sp|P47641|ATPA_MYCGE 16 | sp|O66778|ENO_AQUAE sp|O84591|ENO_CHLTR 17 | sp|O67032|RF1_AQUAE sp|O84026|RF1_CHLTR 18 | sp|P47500|RF1_MYCGE sp|O84026|RF1_CHLTR 19 | sp|P47500|RF1_MYCGE sp|O67032|RF1_AQUAE 20 | sp|O67547|SUCD_AQUAE tr|O84829|O84829_CHLTR 21 | -------------------------------------------------------------------------------- /testdata/expected_output/phylostratigraphy.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Phylo.io 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 32 | 33 | 34 | 35 | 36 | 53 | 54 | 55 | 56 | 59 | 60 | 61 | 62 |
63 |
64 |
65 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /testdata/expected_output/species_tree_checked.nwk: -------------------------------------------------------------------------------- 1 | ((AQUAE:1,CHLTR:1)inter1:1,MYCGE:1)inter2:0; -------------------------------------------------------------------------------- /testdata/in_folder/proteome/AQUAE.fa: -------------------------------------------------------------------------------- 1 | >sp|O67618|LEPA_AQUAE 2 | MEQKNVRNFCIIAHVDHGKSTLADRLLEYTGAISEREKREQLLDTLDVERERGITVKMQA 3 | VRMFYKAKDGNTYKLHLIDTPGHVDFSYEVSRALAACEGALLLIDASQGIEAQTVANFWK 4 | AVEQDLVIIPVINKIDLPSADVDRVKKQIEEVLGLDPEEAILASAKEGIGIEEILEAIVN 5 | RIPPPKGDPQKPLKALIFDSYYDPYRGAVAFVRIFDGEVKPGDKIMLMSTGKEYEVTEVG 6 | AQTPKMTKFDKLSAGDVGYIAASIKDVRDIRIGDTITHAKNPTKEPVPGFQPAKPMVYAG 7 | IYPAEDTTYEELRDALEKYAINDAAIVYEPESSPALGMGFRVGFLGLLHMEIVQERLERE 8 | YGVKIITTAPNVIYRVKKKFTDEVIEVRNPMDFPDNAGLIEYVEEPFVLVTIITPKEYVG 9 | PIIQLCQEKRGIQKNMTYLDPNTVYLEYEMPLSEIIVDFHDKIKSISRGFASYDYEFIGY 10 | RPSDLIKLTVLINKKPVDALSFIVHADRAQKFARRVAEKLRETIPRQLFEVHIQVAKGGK 11 | VIASERIKPLRANVTAKCYGGDVTRKKKLLENQKEGKKRMKQFGKVQLPQEAFLSVLKVE 12 | >sp|O67032|RF1_AQUAE 13 | MLKEAYISRLDKLQEKYRKLQEELSKPEVIQDVEKYKKLSKELKELQEINELYERYKKAQ 14 | KELKEAKELLKSSDKDLRELAEEEVNRLTEEMKKLEEELKVHLVPKDPNDTKNVILEIRA 15 | GAGGEEAALFAADLFRMYQKYAEEKGWKVSILSSNKTGLGGYKEVIALIEGEGAYSRLKY 16 | ESGVHRVQRVPVTESSGRIHTSTATVAVLPEVDETDIKIKPEELKIETFRASGAGGQYVN 17 | TTETAVRITHIPTGIVVQCQDERSQFQNKQKALKILYAKLKDYYERKKQEEIAKERKEQV 18 | GTGERSEKIRTYNFPQNRVTDHRINLTLYKLQDVLEGKLDEIIDALRAKEIEKKLELVEK 19 | EG 20 | >sp|O66778|ENO_AQUAE 21 | MSRIKRVHGREVLDSRGNPTVEVEVELESGALGRAIVPSGASTGEREALELRDGDPKRYL 22 | GKGVLKAVDNVNGVIAKALVGLEPYNQREIDQILIELDGTENKSKLGANAILGTSMAVAR 23 | AAANELGIPLYEYLGGKFGYRLPVPLMNVINGGAHADNNLDIQEFMIVPVCGGAFREALR 24 | AGVETFHHLKKILKEKGYSTNVGDEGGFAPNLNSSEEALDILMQAIEKAGYKPGEDILLA 25 | LDVASSEFYENGVYKFEGKERSAEEMIEFYEKLIQKYPIISIEDPMSENDWEGWKEITKR 26 | LGDKVQLVGDDLFTTNPKILRKGIEEGVANAILVKLNQIGTVSETLDTVMLAKERNYSAI 27 | ISHRSGETEDTFISHLAVATNAGQIKTGSASRTDRIAKYNELLRIEERLGNGAVFWGREE 28 | FYRFTS 29 | >sp|O66429|EFTU_AQUAE 30 | MAKEKFERTKEHVNVGTIGHVDHGKSTLTSAITCVLAAGLVEGGKAKCFKYEEIDKAPEE 31 | KERGITINITHVEYETAKRHYAHVDCPGHADYIKNMITGAAQMDGAILVVSAADGPMPQT 32 | REHVLLARQVNVPYIVVFMNKCDMVDDEELLELVELEVRELLSKYEYPGDEVPVIRGSAL 33 | GALQELEQNSPGKWVESIKELLNAMDEYIPTPQREVDKPFLMPIEDVFSISGRGTVVTGR 34 | VERGVLRPGDEVEIVGLREEPLKTVATSIEMFRKVLDEALPGDNIGVLLRGVGKDDVERG 35 | QVLAQPGSVKAHKRFRAQVYVLSKEEGGRHTPFFVNYRPQFYFRTADVTGTVVKLPEGVE 36 | MVMPGDNVELEVELIAPVALEEGLRFAIREGGRTVGAGVVTKILD 37 | >sp|O67547|SUCD_AQUAE 38 | MAILVNKDTKVVVQGITGKEGSFHAKQCKEYGTQVVAGVTPGKGGMEVEGIPVFNTVKEA 39 | VKETGANCSLIFVPAPFAADAIVEALDAGIELVVCITEGIPVKDMMMVKDYMLKNYPNAK 40 | LVGPNCPGVITPGEAKVGIMPGHIFKRGKIGIVSRSGTLTYEAAYQLTKYGLGQSTAVGI 41 | GGDPVHGLTHRDVIEMFNKDPETEAILMIGEIGGTEEEEAAEYIEKEVDKPVFAYIAGIT 42 | APPGKRMGHAGAIIMGGKGTAKAKMEALEKAGAYVIENPAKIGETVAKILKVIELEEEER 43 | TSDAE 44 | >sp|O66686|TPIS_AQUAE 45 | MRRLIAANWKMNKTVKETEEYINTFLKFVEHPESREILICPPFTSLYVAGKMLQGTGVKL 46 | GAQNCHYEKRGAFTGEISIPMLQEVGCEYVIVGHSERRHIFGESDELIHKKIVACLEMGI 47 | RPILCVGEKKEEREAGMTFKVIETQIKLALTGVEEHTDKIDIAYEPVWAIGTGTPATPED 48 | AVEVHTFIRNLINQLNPKNEGKTRILYGGSVNPQNAKEFMKHEEINGLLVGTASLDPESF 49 | AKIVYSF 50 | >sp|O67828|ATPB_AQUAE 51 | MAEVIKGKVVQVIGPVVDVEFEGVKELPKIKDGLKTIRRAIDDRGNWFEEVLFMEVAQHI 52 | GEHRVRAIAMGPTDGLVRGQEVEYLGGPIKIPVGKEVLGRIFNVAGQPIDEQGPVEAKEY 53 | WPMFRNPPELVEQSTKVEILETGIKVIDLLQPIIKGGKVGLFGGAGVGKTVLMQELIHNI 54 | ARFHEGYSVVVGVGERTREGNDLWLEMKESGVLPYTVMVYGQMNEPPGVRFRVAHTGLTM 55 | AEYFRDVEGQDVLIFIDNIFRFVQAGAEVSTLLGRLPSAVGYQPTLNTDVGEVQERITST 56 | KKGSITAIQAVYVPADDITDPAPWSIFAHLDATTVLTRRLAELGIYPAIDPLESTSKYLA 57 | PEYVGEEHYEVAMEVKRILQRYKELQEIIAILGMEELSDEDKAIVNRARRIQKFLSQPFH 58 | VAEQFTGMPGKYVKLEDTIRSFKEVLTGKYDHLPENAFYMVGTIEDVIEKAKQMGAKV 59 | >sp|O67118|DNAK_AQUAE 60 | MAEKKEKIIGIDLGTTNSVVSVMMGDEAVVIQNQEGSRLTPSVVSWTKEKEILVGEPAKR 61 | RAILDPENTVYESKRFIGRKFEEVKEEAKRVSYKVVPDEKGDAAFDIPNAGKLVRPEEVG 62 | AHVLRKLKEAAEAFLGEPVKKAVITVPAYFNERQRQATKDAGKIAGLEVVRILNEPTAAA 63 | MAYGLHKKDNVRILVYDFGGGTFDVSILEGGEGVIEVKVTAGDTHLGGANIDERIMDWLI 64 | EEFKKETGIDLRKDRTALQRLKEASEQAKKELSFKMETEINLPFITIDPNTNQPLHLQKK 65 | LTRARLEEMIKDIVDRTIDIVKQALEDAKLKPSDIDEVVLVGGSTRIPLVQQRIKEFFGK 66 | EPHKGLNPDEVVAMGAAIQAGVLAGEVKEIVLVDVTPLSLGVETYGGVMTVLIPRNTPIP 67 | VRKCEIFTTAHDYQTEVEIHVLQGERPLAKDNKSLAKFYLTGIPPAPRGVPKIEVCFDID 68 | ADGILHVTAKDLGTGKEQSVRVEISSGLTPEEIERIIKEAEEHAEEDRKKKELIEAKNQL 69 | DHLVYQLEKALKEAGDKVPADVKSEAEKVIEEAKKTIETATEIEQVKQVTEKVLQVSSKM 70 | GTTLYGEAGKQAGGGEKKDEGGEGEVEAKPVD 71 | >sp|O67736|FOLD_AQUAE 72 | MALILDGKSLSKKIREEIKKEVENFTSKGFRPPALAVILVGNDPASEIYVNNKRKACEKV 73 | GIKSLFYHLPQDVSEEKLLGLIYELNMNEEVDGILVQLPLPKHIDQTRVILSISPEKDVD 74 | GFHPENMGKLVAQIEDGFIPCTPLGIDILLKHYGIDVKGKDVTIVGAGFIVGRPLSLLML 75 | WRNATVSVCHIHTKDVKKFTKEADILISATGVPHLIKEDMIKEGAVVVDVGISRLNGKIV 76 | GDVDFERVKEKASAITPVPGGVGPMTVTALLLNTLKSYKRKFAHLISTTNP 77 | >sp|O67161|G3P_AQUAE 78 | MAIKVGINGFGRIGRSFFRASWGREEIEIVAINDLTDAKHLAHLLKYDSVHGIFKGSVEA 79 | KDDSIVVDGKEIKVFAQKDPSQIPWGDLGVDVVIEATGVFRDRENASKHLQGGAKKVIIT 80 | APAKNPDITVVLGVNEEKYNPKEHNIISNASCTTNCLAPCVKVLNEAFGVEKGYMVTVHA 81 | YTNDQRLLDLPHKDFRRARAAAINIVPTTTGAAKAIGEVIPELKGKLDGTARRVPVPDGS 82 | LIDLTVVVNKAPSSVEEVNEKFREAAQKYRESGKVYLKEILQYCEDPIVSTDIVGNPHSA 83 | IFDAPLTQVIDNLVHIAAWYDNEWGYSCRLRDLVIYLAERGL 84 | >sp|O67943|CH60_AQUAE 85 | MAAKAIIYNEEARAKLKAGVDKLANAVKVTLGPKGREVILGKNWGTPVVTKDGVTVAKEI 86 | ELKDKFENIGAQLVKEVASKTADVAGDGTTTATVLAQAIFHEGLRVAASGANVMEVKRGI 87 | DKAVKKIVEELKKLSKDVKERKEIEQVATISANNDPEIGKIIADAMEEVGKDGVITVEES 88 | KSAETTLEVVKGMQFDRGYLSPYFVTDPEKMECVLENPYILIYEKKITNVKELLPILEQV 89 | VRSGRPLLVIAEDVEGEALATLVVNHIKGVLKACAVKAPGFGQRRKDYLGDIAVLTGGQA 90 | ITEDLGIKLESVTLDMLGQAEKVVVDKEHTTIIGGKGDPEQIKARIEQIKRQIQETTSDY 91 | DREKLQERLAKLSGGVAIIRVGAATEAELKEKKYRVEDAVHATKAAVEEGIVPGGGVALV 92 | RASEALEDLKGDNHDQQLGIDIIKKAVRTPLKQIAYNAGYDGSVVLEKVIELGKEKGVSW 93 | GFNAATGEYVDMYEAGIIDPTKVVRTAIENAASVAGTMLTAEALIADLPEEKKKDITPTD 94 | MPELD 95 | >sp|O66907|ATPA_AQUAE 96 | MATLTYEEALEILRQQIKDFEPEAKMEEVGVVYYVGDGVARAYGLENVMAMEIVEFQGGQ 97 | QGIAFNLEEDNVGIIILGSETGIEEGHIVKRTGRILDAPVGEGLVGRVIDPLGNPLDGKG 98 | PIQFEYRSPVEKIAPGVVKRKPVHEPLQTGIKAIDAMIPIGRGQRELIIGDRATGKTTVA 99 | IDTILAQKNSDVYCIYVAVGQKRAAIARLIELLEREGAMEYTTVVVASASDPASLQYLAP 100 | FVGCTIGEYFRDNGKHALIIYDDLSKHAEAYRQLSLLMRRPPGREAYPGDVFYLHSRLLE 101 | RAAKLNDDLGAGSLTALPIIETKAGDVAAYIPTNVISITDGQIYLEADLFNKGIRPAINV 102 | GLSVSRVGGAAQIKAMKQVAGTLRLELAQFRELEAFVQFASELDKATQQQINRGLRLVEL 103 | LKQEPYNPIPVEKQIVLIYAGTHGYLDDIPVESVRKFEKELYAYLDNERPDILKEISEKK 104 | KLDEELEKKIKEALDAFKQKFVP 105 | -------------------------------------------------------------------------------- /testdata/in_folder/proteome/CHLTR.fa: -------------------------------------------------------------------------------- 1 | >sp|O84067|LEPA_CHLTR 2 | MKPYKIENIRNFSIIAHIDHGKSTIADRLLESTSTIEQREMREQLLDSMDLERERGITIK 3 | AHPVTMTYEYEGETYELNLIDTPGHVDFSYEVSRSLAACEGALLIVDAAQGVQAQSLANV 4 | YLALERDLEIIPVLNKIDLPAAQPEAIKKQIEEFIGLDTSNTIACSAKTGQGIPEILESI 5 | IRLVPPPKPPQETELKALIFDSHYDPYVGIMVYVRVISGEIKKGDRITFMATKGSSFEVL 6 | GIGAFLPEATLMEGSLRAGQVGYFIANLKKVKDVKIGDTVTTVKHPAKEPLEGFKEIKPV 7 | VFAGIYPIDSSDFDTLKDALGRLQLNDSALTIEQENSHSLGFGFRCGFLGLLHLEIIFER 8 | ISREFDLDIIATAPSVIYKVVLKNGKTLFIDNPTAYPDPALIEHMEEPWVHVNIITPQEY 9 | LSNIMSLCMDKRGICLKTDMLDQHRLVLSYELPLNEIVSDFNDKLKSVTKGYGSFDYRLG 10 | DYKKGAIIKLEILINDEAVDAFSCLVHRDKAESKGRSICEKLVDVIPPQLFKIPIQAAIN 11 | KKIIARETIRALAKNVTAKCYGGDITRKRKLWDKQKKGKKRMKEFGKVSIPNTAFVEVLK 12 | ME 13 | >sp|O84026|RF1_CHLTR 14 | MEIKVLECLKRLEEVEKQISDPNIFSNPKEYSSLSKEHARLSEIKNAHESLVATKKILQD 15 | DKLALSTEKDPEIVAMLEEGVLVGEEAVERLSKQLENLLIPPDPDDDLSVIMELRAGTGG 16 | DEAALFVGDCVRMYHLYAASKGWQCEVLSTSESDLGGYKEYVMGISGASVKRFLQYEAGT 17 | HRVQRVPETETQGRVHTSAVTVAVLPEPAEDDEEVFIDEKDLRIDTFRSSGAGGQHVNVT 18 | DSAVRITHIPSGVVVTCQDERSQHKNKAKAMRVLKARIRDAEVQKRAQEASAMRSAQVGS 19 | GDRSERIRTYNFPQNRVTDHRIGLTLYNLDRVMEGELDMITTALVTHVHRQLFGHEETA 20 | >sp|O84591|ENO_CHLTR 21 | MFDVVISDIEAREILDSRGYPTLCVKVITNTGTFGEACVPSGASTGIKEALELRDKDPKR 22 | YQGKGVLQAISNVEKVLMPALQGFSVFDQITADAIMIDADGTPNKEKLGANAILGVSLAL 23 | AKAAANTLQRPLYRYLGGSFSHVLPCPMMNLINGGMHATNGLQFQEFMIRPISAPSLTEA 24 | VRMGAEVFNALKKILQNRQLATGVGDEGGFAPNLASNAEALDLLLTAIETAGFTPREDIS 25 | LALDCAASSFYNTQDKTYDGKSYADQVGILAELCEHYPIDSIEDGLAEEDFEGWKLLSET 26 | LGDRVQLVGDDLFVTNSALIAEGIAQGLANAVLIKPNQIGTLTETAEAIRLATIQGYATI 27 | LSHRSGETEDTTIADLAVAFNTGQIKTGSLSRSERIAKYNRLMAIEEEMGPEALFQDSNP 28 | FSKA 29 | >sp|P0CD71|EFTU_CHLTR 30 | MSKETFQRNKPHINIGTIGHVDHGKTTLTAAITRALSGDGLADFRDYSSIDNTPEEKARG 31 | ITINASHVEYETANRHYAHVDCPGHADYVKNMITGAAQMDGAILVVSATDGAMPQTKEHI 32 | LLARQVGVPYIVVFLNKIDMISEEDAELVDLVEMELVELLEEKGYKGCPIIRGSALKALE 33 | GDAAYIEKVRELMQAVDDNIPTPEREIDKPFLMPIEDVFSISGRGTVVTGRIERGIVKVS 34 | DKVQLVGLRDTKETIVTGVEMFRKELPEGRAGENVGLLLRGIGKNDVERGMVVCLPNSVK 35 | PHTQFKCAVYVLQKEEGGRHKPFFTGYRPQFFFRTTDVTGVVTLPEGIEMVMPGDNVEFE 36 | VQLISPVALEEGMRFAIREGGRTIGAGTISKIIA 37 | >tr|O84829|O84829_CHLTR 38 | MLELLSKDLPIITQGITGKAGSFHTTQCVAYGSNFVGGVTPGKGGSQFLDLPIFDSVLEA 39 | KQATGCRASMIFVPPPFAAEAIFEAEDAGIELIVCITEGIPIKDMLEVASLMEKSASSLI 40 | GPNCPGVIKPGVCKIGIMPGYIHLPGKVGVVSRSGTLTYEAVWQLTQRKIGQSVCIGIGG 41 | DPLNGTSFIDALQEFEKDSQTEAVLMIGEIGGSAEEEAADWTRQHSSKPVIAFIAGATAP 42 | KGKRMGHAGAIISGKSGDAFSKQEALRQAGVTVVESLALIGEAVASVLKPR 43 | >sp|O84332|TPIS_CHLTR 44 | MFTDKETHRKPFPTWAHLLHSEPSKQFVFGNWKMNKTLTEAQTFLKSFISSDILSNPQII 45 | TGIIPPFTLLSACQQAVSDSPIFLGAQTTHEADSGAFTGEISAPMLKDIGVDFVLIGHSE 46 | RRHIFHEQNPVLAEKAAAAIHSGMIPVLCIGETLEEQESGATQDILLNQLTTGLSKLPEQ 47 | ASFILAYEPVWAIGTGKVAHPDLVQETHAFCRKTIASLFSKDIAERTPILYGGSVKADNA 48 | RSLSLCPDVNGLLVGGASLSSENFLSIIQQIDIP 49 | >sp|P17821|DNAK_CHLTR 50 | MSEKRKSNKIIGIDLGTTNSCVSVMEGGQPKVIASSEGTRTTPSIVAFKGGETLVGIPAK 51 | RQAVTNPEKTLASTKRFIGRKFSEVESEIKTVPYKVAPNSKGDAVFDVEQKLYTPEEIGA 52 | QILMKMKETAEAYLGETVTEAVITVPAYFNDSQRASTKDAGRIAGLDVKRIIPEPTAAAL 53 | AYGIDKEGDKKIAVFDLGGGTFDISILEIGDGVFEVLSTNGDTHLGGDDFDGVIINWMLD 54 | EFKKQEGIDLSKDNMALQRLKDAAEKAKIELSGVSSTEINQPFITIDANGPKHLALTLTR 55 | AQFEHLASSLIERTKQPCAQALKDAKLSASDIDDVLLVGGMSRMPAVQAVVKEIFGKEPN 56 | KGVNPDEVVAIGAAIQGGVLGGEVKDVLLLDVIPLSLGIETLGGVMTPLVERNTTIPTQK 57 | KQIFSTAADNQPAVTIVVLQGERPMAKDNKEIGRFDLTDIPPAPRGHPQIEVTFDIDANG 58 | ILHVSAKDAASGREQKIRIEASSGLKEDEIQQMIRDAELHKEEDKQRKEASDVKNEADGM 59 | IFRAEKAVKDYHDKIPAELVKEIEEHIEKVRQAIKEDASTTAIKAASDELSTHMQKIGEA 60 | MQAQSASAAASSAANAQGGPNINSEDLKKHSFSTRPPAGGSASSTDNIEDADVEIVDKPE 61 | >sp|O84081|FOLD_CHLTR 62 | MLLKGAPAADHILATIKENIRACSKAPGLAVVLIGNNPASEIYVNMKIKRATDLGMVSKS 63 | YRKPSDATLSDILALIHQLNNDENIHGILVQLPLPKHLDAQAILSTITPDKDVDGLHPVN 64 | VGKLLLGETDGFIPCTPAGIVELCKYYEIPLHGKHVVILGRSNIVGKPLAALLMQRHADT 65 | NASVTLLHSQSEHLTEITRTADILISAIGVPLFVNKEMIAEKTVIMDVGTSRIPAANPKG 66 | YILVGDVDFNNVVPVCRAITPVPGGVGPMTVAMLMRNTWESFLRHTS 67 | >sp|P0CE13|G3P_CHLTR 68 | MRIVINGFGRIGRLVLRQILKRNSPIEVVAINDLVAGDLLTYLFKYDSTHGSFAPQATFS 69 | DGCLVMGERKVHFLAEKDVQKLPWKDLDVDVVVESTGLFVNRDDVAKHLDSGAKRVLITA 70 | PAKGDVPTFVMGVNHQQFDPADVIISNASCTTNCLAPLAKVLLDNFGIEEGLMTTVHAAT 71 | ATQSVVDGPSRKDWRGGRGAFQNIIPASTGAAKAVGLCLPELKGKLTGMAFRVPVADVSV 72 | VDLTVKLSSATTYEAICEAVKHAANTSMKNIMYYTEEAVVSSDFIGCEYSSVFDAQAGVA 73 | LNDRFFKLVAWYDNEIGYATRIVDLLEYVQENSK 74 | >sp|P0C0Z7|CH60_CHLTR 75 | MVAKNIKYNEEARKKIQKGVKTLAEAVKVTLGPKGRHVVIDKSFGSPQVTKDGVTVAKEV 76 | ELADKHENMGAQMVKEVASKTADKAGDGTTTATVLAEAIYTEGLRNVTAGANPMDLKRGI 77 | DKAVKVVVDQIRKISKPVQHHKEIAQVATISANNDAEIGNLIAEAMEKVGKNGSITVEEA 78 | KGFETVLDIVEGMNFNRGYLSSYFATNPETQECVLEDALVLIYDKKISGIKDFLPVLQQV 79 | AESGRPLLIIAEDIEGEALATLVVNRIRGGFRVCAVKAPGFGDRRKAMLEDIAILTGGQL 80 | ISEELGMKLENANLAMLGKAKKVIVSKEDTTIVEGMGEKEALEARCESIKKQIEDSSSDY 81 | DKEKLQERLAKLSGGVAVIRVGAATEIEMKEKKDRVDDAQHATIAAVEEGILPGGGTALI 82 | RCIPTLEAFLPMLTNEDEQIGARIVLKALSAPLKQIAANAGKEGAIIFQQVMSRSANEGY 83 | DALRDAYTDMLEAGILDPAKVTRSALESAASVAGLLLTTEALIAEIPEEKPAAAPAMPGA 84 | GMDY 85 | -------------------------------------------------------------------------------- /testdata/in_folder/proteome/MYCGE.fa: -------------------------------------------------------------------------------- 1 | >sp|P47500|RF1_MYCGE 2 | MDFDKQLFFNVEKIVELTEQLEKDLNKPNLSFEQIKVINKELKHKQPLIVKFKELQKLVE 3 | NANEAEQILNNSSLKELHEEAKKELEKIKASLPSLEEEIKFLLLPVDENNQKNVIVEIRP 4 | AAGGDESCIFLSDLFNMYKNYCTSKNWTVELNEIIPASVGINFVSFAVNGTDVFAKLKFE 5 | SGVHRVQRVPLTEAKGRVHTSTVTVAVLPQLEEVEITINPSDLRIDTYRASGAGGQHVNR 6 | TESAVRITHLPTGIVVACQEGKSQFSNRDKAMKMLRAKLWENAQNKQLSTQADLRKSQVG 7 | SGERAEKIRTYNYPQNRITDHRIKLTINKLNTVILGDLDEIIEALQADEKKQQLEKFIS 8 | >sp|P13927|EFTU_MYCGE 9 | MAREKFDRSKPHVNVGTIGHIDHGKTTLTAAICTVLAKEGKSAATRYDEIDKAPEEKARG 10 | ITINSAHVEYSSDKRHYAHVDCPGHADYIKNMITGAAQMDGAILVVSATDSVMPQTREHI 11 | LLARQVGVPKMVVFLNKCDIASDEEVQELVAEEVRDLLTSYGFDGKNTPIIYGSALKALE 12 | GDPKWEAKIHDLIKAVDEWIPTPTREVDKPFLLAIEDTMTITGRGTVVTGRVERGELKVG 13 | QEVEIVGLKPIRKAVVTGIEMFKKELDSAMAGDNAGVLLRGVERKEVERGQVLAKPGSIK 14 | PHKKFKAEIYALKKEEGGRHTGFLNGYRPQFYFRTTDVTGSIALAENTEMVLPGDNASIT 15 | VELIAPIACEKGSKFSIREGGRTVGAGTVTEVLE 16 | >sp|P47639|ATPB_MYCGE 17 | MIKKENLTYGKVHQVIGPVVDVIFSESKQLPRVYDCLSVQLKKSELFLEATQLIGDDIVR 18 | CIALGPTEGLARNVKVTNYNHPIEVPVGKNVLGRMFNVLGEPIDGKEPLPKKPKLSIHRN 19 | PPAFDEQPNTVDIFETGIKVIDLLTPYVRGGKIGLFGGAGVGKTVLVQELIHNIAKEHSG 20 | LSVFAGVGERTREGNDLYYEMIQGGVIDKTVLVFGQMNEPPGARMRVALTALTMAEYFRD 21 | HDNQNVLLFIDNIFRFTQAGSEVSALLGRMPSAVGYQPTLAIEMGKLQERIASTKTGSIT 22 | SVQAIYVPADDLTDPAPATTFTHLDAKTVLDRNIAALGIFPAINPLESTSRLLDPSVVGI 23 | NHYKVALGVQNILQRFAELQDIIAILGIDELSDEDKIIVERARRIRNFLSQPFFVAEKFS 24 | GIAGKYVSLNDTVQSFKEILEGKHDHLPEQAFFYVGTIQEAVEKAKRLNQEFDKTK 25 | >sp|P47547|DNAK_MYCGE 26 | MSADNGLIIGIDLGTTNSCVSVMEGGRPVVLENPEGKRTTPSIVSYKNNEIIVGDAAKRQ 27 | MVTNPNTIVSIKRLMGTSNKVKVQNADGTTKELSPEQVSAQILSYLKDFAEKKIGKKISR 28 | AVITVPAYFNDAERNATKTAGKIAGLNVERIINEPTAAALAYGIDKASREMKVLVYDLGG 29 | GTFDVSLLDIAEGTFEVLATAGDNRLGGDDWDNKIIEYISAYIAKEHQGLNLSKDKMAMQ 30 | RLKEAAERAKIELSAQLETIISLPFLTVTQKGPVNVELKLTRAKFEELTKPLLERTRNPI 31 | SDVIKEAKIKPEEINEILLVGGSTRMPAVQKLVESMVPGKKPNRSINPDEVVAIGAAIQG 32 | GVLRGDVKDVLLLDVTPLTLSIETLGGVATPLIKRNTTIPVSKSQIFSTAQDNQESVDVV 33 | VCQGERPMSRDNKSLGRFNLGGIQPAPKGKPQIEITFSLDANGILNVKAKDLTTQKENSI 34 | TISDNGNLSEEEIQKMIRDAEANKERDNIIRERIELRNEGEGIVNTIKEILASPDAKNFP 35 | KEEKEKLEKLTGNIDAAIKANDYAKLKVEIENFKKWREEMAKKYNPTGEQGPQAK 36 | >sp|P47543|G3P_MYCGE 37 | MAAKNRTIKVAINGFGRIGRLVFRSLLSKANVEVVAINDLTQPEVLAHLLKYDSAHGELK 38 | RKITVKQNILQIDRKKVYVFSEKDPQNLPWDEHDIDVVIESTGRFVSEEGASLHLKAGAK 39 | RVIISAPAKEKTIRTVVYNVNHKTISSDDKIISAASCTTNCLAPLVHVLEKNFGIVYGTM 40 | LTVHAYTADQRLQDAPHNDLRRARAAAVNIVPTTTGAAKAIGLVVPEANGKLNGMSLRVP 41 | VLTGSIVELSVVLEKSPSVEQVNQAMKRFASASFKYCEDPIVSSDVVSSEYGSIFDSKLT 42 | NIVEVDGMKLYKVYAWYDNESSYVHQLVRVVSYCAKL 43 | >sp|P47641|ATPA_MYCGE 44 | MADKLNEYVALIKTEIKKYSKKIFNSEIGQVISVADGIAKVSGLENALLNELIQFENNIQ 45 | GIVLNLEQNTVGIALFGDYSSLREGSTAKRTHSVMKTPVGDVMLGRIVNALGEAIDGRGD 46 | IKATEYDQIEKIAPGVMKRKSVNQPLETGILTIDALFPIGKGQRELIVGDRQTGKTAIAI 47 | DTIINQKDKDVYCVYVAIGQKNSSVAQIVHQLEVNDSMKYTTVVCATASDSDSMVYLSPF 48 | TGITIAEYWLKKGKDVLIVFDDLSKHAVAYRTLSLLLKRPPGREAFPGDVFYLHSRLLER 49 | ACKLNDENGGGSITALPIIETQAGDISAYIPTNVISITDGQLFMVSSLFNAGQRPAIQIG 50 | LSVSRVGSAAQTKAIKQQTGSLKLELAQYSELDSFSQFGSDLDENTKKVLEHGKRVMEMI 51 | KQPNGKPYSQVHEALFLFAINKAFIKFIPVDEIAKFKQRITEEFNGSHPLFKELSNKKEF 52 | TEDLESKTKTAFKMLVKRFISTLTDYDITKFGSIEELN 53 | -------------------------------------------------------------------------------- /testdata/in_folder/species_tree.nwk: -------------------------------------------------------------------------------- 1 | ((AQUAE,CHLTR)inter1,MYCGE)inter2; -------------------------------------------------------------------------------- /tests/data/HOG_0890520.fa: -------------------------------------------------------------------------------- 1 | >sp|P47500|RF1_MYCGE||MYCGE||1000000000 sp|P47500|RF1_MYCGE 2 | MDFDKQLFFNVEKIVELTEQLEKDLNKPNLSFEQIKVINKELKHKQPLIVKFKELQKLVE 3 | NANEAEQILNNSSLKELHEEAKKELEKIKASLPSLEEEIKFLLLPVDENNQKNVIVEIRP 4 | AAGGDESCIFLSDLFNMYKNYCTSKNWTVELNEIIPASVGINFVSFAVNGTDVFAKLKFE 5 | SGVHRVQRVPLTEAKGRVHTSTVTVAVLPQLEEVEITINPSDLRIDTYRASGAGGQHVNR 6 | TESAVRITHLPTGIVVACQEGKSQFSNRDKAMKMLRAKLWENAQNKQLSTQADLRKSQVG 7 | SGERAEKIRTYNYPQNRITDHRIKLTINKLNTVILGDLDEIIEALQADEKKQQLEKFIS 8 | >sp|O84026|RF1_CHLTR||CHLTR||1001000001 sp|O84026|RF1_CHLTR 9 | MEIKVLECLKRLEEVEKQISDPNIFSNPKEYSSLSKEHARLSEIKNAHESLVATKKILQD 10 | DKLALSTEKDPEIVAMLEEGVLVGEEAVERLSKQLENLLIPPDPDDDLSVIMELRAGTGG 11 | DEAALFVGDCVRMYHLYAASKGWQCEVLSTSESDLGGYKEYVMGISGASVKRFLQYEAGT 12 | HRVQRVPETETQGRVHTSAVTVAVLPEPAEDDEEVFIDEKDLRIDTFRSSGAGGQHVNVT 13 | DSAVRITHIPSGVVVTCQDERSQHKNKAKAMRVLKARIRDAEVQKRAQEASAMRSAQVGS 14 | GDRSERIRTYNFPQNRVTDHRIGLTLYNLDRVMEGELDMITTALVTHVHRQLFGHEETA 15 | >sp|O67032|RF1_AQUAE||AQUAE||1002000001 sp|O67032|RF1_AQUAE 16 | MLKEAYISRLDKLQEKYRKLQEELSKPEVIQDVEKYKKLSKELKELQEINELYERYKKAQ 17 | KELKEAKELLKSSDKDLRELAEEEVNRLTEEMKKLEEELKVHLVPKDPNDTKNVILEIRA 18 | GAGGEEAALFAADLFRMYQKYAEEKGWKVSILSSNKTGLGGYKEVIALIEGEGAYSRLKY 19 | ESGVHRVQRVPVTESSGRIHTSTATVAVLPEVDETDIKIKPEELKIETFRASGAGGQYVN 20 | TTETAVRITHIPTGIVVQCQDERSQFQNKQKALKILYAKLKDYYERKKQEEIAKERKEQV 21 | GTGERSEKIRTYNFPQNRVTDHRINLTLYKLQDVLEGKLDEIIDALRAKEIEKKLELVEK 22 | EG 23 | -------------------------------------------------------------------------------- /tests/data/correct-msa.fa: -------------------------------------------------------------------------------- 1 | >HUMAN01350 | OMA1057741 | PBLD_HUMAN | [Homo sapiens] 2 | --MKLPIFIADAFTARAFRGNPAAVC----LLENELDEDMHQKIAREMNLSETAFIRKLH 3 | PTDNFAQSSCFGLRWFTPASEVPLCGHATLASAAVLFHKIK-NMNSTLTFVTLSGELRAR 4 | RAEDGIVLDLPLYPAHPQDFHEV-EDLI---KTAIGNTLVQDICYSPDTQKLLVRLSDVY 5 | NRSFLENLKVNTENLLQVENTGKVKGLILTLKGEPGGQTQAFDFYSRYFAPWVGVAEDPV 6 | TGSAHAVLSSYWSQHLGKKEMHAFQCSHRGGELGISLRPDGRVD---------------- 7 | ----IRGGAAVVLEGTLTA 8 | >YEAST02880 | OMA1057741 | YHI9_YEAST | [Saccharomyces cerevisiae (strain ATCC 204508 / S288c)] 9 | MTLMVPFKQVDVFTEKPFMGNPVAVINFLEIDENEVSQEELQAIANWTNLSETTFLFK-- 10 | PSD---KKYDYKLRIFTPRSELPFAGHPTIGSCKAFLEFTKNTTATSLVQECKIGAVPIT 11 | INEGLISFKAPM-----ADYESISSEMIADYEKAIGLKFIKPPALLHTGPEWIVALVEDA 12 | ETCF--NANPNFAMLAHQTKQNDHVGIILAGPKKEAAIKNSYEM--RAFAPVINVYEDPV 13 | CGSGSVALARYL------QEVYKFEKT-----TDITISEGGRLKRNGLMLASIKKEADNS 14 | TSYYIAGHATTVIDGKIKV 15 | >MOUSE00277 | OMA1057741 | K3W4L7 | [Mus musculus] 16 | --MKLPIFIADAFTATAFRGNPAAVC----LLERTLEEDAHQQIAREMNLSETAFIRKLQ 17 | PTDSFTQSSRFGLRWFTPVSEVPLCGHATLASAAVLFHKIRNNRNSTLTFVTMSGELKAR 18 | RAEDGIVLDFPVYPTFPQDFHEV-EDLI---KAAIGDTLVQDIRYSTDTRKLLVRLSDSY 19 | DRSFLESLKVNTEPLPAIEKTGKVRGLILTVKGEPGGQTAPYDFYSRYFAPWVGIAEDPV 20 | TGSAHTVLSSYWSQQLRKKEMRAFQCSRRGGELDISLRPDGRVD---------------- 21 | ----IKGGAVIVLEGTLTA 22 | >PANTR00757 | OMA1057741 | A0A6D2W9P7 | [Pan troglodytes] 23 | --MKLPIFIADAFTARAFRGNPAAVC----LLENELDEDMHQKIAREMNLSETAFIRKLH 24 | PTDNFAQSSCFGLRWFTPASEVPLCGHATLASAAVLFHKIK-NMNSTLTFVTLSGELRAR 25 | RAEDGIVLDLPLYPAHPQDFHEV-EDLI---KTAIGNTLVQDICYSPDTRKLLVRLSDVY 26 | NRSFLENLKVNTENLLQVENTGKVKGLILTLKGEPGGQTQAFDFYSRYFAPWVGVAEDPV 27 | TGSAHAVLSSYWSQHLGKKEMHAFQCSRRGGELGISLRPDGRVD---------------- 28 | ----IRGCAAVVLEGTLTA 29 | -------------------------------------------------------------------------------- /tests/test_fasttree_wrapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | from Bio import AlignIO 5 | from FastOMA._wrappers import infer_gene_tree 6 | from FastOMA.zoo.wrappers import WrapperError 7 | import pathlib 8 | this_dir = pathlib.Path(__file__).parent 9 | 10 | 11 | class FastTreeTester(unittest.TestCase): 12 | def test_failing_tree_building_reports_error_from_fasttree(self): 13 | msa = AlignIO.read(this_dir / "data" / "failing-msa.fa", "fasta") 14 | with self.assertLogs("FastOMA", level="ERROR") as cm: 15 | with self.assertRaises(WrapperError): 16 | infer_gene_tree(msa) 17 | self.assertIn("Non-unique name", "\n".join(cm.output)) 18 | 19 | def test_treebuilding_with_correct_msa(self): 20 | msa = AlignIO.read(this_dir / "data" / "correct-msa.fa", "fasta") 21 | tree = infer_gene_tree(msa) 22 | self.assertIn("HUMAN01350", tree) -------------------------------------------------------------------------------- /tests/test_infer_subhog.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from ete3 import Tree, TreeNode 3 | from Bio.Seq import Seq 4 | from Bio.SeqRecord import SeqRecord 5 | from argparse import Namespace 6 | from FastOMA._hog_class import HOG, Representative 7 | from FastOMA._infer_subhog import LevelHOGProcessor 8 | 9 | 10 | class TestLevelHogProcessor(TestCase): 11 | def setUp(self): 12 | genetree = Tree( 13 | '(((((G00100_SE001||SE001:153.567,G00100_SE008||SE008:153.567)1:39.499[&&NHX:evoltype=S],(G00100_SE006||SE006:173.507,G00100_SE007||SE007:173.507)1:19.5597[&&NHX:evoltype=S])1:14.0196[&&NHX:evoltype=S],(G00100_SE003||SE003:198.481,((((G00100_SE011||SE011:136.533,G00100_SE012||SE012:136.533)1:7.60673[&&NHX:evoltype=S],(G00100_SE010||SE010:36.1782,G00342_SE010||SE010:36.1782)1:107.961[&&NHX:evoltype=D])1:8.49419[&&NHX:evoltype=S],G00100_SE009||SE009:152.634)1:13.723[&&NHX:evoltype=S],(((G00186_SE004||SE004:143.819,(G00186_SE011||SE011:136.533,(G00186_SE012||SE012:116.411,G00242_SE012||SE012:116.411)1:20.1214[&&NHX:evoltype=D])1:7.28662[&&NHX:evoltype=S])1:0.32011[&&NHX:evoltype=S],(G00186_SE010||SE010:31.4887,G00350_SE010||SE010:31.4887)1:112.651[&&NHX:evoltype=D])1:8.49419[&&NHX:evoltype=S],G00186_SE009||SE009:152.634)1:13.723[&&NHX:evoltype=S])1:32.1245[&&NHX:evoltype=D])1:8.60492[&&NHX:evoltype=S])1:36.2336[&&NHX:evoltype=S],(((G00110_SE001||SE001:153.567,G00110_SE008||SE008:153.567)1:39.499[&&NHX:evoltype=S],(G00110_SE006||SE006:173.507,G00110_SE007||SE007:173.507)1:19.5597[&&NHX:evoltype=S])1:14.0196[&&NHX:evoltype=S],(G00110_SE003||SE003:198.481,(((G00110_SE004||SE004:143.819,(G00110_SE011||SE011:136.533,G00110_SE012||SE012:136.533)1:7.28662[&&NHX:evoltype=S])1:0.32011[&&NHX:evoltype=S],G00110_SE010||SE010:144.139)1:8.49419[&&NHX:evoltype=S],G00110_SE009||SE009:152.634)1:45.8474[&&NHX:evoltype=S])1:8.60492[&&NHX:evoltype=S])1:36.2336[&&NHX:evoltype=S])1:6.68041[&&NHX:evoltype=D],(G00100_SE002||SE002:119.545,(G00100_SE013||SE013:97.4899,(G00100_SE014||SE014:87.2367,G00100_SE015||SE015:87.2367)1:10.2532[&&NHX:evoltype=S])1:22.055[&&NHX:evoltype=S])1:130.455[&&NHX:evoltype=S]);') 14 | sptree = Tree("dummy;") 15 | hogs = [HOG(SeqRecord(Seq("AAAAAA"), id=n.name), sptree, "test1") for n in genetree.iter_leaves()] 16 | conf = Namespace(msa_write=False, gene_trees_write=False, number_of_samples_per_hog=5, msa_filter_method="col-row-threshold", 17 | gap_ratio_row=0.3, gap_ratio_col=0.5, min_col_trim=400) 18 | self.genetree = genetree 19 | self.lp = LevelHOGProcessor(sptree, hogs, "test1", conf) 20 | 21 | def test_propose_representatives(self): 22 | rep = self.lp.find_most_divergent_representatives_from_genetree(self.genetree) 23 | self.assertEqual(len(rep), self.lp.conf.number_of_samples_per_hog) 24 | self.assertIn(rep[self.lp.conf.number_of_samples_per_hog-1].get_id(), ("G00100_SE013||SE013","G00100_SE015||SE015","G00100_SE014||SE014","G00100_SE002||SE002")) 25 | 26 | def test_reconcilation(self): 27 | exp = self.genetree.write(features=['evoltype']) 28 | self.lp.infer_reconciliation(genetree=self.genetree) 29 | self.assertEqual(exp, self.genetree.write(features=['evoltype'])) 30 | self.assertEqual(self.genetree.sos, 0) 31 | self.assertEqual(self.genetree.children[0].sos, 1) 32 | 33 | 34 | -------------------------------------------------------------------------------- /tests/test_roothog_example.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | class RootHOGExampleTestCase(TestCase): 4 | 5 | def setUpClass(cls): 6 | pass -------------------------------------------------------------------------------- /utils/filter_orthoxml_completeness.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ### How to use: python filter_orthoxml_completeness.py FastOMA_HOGs.orthoxml 0.3 5 | 6 | import sys 7 | import logging 8 | logging.basicConfig(level=logging.DEBUG) 9 | from FastOMA.zoo.hog import filter_orthoxml_file, HOGFilter 10 | 11 | print("started ") 12 | 13 | input_orthoxml_add = sys.argv[1] 14 | threshold_filt = float(sys.argv[2]) 15 | 16 | score_type = "CompletenessScore" 17 | 18 | 19 | output_name = input_orthoxml_add + "_filt_"+str(threshold_filt)+".orthoxml" 20 | with open(output_name, 'wb') as output_file: 21 | filt = HOGFilter(score_type, threshold_filt) 22 | filter_orthoxml_file(input_orthoxml_add, output_file, filt) 23 | 24 | print("we wrote the output in "+output_name) 25 | 26 | -------------------------------------------------------------------------------- /utils/find_unfinished_rhogs.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import sys 5 | 6 | import os 7 | folder = sys.argv[1] #"/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_eukaryota/run_1june/out_folder/" 8 | 9 | from os import listdir 10 | 11 | project_files = listdir(folder + "/rhogs_all/") 12 | rhogs = [] 13 | for file in project_files: 14 | file_name_split = file.split(".") 15 | if file_name_split[-1] == "fa": 16 | rhog_id = int(file_name_split[0].split("_")[1]) 17 | rhogs.append(rhog_id) 18 | 19 | print("number of rhogs is ", len(rhogs)) 20 | 21 | folder_pickle = folder + "/pickle_rhogs/" 22 | project_files = listdir(folder_pickle) 23 | pickles = [] 24 | for file in project_files: 25 | if os.path.getsize(folder_pickle + file) > 2: 26 | file_name_split = file.split(".") 27 | if file_name_split[-1] == "pickle": 28 | rhog_id = int(file_name_split[0].split("_")[1]) 29 | pickles.append(rhog_id) 30 | else: 31 | print("this file is empty", file) 32 | 33 | print("number of pickles is ", len(pickles)) 34 | 35 | no_pickle_list = set(rhogs) - set(pickles) 36 | 37 | print("number of rhogs not finished is ", len(no_pickle_list)) 38 | 39 | print("\n \n ", no_pickle_list) 40 | -------------------------------------------------------------------------------- /utils/orthoxml2OG.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | this code is for converting an OrthoXML file to a set of Fasta files as Ortholougous groups 4 | 5 | How to run: 6 | cd out_folder 7 | python orthoxml2OG.py output_hog_.orthoxml rhogs_all 8 | 9 | 10 | Output 11 | - Gene names per OG in maximal_og_prot.tsv 12 | - Fasta files in OGs_maximal 13 | """ 14 | 15 | 16 | from ete3 import Tree 17 | import sys 18 | import os 19 | from FastOMA.zoo.hog.convert import orthoxml_to_newick 20 | from Bio import SeqIO 21 | 22 | 23 | 24 | 25 | def max_og_tree(tree): 26 | for node in tree.traverse("preorder"): 27 | # for node in xml_tree.traverse(strategy="preorder", is_leaf_fn=lambda n: hasattr(n, "attriremoved") and n.attriremoved==True): 28 | if not node.is_leaf() and hasattr(node,"Ev") and node.Ev == 'duplication': # node.name[:3] == "dup" 29 | dup_node = node 30 | children = dup_node.get_children() 31 | list_num_species = [] 32 | for child in children: 33 | child_name_leaves = child.get_leaves() 34 | species_list = [] 35 | for leaf in child_name_leaves: 36 | name = leaf.name 37 | if name[:3] == "no_": 38 | name = leaf.name.split("_")[-1] 39 | if name in species_dic: 40 | species_name = species_dic[name] 41 | species_list.append(species_name) 42 | else: 43 | print("species not in the dic ",name) 44 | species_set = set(species_list) 45 | list_num_species.append(len(species_set)) 46 | index_max_species = list_num_species.index(max(list_num_species)) 47 | # if there are few children with identical number of species, the case would be not a polytomi but two children with one species 48 | # num_occurence = [1 for i in list_num_species if i == max(list_num_species)] 49 | # if len(num_occurence) > 1: 50 | # print("please check this case with the developer the tool. The tree has polytomy.") 51 | child_max_species = children[index_max_species] 52 | children_to_remove = [i for i in children if i != child_max_species] 53 | for child_to_remove in children_to_remove: 54 | for i in child_to_remove.get_leaves(): 55 | i.in_og = "no" 56 | 57 | 58 | og_prot_list = [] 59 | for node in tree.traverse("preorder"): 60 | if node.is_leaf(): 61 | if hasattr(node,"in_og") and node.in_og == "no": 62 | pass # print(node.name) 63 | else: 64 | og_prot_list.append(node.name) 65 | 66 | return og_prot_list 67 | 68 | 69 | 70 | input_orthoxml=sys.argv[1] # "out_folder/output_hog_.orthoxml" 71 | rhog_all_folder = sys.argv[2]+"/" # "out_folder/rhogs_all/" 72 | fasta_format = "fa" # of the rhogs_all 73 | 74 | 75 | output_file = "maximal_og_prot.tsv" 76 | 77 | 78 | trees, species_dic = orthoxml_to_newick(input_orthoxml, return_gene_to_species=True) # encode_levels_as_nhx=False, xref_tag="protId", 79 | print("We extracted "+str(len(trees))+" trees in NHX format from the input HOG orthoxml"+input_orthoxml) 80 | 81 | 82 | OGs = {} 83 | for hog_id, tree_string in trees.items(): 84 | 85 | tree = Tree(tree_string,format=1) 86 | og_prot_list = max_og_tree(tree) 87 | OGs[hog_id] = og_prot_list 88 | 89 | 90 | print("done") 91 | 92 | 93 | with open(output_file, 'w') as handle: 94 | for hog_id, og_prot_list in OGs.items(): 95 | line_text = str(hog_id)+"\t"+str(og_prot_list)+"\n" 96 | handle.write(line_text) 97 | handle.close() 98 | 99 | print("We wrote the protein families information in the file "+output_file) 100 | 101 | 102 | out_folder_ogs = "OGs_maximal/" 103 | os.makedirs(out_folder_ogs) 104 | 105 | print("start writing "+str(len(OGs))+" OGs as fasta files in folder " +out_folder_ogs ) 106 | for hog_id, og_prot_list in OGs.items(): #hog_id="HOG_0667494_sub10524" 107 | rhog_id = "_".join(hog_id.split("_")[:2]) 108 | 109 | rhogs_all_address = rhog_all_folder + rhog_id + "."+fasta_format 110 | rhogs_all_prots = list(SeqIO.parse(rhogs_all_address, "fasta")) 111 | 112 | og_prots = [] 113 | og_prot_list = OGs[hog_id] 114 | for rhogs_prot in rhogs_all_prots: 115 | if rhogs_prot.id.split("||")[0] in og_prot_list: 116 | sp= rhogs_prot.id.split("||")[1] 117 | rhogs_prot.description += " ["+ sp +"]" 118 | og_prots.append(rhogs_prot) 119 | 120 | og_id = "OG_" + hog_id # one OG per rootHOG # "/HOG_"+ str(rhogid_num).zfill(7) 121 | SeqIO.write(og_prots, out_folder_ogs+og_id+".fa", "fasta") 122 | print("writing done") 123 | 124 | -------------------------------------------------------------------------------- /utils/orthoxml2family.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | import sys 6 | 7 | from FastOMA.zoo.hog import extract_flat_groups_at_level 8 | 9 | 10 | """ 11 | how to run 12 | python orthoxml2family.py my_hogs.orthoxml 13 | 14 | - to convert orthoxml to rootHOG (protein families) 15 | """ 16 | 17 | input_orthoxml = sys.argv[1] 18 | output_file = "families_prot.tsv" 19 | 20 | toplevel_groups = [] 21 | for grp in extract_flat_groups_at_level(input_orthoxml): 22 | toplevel_groups.append(set(g.xref for g in grp)) 23 | 24 | # toplevel_groups is a list of sets 25 | 26 | print("We extracted "+str(len(toplevel_groups))+" protein families from the input HOG orthoxml"+input_orthoxml) 27 | print("The first one contain "+str(len(toplevel_groups[0]))+" proteins.") 28 | 29 | with open(output_file, 'w') as handle: 30 | for toplevel_group_idx, toplevel_group in enumerate(toplevel_groups): 31 | line_text = str(toplevel_group_idx)+"\t"+str(toplevel_group)+"\n" 32 | handle.write(line_text) 33 | handle.close() 34 | 35 | print("We wrote the protein families information in the file "+output_file) 36 | 37 | 38 | # we need to know the species name of each prot, as prot_specis dic 39 | # prot_name_universal = [] 40 | # for group in toplevel_groups: 41 | # if len(group) > 0.9 * 2181: 42 | # species = [prot_specis[prot] for prot in group] 43 | # species_unq = set(species) 44 | # if len(species_unq) > 0.9 * 2181: 45 | # prot_name_universal.append(group) 46 | # 47 | # len(prot_name_universal) -------------------------------------------------------------------------------- /utils/orthoxml2newick.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import sys 5 | import os 6 | from FastOMA.zoo.hog.convert import orthoxml_to_newick 7 | 8 | """ 9 | how to run 10 | python orthoxml2newick.py my_hogs.orthoxml 11 | """ 12 | 13 | input_orthoxml = sys.argv[1] 14 | output_folder = "output_folder_trees" 15 | 16 | os.mkdir(output_folder) 17 | 18 | trees = orthoxml_to_newick(input_orthoxml) 19 | 20 | print("We extracted "+str(len(trees))+" trees from the input HOG orthoxml"+input_orthoxml) 21 | 22 | # write them as files 23 | for treeid_hog, tree in trees.items(): 24 | tree_file_i = output_folder+"/tree_"+str(treeid_hog)+".nwk" 25 | with open(tree_file_i,'w') as handle: 26 | handle.write(tree) 27 | handle.close() 28 | # tree_i.write(format=1, format_root_node=True, outfile=tree_file_i) 29 | print("We wrote "+str(len(trees))+" trees in nhx format from the input HOG orthoxml"+input_orthoxml+"in "+output_folder) 30 | print("You can visualise each tree using https://beta.phylo.io/viewer/ as extendeed newick format.") 31 | -------------------------------------------------------------------------------- /utils/orthoxml2pairs.py: -------------------------------------------------------------------------------- 1 | 2 | from FastOMA.zoo.hog import transform 3 | 4 | #from zoo.tree_utils import collapse, gene_species, transform, HOG_coverages 5 | 6 | import io 7 | import lxml.etree 8 | import sys 9 | orthoxml_file = sys.argv[1] 10 | #"/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_qfo/benchmark-webservice3/orthoxml/euk_omamer200.dev8_13oct.orthoxml" 11 | 12 | 13 | orthxml_str = [] 14 | with open(orthoxml_file, "r") as f: 15 | for i in f: 16 | orthxml_str.append(i) 17 | print(len(orthxml_str)) 18 | dic_gene_integer={} 19 | for line in orthxml_str: 20 | if "gene id" in line: 21 | found=False 22 | gene_int= line.split("\"")[1] 23 | gene_name = line.split("\"")[3] 24 | dic_gene_integer[gene_int] = gene_name 25 | 26 | 27 | 28 | orthoxml_etree=lxml.etree.parse(orthoxml_file) 29 | 30 | pw_orthologs_integer = sorted(list(transform.iter_pairwise_relations(orthoxml_etree))) 31 | # iter_pairwise_relations(obj, rel_type=None (def:'ortholog' , but possible to use 'paralog') 32 | print(len(pw_orthologs_integer)) 33 | print(pw_orthologs_integer[:2]) 34 | pw_orthologs_gene =[] 35 | for pair in pw_orthologs_integer: 36 | pw_orthologs_gene.append((dic_gene_integer[pair[0]],dic_gene_integer[pair[1]])) 37 | 38 | 39 | 40 | print(len(pw_orthologs_gene)) 41 | print(pw_orthologs_gene[:2]) 42 | 43 | 44 | output_file = open(orthoxml_file+"_pairs.tsv","w") 45 | for pair in pw_orthologs_gene: 46 | output_file.write(pair[0]+"\t"+pair[1]+"\n") 47 | 48 | output_file.close() 49 | -------------------------------------------------------------------------------- /utils/orthoxml2perrhog.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # import OrthoXMLSplitter 5 | 6 | 7 | folder="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_eukaryota/run_1june/out_folder/" 8 | hog_file = folder + "/output_hog_.orthoxml" 9 | outdir=folder+"/perrhog_folder" 10 | 11 | from OrthoXMLSplitter import OrthoXMLSplitter 12 | 13 | splitter = OrthoXMLSplitter(hog_file, outdir) 14 | 15 | splitter() 16 | 17 | 18 | -------------------------------------------------------------------------------- /utils/orthoxml2phylostratigraphy.py: -------------------------------------------------------------------------------- 1 | 2 | # you need to install pyham https://github.com/DessimozLab/pyham 3 | 4 | import pyham 5 | 6 | import logging 7 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)-12s %(levelname)-8s %(message)s") 8 | 9 | 10 | working_folder="./" 11 | 12 | nwk_path= working_folder+"in_folder/species_tree.nwk" # species tree should be pruned (no extra leaves) 13 | 14 | tree_str = pyham.utils.get_newick_string(nwk_path, type="nwk") 15 | print(tree_str[:10]) 16 | 17 | orthoxml_path=working_folder+"out_folder/output_hog.orthoxml" 18 | ham_analysis = pyham.Ham(tree_str, orthoxml_path, use_internal_name=True) 19 | print("Ham analysis done") # for a big orthoxml file it can take ~30mins 20 | 21 | #phylostratigraphy 22 | 23 | #create tree profile, classify all genomes by extant or ancestral, and get % of dup, lost, retained, and gained 24 | treeprofile = ham_analysis.create_tree_profile(outfile= working_folder+"/out_folder/phylostratigraphy.html") 25 | treemap = treeprofile.compute_tree_profile_full() 26 | 27 | -------------------------------------------------------------------------------- /utils/pickle2orthoxml.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from xml.dom import minidom 4 | import xml.etree.ElementTree as ET 5 | import pickle 6 | from FastOMA._utils_subhog import read_species_tree 7 | from FastOMA.collect_subhogs import convert_speciestree_to_orthoxml_taxonomy 8 | import sys 9 | from FastOMA.transformer import header_transformer 10 | 11 | from FastOMA.collect_subhogs import iter_hogs 12 | from FastOMA.collect_subhogs import update_hogids 13 | from pathlib import Path 14 | 15 | ``` 16 | python pickle2orthoxml.py "no_header" "file_D0680685.pickle" 17 | 18 | python pickle2orthoxml.py "selected_genes" pickle_folder gene_id_dic_xml.pickle "species_tree_checked.nwk" # this will be slow. gene_id_dic_xml.pickle is in the output of infer_roothogs 19 | ``` 20 | 21 | mode = sys.argv[1] #"selected_genes" #"no_header" # "selected_genes" "all_genes" 22 | 23 | if mode=="no_header": 24 | 25 | input_pickle= sys.argv[2] # "file_D0680685.pickle" 26 | handle=open(input_pickle,'rb') 27 | orthoxml_file = pickle.load(handle) 28 | 29 | print(len(orthoxml_file)) 30 | xml_str = minidom.parseString(ET.tostring(orthoxml_file[0])).toprettyxml(indent=" ") 31 | 32 | with open(input_pickle+"_noheader.orthoxml","w") as out_file: 33 | out_file.write(xml_str) 34 | 35 | if mode =="selected_genes": 36 | 37 | input_pickle = sys.argv[2] # a folder of pickles pickle_folder 38 | gene_id_pickle_file = sys.argv[3] # generated in infer_roothogs. 39 | # available in out_folder/temp_output/gene_id_dic_xml.pickle 40 | # this keeps the gene name and the gene integer ID used in orthoxml. 41 | species_tree = sys.argv[4] # "species_tree_checked.nwk" 42 | 43 | handle=open(input_pickle,'rb') 44 | orthoxml_file1 = pickle.load(handle) # todo might have two elements inside? 45 | gene_int_set = set() 46 | num_digit = 10 # integer ids # assumption ? 47 | for orthoxml_part in orthoxml_file1: 48 | xml_str = minidom.parseString(ET.tostring(orthoxml_part)).toprettyxml(indent=" ") 49 | gene_int_set_i = set([int(i[1:num_digit+1]) for i in xml_str.split("geneRef id=")[1:] ]) 50 | gene_int_set.update(gene_int_set_i) 51 | 52 | from datetime import datetime 53 | fastoma_version= "0" 54 | orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/", 55 | "origin": "FastOMA " + fastoma_version, 56 | "originVersion": datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 57 | "version": "0.5"}) # 58 | 59 | with open(gene_id_pickle_file, 'rb') as handle: 60 | gene_id_name = pickle.load(handle) # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name) 61 | print("We read the gene_id_name dictionary with %d items", len(gene_id_name)) 62 | 63 | speciestree = read_species_tree(species_tree) 64 | taxonomy, name2taxid = convert_speciestree_to_orthoxml_taxonomy(speciestree) 65 | print("Now creating the header of orthoxml") 66 | 67 | id_transform_= "noop" # noop:No transformation, "UniProt": '>sp|P68250|1433B_BOVIN' --> P68250""") 68 | 69 | id_transformer = header_transformer(id_transform_) 70 | 71 | # #### create the header of orthoxml #### 72 | for query_species_name, list_prots in gene_id_name.items(): 73 | first=True 74 | for (gene_idx_integer, query_prot_name) in list_prots: 75 | if gene_idx_integer in gene_int_set: 76 | if first: 77 | species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "taxonId": str(name2taxid[query_species_name]), "NCBITaxId": "0"}) 78 | database_xml = ET.SubElement(species_xml, "database", attrib={"name": "database", "version": "2023"}) 79 | genes_xml = ET.SubElement(database_xml, "genes") 80 | prot_id = id_transformer.transform(query_prot_name) 81 | gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": prot_id}) 82 | first=False 83 | else: 84 | prot_id = id_transformer.transform(query_prot_name) 85 | gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": prot_id}) 86 | 87 | 88 | 89 | print("gene_xml is created.") 90 | # orthoxml_file.append(taxonomy) 91 | 92 | scores = ET.SubElement(orthoxml_file, "scores") 93 | ET.SubElement(scores, "scoreDef", {"id": "CompletenessScore", 94 | "desc": "Fraction of expected species with genes in the (Sub)HOG"}) 95 | 96 | # #### create the groups of orthoxml #### 97 | groups_xml = ET.SubElement(orthoxml_file, "groups") 98 | 99 | with open(input_pickle, 'rb') as handle: 100 | hogs_a_rhog_xml = pickle.load(handle) 101 | for idx, hog_a_rhog_xml in enumerate(hogs_a_rhog_xml): 102 | fam = idx # this could be improved 103 | groups_xml.append(update_hogids(fam, hog_a_rhog_xml, name2taxid)) 104 | #for fam, hogs_a_rhog_xml in enumerate(iter_hogs(Path(pickle_folder)), start=1): 105 | # groups_xml.append(update_hogids(fam, hogs_a_rhog_xml, name2taxid)) 106 | print("converting the xml object to string.") 107 | 108 | output_xml_name= input_pickle+".orthoxml" 109 | with open(output_xml_name, 'wb') as fh: 110 | ET.indent(orthoxml_file, space=' ', level=0) 111 | orthoxml = ET.ElementTree(orthoxml_file) 112 | orthoxml.write(fh, encoding="utf-8", xml_declaration=True, ) 113 | print("orthoxml is written in %s", output_xml_name) 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /utils/write_orthoxml_per_rHOG.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | 3 | from os import listdir 4 | from xml.dom import minidom 5 | 6 | import pickle 7 | 8 | 9 | 10 | folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_eukaryota/run_1june/" 11 | 12 | # create this folder /out_folder/orthoxml_out/ 13 | 14 | gene_id_pickle_file = folder + "/out_folder/gene_id_dic_xml.pickle" 15 | 16 | with open(gene_id_pickle_file, 'rb') as handle: 17 | gene_id_name = pickle.load(handle) 18 | # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name) 19 | print("gene_id_name read ", len(gene_id_name)) 20 | 21 | pickle_folder = folder + "/out_folder/pickle_rhogs_/" 22 | pickle_files_adress = listdir(pickle_folder) 23 | 24 | orthoxml_out_folder = folder + "/out_folder/orthoxml_out/" 25 | check = listdir(orthoxml_out_folder) 26 | 27 | 28 | print("gene_xml created ") 29 | # hogs_a_rhog_xml_all = [] 30 | for idx, pickle_file_adress in enumerate(pickle_files_adress): 31 | 32 | if idx % 100 == 0: print(idx) 33 | with open(pickle_folder + pickle_file_adress, 'rb') as handle: 34 | hogs_a_rhog_xml_batch = pickle.load( 35 | handle) # hogs_a_rhog_xml_batch is orthoxml_to_newick.py list of hog object. 36 | handle.close() 37 | # hogs_a_rhog_xml_all.extend(hogs_a_rhog_xml_batch) 38 | # hogs_rhogs_xml_all is orthoxml_to_newick.py list of hog object. 39 | # print("number of hogs is batch is ", len(hogs_a_rhog_xml_batch)) 40 | 41 | xml_str = "" 42 | for i in hogs_a_rhog_xml_batch: 43 | xml_str += minidom.parseString(ET.tostring(i)).toprettyxml(indent=" ") 44 | xs = xml_str.split("\n") 45 | list_geneid = [] 46 | for x in xs: 47 | if "geneRef id" in x: 48 | list_geneid.append(int(x.split("\"")[1])) 49 | print(len(list_geneid)) 50 | 51 | query_species_name_list = [] 52 | for query_species_name, list_prots in gene_id_name.items(): 53 | 54 | for (gene_idx_integer, query_prot_name) in list_prots: 55 | if gene_idx_integer in list_geneid: 56 | query_species_name_list.append(query_species_name) 57 | 58 | query_species_name_set = list(set(query_species_name_list)) 59 | 60 | output_xml_name = orthoxml_out_folder + pickle_files_adress[0] + "_.orthoxml" 61 | orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA", 62 | "originVersion": "Nov 2021", "version": "0.3"}) # 63 | 64 | for query_species_name, list_prots in gene_id_name.items(): 65 | if query_species_name in query_species_name_set: 66 | species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "NCBITaxId": "1"}) 67 | database_xml = ET.SubElement(species_xml, "database", attrib={"name": " database ", "version": "2020"}) 68 | genes_xml = ET.SubElement(database_xml, "genes") 69 | 70 | for (gene_idx_integer, query_prot_name) in list_prots: 71 | if gene_idx_integer in list_geneid: # +[1007003758] 72 | query_prot_name_pure = query_prot_name 73 | gene_xml = ET.SubElement(genes_xml, "gene", 74 | attrib={"id": str(gene_idx_integer), "protId": query_prot_name_pure}) 75 | 76 | groups_xml = ET.SubElement(orthoxml_file, "groups") 77 | 78 | for hogs_a_rhog_xml in hogs_a_rhog_xml_batch: 79 | groups_xml.append(hogs_a_rhog_xml) 80 | # print("convert to string") 81 | 82 | xml_str = minidom.parseString(ET.tostring(orthoxml_file)).toprettyxml(indent=" ") 83 | 84 | with open(output_xml_name, "w") as file_xml: 85 | file_xml.write(xml_str) 86 | file_xml.close() 87 | 88 | print("orthoxml is written in " + output_xml_name) 89 | --------------------------------------------------------------------------------