├── .github
    └── workflows
    │   └── docker-image.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── archive
    ├── .idea
    │   ├── encodings.xml
    │   ├── inspectionProfiles
    │   │   └── profiles_settings.xml
    │   ├── libraries
    │   │   └── R_User_Library.xml
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── other.xml
    │   ├── pore2tree.iml
    │   └── vcs.xml
    ├── TODO
    ├── down_py_script.sh
    ├── r2t_py_script.sh
    ├── requirements.txt
    ├── rm_py_script.sh
    ├── scripts
    │   ├── SraRunTable.txt
    │   ├── adjust_mapping_names.py
    │   ├── clean_fasta_cdna_cds.py
    │   ├── concat_alignments.py
    │   ├── down_py_script.sh
    │   ├── from_assemblies.py
    │   ├── get_alignment_similarity.py
    │   ├── get_computation_progress.py
    │   ├── get_highly_conserved_og_dna_hdf5.py
    │   ├── get_og_from_readmapping.py
    │   ├── get_reconstructed_seq_by_species.py
    │   ├── get_seq_completeness.py
    │   ├── get_topological_likelihoods.py
    │   ├── lsf_submit.py
    │   ├── lsf_submit_mouse.py
    │   ├── lsf_submit_paired.py
    │   ├── map2align.py
    │   ├── map2align_test.py
    │   ├── monitor_folder_size.py
    │   ├── orthogroups_fasta_to_marker_genes.py
    │   ├── orthogroups_fasta_to_marker_genes_by_groups.py
    │   ├── protein_converter.py
    │   ├── r2t_py_script.sh
    │   ├── relabel_msa.py
    │   ├── remove_species_from_alignment.py
    │   ├── rm_py_script.sh
    │   ├── sample_from_reads.py
    │   ├── sample_reads.py
    │   ├── sge_submit.py
    │   ├── sge_submit_paired.py
    │   ├── sge_submit_paired_comic.py
    │   ├── slurm_submit.py
    │   ├── subsample_nextstrain_covid_genomes_with_sra_accession.py
    │   ├── treecl
    │   │   └── select_alignments.py
    │   └── trim_alignment.py
    ├── set_marker_genes
    │   ├── bacteria_markergenes.zip
    │   └── mammalia_markergenes.zip
    ├── tests
    │   ├── info.log
    │   ├── input.log
    │   ├── test_aligner.py
    │   ├── test_og.py
    │   ├── test_ogset.py
    │   ├── test_reads.py
    │   ├── test_seqCompleteness.py
    │   └── test_use.py
    └── wiki_images
    │   ├── covid1.jpg
    │   ├── covid2.jpg
    │   ├── figure1.jpg
    │   ├── figure_1sp.jpg
    │   ├── oma_page_0.png
    │   ├── oma_page_1.png
    │   ├── oma_page_2.png
    │   ├── oma_page_3.png
    │   ├── oma_page_4.png
    │   ├── oma_page_5.png
    │   ├── oma_page_6.png
    │   ├── oma_page_7.png
    │   └── oma_page_8.png
├── bin
    └── read2tree
├── environment.yml
├── read2tree
    ├── Aligner.py
    ├── Analyzer.py
    ├── FastxReader.py
    ├── GuidedAssembler.py
    ├── Mapper.py
    ├── MultiProcessingLog.py
    ├── OGSet.py
    ├── Progress.py
    ├── Reads.py
    ├── ReferenceSet.py
    ├── TreeInference.py
    ├── __init__.py
    ├── _utils.py
    ├── file_utils
    │   ├── __init__.py
    │   └── context_managers.py
    ├── logging
    │   ├── log.yaml
    │   └── log.yaml.bak
    ├── main.py
    ├── parser
    │   ├── OMAOutputParser.py
    │   └── __init__.py
    ├── stats
    │   ├── Coverage.py
    │   ├── SeqCompleteness.py
    │   └── __init__.py
    ├── utils
    │   ├── __init__.py
    │   └── seq_utils.py
    └── wrappers
    │   ├── __init__.py
    │   ├── abstract_cli.py
    │   ├── aligners
    │       ├── __init__.py
    │       ├── base_aligner.py
    │       ├── mafft.py
    │       ├── muscle.py
    │       ├── probcons.py
    │       └── prographmsa.py
    │   ├── options.py
    │   ├── read_mappers
    │       ├── __init__.py
    │       ├── base_mapper.py
    │       ├── ngm.py
    │       ├── ngmlr.py
    │       └── parser.py
    │   └── treebuilders
    │       ├── __init__.py
    │       ├── base_treebuilder.py
    │       ├── fasttree.py
    │       ├── iqtree.py
    │       ├── parsers.py
    │       ├── phyml.py
    │       ├── raxml.py
    │       └── src
    │           └── pip-delete-this-directory.txt
├── setup.py
├── src
    └── pip-delete-this-directory.txt
└── tests
    ├── dna_ref.fa
    ├── marker_genes
        ├── OMAGroup_1001241.fa
        ├── OMAGroup_1008242.fa
        ├── OMAGroup_1065415.fa
        ├── OMAGroup_1121053.fa
        ├── OMAGroup_1125645.fa
        ├── OMAGroup_1133018.fa
        ├── OMAGroup_1151179.fa
        ├── OMAGroup_1163384.fa
        ├── OMAGroup_1171372.fa
        ├── OMAGroup_1188079.fa
        ├── OMAGroup_649157.fa
        ├── OMAGroup_649216.fa
        ├── OMAGroup_671579.fa
        ├── OMAGroup_681083.fa
        ├── OMAGroup_681195.fa
        ├── OMAGroup_683078.fa
        ├── OMAGroup_894224.fa
        ├── OMAGroup_898327.fa
        ├── OMAGroup_944789.fa
        └── OMAGroup_974829.fa
    ├── sample_1.fastq
    ├── sample_2.fastq
    ├── test_aligner.py
    ├── test_og.py
    ├── test_ogset.py
    ├── test_reads.py
    ├── test_seqCompleteness.py
    └── test_use.py


/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |   release:
 9 |     type: [published]
10 | 
11 | env:
12 |   TEST_TAG: dessimozlab/read2tree:test
13 | 
14 | jobs:
15 | 
16 |   build:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |       -
22 |         name: Checkout
23 |         uses: actions/checkout@v2
24 |         with:
25 |           submodules: recursive
26 | 
27 |       -
28 |         name: Docker meta
29 |         id: meta
30 |         uses: docker/metadata-action@v4
31 |         with:
32 |           # list of Docker images to use as base name for tags
33 |           images: |
34 |             dessimozlab/read2tree
35 |           # generate Docker tags based on the following events/attributes
36 |           tags: |
37 |             type=schedule
38 |             type=ref,event=branch
39 |             type=ref,event=pr
40 |             type=semver,pattern={{version}}
41 |             type=semver,pattern={{major}}.{{minor}}
42 |             type=semver,pattern={{major}}
43 |             type=sha
44 |       -
45 |         name: Set up QEMU
46 |         uses: docker/setup-qemu-action@v2
47 |       -
48 |         name: Set up Docker Buildx
49 |         uses: docker/setup-buildx-action@v2
50 | 
51 |       -
52 |         name: Build and export to docker for testing
53 |         uses: docker/build-push-action@v3
54 |         with:
55 |           context: .
56 |           load: true
57 |           tags: ${{ env.TEST_TAG }}
58 |       -
59 |         name: Test
60 |         run: |
61 |           docker run --rm -i -v $PWD/tests:/input -v $PWD/tests/:/reads -v $PWD/output:/out -v $PWD/run:/run ${{ env.TEST_TAG }} --tree --standalone_path /input/marker_genes --dna_reference /input/dna_ref.fa --reads /reads/sample_1.fastq --output_path /out
62 |           if [ ! -f output/tree_sample_1.nwk ] ; then exit 1; fi
63 |       -
64 |         name: Login to DockerHub
65 |         if: github.event_name != 'pull_request' && github.event_name != 'push'
66 |         uses: docker/login-action@v2
67 |         with:
68 |           username: ${{ secrets.DOCKER_HUB_USERNAME }}
69 |           password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
70 | 
71 |       -
72 |         name: Build and push
73 |         uses: docker/build-push-action@v3
74 |         with:
75 |           context: .
76 |           platforms: linux/amd64
77 |           push: ${{ github.event_name != 'push' && github.event_name != 'pull_request' }}
78 |           tags: ${{ steps.meta.outputs.tags }}
79 |           labels: ${{ steps.meta.outputs.labels }}
80 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | build
 3 | dist
 4 | read2tree.egg-info
 5 | docs/*
 6 | tmp/*
 7 | .Rhistory
 8 | .RData
 9 | *pyc
10 | tmp
11 | **/.ropeproject
12 | **/__pycache__
13 | **/.ipynb_checkpoints
14 | .idea/workspace.xml
15 | .python-version
16 | .DS_Store
17 | tests/output
18 | tests/mplog.log
19 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # base image
 2 | FROM continuumio/miniconda3
 3 | 
 4 | LABEL software="read2tree"
 5 | 
 6 | 
 7 | WORKDIR /app
 8 | 
 9 | # Create the environment:
10 | COPY environment.yml .
11 | 
12 | RUN apt-get -qq update \
13 |     && apt-get install -y --no-install-recommends \
14 |         wget \
15 |     && rm -rf /var/lib/apt/lists/*
16 | 
17 | RUN conda env create -f environment.yml
18 | 
19 | # Make RUN commands use the new environment:
20 | SHELL ["conda", "run", "-n", "read2tree_env", "/bin/bash", "-c"]
21 | 
22 | # Make sure the environment is activated:
23 | RUN echo "Make sure numpy is installed:" \
24 |     && python -c "import numpy" \
25 |     && python -c "import ete3" \
26 |     && python -c "import pysam"
27 | 
28 | COPY . .
29 | RUN python setup.py install
30 | 
31 | ENV PATH /app/read2tree/bin:/opt/conda/envs/read2tree_env/bin:$PATH
32 | 
33 | WORKDIR /run
34 | 
35 | ENTRYPOINT ["read2tree"]
36 | 
37 | CMD ["-h"] 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 webfucktory
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/archive/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding" addBOMForNewFiles="with NO BOM" />
4 | </project>


--------------------------------------------------------------------------------
/archive/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="PROJECT_PROFILE" value="Default" />
4 |     <option name="USE_PROJECT_PROFILE" value="false" />
5 |     <version value="1.0" />
6 |   </settings>
7 | </component>


--------------------------------------------------------------------------------
/archive/.idea/libraries/R_User_Library.xml:
--------------------------------------------------------------------------------
1 | <component name="libraryTable">
2 |   <library name="R User Library">
3 |     <CLASSES />
4 |     <SOURCES />
5 |   </library>
6 | </component>


--------------------------------------------------------------------------------
/archive/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (r2t)" project-jdk-type="Python SDK" />
4 |   <component name="PythonCompatibilityInspectionAdvertiser">
5 |     <option name="version" value="3" />
6 |   </component>
7 | </project>


--------------------------------------------------------------------------------
/archive/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/pore2tree.iml" filepath="$PROJECT_DIR$/.idea/pore2tree.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/archive/.idea/other.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="PySciProjectComponent">
4 |     <option name="PY_SCI_VIEW" value="true" />
5 |     <option name="PY_SCI_VIEW_SUGGESTED" value="true" />
6 |   </component>
7 | </project>


--------------------------------------------------------------------------------
/archive/.idea/pore2tree.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.5 (r2t)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |     <orderEntry type="library" name="R User Library" level="project" />
 8 |     <orderEntry type="library" name="R Skeletons" level="application" />
 9 |   </component>
10 |   <component name="PyDocumentationSettings">
11 |     <option name="renderExternalDocumentation" value="true" />
12 |   </component>
13 |   <component name="TestRunnerService">
14 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
15 |   </component>
16 | </module>


--------------------------------------------------------------------------------
/archive/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/archive/TODO:
--------------------------------------------------------------------------------
1 | added here something to test windows bash
2 | * add better error handling when mapping doesn't work
3 | * tables needs hdf5 dependencies
4 | * some git repositories are difficult to access
5 | * each time the mapping function finishes it should check whether the currently running mapping is the
6 | last one and if so then once this is done it should delete all unnecessary files
7 | 


--------------------------------------------------------------------------------
/archive/down_py_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/down_GLYSP.o%J
 3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/down_GLYSP.e%J
 4 | #BSUB -u david.dylus@unil.ch
 5 | #BSUB -J down_GLYSP
 6 | #BSUB -n 4
 7 | #BSUB -R "span[ptile=4]"
 8 | #BSUB -R "rusage[mem=4000]"
 9 | #BSUB -M 4000000
10 | srr=SRR3115005
11 | speciesid=GLYSP
12 | module add Utility/aspera_connect/3.7.4.147727
13 | source activate r2t
14 | mkdir /scratch/beegfs/weekly/ddylus/avian/reads/$speciesid
15 | echo 'Created read $speciesid'
16 | cd /scratch/beegfs/weekly/ddylus/avian/reads/$speciesid
17 | ascp -v -QT -k1 -l100M -i /software/Utility/aspera_connect/3.7.4.147727/etc/asperaweb_id_dsa.openssh anonftp@ftp.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/SRR/${srr:0:6}/$srr/$srr.sra ./
18 | echo 'Finished download'
19 | parallel-fastq-dump -s *.sra -t 4 -O . --split-files --tmpdir .
20 | echo 'Finished getting fastq from sra and split files'
21 | mv *\_1.fastq $speciesid\_1.fq
22 | mv *\_2.fastq $speciesid\_2.fq
23 | echo 'Finished moving files'


--------------------------------------------------------------------------------
/archive/r2t_py_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/r2t_GLYSP.o%J
 3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/r2t_GLYSP.e%J
 4 | #BSUB -u david.dylus@unil.ch
 5 | #BSUB -J r2t_GLYSP
 6 | #BSUB -n 4
 7 | #BSUB -R "span[ptile=4]"
 8 | #BSUB -R "rusage[mem=10000]"
 9 | #BSUB -M 10000000
10 | source activate r2t
11 | reads=/scratch/beegfs/weekly/ddylus/avian/reads/GLYSP
12 | cd /scratch/beegfs/weekly/ddylus/avian/r2t/
13 | python -W ignore ~/opt/read2tree/bin/read2tree --standalone_path /scratch/beegfs/weekly/ddylus/avian/marker_genes/ --dna_reference /scratch/beegfs/weekly/ddylus/avian/eukaryotes.cdna.fa --reads $reads/GLYSP_1.fq $reads/GLYSP_2.fq --output_path /scratch/beegfs/weekly/ddylus/avian/r2t/ --single_mapping /scratch/beegfs/weekly/ddylus/avian/r2t/02_ref_dna/MELGA_OGs.fa --threads 4 --min_species 8


--------------------------------------------------------------------------------
/archive/requirements.txt:
--------------------------------------------------------------------------------
 1 | biopython
 2 | numpy>=1.13.3
 3 | scipy
 4 | lxml
 5 | pandas
 6 | Cython
 7 | ete3>=3.0.0b35
 8 | pyparsing>=2.1.10
 9 | pysam>=0.11.2.2
10 | six>=1.10.0
11 | requests>=2.13.0
12 | dendropy>=4.3.0
13 | tqdm>=4.19.1
14 | pyham
15 | pyyaml
16 | multiprocessing_logging
17 | 


--------------------------------------------------------------------------------
/archive/rm_py_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/rm_GLYSP.o%J
 3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/rm_GLYSP.e%J
 4 | #BSUB -u david.dylus@unil.ch
 5 | #BSUB -J rm_GLYSP
 6 | #BSUB -n 1
 7 | #BSUB -R "span[ptile=1]"
 8 | #BSUB -R "rusage[mem=1000]"
 9 | #BSUB -M 1000000
10 | rm -r /scratch/beegfs/weekly/ddylus/avian/reads/GLYSP


--------------------------------------------------------------------------------
/archive/scripts/adjust_mapping_names.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import shutil
 3 | import os
 4 | import getopt
 5 | import glob
 6 | 
 7 | from Bio import SeqIO
 8 | from tables import *
 9 | 
10 | def main():
11 | 
12 |     try:
13 |         opts, args = getopt.getopt(sys.argv[1:], "m:r:h", ["mapping_folder=", "reference_folder="])
14 |     except getopt.GetoptError as e:
15 |         print(str(e))
16 |         print('get_seq_completeness.py -m <mapping_folder>')
17 |         sys.exit(2)
18 | 
19 |     mapping_folder = None
20 | 
21 |     for opt, arg in opts:
22 |         if opt == '-h':
23 |             print('get_seq_completeness.py -m <mapping_folder>')
24 |             sys.exit()
25 |         elif opt in ("-m", "--mapping_folder"):
26 |             mapping_folder = arg
27 |             if mapping_folder[-1] is not "/":
28 |                 mapping_folder += "/"
29 |         else:
30 |             assert False, "unhandled option"
31 | 
32 |     if mapping_folder:
33 |         for file in glob.glob(mapping_folder + "/*.fa"):
34 |             if "_OGs" not in os.path.basename(file):
35 |                 species_name = os.path.basename(file).split("_")[0]
36 |                 new_file_name = species_name + "_OGs_consensus.fa"
37 |                 shutil.move(file, os.path.join(mapping_folder, new_file_name))
38 |         for file in glob.glob(mapping_folder + "/*cov.txt"):
39 |             if "_OGs" not in os.path.basename(file):
40 |                 species_name = os.path.basename(file).split("_")[0]
41 |                 new_file_name = species_name + "_OGs_cov.txt"
42 |                 shutil.move(file, os.path.join(mapping_folder, new_file_name))
43 | 
44 | if __name__ == "__main__":
45 |     main()
46 | 


--------------------------------------------------------------------------------
/archive/scripts/clean_fasta_cdna_cds.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from Bio import SeqIO
  3 | from Bio.Seq import Seq
  4 | import sys
  5 | from os import listdir
  6 | import os
  7 | 
  8 | 
  9 | 
 10 | 
 11 | def read_fasta_files(input_folder, format_input="fna"):
 12 | 
 13 |     files = listdir(input_folder)
 14 |     records_all = []
 15 |     file_names = [] 
 16 |     for file in files:
 17 |         sp_name = file.split(".")[:-1]
 18 |         if file.split(".")[-1] == format_input:
 19 |             file_names.append(file)
 20 |             records = list(SeqIO.parse(input_folder + file, "fasta"))
 21 |             records_all.append(records)        
 22 |         else:
 23 |             print("we are not reading the file "+str(input_folder+file)+" since extension is not faa.")
 24 |     if records_all:
 25 |         print("there are ", len(file_names), format_input, " files, and the first file has ", len(records_all[0]), "sequences in it.") 
 26 |     else:
 27 |         print("there is no  " +format_input, " files in ",input_folder) 
 28 |     return file_names, records_all
 29 | 
 30 | 
 31 | def create_five_letter(file_names, output_five_letter_tsv = "clean_five_letter_species.tsv"):
 32 |     
 33 |     fiveLetter_species_dic = {}
 34 |     countr = 0
 35 |     for file_name in file_names:
 36 |         fiveLetter_species = "s" + str(countr).zfill(4) 
 37 |         fiveLetter_species_dic[file_name] = fiveLetter_species
 38 |         countr += 1
 39 |     file_out = open(output_five_letter_tsv, "w")
 40 |     for species_name, fiveLetter in fiveLetter_species_dic.items():
 41 |         file_out.write(species_name + "\t" + fiveLetter + "\n")
 42 |     file_out.close()
 43 |     print("the five letter codes for each faa files are written in "+output_five_letter_tsv)
 44 | 
 45 |     return fiveLetter_species_dic
 46 | 
 47 | 
 48 | 
 49 | def clean_translate(records ,species_fivelet):
 50 |     
 51 |     records_nuc = []
 52 |     records_aa = []
 53 |     for record in records:
 54 |         sequence = record.seq
 55 |         remainder = len(sequence) % 3
 56 |         if remainder != 0:
 57 |             sequence +=Seq('N' * (3 - remainder)) 
 58 |             record.seq= sequence
 59 |     
 60 |         id_old = str(record.id).replace("_","").replace(".","")
 61 |         id_new=  species_fivelet + id_old
 62 |         
 63 |         nuc_seq= SeqIO.SeqRecord(sequence, id=id_new, description="cleaned for r2t", name = id_new)
 64 |         
 65 |         protein_seq = sequence.translate()
 66 |         protein_seq = SeqIO.SeqRecord(protein_seq, id=id_new, description="cleaned for r2t", name = id_new)
 67 |     
 68 |         
 69 |         records_nuc.append(nuc_seq)
 70 |         records_aa.append(protein_seq)
 71 |     
 72 |     print("the clean aa and nuc for "+species_fivelet+" is ready")
 73 |     
 74 |     return records_nuc, records_aa
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | if __name__ == '__main__':
 81 | 
 82 |     input_folder_fna = sys.argv[1] + "/"  # "myfolder/input_fna/" #
 83 |         
 84 |     file_names, records_all = read_fasta_files(input_folder_fna, "fna")
 85 |     fiveLetter_species_dic = create_five_letter(file_names)
 86 |     
 87 |     
 88 |     folder_aa= "clean_aa"
 89 |     
 90 |     
 91 |     if not os.path.exists(folder_aa):
 92 |         os.makedirs(folder_aa)
 93 |     else:
 94 |         print("ERROR the folder exists "+folder_aa +" better to remove it ")
 95 |     
 96 |     records_nuc_all_clean=[]
 97 |     for idx in range(len(file_names)):
 98 |         file_name = file_names[idx]
 99 |         records = records_all[idx]    
100 |         species_fivelet = fiveLetter_species_dic[file_name]
101 |     
102 |         records_nuc, records_aa = clean_translate(records ,species_fivelet)
103 |             
104 |         SeqIO.write(records_aa, folder_aa+"/"+species_fivelet+".fa", "fasta")
105 |         
106 |         records_nuc_all_clean += records_nuc # one big list 
107 |     
108 |     
109 |     SeqIO.write(records_nuc_all_clean, "dna_ref.fa", "fasta")
110 |     
111 |     print("we wrote "+str(len(file_names))+" faa fiels in the folder "+folder_aa+" and the nucluetide sequences all together in dna_ref.fa" )
112 | 
113 |     print("Now you can use the folder with OMA standalone" )
114 | 


--------------------------------------------------------------------------------
/archive/scripts/concat_alignments.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import getopt
 4 | import glob
 5 | from Bio import AlignIO, SeqIO
 6 | 
 7 | from zoo.seq_utils.utils import concatenate
 8 | 
 9 | <<<<<<< Updated upstream
10 | def concatenate_alignments(folder):
11 | =======
12 | def concatenate_alignments(folder, min_taxa=0):
13 | >>>>>>> Stashed changes
14 |     all_og_alignments = []
15 |     all_og_align_pos = {}
16 |     start = 0
17 |     for f in glob.glob(folder+'*.phy'):
18 |         used_ogs = 0
19 |         if os.path.getsize(f) > 0:
20 |             try:
21 |                 msa = AlignIO.read(f, "phylip-relaxed")
22 |             except ValueError:
23 |                 msa = AlignIO.read(f, "fasta")
24 |         #for record in msa:
25 |         #    record.id = record.id[0:5]
26 |         #msa[-1].id = "CANAL"
27 |             if len(msa) >= min_taxa:
28 |                 print(f)
29 |                 used_ogs =+ 1
30 |                 all_og_alignments.append(msa)
31 |             #all_og_align_pos[f] = [start, start + len(record.seq)]
32 |             #start = len(record.seq) + 1
33 |     con_alignment = concatenate(all_og_alignments)
34 |     print('OGs used: {}!'.format(used_ogs))
35 |     return con_alignment
36 | 
37 | 
38 | def main():
39 | 
40 |     try:
41 |       opts, args = getopt.getopt(sys.argv[1:], "f:m:o:", ["folder=", "min_taxa=", "out_file="])
42 |     except getopt.GetoptError as e:
43 |         print(str(e))
44 |         print('concat_alignments.py -f <folder> -m <min_taxa> -o <out_file>')
45 |         sys.exit(2)
46 | 
47 |     seq_folder = None
48 |     out_file = None
49 |     min_taxa = 0
50 | 
51 |     for opt, arg in opts:
52 |         if opt == '-h':
53 |             print('concat_alignments.py -f <folder> -m <min_taxa> -o <out_folder> -d')
54 |             sys.exit()
55 |         elif opt in ("-f", "--folder"):
56 |             seq_folder = arg
57 |         elif opt in ("-o", "--out_file"):
58 |             out_file = arg
59 |         elif opt in ("-m", "--min_taxa"):
60 |             min_taxa = int(arg)
61 |         else:
62 |             assert False, "unhandled option"
63 | 
64 | 
65 | 
66 |     if seq_folder[-1] is not "/":
67 |         seq_folder += "/"
68 |     
69 |     if min_taxa > 0:
70 |       out_file = out_file+"_"+str(min_taxa)+".phy"
71 | 
72 |     alignment = concatenate_alignments(seq_folder, min_taxa=min_taxa)
73 |     if alignment is not None:
74 |         align_output = open(out_file, "w")
75 |         AlignIO.write(alignment, align_output, "phylip-relaxed")
76 |         align_output.close()
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/archive/scripts/down_py_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/down_GLYSP.o%J
 3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/down_GLYSP.e%J
 4 | #BSUB -u david.dylus@unil.ch
 5 | #BSUB -J down_GLYSP
 6 | #BSUB -n 1
 7 | #BSUB -R "span[ptile=1]"
 8 | #BSUB -R "rusage[mem=2000]"
 9 | #BSUB -M 2000000
10 | srr=SRR3115005
11 | speciesid=GLYSP
12 | module add Utility/aspera_connect/3.7.4.147727
13 | module add UHTS/Analysis/sratoolkit/2.8.2.1
14 | source activate r2t
15 | mkdir /scratch/beegfs/weekly/ddylus/avian/reads/$speciesid
16 | echo 'Created read $speciesid'
17 | cd /scratch/beegfs/weekly/ddylus/avian/reads/$speciesid
18 | ascp -v -QT -k1 -l100M -i /software/Utility/aspera_connect/3.7.4.147727/etc/asperaweb_id_dsa.openssh anonftp@ftp.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/SRR/${srr:0:6}/$srr/$srr.sra ./
19 | echo 'Finished download'
20 | fastq-dump --split-files --gzip $srr.sra
21 | echo 'Finished getting fastq from sra and split files'
22 | mv *\_1.* $speciesid\_1.fq.gz
23 | mv *\_2.* $speciesid\_2.fq.gz
24 | echo 'Finished moving files'


--------------------------------------------------------------------------------
/archive/scripts/get_highly_conserved_og_dna_hdf5.py:
--------------------------------------------------------------------------------
  1 | from tables import *
  2 | from Bio import SeqIO
  3 | from Bio.Seq import Seq
  4 | from Bio.SeqRecord import SeqRecord
  5 | from pyoma.browser import db
  6 | import familyanalyzer as fa
  7 | 
  8 | # parameters
  9 | MIN_SPECIES = 20
 10 | DUP_RATIO = 0
 11 | DIR = '/Users/daviddylus/Research/read2tree/reference_datasets/Dataset1/Output/'
 12 | 
 13 | # read in files
 14 | hog_XML = DIR+'HierarchicalGroups.orthoxml'
 15 | og_XML = DIR+'OrthologousGroups.orthoxml'
 16 | h5file = open_file("/Volumes/Untitled/OmaServer.h5", mode="r")
 17 | 
 18 | genomeTab = h5file.root.Genome
 19 | dbObj = db.Database(h5file)
 20 | omaIdObj = db.OmaIdMapper(dbObj)
 21 | 
 22 | if DUP_RATIO != 0:
 23 |   hog_op = fa.OrthoXMLParser(hog_XML)
 24 |   gene_family_xml_nodes_hog = hog_op.getToplevelGroups()
 25 |   # select all the families with more than X species and duplication ratio smaller than Y
 26 |   hog_families_X = {}
 27 |   for i, family in enumerate(gene_family_xml_nodes_hog):
 28 |     family_id = family.get('id')
 29 |     genes_per_hog = [val for sublist in hog_op.getGenesPerSpeciesInFam(family).values() for val in sublist]
 30 |     species_per_hog = hog_op.getGenesPerSpeciesInFam(family).keys()
 31 |     duplication_ratio = float(len(genes_per_hog)) / float(len(species_per_hog))
 32 |     if len(species_per_hog) >= MIN_SPECIES and duplication_ratio <= DUP_RATIO:
 33 |       hog_families_X[family_id] = genes_per_hog
 34 | 
 35 |   print(len(hog_families_X))
 36 | 
 37 | 
 38 | og_op = fa.OrthoXMLParser(og_XML)
 39 | gene_family_xml_nodes_og = og_op.getToplevelGroups()
 40 | og_families_X = {}
 41 | for i, family in enumerate(gene_family_xml_nodes_og):
 42 |     family_id = family.get('id')
 43 |     genes_per_og = [val for sublist in og_op.getGenesPerSpeciesInFam(family).values() for val in sublist]
 44 |     species_per_og = og_op.getGenesPerSpeciesInFam(family).keys()
 45 |     if len(species_per_og) >= MIN_SPECIES:
 46 |       og_families_X[family_id] = genes_per_og
 47 | 
 48 | print(len(og_families_X))
 49 | 
 50 | if DUP_RATIO != 0:
 51 |   family_map = {}
 52 |   entries_map_omaids = {}
 53 |   cpt = 0
 54 |   for og in og_families_X:
 55 |     cpt += 1
 56 |     if cpt % 10 == 0:
 57 |       print("{} on {}".format(cpt, len(og_families_X)))
 58 |     a = og_families_X[og]
 59 |     for hog in hog_families_X:
 60 |       b = hog_families_X[hog]
 61 |       if len(set(a).intersection(b)) == 30:
 62 |         oma_ids_full = [og_op.mapGeneToXRef(val, 'protId') for val in og_families_X[og]]
 63 |         oma_ids = [og_op.mapGeneToXRef(val, 'protId').split(' | ')[0] for val in og_families_X[og]]
 64 |         entries = [omaIdObj.omaid_to_entry_nr(val) for val in oma_ids]
 65 |         for oma_id in oma_ids_full:
 66 |           entries_map_omaids[omaIdObj.omaid_to_entry_nr(oma_id.split(' | ')[0])] = oma_id
 67 |         family_map[og] = entries
 68 |         break
 69 |   print(len(entries_map_omaids))
 70 | else:
 71 |   family_map = {}
 72 |   entries_map_omaids = {}
 73 |   cpt = 0
 74 |   for og in og_families_X:
 75 |     cpt += 1
 76 |     if cpt % 1000 == 0:
 77 |       print(og_op.mapGeneToXRef(og_families_X[og][0], 'protId').split(' | ')[0])
 78 |       print("{} on {}".format(cpt, len(og_families_X)))
 79 |     oma_ids_full = [og_op.mapGeneToXRef(val, 'protId') for val in og_families_X[og] if og_op.mapGeneToXRef(val, 'protId')]
 80 |     oma_ids = [val.split(' | ')[0] for val in oma_ids_full]
 81 |     entries = [omaIdObj.omaid_to_entry_nr(val) for val in oma_ids if omaIdObj.omaid_to_entry_nr(val)]
 82 |     print(entries)
 83 |     for oma_id in oma_ids_full:
 84 |       entries_map_omaids[omaIdObj.omaid_to_entry_nr(oma_id.split(' | ')[0])] = oma_id
 85 |     family_map[og] = entries
 86 |   print(len(entries_map_omaids))
 87 | 
 88 | 
 89 | family_map_invert = {}
 90 | for key in family_map:
 91 |   for val in family_map[key]:
 92 |     family_map_invert[val]=key
 93 | 
 94 | print(len(family_map_invert))
 95 | 
 96 | records = []
 97 | for key in family_map_invert:
 98 |   new_id = entries_map_omaids[key] + '| OG' + family_map_invert[key]
 99 |   record = SeqRecord(Seq(dbObj.get_cdna(key)), id=new_id, description="")
100 |   records.append(record)
101 | 
102 | with open("dataset2.fasta", "w") as output_handle:
103 |   SeqIO.write(records, output_handle, "fasta")
104 | 
105 | 


--------------------------------------------------------------------------------
/archive/scripts/get_og_from_readmapping.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import getopt
 3 | from Bio import SeqIO
 4 | from Bio.SeqIO import FastaIO
 5 | 
 6 | def get_ogs(mapped_reads, og_data):
 7 |     og_data_names = [record.name for record in og_data]
 8 |     og_data_ogs = [record.description.split("| ")[-1] for record in og_data]
 9 |     list_of_ogs = {}
10 |     for record in mapped_reads:
11 |         if record.name in og_data_names:
12 |             og_index = og_data_names.index(record.name)
13 |             og_name = og_data[og_index].description.split("| ")[-1]
14 |             indices = [i for i, x in enumerate(og_data_ogs) if x == og_name]
15 |             seq_to_write = [og_data[i] for i in indices]
16 |             record.seq = record.seq.upper()
17 |             record.id = "SRR400661_" + record.id
18 |             record.name = "SRR400661_" + record.name
19 |             record.description = "SRR400661_" + record.description
20 |             seq_to_write.append(record)
21 |             list_of_ogs[og_name] = seq_to_write
22 |     return list_of_ogs
23 | 
24 | 
25 | 
26 | def main():
27 | 
28 |     try:
29 |         opts, args = getopt.getopt(sys.argv[1:], "r:d:o:", ["mapped_reads=", "ref_data=", "out_folder="])
30 |     except getopt.GetoptError as e:
31 |         print(str(e))
32 |         print('concat_alignments.py -r <mapped_reads> -d <ref_data> -o <out_folder>')
33 |         sys.exit(2)
34 | 
35 |     mapped_reads = None
36 |     ref_data = None
37 |     out_folder = None
38 | 
39 |     for opt, arg in opts:
40 |         if opt == '-h':
41 |             print('concat_alignments.py -r <mapped_reads> -d <ref_data> -o <out_folder>')
42 |             sys.exit()
43 |         elif opt in ("-r", "--reads"):
44 |             mapped_reads = arg
45 |         elif opt in ("-d", "--ref_data"):
46 |             ref_data = arg
47 |         elif opt in ("-o", "--out_folder"):
48 |             out_folder = arg
49 |         else:
50 |             assert False, "unhandled option"
51 | 
52 |     read_mappings = list(SeqIO.parse(mapped_reads, "fasta"))
53 |     og_data = list(SeqIO.parse(ref_data, "fasta"))
54 | 
55 |     if out_folder[-1] is not "/":
56 |         out_folder += "/"
57 | 
58 |     list_of_ogs = get_ogs(read_mappings, og_data)
59 |     if list_of_ogs is not None:
60 |         for og in list_of_ogs:
61 |             file_name = out_folder + og + ".fasta"
62 |             fasta_out = FastaIO.FastaWriter(open(file_name, "w"), wrap=None)
63 |             fasta_out.write_file(list_of_ogs[og])
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     main()


--------------------------------------------------------------------------------
/archive/scripts/get_reconstructed_seq_by_species.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import getopt
 4 | import glob
 5 | import pandas as pd
 6 | 
 7 | from Bio import SeqIO
 8 | from tables import *
 9 | from Bio.SeqIO.FastaIO import FastaWriter
10 | 
11 | 
12 | def read_seq_records(folder):
13 |     out_dic = {}
14 |     for file in glob.glob(os.path.join(folder, "*.fa")):
15 |         sp_name = os.path.basename(file).split("_")[0]
16 |         out_dic[sp_name] = {rec.id: rec for rec in list(SeqIO.parse(file, "fasta"))}
17 |     return out_dic
18 | 
19 | def read_sc_file(file):
20 |     tmp = pd.read_csv(file)
21 |     return [t['gene_id']+"_"+t['og']+"_"+t['og'] for i,t in tmp.iterrows()]
22 | 
23 | 
24 | def main():
25 | 
26 |     try:
27 |         opts, args = getopt.getopt(sys.argv[1:], "m:s:h", ["mapping_folder=", "sc_file="])
28 |     except getopt.GetoptError as e:
29 |         print(str(e))
30 |         print('get_reconstructed_seq_by_species.py -m <mapping_folder> -s <sc_file>')
31 |         sys.exit(2)
32 | 
33 |     mapping_folder = None
34 |     sc_file = None
35 | 
36 |     for opt, arg in opts:
37 |         if opt == '-h':
38 |             print('get_reconstructed_seq_by_species.py -m <mapping_folder> -s <sc_file>')
39 |             sys.exit()
40 |         elif opt in ("-m", "--mapping_folder"):
41 |             mapping_folder = arg
42 |             if mapping_folder[-1] is not "/":
43 |                 mapping_folder += "/"
44 |         elif opt in ("-s", "--sc_file"):
45 |             sc_file = arg
46 |         else:
47 |             assert False, "unhandled option"
48 | 
49 |     all_records = read_seq_records(mapping_folder)
50 |     selected_seq = [all_records[idx[0:5]][idx] for idx in read_sc_file(sc_file)]
51 |     print(selected_seq)
52 |     file_name = mapping_folder.split("03_mapping_")[-1].split("/")[0]+"_consensus.fa"
53 |     handleF = open(file_name, "w")
54 |     writer = FastaWriter(handleF, wrap=None)
55 |     writer.write_file(selected_seq)
56 |     handleF.close()
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/archive/scripts/get_seq_completeness.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import getopt
 4 | import glob
 5 | 
 6 | from Bio import SeqIO
 7 | from tables import *
 8 | from read2tree.stats.SeqCompleteness import SeqCompleteness
 9 | 
10 | 
11 | def read_seq_records(file):
12 |     return list(SeqIO.parse(file, "fasta"))
13 | 
14 | 
15 | def main():
16 | 
17 |     try:
18 |         opts, args = getopt.getopt(sys.argv[1:], "m:r:h", ["mapping_folder=", "reference_folder="])
19 |     except getopt.GetoptError as e:
20 |         print(str(e))
21 |         print('get_seq_completeness.py -m <mapping_folder> -r <reference_folder>')
22 |         sys.exit(2)
23 | 
24 |     mapping_folder = None
25 |     reference_folder = None
26 | 
27 |     for opt, arg in opts:
28 |         if opt == '-h':
29 |             print('get_seq_completeness.py -m <mapping_folder> -r <reference_folder>')
30 |             sys.exit()
31 |         elif opt in ("-m", "--mapping_folder"):
32 |             mapping_folder = arg
33 |             if mapping_folder[-1] is not "/":
34 |                 mapping_folder += "/"
35 |         elif opt in ("-r", "--reference_folder"):
36 |             reference_folder = arg
37 |             if reference_folder[-1] is not "/":
38 |                 reference_folder += "/"
39 |         else:
40 |             assert False, "unhandled option"
41 |     if reference_folder and mapping_folder:
42 |         for file in glob.glob(reference_folder+"/*.fa"):
43 |             species = os.path.basename(file).split("_")[0]
44 |             print(species)
45 |             ref_records = read_seq_records(file)
46 |             mapping_file = os.path.join(mapping_folder,os.path.basename(file).split(".")[0]+"_consensus.fa")
47 |             if os.path.exists(mapping_file):
48 |                 map_records = read_seq_records(mapping_file)
49 |                 seqC = SeqCompleteness(ref_records)
50 |                 seqC.get_seq_completeness(map_records)
51 |                 seqC.write_seq_completeness(os.path.join(mapping_folder, species + "_OGs_sc.txt"))
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/archive/scripts/map2align_test.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import getopt
  4 | import glob
  5 | 
  6 | from Bio import SeqIO
  7 | from zoo.wrappers.treebuilders import Fasttree
  8 | from tables import *
  9 | from Bio import AlignIO
 10 | from zoo.wrappers.aligners import Mafft
 11 | from Bio.SeqIO import FastaIO
 12 | 
 13 | 
 14 | from zoo.seq_utils.utils import concatenate
 15 | 
 16 | 
 17 | def get_coverage(og):
 18 |     return (len(og[-1].seq)-og[-1].seq.count('X'))/len(og[-1].seq)
 19 | 
 20 | def perform_mapping(DIR_MAPPING, FILE_OGS):
 21 |     og_dict = {}
 22 |     '''read in og with aa seq'''
 23 |     og = list(SeqIO.parse(FILE_OGS, "fasta"))
 24 |     for record in og:
 25 |         key = record.description.split(" | ")[-1]
 26 |         if key in og_dict:
 27 |             ids = [rec.id for rec in og_dict[key]]
 28 |             if record.id not in ids:
 29 |                 og_dict[key].append(record)
 30 |         else:
 31 |             og_dict[key] = []
 32 |             og_dict[key].append(record)
 33 | 
 34 | 
 35 |     # parse the mapped reads to ogs to dictionary
 36 |     all_dict = {}
 37 |     for file in glob.glob(DIR_MAPPING + "*.fa"):
 38 |         og_name = file.split("_")[-1].split(".")[0]
 39 |         og = og_dict[og_name]
 40 | 
 41 |         # change ids to species names
 42 |         for i, record in enumerate(og):
 43 |             s = record.id[0:5]
 44 |             record.id = s
 45 |         all_dict[og_name] = og
 46 | 
 47 |     OG_OUT = DIR_MAPPING + 'origin_og/'
 48 |     if not os.path.exists(OG_OUT):
 49 |         os.makedirs(OG_OUT)
 50 | 
 51 |     for key, item in all_dict.items():
 52 |         file_name = OG_OUT + key + ".fa"
 53 |         fasta_out = FastaIO.FastaWriter(open(file_name, "w"), wrap=None)
 54 |         fasta_out.write_file(item)
 55 | 
 56 |     print("FINISHED PARSING OGs!")
 57 |     return all_dict
 58 | 
 59 | def read_alignments(folder):
 60 |     align_list = []
 61 |     for filename in glob.glob(folder+"*.phy"):
 62 |         # input_handle = open(filename, "rU")
 63 |         align_list.append(AlignIO.read(filename, "phylip-relaxed"))
 64 |     print("FINISHED READING ALIGNMENTS!")
 65 |     return align_list
 66 | 
 67 | def perform_alignment(all_dict, DIR_MAPPING):
 68 |     align_dict = {}
 69 |     align_list = []
 70 |     counter = 0
 71 |     for key, value in all_dict.items():
 72 |         mafft_wrapper = Mafft(value, datatype="PROTEIN")
 73 |         mafft_wrapper.options.options['--localpair'].set_value(True)
 74 |         mafft_wrapper.options.options['--maxiterate'].set_value(1000)
 75 |         alignment = mafft_wrapper()
 76 |         align_dict[key] = alignment
 77 |         align_list.append(alignment)
 78 |         counter += 1
 79 |         if counter % 50 == 0:
 80 |             print('{} of {} alignments done'.format(counter, len(all_dict)))
 81 | 
 82 |     ALIGN_OUT = DIR_MAPPING + 'origin_align/'
 83 | 
 84 |     if not os.path.exists(ALIGN_OUT):
 85 |         os.makedirs(ALIGN_OUT)
 86 |     print("WRITING ALIGNMENT FILES INTO: {}!".format(ALIGN_OUT))
 87 |     for key, value in align_dict.items():
 88 |         output_handle = open(ALIGN_OUT + key + ".phy", "w")
 89 |         AlignIO.write(value, output_handle, "phylip")
 90 |     print("FINISHED ALIGNMENTS!")
 91 |     return align_list
 92 | 
 93 | def concatenate_alignment(align_list, DIR_MAPPING):
 94 |     ALIGN_OUT = DIR_MAPPING + 'origin_align/'
 95 |     concat_align = concatenate(align_list)
 96 | 
 97 |     output_handle = open(ALIGN_OUT + "CONCAT.phy", "w")
 98 |     AlignIO.write(concat_align, output_handle, "phylip")
 99 |     print("FINISHED CONCATINATION!")
100 |     return concat_align
101 | 
102 | def build_tree(concat_align, DIR_MAPPING):
103 | 
104 |     fasttree_wrapper = Fasttree(concat_align, datatype="PROTEIN")
105 |     tree = fasttree_wrapper()
106 |     print("FINISHED TREE INFERENCE!")
107 |     with open(DIR_MAPPING+"original_tree.nwk", "w") as text_file:
108 |         text_file.write("{};".format(tree))
109 |     print("Resulting tree: {}".format(tree))
110 |     return tree
111 | 
112 | def main():
113 | 
114 |     try:
115 |         opts, args = getopt.getopt(sys.argv[1:], "m:o:a:t:h", ["mapping_folder=", "ortholog_file=", "alignmnet_folder="])
116 |     except getopt.GetoptError as e:
117 |         print(str(e))
118 |         print('map2align_test.py -m <mapping_folder> -o <ortholog_file> -a <alignmnet_folder>')
119 |         sys.exit(2)
120 | 
121 |     mapping_folder = None
122 |     ortholog_file = None
123 |     alignment_folder = None
124 | 
125 |     for opt, arg in opts:
126 |         if opt == '-h':
127 |             print('map2align_test.py -m <mapping_folder> -o <ortholog_file> -a <alignmnet_folder>')
128 |             sys.exit()
129 |         elif opt in ("-m", "--mapping_folder"):
130 |             mapping_folder = arg
131 |             if mapping_folder[-1] is not "/":
132 |                 mapping_folder += "/"
133 |         elif opt in ("-a", "--alignmnet_folder"):
134 |             alignment_folder = arg
135 |             if alignment_folder[-1] is not "/":
136 |                 alignment_folder += "/"
137 |         elif opt in ("-o", "--ortholog_folder"):
138 |             ortholog_file = arg
139 |         else:
140 |             assert False, "unhandled option"
141 | 
142 | 
143 |     if alignment_folder:
144 |         align = read_alignments(alignment_folder)
145 |     else:
146 |         mapping = perform_mapping(mapping_folder, ortholog_file)
147 |         align = perform_alignment(mapping, mapping_folder)
148 | 
149 |     concatenation = concatenate_alignment(align, mapping_folder)
150 |     build_tree(concatenation, mapping_folder)
151 | 
152 | 
153 | if __name__ == "__main__":
154 |     main()
155 | 


--------------------------------------------------------------------------------
/archive/scripts/monitor_folder_size.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | import pandas as pd
 4 | import subprocess
 5 | 
 6 | 
 7 | def output_shell(line):
 8 |     """
 9 |     Save output of shell line that has pipes
10 |     taken from: https://stackoverflow.com/questions/7389662/link-several-popen-commands-with-pipes
11 |     :param line:
12 |     :return:
13 |     """
14 |     try:
15 |         shell_command = subprocess.Popen(line, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
16 |     except OSError:
17 |         return None
18 |     except ValueError:
19 |         return None
20 | 
21 |     (output, err) = shell_command.communicate()
22 |     shell_command.wait()
23 |     if shell_command.returncode != 0:
24 |         print("Shell command failed to execute")
25 |         return None
26 | 
27 |     return output
28 | 
29 | def du(path):
30 |     return subprocess.check_output(['du', '-sh', path]).split()[0].decode('utf-8')
31 | 
32 | def bjobs():
33 |     return output_shell("bjobs | grep -c 'RUN'")
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     path = sys.argv[1] if len(sys.argv) > 1 else '.'
38 | 
39 |     bjobs_exist = True
40 |     folder_size = []
41 |     number_jobs = []
42 |     total_time = []
43 |     current_time = 0
44 |     time_interval = 10
45 | 
46 |     try:
47 |         with open('./monitoring.csv', 'a') as file:
48 |             file.write('current_time,folder_size,num_bjobs\n')
49 |             while True and bjobs_exist:
50 |                 folder_size.append(du(path))
51 |                 number_jobs.append(bjobs())
52 |                 total_time.append(current_time)
53 |                 to_write = str(current_time)+','+str(folder_size[-1])+','+str(number_jobs[-1])+'\n'
54 |                 file.write(to_write)
55 |                 current_time += time_interval
56 |                 time.sleep(time_interval)
57 |                 # if "No unfinished job found" in output_shell("bjobs"):
58 |                 #     bjobs_exist = False
59 |     except KeyboardInterrupt:
60 |         #time.sleep(time_interval)
61 |         file.close()
62 |         # d = {"folder_size": folder_size, "current_time": total_time, "num_bjobs": number_jobs}
63 |         # df = pd.DataFrame(d)
64 |         # df.to_csv("./monitoring.csv")
65 |         raise


--------------------------------------------------------------------------------
/archive/scripts/orthogroups_fasta_to_marker_genes.py:
--------------------------------------------------------------------------------
  1 | from Bio.SeqIO.FastaIO import FastaWriter
  2 | from Bio import SeqIO
  3 | import tqdm, os, glob, re
  4 | from xml.dom import minidom
  5 | 
  6 | 
  7 | 
  8 | def _find_index_substring(ids, search_string, tmp_list):
  9 |     best_index = None
 10 |     max_occurence = 0
 11 |     tmp_ids = [re.sub(r'\..*', '', tmp) for tmp in tmp_list]
 12 |     use_ids = [re.sub(r'\W+', '', tmp_id) for tmp_id in tmp_ids]
 13 |     index = [i for i, s in enumerate(ids) if search_string in s]
 14 |     for i in index:
 15 |         string_occurence = len([k for k in use_ids if k in ids[i]])
 16 |         if string_occurence > max_occurence:
 17 |             best_index = i
 18 |             max_occurence = string_occurence
 19 |     if best_index:
 20 |         return best_index
 21 |     else:
 22 |         return None
 23 | 
 24 | 
 25 | def _get_all_ids(f_orthoxml):
 26 |     all_prot_ids = []
 27 |     xmldoc = minidom.parse(f_orthoxml)
 28 |     itemlist = xmldoc.getElementsByTagName('gene')
 29 |     print(" --- loading all protids ---")
 30 |     for s in tqdm.tqdm(itemlist):
 31 |         tmp = s.attributes['protId'].value
 32 |         all_prot_ids.append(tmp)
 33 |     return all_prot_ids
 34 | 
 35 | 
 36 | def _write(file, value):
 37 |     """
 38 |     Write output to fasta file
 39 |             :param file: file and location of outputfile
 40 |             :param value:
 41 |             :return:
 42 |     """
 43 |     handle = open(file, "w")
 44 |     writer = FastaWriter(handle, wrap=None)
 45 |     writer.write_file(value)
 46 |     handle.close()
 47 | 
 48 | 
 49 | def _get_species_id(record):
 50 |     if '[' in record.description and ']' in record.description:
 51 |         return record.description[record.description.find(
 52 |             "[")+1:record.description.find("]")]
 53 |     else:
 54 |         return record.id[0:5]
 55 | 
 56 | def run(orthogroups_fasta_folder, orthogroups_xml, output_path, min_species):
 57 |     if not os.path.exists(output_path):
 58 |         os.makedirs(output_path)
 59 |     all_prot_ids = _get_all_ids(orthogroups_xml)
 60 |     for f in tqdm.tqdm(glob.glob(os.path.join(orthogroups_fasta_folder, '*.fa'))):
 61 |         records = list(SeqIO.parse(f, 'fasta'))
 62 |         if len(records) >= min_species:
 63 |             for rec in records:
 64 |                 sp_id = _get_species_id(rec)
 65 |                 tmp_lst = rec.description.split()
 66 |                 if sp_id not in tmp_lst[0]:
 67 |                     tmp = tmp_lst[-2]
 68 |                     tmp_id = re.sub(r'\..*', '', tmp)
 69 |                     use_id = re.sub(r'\W+', '', tmp_id)
 70 |                     new_id = _find_index_substring(all_prot_ids, use_id, tmp_lst)
 71 |                     if new_id:
 72 |                         rec.id = all_prot_ids[new_id]
 73 |                         new_description = rec.description.split()[-1]
 74 |                         rec.description = new_description
 75 |                         rec.name = ''
 76 |             output_file = os.path.join(output_path,
 77 |                                        os.path.basename(f))
 78 |             _write(output_file, records)
 79 | 
 80 | 
 81 | if __name__ == "__main__":
 82 |     import argparse
 83 |     parser = argparse.ArgumentParser(
 84 |         description="""Transform OrthogroupsFasta into marker_genes""")
 85 |     parser.add_argument('--oxml', default=None,
 86 |                         help='[Default is none] Remove species present '
 87 |                         'in data set after mapping step completed to '
 88 |                         'build OGs. Input is comma separated list '
 89 |                         'without spaces, e.g. XXX,YYY,AAA.')
 90 |     parser.add_argument('--ofolder', default='marker_genes', required=True,
 91 |                         help='[Default is current directory] Path to '
 92 |                         'output directory.')
 93 |     parser.add_argument('--ofasta', default='.', required=True,
 94 |                         help='[Default is current directory] Path to '
 95 |                         'output directory.')
 96 |     parser.add_argument('--min_species', type=int, default=None,
 97 |                             help='Min number of species in selected '
 98 |                                  'orthologous groups. If not selected it will be '
 99 |                                  'estimated such that around 1000 OGs '
100 |                                  'are available.')
101 | 
102 |     conf = parser.parse_args()
103 | 
104 |     run(conf.ofasta, conf.oxml, conf.ofolder, conf.min_species)
105 | 


--------------------------------------------------------------------------------
/archive/scripts/orthogroups_fasta_to_marker_genes_by_groups.py:
--------------------------------------------------------------------------------
 1 | from Bio.SeqIO.FastaIO import FastaWriter
 2 | from Bio import SeqIO
 3 | import tqdm, os, glob
 4 | 
 5 | def _oma_replace(row):
 6 |     if 'OMA0000' in row:
 7 |         return 'OMA0000'
 8 |     elif 'OMA000' in row:
 9 |         return 'OMA000'
10 |     elif 'OMA00' in row:
11 |         return 'OMA00'
12 |     elif 'OMA0' in row:
13 |         return 'OMA0'
14 |     elif 'OMA' in row:
15 |         return 'OMA'
16 | 
17 | 
18 | def _get_all_ids(orthogroups_txt):
19 |     with open(orthogroups_txt) as f:
20 |         lines = f.readlines()
21 |     x = []
22 |     for l in lines:
23 |         if '#' not in l:
24 |             x.append(l.rstrip("\n").split("\t"))
25 |     og_dic = {}
26 |     for r in x:
27 |         tmp = r[0].replace(_oma_replace(r[0]), 'OG')
28 |         r[0] = tmp
29 |         og_dic[tmp] = {i[0:5]: i[6:] for i in r[1:]}
30 |     return og_dic
31 | 
32 | 
33 | def _write(file, value):
34 |     """
35 |     Write output to fasta file
36 |             :param file: file and location of outputfile
37 |             :param value:
38 |             :return:
39 |     """
40 |     handle = open(file, "w")
41 |     writer = FastaWriter(handle, wrap=None)
42 |     writer.write_file(value)
43 |     handle.close()
44 | 
45 | 
46 | def _get_species_id(record):
47 |     if '[' in record.description and ']' in record.description:
48 |         return record.description[record.description.find(
49 |             "[")+1:record.description.find("]")]
50 |     else:
51 |         return record.id[0:5]
52 | 
53 | def run(orthogroups_fasta_folder, og_dic, output_path, min_species):
54 |     if not os.path.exists(output_path):
55 |         os.makedirs(output_path)
56 |     for f in tqdm.tqdm(glob.glob(os.path.join(orthogroups_fasta_folder, '*.fa'))):
57 |         new_name_dic = og_dic[os.path.basename(f).split(".")[0]]
58 |         records = list(SeqIO.parse(f, 'fasta'))
59 |         if len(records) >= min_species:
60 |             for rec in records:
61 |                 sp_id = _get_species_id(rec)
62 |                 new_id = new_name_dic[sp_id].split()[0]
63 |                 rec.id = new_id
64 |                 rec.description = new_name_dic[sp_id].replace(new_id, "") + " [" + sp_id + "]"
65 |             output_file = os.path.join(output_path,
66 |                                            os.path.basename(f))
67 |             _write(output_file, records)
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     import argparse
72 |     parser = argparse.ArgumentParser(
73 |         description="""Transform OrthogroupsFasta into marker_genes""")
74 |     parser.add_argument('--ogroups', default=None,
75 |                         help='[Default is none] Remove species present '
76 |                         'in data set after mapping step completed to '
77 |                         'build OGs. Input is comma separated list '
78 |                         'without spaces, e.g. XXX,YYY,AAA.')
79 |     parser.add_argument('--ofolder', default='marker_genes', required=True,
80 |                         help='[Default is current directory] Path to '
81 |                         'output directory.')
82 |     parser.add_argument('--ofasta', default='.', required=True,
83 |                         help='[Default is current directory] Path to '
84 |                         'output directory.')
85 |     parser.add_argument('--min_species', type=int, default=None,
86 |                             help='Min number of species in selected '
87 |                                  'orthologous groups. If not selected it will be '
88 |                                  'estimated such that around 1000 OGs '
89 |                                  'are available.')
90 | 
91 |     conf = parser.parse_args()
92 |     og_dic = _get_all_ids(conf.ogroups)
93 | 
94 |     run(conf.ofasta, og_dic, conf.ofolder, conf.min_species)
95 | 


--------------------------------------------------------------------------------
/archive/scripts/protein_converter.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from Bio import SeqIO
 3 | 
 4 | # Get input and output file paths from command-line arguments
 5 | # Daniel Paiva Agustinho
 6 | input_file = sys.argv[1]
 7 | output_file = sys.argv[2]
 8 | 
 9 | with open(input_file, "r") as input_handle:
10 |     with open(output_file, "w") as output_handle:
11 |         for record in SeqIO.parse(input_handle, "fasta"):
12 |             protein_seq = record.seq.translate()
13 | 
14 |             # Extract the entire original header
15 |             original_header = record.description
16 | 
17 |             # Create a new sequence record with the original header
18 |             protein_seq = SeqIO.SeqRecord(
19 |                 protein_seq, id=record.id, description=original_header
20 |             )
21 |             SeqIO.write(protein_seq, output_handle, "fasta")
22 | print("done",str(output_file))
23 | 


--------------------------------------------------------------------------------
/archive/scripts/r2t_py_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/r2t_GLYSP.o%J
 3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/r2t_GLYSP.e%J
 4 | #BSUB -u david.dylus@unil.ch
 5 | #BSUB -J r2t_GLYSP
 6 | #BSUB -n 1
 7 | #BSUB -R "span[ptile=1]"
 8 | #BSUB -R "rusage[mem=4000]"
 9 | #BSUB -M 4000000
10 | source activate r2t
11 | reads=/scratch/beegfs/weekly/ddylus/avian/reads/GLYSP
12 | cd /scratch/beegfs/weekly/ddylus/avian/r2t/
13 | python -W ignore /scratch/beegfs/monthly/ddylus/opt/read2tree/bin/read2tree --standalone_path /scratch/beegfs/weekly/ddylus/avian/marker_genes/ --dna_reference /scratch/beegfs/weekly/ddylus/avian/eukaryotes.cdna.fa --reads $reads/GLYSP_1.fq.gz $reads/GLYSP_2.fq.gz --output_path /scratch/beegfs/weekly/ddylus/avian/r2t/ --single_mapping /scratch/beegfs/weekly/ddylus/avian/r2t/02_ref_dna/MELGA_OGs.fa --threads 4 --min_species 8


--------------------------------------------------------------------------------
/archive/scripts/relabel_msa.py:
--------------------------------------------------------------------------------
 1 | import Bio.AlignIO
 2 | import csv
 3 | 
 4 | 
 5 | def load_oma_species(fn):
 6 |     with open(fn, 'rt') as fh:
 7 |         reader = csv.reader((l for l in fh if not l.startswith('#')), dialect="excel-tab")
 8 |         mapping = {row[0]: row[2].replace(' ','_') + "__" + row[1] for row in reader}
 9 |     return mapping
10 | 
11 | 
12 | def load_nextstrain_metadata(fn):
13 |     with open(fn, 'rt') as fh:
14 |         reader = csv.DictReader(fh, dialect="excel-tab")
15 |         mapping = {row['sra_accession']: row['sra_accession'] + "__" + row['strain'].replace(' ','_') + "__" + row['Nextstrain_clade'].replace(' ','_').replace('(','[').replace(')',']') + row['date']
16 |                    for row in reader}
17 |     return mapping
18 | 
19 | 
20 | def update_msa_ids(msa_path, new_path, mapping, format="phylip-relaxed"):
21 |     msa = Bio.AlignIO.read(msa_path, format=format)
22 |     for rec in msa:
23 |         rec.id = mapping.get(rec.id, rec.id)
24 |     Bio.AlignIO.write(msa, new_path, format=format)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     import argparse
29 |     parser = argparse.ArgumentParser(description="update labels of sequence ids")
30 |     parser.add_argument('--oma-map', help="path to the oma-species.txt file to change 5letter codes with scientific names")
31 |     parser.add_argument('--nextstrain', help="path to the nextstrain metadata file with the sra accessions")
32 |     parser.add_argument('--msa-format', help="format of the msa. if not set, it will be guessed based on file extension")
33 |     parser.add_argument('--out', required=True, help="Path to the output filename")
34 |     parser.add_argument('msa', help="Path to the input msa filename")
35 | 
36 |     conf = parser.parse_args()
37 |     mapping = {}
38 |     if conf.oma_map:
39 |         mapping.update(load_oma_species(conf.oma_map))
40 |     if conf.nextstrain:
41 |         mapping.update(load_nextstrain_metadata(conf.nextstrain))
42 | 
43 |     if conf.msa_format is None:
44 |         conf.msa_format = "phylip-relaxed" if conf.msa.endswith('.phy') else "fasta"
45 |     update_msa_ids(conf.msa, conf.out, mapping, format=conf.msa_format)
46 | 
47 | 


--------------------------------------------------------------------------------
/archive/scripts/remove_species_from_alignment.py:
--------------------------------------------------------------------------------
 1 | from Bio import AlignIO
 2 | from Bio.Align import MultipleSeqAlignment
 3 | from Bio.Alphabet import IUPAC, Gapped
 4 | 
 5 | 
 6 | def get_alignment(file, species_list):
 7 |     keep_species = []
 8 |     alignment = AlignIO.read(file, 'phylip-relaxed')
 9 |     for i, record in enumerate(alignment):
10 |         if record.id not in species_list:
11 |             keep_species.append(record)
12 |     return MultipleSeqAlignment(keep_species, Gapped(IUPAC.protein, "-"))
13 | 
14 | 
15 | def write_alignment(output, alignment):
16 |     AlignIO.write(alignment, output, 'phylip-relaxed')
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     import argparse
21 |     parser = argparse.ArgumentParser(
22 |         description="""Remove species from given alignment.""")
23 |     parser.add_argument('-s', '--remove_species', default=None,
24 |                         help='[Default is none] Remove species present '
25 |                         'in data set after mapping step completed to '
26 |                         'build OGs. Input is comma separated list '
27 |                         'without spaces, e.g. XXX,YYY,AAA.')
28 |     parser.add_argument('-o', '--output', default='.', required=True,
29 |                         help='[Default is current directory] Path to '
30 |                         'output directory.')
31 |     parser.add_argument('-i', '--input', default='.', required=True,
32 |                         help='[Default is current directory] Path to '
33 |                         'output directory.')
34 | 
35 |     conf = parser.parse_args()
36 | 
37 |     new_alignment = get_alignment(conf.input, conf.remove_species)
38 |     write_alignment(conf.output, new_alignment)
39 | 


--------------------------------------------------------------------------------
/archive/scripts/rm_py_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/rm_GLYSP.o%J
 3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/rm_GLYSP.e%J
 4 | #BSUB -u david.dylus@unil.ch
 5 | #BSUB -J rm_GLYSP
 6 | #BSUB -n 1
 7 | #BSUB -R "span[ptile=1]"
 8 | #BSUB -R "rusage[mem=1000]"
 9 | #BSUB -M 1000000
10 | rm -r /scratch/beegfs/weekly/ddylus/avian/reads/GLYSP


--------------------------------------------------------------------------------
/archive/scripts/sample_from_reads.py:
--------------------------------------------------------------------------------
 1 | #from __future__ import division
 2 | import random
 3 | import argparse
 4 | import sys
 5 | # bp length of mouse transcriptome in OMA: 37.914.531
 6 | # bp length of CANVA genome 2.5Mpb
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument("-i", "--input", nargs='+', default=None, help="input FASTQ filename")
10 | parser.add_argument("-out", "--output", help="output FASTQ filename")
11 | parser.add_argument("-f", "--fraction", type=float, help="fraction of reads to sample")
12 | parser.add_argument("-n", "--number", type=int, help="number of reads to sample")
13 | parser.add_argument("-s", "--sample", type=int, help="number of output files to write", default=1)
14 | args = parser.parse_args()
15 | 
16 | if args.fraction and args.number:
17 |    sys.exit("give either a fraction or a number, not both")
18 | 
19 | if not args.fraction and not args.number:
20 |    sys.exit("you must give either a fraction or a number")
21 | 
22 | print("counting records....")
23 | with open(args.input[0]) as input:
24 |     num_lines = sum([1 for line in input])
25 | total_records = int(num_lines / 4)
26 | 
27 | if args.fraction:
28 |     args.number = int(total_records * args.fraction)
29 | 
30 | print("sampling " + str(args.number) + " out of " + str(total_records) + " records")
31 | 
32 | output_sequence_sets = []
33 | output_file_left = []
34 | if len(args.input) > 1:
35 |     output_file_right = []
36 | for i in range(args.sample):
37 |     output_sequence_sets.append(set(random.sample(range(total_records + 1), args.number)))
38 |     #output_file = args.input[0].split("/")[-1].split(".")[0]
39 |     output_file = args.output
40 |     output_file_left.append(open(output_file + "_0_" + str(i) + ".fq", "w"))
41 |     if len(args.input) > 1:
42 |         output_file_right.append(open(output_file + "_1_" + str(i) + ".fq", "w"))
43 | 
44 | initial_length = 0
45 | sampling_length = 0
46 | 
47 | record_number = 0
48 | with open(args.input[0]) as read_input:
49 |     for line1 in read_input:
50 |         line2 = read_input.readline()
51 |         initial_length += len(line2)
52 |         line3 = read_input.readline()
53 |         line4 = read_input.readline()
54 |         for i, output in enumerate(output_file_left):
55 |             if record_number in output_sequence_sets[i]:
56 |                     output.write(line1)
57 |                     output.write(line2)
58 |                     output.write(line3)
59 |                     output.write(line4)
60 |                     sampling_length += len(line2)
61 |         record_number += 1
62 | 
63 | if len(args.input) > 1:
64 |     record_number = 0
65 |     with open(args.input[1]) as read_input:
66 |         for line1 in read_input:
67 |             line2 = read_input.readline()
68 |             line3 = read_input.readline()
69 |             line4 = read_input.readline()
70 |             for i, output in enumerate(output_file_right):
71 |                 if record_number in output_sequence_sets[i]:
72 |                         output.write(line1)
73 |                         output.write(line2)
74 |                         output.write(line3)
75 |                         output.write(line4)
76 |                         sampling_length += len(line2)
77 |             record_number += 1
78 | 
79 | 
80 | #output[0].close()
81 | # if len(args.input) > 1:
82 | #     output[1].close()
83 | print("The mean length of all reads is {} and the mean length of the subsampled reads is {}".format(initial_length/total_records, sampling_length/args.number))
84 | print("The sum length of all reads is {} and the sum length of the subsampled reads is {}".format(initial_length, sampling_length))
85 | print("done!")
86 | 


--------------------------------------------------------------------------------
/archive/scripts/subsample_nextstrain_covid_genomes_with_sra_accession.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import lzma
 3 | import csv
 4 | import random
 5 | 
 6 | 
 7 | def get_sra_datasets(fn):
 8 |     with lzma.open(fn, "rt", newline="") as fh:
 9 |         reader = csv.DictReader(fh, dialect="excel-tab")
10 |         for row in reader:
11 |             if row["sra_accession"] not in ('', '?'):
12 |                 yield row
13 | 
14 | 
15 | def subsample(metafile, nr_per_clade):
16 |     sra = sorted(get_sra_datasets(metafile), key=lambda x: x["Nextstrain_clade"])
17 |     sub = []
18 |     for clade, samples in itertools.groupby(sra, key=lambda x: x["Nextstrain_clade"]):
19 |         if clade == "": 
20 |             continue
21 |         samples = list(samples)
22 |         print(f"{clade}: {len(samples)}")
23 |         sub.extend(random.sample(samples, min(nr_per_clade, len(samples))))
24 |     return sub
25 | 
26 | def write(outfn, sub):
27 |     with open(outfn,'w') as fout:
28 |         w = csv.DictWriter(fout, fieldnames=sub[0].keys(), dialect="excel-tab")
29 |         w.writeheader()
30 |         w.writerows(sub)
31 | 
32 | if __name__ == "__main__":
33 |     import argparse
34 |     parser = argparse.ArgumentParser(description="subsample nextstrain samples from all clades that contain sra accession ids")
35 |     parser.add_argument("--out", required=True, help="path to output file")
36 |     parser.add_argument("--nr-per-clade", default=2, type=int, help="number of samples to use per nextstrain clade. [default: 2]")
37 |     parser.add_argument("metafile", help="metadata.tsv.xz file from nextstrain, e.g. https://data.nextstrain.org/files/ncov/open/global/metadata.tsv.xz")
38 |     conf = parser.parse_args()
39 | 
40 |     subset = subsample(conf.metafile, conf.nr_per_clade)
41 |     write(conf.out, subset)
42 | 


--------------------------------------------------------------------------------
/archive/scripts/treecl/select_alignments.py:
--------------------------------------------------------------------------------
 1 | from Bio import AlignIO
 2 | import tqdm, os, glob
 3 | 
 4 | def run(afolder, ofolder, min_species):
 5 |     if not os.path.exists(ofolder):
 6 |         os.makedirs(ofolder)
 7 |     for f in tqdm.tqdm(glob.glob(os.path.join(afolder, '*.fa'))):
 8 |         if os.path.getsize(f) > 0:
 9 |             try:
10 |                 msa = AlignIO.read(f, "phylip-relaxed")
11 |             except ValueError:
12 |                 msa = AlignIO.read(f, "fasta")
13 |         if len(msa) >= min_species:
14 |             align_output = open(os.path.join(ofolder, os.path.basename(f).split(".")[0]+".phy"), "w")
15 |             AlignIO.write(msa, align_output, "phylip-relaxed")
16 |             align_output.close()
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     import argparse
21 |     parser = argparse.ArgumentParser(
22 |         description="""Transform OrthogroupsFasta into marker_genes""")
23 |     parser.add_argument('--afolder', default=None, required=True,
24 |                         help='[Default is none] Folder that contains alignments'
25 |                         'without spaces, e.g. XXX,YYY,AAA.')
26 |     parser.add_argument('--ofolder', default='alignments_selected', required=True,
27 |                         help='[Default is current directory] Path to '
28 |                         'output directory.')
29 |     parser.add_argument('--min_species', type=int, default=0,
30 |                             help='Min number of species in selected '
31 |                                  'alignments. ')
32 | 
33 |     conf = parser.parse_args()
34 | 
35 |     run(conf.afolder, conf.ofolder, conf.min_species)
36 | 


--------------------------------------------------------------------------------
/archive/scripts/trim_alignment.py:
--------------------------------------------------------------------------------
 1 | import Bio.AlignIO
 2 | import Bio.Align
 3 | import collections
 4 | import math
 5 | 
 6 | def load_msa(fn):
 7 |     if fn.endswith('.phy'):
 8 |         format = 'phylip-relaxed'
 9 |     elif fn.endswith('.fa'):
10 |         format = 'fasta'
11 |     else:
12 |         raise UnkownFormatError('unknown format for '+fn)
13 |     with open(fn, 'rt') as fh:
14 |         msa = next(Bio.AlignIO.parse(fn, format))
15 |     return msa
16 | 
17 | 
18 | def write_msa(fn, msa):
19 |     with open(fn, 'wt') as fh:
20 |         Bio.AlignIO.write(msa, fh, 'phylip-relaxed')
21 | 
22 | 
23 | def count_nucs(data):
24 |     c = collections.Counter(data)
25 |     valid = sum(c[x] for x in ('ATCGN'))
26 |     return valid
27 | 
28 | def trim(msa, min_residue):
29 |     keep = []
30 |     for col in range(msa.get_alignment_length()):
31 |         if count_nucs(msa[:,col]) >= min_residue:
32 |             keep.append(col)
33 |     print(len(keep))
34 |     trimmed = msa[:, keep[0]:keep[0]+1]
35 |     for k in keep[1:]:
36 |         trimmed = trimmed + msa[:, k:k+1]
37 |     return keep, trimmed        
38 | 
39 | def filter_taxa(msa, min_residue):
40 |     filtered = Bio.Align.MultipleSeqAlignment(filter(lambda taxon: count_nucs(taxon) > min_residue, msa))
41 |     return filtered
42 | 
43 | class UnknownFormatError(Exception):
44 |     pass
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     import argparse
49 |     parser = argparse.ArgumentParser(description="sample part of the alignment that contains enough data, and through out species which have too little data")
50 |     parser.add_argument('alignment', help="path to multiple sequence alignment")
51 |     parser.add_argument('--min-per-col', type=int, help="Min nr of taxa that need to have a nuc at a column to be included. Defaults to ceil(nr_taxa*0.3)")
52 |     parser.add_argument('--min-res-per-species', default=400, type=int, help="Minimum number of residues for a taxon in the trimmed alignment to not be kicked out. Defaults to 400")
53 |     parser.add_argument('--out', help="Outfile of trimmed alignment")
54 |     conf = parser.parse_args()
55 | 
56 |     msa = load_msa(conf.alignment)
57 |     if conf.min_per_col is None:
58 |         conf.min_per_col = math.ceil(0.3*len(msa))
59 |     if conf.out is None:
60 |         conf.out = conf.alignment+".trimmed"
61 | 
62 |     print("Loaded MSA ({}x{}). Filter cols with less than {} residue"
63 |           .format(len(msa), msa.get_alignment_length(), conf.min_per_col))
64 |     keep, trimmed_msa = trim(msa, conf.min_per_col)
65 |     print("  after filtering columns: {}x{}".format(len(trimmed_msa), trimmed_msa.get_alignment_length()))
66 |     filtered = filter_taxa(trimmed_msa, conf.min_res_per_species)
67 |     print("  after filtering taxa: {}x{}".format(len(filtered), filtered.get_alignment_length()))
68 |     write_msa(conf.out, filtered)
69 | 


--------------------------------------------------------------------------------
/archive/set_marker_genes/bacteria_markergenes.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/set_marker_genes/bacteria_markergenes.zip


--------------------------------------------------------------------------------
/archive/set_marker_genes/mammalia_markergenes.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/set_marker_genes/mammalia_markergenes.zip


--------------------------------------------------------------------------------
/archive/tests/info.log:
--------------------------------------------------------------------------------
1 | 2018-11-19 08:44:51,173:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz ---
2 | 2018-11-19 08:44:51,180:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz ---
3 | 2018-11-19 08:44:51,183:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz ---
4 | 2018-11-19 08:48:03,835:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz ---
5 | 2018-11-19 08:48:03,839:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz ---
6 | 2018-11-19 08:48:03,842:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz ---
7 | 


--------------------------------------------------------------------------------
/archive/tests/input.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/tests/input.log


--------------------------------------------------------------------------------
/archive/tests/test_aligner.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import gzip
 4 | import argparse
 5 | from Bio import SeqIO
 6 | from read2tree.Reads import Reads
 7 | from read2tree.FastxReader import FastxReader
 8 | dirname = os.path.dirname(__file__)
 9 | 
10 | 
11 | class ReadTest(unittest.TestCase):
12 | 
13 |     def setup_reads_paired(self, sampling=False):
14 |         arg_parser = argparse.ArgumentParser(prog='read2tree')
15 | 
16 |         arg_parser.add_argument('--standalone_path', default='.',
17 |                                 help='[Default is current directory] Path to '
18 |                                      'oma standalone directory.')
19 | 
20 |         arg_parser.add_argument('--reads', nargs='+', default=None,
21 |                                 help='Reads to be mapped to reference. If paired '
22 |                                 'end add separated by space.')
23 | 
24 |         arg_parser.add_argument('--read_type', default='short',
25 |                                 help='[Default is short reads] Type of reads to '
26 |                                 'use for mapping. Either ngm for short reads or '
27 |                                 'ngmlr for long will be used.')
28 | 
29 |         arg_parser.add_argument('--dna_reference', default='',
30 |                                 help='Reference file that contains nucleotide '
31 |                                 'sequences (fasta, hdf5). If not given it will use'
32 |                                 'the RESTapi and retrieve sequences '
33 |                                 'from http://omabrowser.org directly. '
34 |                                 'NOTE: internet connection required!')
35 |                                 
36 |         arg_parser.add_argument('--keep_all_ogs', action='store_true',
37 |                                 help='Keep all orthologs after addition of '
38 |                                 'mapped seq, which means also the groups that '
39 |                                 'have no mapped sequence. Otherwise only groups '
40 |                                 'are used that have the mapped sequence for '
41 |                                 'alignment and tree inference.')
42 | 
43 |         arg_parser.add_argument('-r', '--reference', action='store_true',
44 |                                 help='Just generate the reference dataset for '
45 |                                 'mapping.')
46 | 
47 |         arg_parser.add_argument('--remove_species_ogs', default=None,
48 |                                 help='[Default is none] Remove species present '
49 |                                 'in data set after mapping step completed to '
50 |                                 'build OGs. Input is comma separated list '
51 |                                 'without spaces, e.g. XXX,YYY,AAA.')
52 | 
53 |         arg_parser.add_argument('-s', '--species_name', default=None,
54 |                                 help='[Default is name of read] Name of species '
55 |                                      'for mapped sequence.')
56 | 
57 |         arg_parser.add_argument('--output_path', default='.', required=True,
58 |                                 help='[Default is current directory] Path to '
59 |                                 'output directory.')
60 | 
61 |         argv = ['--standalone_path', 'tests/data/marker_genes/',
62 |                 '--dna_reference', 'tests/data/dna.fa', '--reads',
63 |                 'tests/data/mapper/test3/test_1b.fq',
64 |                 'tests/data/mapper/test3/test_2b.fq',
65 |                 '--output_path', 'tests/data/output', '--read_type',
66 |                 'short', '--keep_all_ogs', '--reference',
67 |                 '--remove_species_ogs', 'CIOIN', '--species_name', 'ass']
68 | 
69 |         args = arg_parser.parse_args(argv)
70 |         return alignments = Aligner(args, ogset.ogs, load=True)
71 | 


--------------------------------------------------------------------------------
/archive/tests/test_og.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from Bio import SeqIO
 4 | from read2tree.OGSet import OG
 5 | 
 6 | dirname = os.path.dirname(__file__)
 7 | 
 8 | 
 9 | class OGTest(unittest.TestCase):
10 | 
11 |     def setup(self):
12 |         aa = list(SeqIO.parse('data/OG4.aa', format='fasta'))
13 |         dna = list(SeqIO.parse('data/OG4.dna', format='fasta'))
14 |         og = OG()
15 |         og.aa = aa
16 |         og.dna = dna
17 |         return og
18 | 
19 |     def test_init(self):
20 |         og = self.setup()
21 |         self.assertEqual(og.dna[0].id, 'MOUSE21964_OG4')
22 | 
23 |     def test_get_og_dict(self):
24 |         og = self.setup()
25 |         dna_dict = og._get_og_dict(og)
26 |         self.assertEqual(dna_dict['MOUSE21964'].name, 'MOUSE21964_OG4')
27 | 
28 |     def test_remove_species_records(self):
29 |         og = self.setup()
30 |         og_wo_mouse = og.remove_species_records('MOUSE')
31 |         self.assertEqual(len(og_wo_mouse[0]), 4)
32 |         self.assertEqual(len(og_wo_mouse[1]), 4)
33 | 
34 |     def test_get_species_id(self):
35 |         og = self.setup()
36 |         dna = og.dna[0]
37 |         aa = og.aa[0]
38 |         self.assertEqual(og._get_species_id(dna), 'MOUSE')
39 |         self.assertEqual(og._get_species_id(aa), 'MOUSE')
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/archive/tests/test_ogset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | from read2tree import OGSet
 4 | 
 5 | API_URL = 'http://omabrowser.org/api'
 6 | 
 7 | class OGSetTest(unittest.TestCase):
 8 |     def setUp(self):
 9 |         arg_parser = argparse.ArgumentParser(prog='read2tree')
10 | 
11 |         arg_parser.add_argument('--reads', nargs='+', default=None,
12 |                                 help='Reads to be mapped to reference. If paired end '
13 |                                      'add separated by space.')
14 |         arg_parser.add_argument('--read_split_length', type=int, default=400,
15 |                                 help='Set read split length.')
16 |         arg_parser.add_argument('--read_split_overlap', type=int, default=50,
17 |                                 help='Set read split length overlap.')
18 |         arg_parser.add_argument('-s', '--species_name', default=None,
19 |                                 help='[Default is name of read] Name of species '
20 |                                      'for mapped sequence.')
21 | 
22 |         argv = ['--reads', 'tests/data/reads/test.fq']
23 | 
24 |         args = arg_parser.parse_args(argv)
25 |         return OGSet(args)
26 | 
27 |     def test_OGSet(self):
28 |         raise NotImplementedError
29 | 
30 |     def test_marker_genes_input(self):
31 |         raise NotImplementedError
32 | 
33 |     def test_omastandalone_input(self):
34 |         raise NotImplementedError
35 | 
36 |     def test_output_folder_structure(self):
37 |         raise NotImplementedError
38 | 
39 |     def test_species_removal(self):
40 |         raise NotImplementedError
41 | 
42 |     def test_species_removal_after_mapping(self):
43 |         raise NotImplementedError
44 | 
45 |     def test_rest_api_connection(self):
46 |         OGSet._read
47 | 
48 |     def test_rest_api_dna_downlaod(self):
49 |         raise NotImplementedError
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     unittest.main()
54 | 


--------------------------------------------------------------------------------
/archive/tests/test_reads.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | import gzip
  4 | import argparse
  5 | from Bio import SeqIO
  6 | from read2tree.Reads import Reads
  7 | from read2tree.FastxReader import FastxReader
  8 | from read2tree.main import parse_args
  9 | from read2tree._utils import exe_name
 10 | dirname = os.path.dirname(__file__)
 11 | 
 12 | 
 13 | class ReadTest(unittest.TestCase):
 14 | 
 15 |     def setup_long_reads(self, split=False):
 16 |         if split:
 17 |             argv = ['--output_path', 'data/output', '--reads', 'data/reads/test.fq.gz', '--split_reads',
 18 |                     '--split_overlap', '50', '--split_len', '400', '--sample_reads', '--coverage', '10',
 19 |                     '--genome_len', '1000']
 20 |         else:
 21 |             argv = ['--output_path', 'data/output', '--reads', 'data/reads/test.fq.gz']
 22 | 
 23 |         args = parse_args(argv, exe_name(), '')
 24 |         # args = arg_parser.parse_args(argv)
 25 |         return Reads(args)
 26 | 
 27 |     def setup_reads_paired(self, sampling=False):
 28 | 
 29 |         if sampling:
 30 |             argv = ['--output_path', 'data/output', '--reads', 'data/reads/test_1a.fq.gz',
 31 |                     'data/reads/test_2a.fq.gz', '--sample_reads', '--coverage', '10', '--genome_len', '1000']
 32 |         else:
 33 |             argv = ['--output_path', 'data/output', '--reads', 'data/reads/test_1a.fq.gz',
 34 |                     'data/reads/test_2a.fq.gz']
 35 |         args = parse_args(argv, exe_name(), '')
 36 |         return Reads(args)
 37 | 
 38 |     def test_split(self):
 39 |         test_seq = 'ACGTTTTTTGGAAGAGTTAGAGATTTTTAGAGAGGAGGGGT'
 40 |         expected = ['ACGTTTTTTG', 'GAAGAGTTAG', 'AGATTTTTAG', 'AGAGGAGGGG',
 41 |                     'GAGGAGGGGT']
 42 |         reads = self.setup_long_reads()
 43 |         # obtained = reads._split_len(test_seq, 10)
 44 |         obtained = reads._split_len_overlap(test_seq, 10, 0)
 45 |         self.assertEqual(expected, obtained)
 46 | 
 47 |     def test_splitOverlap(self):
 48 |         test_seq = 'ACGTTTTTTGGAAGAGTTAGAGATTTTTAGAGAGGAGGGGTTT'
 49 |         expected = ['ACGTTTTTTG', 'TTTTGGAAGA', 'GAAGAGTTAG', 'GTTAGAGATT',
 50 |                     'AGATTTTTAG', 'TTTAGAGAGG', 'AGAGGAGGGG', 'GGAGGGGTTT']
 51 |         reads = self.setup_long_reads()
 52 |         obtained = reads._split_len_overlap(test_seq, 10, 5)
 53 |         # print(reads._split_len_overlap('TTTTTAGAGAGGAGGGGTTT', 10, 5))
 54 |         self.assertEqual(expected, obtained)
 55 | 
 56 |     def test_get_4_line_fastq_string(self):
 57 |         reads = self.setup_long_reads()
 58 |         expected = '@SRR00001 length=16\nACGTTTGGGAAGGTTT\n+SRR00001 ' \
 59 |                    'length=16\n????????????????\n'
 60 |         read_id = 'SRR00001'
 61 |         seq = 'ACGTTTGGGAAGGTTT'
 62 |         qual = '????????????????'
 63 |         name = reads._get_4_line_fastq_string(read_id, seq, qual, x=0)
 64 |         self.assertEqual(name, expected)
 65 | 
 66 |     def test_read_num_split(self):
 67 |         reads = self.setup_long_reads(split=True)
 68 |         num_reads = reads._get_num_reads('data/reads/test.fq.gz')
 69 |         self.assertEqual(num_reads, 18)
 70 | 
 71 |     def test_read_len_split(self):
 72 |         reads = self.setup_long_reads(split=True)
 73 |         len_reads = reads._get_read_len('data/reads/test.fq.gz',1000)
 74 |         self.assertEqual(len_reads, 400)
 75 | 
 76 |     def test_read_num_paired(self):
 77 |         reads = self.setup_reads_paired()
 78 |         num_reads = reads._get_num_reads('data/reads/test_1a.fq.gz')
 79 |         self.assertEqual(num_reads, 1000)
 80 | 
 81 |     def test_read_len_paired(self):
 82 |         reads = self.setup_reads_paired()
 83 |         num_reads = reads._get_read_len('data/reads/test_1a.fq.gz', 1000)
 84 |         self.assertEqual(num_reads, 151.0)
 85 | 
 86 |     def test_read_num_by_coverage_paired(self):
 87 |         reads = self.setup_reads_paired(sampling=True)
 88 |         num_reads = reads._get_num_reads_by_coverage(
 89 |             'data/reads/test_1a.fq.gz', 1000)
 90 |         self.assertEqual(num_reads, 34)
 91 | 
 92 |     def test_read_num_by_coverage_split(self):
 93 |         reads = self.setup_long_reads(split=True)
 94 |         num_reads = reads._get_num_reads_by_coverage(['data/reads/test.fq.gz'],1000)
 95 |         self.assertEqual(num_reads, 25)
 96 | 
 97 |     def test_read_vec_paired(self):
 98 |         reads = self.setup_reads_paired(sampling=True)
 99 |         num_reads = reads._get_vector_random_reads(
100 |             'data/reads/test_1a.fq.gz')
101 |         self.assertEqual(len(num_reads), 34)
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     unittest.main()
106 | 


--------------------------------------------------------------------------------
/archive/tests/test_use.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import argparse
 3 | import warnings
 4 | warnings.filterwarnings('ignore')
 5 | from read2tree.Progress import Progress
 6 | from read2tree.stats.Coverage import Coverage
 7 | from read2tree.stats.SeqCompleteness import SeqCompleteness
 8 | import os
 9 | 
10 | class Use(unittest.TestCase):
11 | 
12 |     def test_OGSet(self):
13 | 
14 |     def test_write_progress(self):
15 | 
16 |     def test_read_progress(self):
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     unittest.main()
21 | 


--------------------------------------------------------------------------------
/archive/wiki_images/covid1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/covid1.jpg


--------------------------------------------------------------------------------
/archive/wiki_images/covid2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/covid2.jpg


--------------------------------------------------------------------------------
/archive/wiki_images/figure1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/figure1.jpg


--------------------------------------------------------------------------------
/archive/wiki_images/figure_1sp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/figure_1sp.jpg


--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_0.png


--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_1.png


--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_2.png


--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_3.png


--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_4.png


--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_5.png


--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_6.png


--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_7.png


--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_8.png


--------------------------------------------------------------------------------
/bin/read2tree:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python -W ignore 
 2 | '''
 3 |     Wrapper to enable the user to call the installed hogprop without the '.py'
 4 |     ending.
 5 | 
 6 |     -- Alex Warwick Vesztrocy, June 2016
 7 | '''
 8 | from read2tree.main import main
 9 | from read2tree._utils import exe_name
10 | import sys
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     desc = 'read2tree is a pipeline allowing to use read data in combination with ' \
15 |            'an OMA standalone output run to produce high quality trees. '
16 |     main(sys.argv[1:], exe_name=exe_name(), desc=desc)
17 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: read2tree_env
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - python=3.9
 7 |   - numpy
 8 |   - biopython
 9 |   - ete3
10 |   - lxml
11 |   - tqdm
12 |   - scipy
13 |   - pyparsing
14 |   - requests
15 |   - natsort
16 |   - pyyaml
17 |   - filelock
18 |   - dendropy
19 |   - mafft
20 |   - iqtree
21 |   - ngmlr
22 |   - nextgenmap
23 |   - samtools
24 |   - filelock
25 |   - pyham
26 |   - pysam


--------------------------------------------------------------------------------
/read2tree/Analyzer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 |     This file contains definitions of a class which surrounds possible alignment methods
 4 | 
 5 |     -- David Dylus, July--XXX 2017
 6 | '''
 7 | import os
 8 | from Bio import AlignIO
 9 | import re
10 | 
11 | class Analyzer(object):
12 | 
13 |     def __init__(self, args, og_set=None):
14 |         print('--- Alignment of OGs ---')
15 |         self.args = args
16 |         self.cov = {}
17 |         self.seq_completeness = {}
18 | 
19 |         self._genome_or_transcriptome_length = args.gt_length
20 | 
21 |         if " " in args.reads:
22 |             self._reads = args.reads.rstrip().split(" ")
23 |         else:
24 |             self._reads = args.reads
25 | 
26 |         if len(self._reads) == 2:
27 |             self._species_name = self._reads[0].split("/")[-1].split(".")[0]
28 |         else:
29 |             self._species_name = self._reads.split("/")[-1].split(".")[0]
30 | 
31 |         self.treeStats = {}
32 |         self.alignmentStats = {}
33 | 
34 |     # def __call__(self, *args, **kwargs):
35 |     #     raise NotImplementedError
36 | 
37 |     def _get_coverage_reads(self, args):
38 |         """
39 | 
40 |         :param args:
41 |         :return: coverage
42 |         """
43 |         with open(args.reads[0]) as input:
44 |             read_length = input.readline().split("length=")[-1]
45 |             num_lines = sum([1 for line in input])
46 | 
47 |         total_records = int(num_lines / 4)
48 |         coverage = (total_records * read_length * len(args.reads))/self._genome_or_transcriptome_length
49 |         return coverage
50 | 
51 |     def _get_number_results(self):
52 |         raise NotImplementedError
53 | 
54 |     def _get_rf_dist(self, ref_tree):
55 |         raise NotImplementedError
56 | 
57 |     def _get_length_align(self):
58 |         raise NotImplementedError
59 | 
60 |     def _get_num_OGs(self):
61 |         raise NotImplementedError
62 | 
63 |     def _get_mean_ACGT(self, args):
64 |         import glob
65 |         for folder in glob.iglob(args.output + '/05_*', recursive=True):
66 |             print(folder)
67 |             all_coverages = []
68 | 
69 |             for file in glob.iglob(folder + '/*.phy'):
70 |                 align = AlignIO.read(file, "phylip-relaxed")
71 |                 for record in align:
72 |                     if self._species_name[0:5] in record.id:
73 |                         seq = re.sub('-', '', str(record.seq))
74 |                         xx = seq.count("X")
75 |                         aa = len(seq) - xx
76 |                         all_coverages.append((aa / len(seq)))
77 |             print(sum(all_coverages) / len(all_coverages))
78 | 
79 |     def _get_branch_length_mapped_seq(self):
80 |         raise NotImplementedError
81 | 
82 |     def write_to_csv(self):
83 |         raise NotImplementedError


--------------------------------------------------------------------------------
/read2tree/FastxReader.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import logging
 3 | import gzip
 4 | import mimetypes
 5 | # from memory_profiler import memory_usage
 6 | 
 7 | class FastxReader(object):
 8 | 
 9 |     def __init__(self, file):
10 | 
11 |         self._file = file
12 |         guessed_type = mimetypes.guess_type(file)[1]
13 |         if guessed_type:
14 |             if 'gzip' in guessed_type:
15 |                 self._file_handle = 'gzip'
16 |         else:
17 |             self._file_handle = 'txt'
18 | 
19 |     def open_fastx(self):
20 |         if self._file_handle in 'gzip':
21 |             return gzip.open(self._file, 'rt')
22 |         else:
23 |             return open(self._file, 'rt')
24 | 
25 |     def readfq_id(self, file_handle):
26 |         for l in file_handle:
27 |             name = l.rstrip()
28 |             seq = next(file_handle).rstrip()
29 |             tmp = next(file_handle).rstrip()
30 |             qual = next(file_handle).rstrip()
31 |             yield name.split(' ')[0]
32 | 
33 |     def readfq(self, file_handle):
34 |         for l in file_handle:
35 |             name = l.rstrip()
36 |             seq = next(file_handle).rstrip()
37 |             tmp = next(file_handle).rstrip()
38 |             qual = next(file_handle).rstrip()
39 |             yield name, seq, qual
40 | 
41 |     def readfa(self, file_handle):
42 |         for l in file_handle:
43 |             name = l.rstrip()
44 |             seq = next(file_handle).rstrip()
45 |             yield name, seq
46 | 
47 |     def readfx(self, file_handle):
48 |         for l in file_handle:
49 |             name = l.rstrip()
50 |             seq = next(file_handle).rstrip()
51 |             if '@' in name[0]:
52 |                 tmp = next(file_handle).rstrip()
53 |                 qual = next(file_handle).rstrip()
54 |             elif '>' in name[0]:
55 |                 qual = None
56 |             yield name, seq, qual
57 | 
58 |     # def readfx(self, file_handle):  # this is a generator function
59 |     #     '''
60 |     #     This function was copy and pasted from https://github.com/lh3/readfq
61 |     #     Readfq is a fast implementation of a read iterator and provides a
62 |     #     massive spead up compared to regular
63 |     #     implementations
64 |     #     :param file_handle: is a filehandle
65 |     #     :return: name, seq, quality
66 |     #     '''
67 |     #     last = None  # this is a buffer keeping the last unprocessed line
68 |     #     while True:  # mimic closure; is it a bad idea?
69 |     #         if not last:  # the first record or a record following a fastq
70 |     #             for l in file_handle:  # search for the start of the next record
71 |     #                 if l[0] in '>@':  # fasta/q header line
72 |     #                     last = l[:-1]  # save this line
73 |     #                     break
74 |     #         if not last:
75 |     #             break
76 |     #         name, seqs, last = last, [], None
77 |     #         for l in file_handle:  # read the sequence
78 |     #             if l[0] in '@+>':
79 |     #                 last = l[:-1]
80 |     #                 break
81 |     #             seqs.append(l[:-1])
82 |     #         if not last or last[0] != '+':  # this is a fasta record
83 |     #             yield name, ''.join(seqs), None  # yield a fasta record
84 |     #             if not last:
85 |     #                 break
86 |     #         else:  # this is a fastq record
87 |     #             seq, leng, seqs = ''.join(seqs), 0, []
88 |     #             for l in file_handle:  # read the quality
89 |     #                 seqs.append(l[:-1])
90 |     #                 leng += len(l) - 1
91 |     #                 if leng >= len(seq):  # have read enough quality
92 |     #                     last = None
93 |     #                     yield name, seq, ''.join(seqs)  # yield a fastq record
94 |     #                     break
95 |     #             if last:  # reach EOF before reading enough quality
96 |     #                 yield name, seq, None  # yield a fasta record instead
97 |     #                 break
98 | 


--------------------------------------------------------------------------------
/read2tree/GuidedAssembler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 |     This file contains definitions of a class which surrounds possible alignment methods
 4 | 
 5 |     -- David Dylus, July--XXX 2017
 6 | '''
 7 | import logging
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | class Aligner(object):
12 | 
13 |     def __init__(self, args=None, alignments=None):
14 | 
15 |         self.args = args
16 |         self.alignments = alignments
17 |         self.placement_dic = alignments.placement_dic
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/read2tree/MultiProcessingLog.py:
--------------------------------------------------------------------------------
 1 | # taken from https://gist.github.com/JesseBuesking/10674086
 2 | 
 3 | from logging.handlers import RotatingFileHandler
 4 | import multiprocessing, threading, logging, sys, traceback
 5 | #import os
 6 | 
 7 | 
 8 | class MultiProcessingLog(logging.Handler):
 9 |     def __init__(self, name, mode, maxsize, rotate):
10 |         logging.Handler.__init__(self)
11 | 
12 |         self._handler = RotatingFileHandler(name, mode, maxsize, rotate)
13 |         self.queue = multiprocessing.Queue(-1)
14 | 
15 |         t = threading.Thread(target=self.receive)
16 |         t.daemon = True
17 |         t.start()
18 | 
19 |     def setFormatter(self, fmt):
20 |         logging.Handler.setFormatter(self, fmt)
21 |         self._handler.setFormatter(fmt)
22 | 
23 |     def receive(self):
24 |         while True:
25 |             try:
26 |                 record = self.queue.get()
27 |                 self._handler.emit(record)
28 |                 #print('received on pid {}'.format(os.getpid()))
29 |             except (KeyboardInterrupt, SystemExit):
30 |                 raise
31 |             except EOFError:
32 |                 break
33 |             except:
34 |                 traceback.print_exc(file=sys.stderr)
35 | 
36 |     def send(self, s):
37 |         self.queue.put_nowait(s)
38 | 
39 |     def _format_record(self, record):
40 |         # ensure that exc_info and args have been stringified. Removes any
41 |         # chance of unpickleable things inside and possibly reduces message size
42 |         # sent over the pipe
43 |         if record.args:
44 |             record.msg = record.msg % record.args
45 |             record.args = None
46 |         if record.exc_info:
47 |             dummy = self.format(record)
48 |             record.exc_info = None
49 | 
50 |         return record
51 | 
52 |     def emit(self, record):
53 |         try:
54 |             s = self._format_record(record)
55 |             self.send(s)
56 |         except (KeyboardInterrupt, SystemExit):
57 |             raise
58 |         except:
59 |             self.handleError(record)
60 | 
61 |     def close(self):
62 |         self._handler.close()
63 |         logging.Handler.close(self)


--------------------------------------------------------------------------------
/read2tree/ReferenceSet.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 |     This file contains definitions of a class which allows to create
  4 |     the reference orthologous groups with their DNA sequences.
  5 | 
  6 |     -- David Dylus, July--XXX 2017
  7 | '''
  8 | 
  9 | import os
 10 | import glob
 11 | import logging
 12 | import time
 13 | from tqdm import tqdm
 14 | from Bio import SeqIO
 15 | from Bio.SeqIO.FastaIO import FastaWriter
 16 | 
 17 | from read2tree.Progress import Progress
 18 | 
 19 | 
 20 | class ReferenceSet(object):
 21 |     '''
 22 |     Structure for reference
 23 |     '''
 24 | 
 25 |     def __init__(self, args, og_set=None, load=True, progress=None):
 26 |         """
 27 | 
 28 |         :param args: list of arguments from command line
 29 |         :param og_set: set of OGs used to obtain reference DNA sequences
 30 |         :param load: set to True when reference loaded from folder/file of list of arguments
 31 |         """
 32 |         self.ref = {}
 33 |         self.load = load
 34 |         self.args = args
 35 |         self.progress = progress
 36 | 
 37 |         self.logger = logging.getLogger(__name__)
 38 |         self._species_name = self.args.species_name
 39 | 
 40 |         if load is False:
 41 |             self.ref = self._load_records_folder()
 42 |         elif og_set is not None and load is True:
 43 |             self.ref = self._generate_reference(og_set)
 44 |             self.write()
 45 |             # self.progress.set_status('ref')
 46 | 
 47 |         # if args.remove_species:
 48 |         #     self.ref = self._remove_species()
 49 | 
 50 |     def _read_fasta(self, ref_file):
 51 |         '''
 52 | 
 53 |         :param ref_file: file that contains all the DNA sequences from the oma database
 54 |         :return:
 55 |         '''
 56 |         print('--- Reading DNA reference into memory ---')
 57 |         return SeqIO.index(ref_file, "fasta")
 58 | 
 59 |     def _load_records_folder(self):
 60 |         """
 61 |         Parse species with their dna sequences from folder
 62 |         :return:
 63 |         """
 64 |         ref_dict = {}
 65 |         print('--- Generating reference for mapping from folder ---')
 66 |         ref_dna = os.path.join(self.args.output_path, '02_ref_dna')
 67 |         for file in tqdm(glob.glob(os.path.join(ref_dna, "*.fa")), desc="Re-loading references for mapping from folder", unit=" species"):
 68 |             species_name = file.split("/")[-1].split("_")[0]
 69 |             ref_dict[species_name] = Reference()
 70 |             ref_dict[species_name].dna = list(SeqIO.parse(file, 'fasta'))
 71 | 
 72 |         return ref_dict
 73 | 
 74 |     def _generate_reference(self, og_set):
 75 |         '''
 76 |         Split records into dictionary with keys being species and the values the corresponded sequence records
 77 |         '''
 78 |         print('--- Generating reference for mapping ---')
 79 |         start = time.time()
 80 |         ref_set = {}
 81 |         for name, og in tqdm(og_set.items(), desc="Loading records", unit=" record"):
 82 |             for record in og.aa:
 83 |                 species = record.id[0:5]
 84 |                 record.id = record.id  # +"_"+name
 85 |                 if species in ref_set.keys():
 86 |                     ref_set[species].aa.append(record)
 87 |                 else:
 88 |                     ref_set[species] = Reference()
 89 |                     ref_set[species].aa.append(record)
 90 | 
 91 |             for record in og.dna:
 92 |                 species = record.id[0:5]
 93 |                 record.id = record.id  # + "_" + name
 94 |                 if species in ref_set.keys():
 95 |                     ref_set[species].dna.append(record)
 96 |                 else:
 97 |                     ref_set[species] = Reference()
 98 |                     ref_set[species].dna.append(record)
 99 |         end = time.time()
100 |         elapsed_time = end - start
101 |         self.logger.info('{}: Extracted {} reference species form {} ogs took {}'
102 |                        .format(self._species_name, len(ref_set.keys()),
103 |                        len(og_set.keys()), elapsed_time))
104 |         return ref_set
105 | 
106 |     def write(self):
107 |         '''
108 |         Write for each species all the DNA sequences into separate fasta files
109 |         :param output_folder: folder where files should be stored
110 |         '''
111 |         out_dna = os.path.join(self.args.output_path, '02_ref_dna')
112 |         if not os.path.exists(out_dna):
113 |             os.makedirs(out_dna)
114 |         for key, value in self.ref.items():
115 |             if value.dna:  # only write if not empty
116 |                 value.write_dna(key, out_dna)
117 | 
118 |     def _remove_species(self):
119 |         raise NotImplementedError
120 | 
121 | 
122 | class Reference(object):
123 | 
124 |     def __init__(self, args=None):
125 |         self.args = args
126 |         self.aa = []
127 |         self.dna = []
128 | 
129 |     def write_aa(self, species, output_folder):
130 |         handle = open(os.path.join(output_folder, species + '_OGs.fa'), "w")
131 |         writer = FastaWriter(handle, wrap=None)
132 |         writer.write_file(self.aa)
133 |         handle.close()
134 | 
135 |     def write_dna(self, species, output_folder):
136 |         handle = open(os.path.join(output_folder, species + '_OGs.fa'), "w")
137 |         writer = FastaWriter(handle, wrap=None)
138 |         writer.write_file(self.dna)
139 |         handle.close()
140 | 


--------------------------------------------------------------------------------
/read2tree/TreeInference.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 |     This file contains definitions of a class which surrounds the wrappers to build trees given a set of command line arguments.
 4 | 
 5 |     -- David Dylus, July--XXX 2017
 6 | '''
 7 | import os
 8 | import time
 9 | import logging
10 | from read2tree.wrappers.treebuilders import Fasttree, Iqtree
11 | from read2tree.wrappers.treebuilders.base_treebuilder import DataType
12 | 
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class TreeInference(object):
18 | 
19 |     def __init__(self, args, concat_alignment=None):
20 |         print('--- Tree inference ---')
21 | 
22 |         self.args = args
23 | 
24 |         self.elapsed_time = 0
25 | 
26 |         if self.args.reads:
27 |             if len(self.args.reads) == 2:
28 |                 self._reads = self.args.reads
29 |                 self._species_name = self._reads[0].split("/")[-1].split(".")[0]
30 |             else:
31 |                 self._reads = self.args.reads[0]
32 |                 self._species_name = self._reads.split("/")[-1].split(".")[0]
33 | 
34 |         if self.args.species_name:
35 |             self._species_name = self.args.species_name
36 | 
37 |         if not self.args.reads and not self.args.species_name:
38 |             self._species_name = 'merge'
39 | 
40 |         self.tree = None
41 |         if concat_alignment is not None:
42 |             self.tree = self._infer_tree(concat_alignment)
43 | 
44 |     def _infer_tree(self, concat_alignment):
45 |         start = time.time()
46 |         output_folder = self.args.output_path
47 |         if not os.path.exists(output_folder):
48 |             os.makedirs(output_folder)
49 |         #fasttree_wrapper = Fasttree(concat_alignment, datatype=DataType.PROTEIN)
50 |         #tree = fasttree_wrapper()
51 |         iqtree_wrapper = Iqtree(concat_alignment, datatype=DataType.PROTEIN)
52 |         iqtree_wrapper.options.options['-m'].set_value('LG')
53 |         iqtree_wrapper.options.options['-nt'].set_value(self.args.threads)
54 |         tree = iqtree_wrapper()
55 |         with open(os.path.join(output_folder, "tree_" + self._species_name + ".nwk"), "w") as text_file:
56 |             text_file.write("{}".format(tree))
57 |         self.tree = "{}".format(tree)
58 |         end = time.time()
59 |         self.elapsed_time = end - start
60 |         logger.info('{}: Tree inference took {}.'.format(self._species_name,
61 |                                                          self.elapsed_time))
62 | 
63 |         return tree
64 | 


--------------------------------------------------------------------------------
/read2tree/__init__.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | import logging
 3 | import logging.config
 4 | import yaml
 5 | import os
 6 | from pkg_resources import resource_string
 7 | logging.getLogger(__name__).addHandler(logging.NullHandler())
 8 | 
 9 | __version__ = '0.1.5'
10 | __copyright__ = 'read2tree (C) 2017-{:d} David Dylus' \
11 |                 .format(date.today().year)
12 | 
13 | # path = './log.yaml'
14 | # if os.path.exists(path):
15 | #     with open(path, 'rt') as f:
16 | #         config = yaml.load(f.read())
17 | #     logging.config.dictConfig(config)
18 | 
19 | conf = resource_string(__name__, 'logging/log.yaml')
20 | 
21 | D = yaml.load(conf, Loader=yaml.FullLoader)
22 | D.setdefault('version', 1)
23 | logging.config.dictConfig(D)
24 | # del D
25 | 


--------------------------------------------------------------------------------
/read2tree/_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 |     Utilities for parsing the annotations files.
  4 | 
  5 |     -- Alex Warwick Vesztrocy - March--June 2016
  6 | '''
  7 | import bz2
  8 | import gzip
  9 | import os
 10 | import sys
 11 | 
 12 | 
 13 | # File opening. This is based on the example on SO here:
 14 | # http://stackoverflow.com/a/26986344
 15 | fmagic = {b'\x1f\x8b\x08': gzip.open,
 16 |           b'\x42\x5a\x68': bz2.BZ2File}
 17 | 
 18 | 
 19 | def auto_open(fn, *args):
 20 |     '''
 21 |         Opens files based on their "magic bytes". Supports bz2 and gzip. If it
 22 |         finds neither of these, presumption is it is a standard, uncompressed
 23 |         file.
 24 |     '''
 25 |     if os.path.isfile(fn) and os.stat(fn).st_size > 0:
 26 |         with open(fn, 'rb') as fp:
 27 |             fs = fp.read(max([len(x) for x in fmagic]))
 28 |         for (magic, _open) in fmagic.items():
 29 |             if fs.startswith(magic):
 30 |                 return _open(fn, *args)
 31 |     else:
 32 |         if fn.endswith('gz'):
 33 |             return gzip.open(fn, *args)
 34 |         elif fn.endswith('bz2'):
 35 |             return bz2.BZ2File(fn, *args)
 36 | 
 37 |     return open(fn, *args)
 38 | 
 39 | 
 40 | def exe_name():
 41 |     '''
 42 |         Return the executable's basename, for inclusion in the help (with the
 43 |         help of argparse).
 44 |     '''
 45 |     return os.path.basename(sys.argv[0])
 46 | 
 47 | 
 48 | class LazyProperty(object):
 49 |     '''
 50 |         Decorator to evaluate a property only on access.
 51 | 
 52 |         Compute the attribute value and caches it in the instance.
 53 |         Python Cookbook (Denis Otkidach)
 54 |         http://stackoverflow.com/users/168352/denis-otkidach
 55 |         This decorator allows you to create a property which can be computed
 56 |         once and accessed many times.
 57 | 
 58 |         (Include from pyoma.browser.models - Adrian Altenhoff)
 59 |     '''
 60 |     def __init__(self, method, name=None):
 61 |         # record the unbound-method and the name
 62 |         self.method = method
 63 |         self.name = name or method.__name__
 64 |         self.__doc__ = method.__doc__
 65 | 
 66 |     def __get__(self, inst, cls):
 67 |         if inst is None:
 68 |             return self
 69 |         # compute, cache and return the instance's attribute value
 70 |         result = self.method(inst)
 71 |         # setattr redefines the instance's attribute so this doesn't get called
 72 |         # again
 73 |         setattr(inst, self.name, result)
 74 |         return result
 75 | 
 76 | 
 77 | def get_job_id():
 78 |     '''
 79 |         Gets job ID.
 80 |     '''
 81 |     if 'JOB_ID' in os.environ:
 82 |         # SGE
 83 |         return int(os.environ['JOB_ID'])
 84 |     elif 'LSB_JOBID' in os.environ:
 85 |         # LSF
 86 |         return int(os.environ['LSB_JOBID'])
 87 |     elif 'PBS_JOBID' in os.environ:
 88 |         # PBS / Torque
 89 |         return int(os.environ['PBS_JOBID'])
 90 |     elif 'SLURM_ARRAY_JOB_ID' in os.environ:
 91 |         # Slurm
 92 |         return int(os.environ['SLURM_ARRAY_JOB_ID'])
 93 |     else:
 94 |         # No parallelism detected.
 95 |         return None
 96 | 
 97 | 
 98 | def get_worker_id():
 99 |     '''
100 |         Gets worker ID from the array ID in the job handler.
101 |         number of workers.
102 |     '''
103 |     try:
104 |         if 'SGE_TASK_ID' in os.environ:
105 |             # SGE
106 |             return int(os.environ['SGE_TASK_ID'])
107 |         elif 'LSB_JOBINDEX' in os.environ:
108 |             return int(os.environ['LSB_JOBINDEX'])
109 |         elif 'PBS_ARRAYID' in os.environ:
110 |             # PBS / Torque
111 |             return int(os.environ['PBS_ARRAYID'])
112 |         elif 'SLURM_ARRAY_TASK_ID' in os.environ:
113 |             # Slurm
114 |             return int(os.environ['SLURM_ARRAY_TASK_ID'])
115 |     except ValueError:
116 |         # int() to base10 error
117 |         pass
118 | 
119 |     # No parallelism detected.
120 |     return None
121 | 
122 | 
123 | def check_array_ids(args):
124 |     '''
125 |         Checks the IDs added to args for array jobs. Raises errors if not setup
126 |         correctly.
127 |     '''
128 |     if args.worker_id > args.array or args.worker_id == 0:
129 |         raise RuntimeError('Recognised: worker ID {} and array size {}. '
130 |                            'Worker IDs should run from 1-N (N is array size'
131 |                            ').'.format(args.worker_id, args.array))
132 |     if args.job_id is None or args.worker_id is None:
133 |         raise RuntimeError('User requested HOGPROP to run as job array.'
134 |                            'Can\'t find job ID ({}) or array ID ({}).'
135 |                            .format(args.job_id, args.worker_id))
136 | 


--------------------------------------------------------------------------------
/read2tree/file_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .context_managers import *
2 | 


--------------------------------------------------------------------------------
/read2tree/file_utils/context_managers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import tempfile
  4 | 
  5 | __all__ = ['TempFile', 'TempDir', 'ChDir', 'MkDir', 'NonDeletingTempDir']
  6 | 
  7 | 
  8 | class TempFile(object):
  9 |     """ 
 10 |     Context manager for working with a temporary file
 11 |     that automatically cleans up.
 12 | 
 13 |     Usage:
 14 | 
 15 |     with TempFile() as tmp:
 16 |         # In scope, tmp exists on the disk
 17 |         # Do some work with tmp, e.g. tmp.write('something')
 18 | 
 19 |     # Out of scope, tmp is deleted
 20 | 
 21 |     with TempFile('local_temp_space') as tmp:
 22 |         # tmp is created in the directory 'local_temp_space'
 23 |         # The specified directory must exist, or an error is thrown
 24 | 
 25 |     """
 26 | 
 27 |     def __init__(self, dir_=None):
 28 |         if dir_ is not None and not os.path.exists(dir_):
 29 |             raise IOError('Directory "{}"" does not exist'.format(dir_))
 30 |         self.dir = dir_
 31 | 
 32 |     def __enter__(self):
 33 |         self._fd, self._wrapped_tmp = tempfile.mkstemp(dir=self.dir)
 34 |         return os.path.abspath(self._wrapped_tmp)
 35 | 
 36 |     def __exit__(self, type, value, tb):
 37 |         os.close(self._fd)
 38 |         os.remove(self._wrapped_tmp)
 39 | 
 40 | 
 41 | class TempDir(object):
 42 |     """
 43 |     Context manager for working with a temporary file
 44 |     that automatically cleans up.
 45 | 
 46 |     Usage:
 47 | 
 48 |     with TempDir() as tmpd:
 49 |         # In scope, tmpd exists on the disk
 50 |         # Do some work with tmpd ...
 51 | 
 52 |     # Out of scope, tmpd is deleted along with all its content
 53 | 
 54 |     Can be nested with TempFile, e.g.
 55 | 
 56 |     with TempDir() as tmpd, TempFile(tmpd) as tmpf:
 57 |         # tempfile tmpf is created inside temporary directory tmpd
 58 |     # On exit, everything is deleted
 59 | 
 60 |     """
 61 | 
 62 |     def __enter__(self):
 63 |         self._wrapped_tmpdir = tempfile.mkdtemp()
 64 |         return os.path.abspath(self._wrapped_tmpdir)
 65 | 
 66 |     def __exit__(self, type, value, tb):
 67 |         shutil.rmtree(self._wrapped_tmpdir)
 68 | 
 69 | 
 70 | class NonDeletingTempDir(TempDir):
 71 |     def __exit__(self, tpye, value, tb):
 72 |         pass
 73 | 
 74 | 
 75 | class ChDir(object):
 76 |     """
 77 |     Context manager to switch to a working directory,
 78 |     and return to the current directory (like 'Dir.chdir do' block in Ruby)
 79 | 
 80 |     Usage:
 81 | 
 82 |     with TempDir() as dir, ChDir(dir):
 83 |         # Do some work in the working temp directory 'dir'
 84 | 
 85 |     # Exit 'dir'
 86 |     """
 87 | 
 88 |     def __init__(self, working_dir):
 89 |         if not os.path.exists(working_dir):
 90 |             raise IOError('Directory "{}"" does not exist'.format(working_dir))
 91 |         self._cdir = os.getcwd()
 92 |         self._wdir = working_dir
 93 | 
 94 |     def __enter__(self):
 95 |         os.chdir(self._wdir)
 96 | 
 97 |     def __exit__(self, type, value, tb):
 98 |         os.chdir(self._cdir)
 99 | 
100 | 
101 | class MkDir(ChDir):
102 |     """
103 |     Context manager to create and switch to a working directory,
104 |     then return to the current directory.
105 | 
106 |     Usage:
107 | 
108 |     with TempDir() as dir, MkDir(dir):
109 |         # Do some work in the working temp directory 'dir'
110 | 
111 |     # Exit 'dir'
112 |     """
113 | 
114 |     def __init__(self, working_dir):
115 |         if not os.path.exists(working_dir):
116 |             try:
117 |                 os.makedirs(working_dir)
118 |             except OSError as e:
119 |                 if e.errno != 17:
120 |                     raise
121 |                 pass  # path was created by another thread / process
122 |                 # this is a race condition, but probably benign
123 | 
124 |     def __enter__(self):
125 |         pass
126 | 
127 |     def __exit__(self, type, value, tb):
128 |         pass
129 | 


--------------------------------------------------------------------------------
/read2tree/logging/log.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: 1
 3 | disable_existing_loggers: False
 4 | formatters:
 5 |     simple:
 6 |         format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 7 | handlers:
 8 |     console:
 9 |         class: logging.StreamHandler
10 |         level: INFO
11 |         formatter: simple
12 |         stream: ext://sys.stdout
13 |     mplog:
14 |         class: read2tree.MultiProcessingLog.MultiProcessingLog
15 |         level: DEBUG
16 |         formatter: simple
17 |         name: mplog.log
18 |         mode: a
19 |         maxsize: 1024
20 |         rotate: 0
21 | root:
22 |     level: DEBUG
23 |     handlers: [console, mplog]


--------------------------------------------------------------------------------
/read2tree/logging/log.yaml.bak:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: 1
 3 | disable_existing_loggers: False
 4 | formatters:
 5 |     simple:
 6 |         format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 7 | handlers:
 8 |     console:
 9 |         class: logging.StreamHandler
10 |         level: INFO
11 |         formatter: simple
12 |         stream: ext://sys.stdout
13 |     mplog:
14 |         class: read2tree.MultiProcessingLog.MultiProcessingLog
15 |         level: DEBUG
16 |         formatter: simple
17 |         name: mplog.log
18 |         mode: a
19 |         maxsize: 1024
20 |         rotate: 0
21 | root:
22 |     level: DEBUG
23 |     handlers: [console, mplog]


--------------------------------------------------------------------------------
/read2tree/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .OMAOutputParser import *


--------------------------------------------------------------------------------
/read2tree/stats/Coverage.py:
--------------------------------------------------------------------------------
 1 | import pysam
 2 | import numpy as np
 3 | 
 4 | 
 5 | class Coverage(object):
 6 | 
 7 |     def __init__(self, args):
 8 |         self.args = args
 9 |         self.coverage = {}
10 | 
11 |     def get_coverage_bam(self, file_name):
12 |         mybam = pysam.AlignmentFile(file_name, 'rb')
13 |         for ref in mybam.references:
14 |             self.coverage[self._get_clean_id(ref)] \
15 |                 = self._get_gene_coverage(mybam, ref)
16 | 
17 |     def _get_clean_id(self, id):
18 |         id = id.split(" ")[0]
19 |         id = id.split("_")
20 |         return id[0]+"_"+id[1]
21 | 
22 |     def add_coverage(self, ref, coverage):
23 |         self.coverage[ref] = coverage
24 | 
25 |     def write_coverage_bam(self, file_name):
26 |         out_text = ''
27 |         header = '#species,og,gene_id,coverage,std\n'
28 |         out_text += header
29 |         for key, value in self.coverage.items():
30 |             species = key[0:5]
31 |             og = key.split("_")[-1]
32 |             gene_id = key.split("_")[0]
33 |             coverage = value
34 |             line = species + "," + og + "," + gene_id + "," + \
35 |                 str(coverage[0]) + "," + str(coverage[1]) + "\n"
36 |             out_text += line
37 | 
38 |         with open(file_name, "w") as myfile:
39 |             myfile.write(out_text)
40 | 
41 |     def read_coverage_from_file(self, file_name):
42 |         raise NotImplementedError
43 | 
44 |     def _get_gene_coverage(self, mybam, ref):
45 |         """
46 | 
47 |         :param mybam: bam_file object from pysam
48 |         :param ref: the gene_id reference to pileup the the number of reads per column
49 |         :return: average coverage per gene
50 |         """
51 |         column_coverage = []
52 |         for pileupcolumn in mybam.pileup(ref, 0, 100000):
53 |             if pileupcolumn.n >= self.args.min_cons_coverage:
54 |                 column_coverage.append(pileupcolumn.n)
55 |         np_column_coverage = np.array(column_coverage)
56 |         return [np.mean(np_column_coverage), np.std(np_column_coverage)]
57 | 


--------------------------------------------------------------------------------
/read2tree/stats/SeqCompleteness.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import numpy as np
  3 | 
  4 | 
  5 | class SeqCompleteness(object):
  6 | 
  7 |     def __init__(self, mapped_ref=None, tested_ref=None):
  8 |         self.seq_completeness = {}
  9 | 
 10 |         if mapped_ref:
 11 |             self.map_ref_records = self._get_og_dict(mapped_ref)
 12 |         else:
 13 |             self.map_ref_records = None
 14 | 
 15 |         if tested_ref:
 16 |             self.ref_records = self._get_og_dict(tested_ref)
 17 |         else:
 18 |             self.ref_records = None
 19 | 
 20 |     def get_seq_completeness(self, records):
 21 |         for record in records:
 22 |             self.seq_completeness[
 23 |                 record.id] = self._get_single_seq_completeness(record)
 24 | 
 25 |     def _get_single_seq_completeness(self, mapped_record, gene_code='dna'):
 26 |         """
 27 |         Calculate single sequence completeness using the number of dna or aa
 28 |         positions that are not n/X divided by either
 29 |         length of sequence or full length or reference
 30 |         :param mapped_record: sequence record that was produced by mapping
 31 |         :param gene_code: dna or aa
 32 |         :return: tuple with partial seq completeness computed using just the
 33 |             mapped_record itself and ref_seq_completeness computed
 34 |             using also t
 35 |         """
 36 | 
 37 |         map_ref_record = self.map_ref_records[self._get_og_id(mapped_record.id)]
 38 |         map_ref_seq = str(map_ref_record.seq).upper()
 39 |         map_seq = str(mapped_record.seq).upper()
 40 |         if self.ref_records and self._get_og_id(mapped_record.id) in \
 41 |                 self.ref_records.keys():
 42 |             ref_record = self.ref_records[self._get_og_id(mapped_record.id)]
 43 |             ref_seq = str(ref_record.seq).upper()
 44 |         else:
 45 |             ref_seq = map_ref_seq
 46 |         if gene_code == 'dna':
 47 |             ref_seq_len = len(ref_seq)
 48 |             map_seq_len = len(map_ref_seq)
 49 |             non_n_len = len(map_ref_seq) - str(map_seq).count('N')
 50 |             map_seq_completeness = non_n_len / map_seq_len
 51 |             ref_seq_completeness = non_n_len / ref_seq_len
 52 |         elif gene_code == 'aa':
 53 |             ref_seq_len = len(ref_seq)
 54 |             map_seq_len = len(map_seq)
 55 |             non_n_len = len(map_seq) - str(map_seq).count('X')
 56 |             map_seq_completeness = non_n_len / map_seq_len
 57 |             ref_seq_completeness = non_n_len / ref_seq_len
 58 |         return [map_seq_completeness, ref_seq_completeness,
 59 |                 non_n_len, map_seq_len, ref_seq_len]
 60 | 
 61 |     def _get_og_dict(self, ref_og):
 62 |         dna_dict = {}
 63 |         for record in ref_og:
 64 |             if '_' in record.id:
 65 |                 split_id = record.id.split("_")
 66 |                 tmp = split_id[0]+"_"+split_id[1]
 67 |                 record.id = tmp
 68 |                 og_id = split_id[1]
 69 | 
 70 |             dna_dict[og_id] = record
 71 |         return dna_dict
 72 | 
 73 |     def _get_og_id(self, id):
 74 |         split_id = id.split("_")
 75 |         # return split_id[0]+"_"+split_id[1]
 76 |         return split_id[1]
 77 | 
 78 |     def _get_gene_id(self, id):
 79 |         split_id = id.split("_")
 80 |         return split_id[0]
 81 | 
 82 |     def add_seq_completeness(self, ref, seq_completeness):
 83 |         self.seq_completeness[ref] = seq_completeness
 84 | 
 85 |     def write_seq_completeness(self, file_name):
 86 |         out_text = ''
 87 |         header = '#species,og,gene_id,map_seq_completeness,' \
 88 |                  'ref_seq_completeness,inferred_len,given_len,ref_len\n'
 89 |         out_text += header
 90 |         for key, value in self.seq_completeness.items():
 91 |             species = key[0:5]
 92 |             og = key.split("_")[-1]
 93 |             gene_id = key.split("_")[0]
 94 |             seq_completeness = value
 95 |             line = species + "," + og + "," + gene_id + "," + \
 96 |                 str(seq_completeness[0]) + "," + str(seq_completeness[1]) + \
 97 |                 "," + str(seq_completeness[2]) + "," + \
 98 |                 str(seq_completeness[3]) + "," + \
 99 |                 str(seq_completeness[4]) + "\n"
100 |             out_text += line
101 | 
102 |         with open(file_name, "w") as myfile:
103 |             myfile.write(out_text)
104 | 


--------------------------------------------------------------------------------
/read2tree/stats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/read2tree/stats/__init__.py


--------------------------------------------------------------------------------
/read2tree/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .seq_utils import *
2 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | class WrapperError(Exception):
2 |     pass
3 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/aligners/__init__.py:
--------------------------------------------------------------------------------
1 | from .mafft import Mafft
2 | from .muscle import Muscle
3 | from .prographmsa import ProGraphMSA
4 | from .probcons import ProbCons
5 | from .base_aligner import AlignmentInput, DataType, WrapperError


--------------------------------------------------------------------------------
/read2tree/wrappers/aligners/base_aligner.py:
--------------------------------------------------------------------------------
  1 | import os, types, itertools
  2 | from abc import ABCMeta, abstractmethod
  3 | from enum import Enum
  4 | from Bio import AlignIO, SeqIO
  5 | from Bio.Align import MultipleSeqAlignment
  6 | from read2tree.utils.seq_utils import is_dna
  7 | 
  8 | 
  9 | from read2tree.wrappers import WrapperError
 10 | 
 11 | 
 12 | AlignmentInput = Enum('AlignmentInput', 'OBJECT FILENAME')
 13 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN')
 14 | 
 15 | class Aligner(object):
 16 |     """
 17 |     Base class for wrappers of Multiple Sequence Aligner software
 18 | 
 19 |     The wrapper is written as a callable class.
 20 |     This can hold data (state) to do with the operation it performs, so it can keep results,
 21 |     execution times and other metadata, as well as perform the task.
 22 | 
 23 |     This is a base implementation to be extended. The important parts are
 24 |     __init__ (does the setup) and __call__ (does the work). All
 25 |     else are helper methods.
 26 | 
 27 |     :Example:
 28 | 
 29 |     ::
 30 | 
 31 |         callable_wrapper = ConcreteAligner(aln)
 32 |         result = callable_wrapper()
 33 |         time_taken = callable_wrapper.elapsed_time
 34 |         result_again = callable_wrapper.result
 35 | 
 36 |     """
 37 |     __metaclass__ = ABCMeta
 38 | 
 39 |     def __init__(self, input_, datatype=DataType.UNKNOWN, binary=None):
 40 |         """
 41 |         Should work the same whether you're working with a Biopython object or a file
 42 |         but the implementation differs, e.g. a Biopython object will need
 43 |         to be written temporarily to disk for the Aligner to work on it.
 44 | 
 45 |         :param input_: can be either a filename or a biopython multiple
 46 |             sequence alignment (a collection of :class:`Bio.SeqRecord.SeqRecord`)
 47 | 
 48 |         :param binary: is the alignment's executable file, or None. If set to
 49 |             None, it is assumed to be found in the PATH.
 50 | 
 51 |         :param datatype: means is it DNA or protein?
 52 |         """
 53 |         self.input_type = identify_input(input_) # Figure out what it is - file or object
 54 | 
 55 |         if datatype == DataType.UNKNOWN:
 56 |             #dup, input_ = itertools.tee(input_)
 57 |             self.datatype = guess_datatype(input_, from_filename=self.input_type==AlignmentInput.FILENAME)
 58 |             if self.input_type == AlignmentInput.OBJECT:
 59 |                 dup, input_ = itertools.tee(input_)
 60 |                 self.datatype = guess_datatype(dup, False)
 61 |             else:
 62 |                 self.datatype = guess_datatype(input_, True)
 63 |         else:
 64 |             self.datatype = datatype
 65 | 
 66 |         self.input = input_  # store it
 67 |         self.elapsed_time = None
 68 |         self.stdout = None
 69 |         self.stderr = None
 70 |         try:
 71 |             self.cli = self._init_cli(binary)
 72 |         except IOError as err:
 73 |             raise WrapperError('Error searching for binary: {}'.format(err))
 74 |         # End setup
 75 | 
 76 |     @abstractmethod
 77 |     def __call__(self, *args, **kwargs):
 78 |         """
 79 |         How to call the underlying aligner
 80 |         """
 81 |         pass
 82 | 
 83 |     @abstractmethod
 84 |     def _init_cli(self, binary):
 85 |         pass
 86 | 
 87 | import logging
 88 | logger = logging.getLogger()
 89 | 
 90 | def guess_datatype(alignment, from_filename=False):
 91 |     logger.warning("Guessing is not recommended - specify the sequence type with option datatype={DNA, PROTEIN}, be more confident")
 92 |     if from_filename:
 93 |         try:
 94 |             alignment = SeqIO.parse(alignment, 'fasta')
 95 |         except:
 96 |             alignment = SeqIO.parse(alignment, 'phylip-relaxed')
 97 |     return DataType.DNA if is_dna(alignment) else DataType.PROTEIN
 98 | 
 99 | 
100 | def identify_input(alignment):
101 |     """
102 |     Work out if we're dealing with a Biopython object (return True), a file
103 |     (return False), or invalid input (raise error)
104 |     """
105 |     try:
106 |         if isinstance(alignment, (MultipleSeqAlignment, types.GeneratorType, list)):
107 |             # `alignment` is a Biopython MultipleSequenceAlignment
108 |             return AlignmentInput.OBJECT
109 | 
110 |         elif isinstance(alignment, str) and os.path.exists(alignment):
111 |             # `alignment` is a filepath
112 |             return AlignmentInput.FILENAME
113 | 
114 |     except:
115 |         # `alignment` is some other thing we can't handle
116 |         raise ValueError('{} is not an alignment object or a valid filename'.format(alignment))
117 | 
118 | 
119 | # TODO: Break the identify_input function into two parts - one to work out the datatype, one to work out whether
120 | # this is a file or an object
121 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/aligners/muscle.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | import time
  3 | from Bio import AlignIO, SeqIO
  4 | from six import StringIO
  5 | from ..abstract_cli import AbstractCLI
  6 | from .base_aligner import Aligner, AlignmentInput, DataType
  7 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, TreeInputOption, OptionSet
  8 | 
  9 | 
 10 | class MuscleCLI(AbstractCLI):
 11 |     """
 12 |     Muscle low-level command line interface
 13 | 
 14 |     example:
 15 |     muscle_cli = MuscleCLI()
 16 |     process = muscle_cli(cmd='muscle args...')
 17 |     stdout = muscle_cli.get_stdout()
 18 |     """
 19 |     @property
 20 |     def _default_exe(self):
 21 |         return 'muscle'
 22 | 
 23 |     # def _set_help(self):
 24 |     #     self(help=True, wait=True)
 25 |     #     self._help = self.get_stdout()
 26 | 
 27 | def set_default_dna_options(aligner):
 28 |     """
 29 |     Dummy function as sensible default already provided by mafft --auto
 30 |     """
 31 |     aligner.options = get_default_options()
 32 | 
 33 | 
 34 | def set_default_protein_options(aligner):
 35 |     """
 36 |     Dummy function as sensible default already provided by mafft --auto
 37 |     """
 38 |     aligner.options = get_default_options()
 39 | 
 40 | class Muscle(Aligner):
 41 |     """
 42 |     Convenient wrapper for Muscle multiple sequence aligner
 43 | 
 44 |     The wrapper is written as a callable class.
 45 |     This can hold data (state) to do with the operation it performs, so it can keep results,
 46 |     execution times and other metadata, as well as perform the task.
 47 | 
 48 |     This is a basic implementation that can be extended. The important parts are
 49 |     __init__ (does the setup) and __call__ (does the work). All
 50 |     else are helper methods.
 51 | 
 52 |     :Example:
 53 | 
 54 |     ::
 55 | 
 56 |         callable_wrapper = Muscle(aln)
 57 |         result = callable_wrapper()
 58 |         time_taken = callable_wrapper.elapsed_time
 59 |         result_again = callable_wrapper.result
 60 |     """
 61 | 
 62 |     def __init__(self, input_, *args, **kwargs):
 63 |         super(Muscle, self).__init__(input_, *args, **kwargs)
 64 |         self.options = get_default_options()
 65 | 
 66 |         if self.datatype == DataType.DNA:
 67 |             set_default_dna_options(self)
 68 |         else:
 69 |             set_default_protein_options(self)
 70 | 
 71 |     def __call__(self, *args, **kwargs):
 72 |         """
 73 |         Anything to do with calling Muscle should go here.
 74 |         If any extra arguments need to be passed they can
 75 |         be specified (listed as *args and **kwargs for now).
 76 |         """
 77 |         start = time.time() # time the execution
 78 | 
 79 |         if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
 80 |             with tempfile.NamedTemporaryFile(mode="wt") as filehandle:
 81 |                 SeqIO.write(self.input, filehandle, 'fasta')
 82 |                 filehandle.seek(0)
 83 |                 output, error = self._call(filehandle.name, *args, **kwargs)
 84 |         else:
 85 |             output, error = self._call(self.input, *args, **kwargs)
 86 | 
 87 |         self.result = self._read_result(output) # store result
 88 |         self.stdout = output
 89 |         self.stderr = error
 90 | 
 91 |         end = time.time()
 92 |         self.elapsed_time = end - start
 93 |         return self.result
 94 |         # End call
 95 | 
 96 |     # Any other accessory methods 
 97 |     def _call(self, filename, *args, **kwargs):
 98 |         """
 99 |         Call underlying low level _MuscleCLI wrapper. 
100 |         Options are passed via *args and **kwargs
101 |         [This only covers the simplest automatic
102 |          case]
103 |         """
104 |         self.cli('{} -in {}'.format(self.command(), filename),
105 |                 wait=True)
106 |         return self.cli.get_stdout(), self.cli.get_stderr()
107 | 
108 |     def command(self):
109 |         return str(self.options)
110 | 
111 |     def _read_result(self, output):
112 |         """
113 |         Read back the result.
114 |         """
115 |         fileobj = StringIO(output)
116 |         return AlignIO.read(fileobj, 'fasta')
117 | 
118 |     def _init_cli(self, binary):
119 |         return MuscleCLI(executable=binary)
120 | 
121 | 
122 | def get_default_options():
123 |     return OptionSet([
124 |         # Algorithm
125 | 
126 |         # Find diagonals (faster for similar sequences)
127 |         FlagOption('-diags', False, active=False),
128 | 
129 |         # Maximum number of iterations(integer, default 16)
130 |         IntegerOption('-maxiters', 16, active=False),
131 | 
132 |         # Maximum time to iterate in hours (default no limit)
133 |         FloatOption('-maxhours', 0.0, active=False)
134 | 
135 |         #reeInputOption('-usetree', '', active=False)
136 |     ])
137 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/aligners/probcons.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from Bio import AlignIO, SeqIO
  3 | from six import StringIO
  4 | from ..abstract_cli import AbstractCLI
  5 | from .base_aligner import Aligner, AlignmentInput, DataType
  6 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
  7 | import tempfile
  8 | 
  9 | 
 10 | class ProbConsCLI(AbstractCLI):
 11 |     """
 12 |     ProbCons low-level command line interface
 13 | 
 14 |     :Example:
 15 | 
 16 |     ::
 17 | 
 18 |         probcons_cli = _ProbConsCLI()
 19 |         process = mafft_cli(cmd='mafft args...')
 20 |         stdout = mafft_cli.get_stdout()
 21 |     """
 22 |     @property
 23 |     def _default_exe(self):
 24 |         return 'probcons'
 25 | 
 26 |     # def _set_help(self):
 27 |     #     self(help=True, wait=True)
 28 |     #     self._help = self.get_stdout()
 29 | 
 30 | 
 31 | def set_default_dna_options(aligner):
 32 |     """
 33 |     Dummy function as sensible default already provided by mafft --auto
 34 |     """
 35 |     aligner.options = get_default_options()
 36 | 
 37 | 
 38 | def set_default_protein_options(aligner):
 39 |     """
 40 |     Dummy function as sensible default already provided by mafft --auto
 41 |     """
 42 |     aligner.options = get_default_options()
 43 | 
 44 | 
 45 | class ProbCons(Aligner):
 46 |     """
 47 |     Convenient wrapper for ProbCons multiple sequence aligner
 48 | 
 49 |     The wrapper is written as a callable class.
 50 |     This can hold data (state) to do with the operation it performs, so it can keep results,
 51 |     execution times and other metadata, as well as perform the task.
 52 | 
 53 |     This is a basic implementation that can be extended. The important parts are
 54 |     __init__ (does the setup) and __call__ (does the work). All
 55 |     else are helper methods.
 56 | 
 57 |     :Example:
 58 | 
 59 |     ::
 60 | 
 61 |         callable_wrapper = ProbCons(aln)
 62 |         result = callable_wrapper()
 63 |         time_taken = callable_wrapper.elapsed_time
 64 |         result_again = callable_wrapper.result
 65 | 
 66 | 
 67 |     .. note:: There exists an ipython notebook on how to work with wrappers,
 68 |          including dealing with non-default parameters.
 69 |     """
 70 | 
 71 |     def __init__(self, input_, *args, **kwargs):
 72 |         super(ProbCons, self).__init__(input_, *args, **kwargs)
 73 |         self.options = get_default_options()
 74 |         if self.datatype == DataType.DNA:
 75 |             set_default_dna_options(self)
 76 |         else:
 77 |             set_default_protein_options(self)
 78 | 
 79 |     def __call__(self, *args, **kwargs):
 80 |         """
 81 |         Anything to do with calling Mafft should go here.
 82 |         If any extra arguments need to be passed they can
 83 |         be specified (listed as *args and **kwargs for now).
 84 |         """
 85 |         start = time.time()  # time the execution
 86 |         
 87 |         if self.input_type == AlignmentInput.OBJECT:  # different operation depending on what it is
 88 |             with tempfile.NamedTemporaryFile(mode='wt') as filehandle:
 89 |                 SeqIO.write(self.input, filehandle, 'fasta')
 90 |                 filehandle.seek(0)
 91 |                 output, error = self._call(filehandle.name, *args, **kwargs)
 92 |                 
 93 |         else:
 94 |             output, error = self._call(self.input, *args, **kwargs)
 95 | 
 96 |         self.result = self._read_result(output) # store result
 97 |         self.stdout = output
 98 |         self.stderr = error
 99 | 
100 |         end = time.time()
101 |         self.elapsed_time = end - start
102 |         return self.result
103 |         # End call
104 | 
105 |     # Any other accessory methods 
106 |     def _call(self, filename, *args, **kwargs):
107 |         """
108 |         Call underlying low level _Mafft wrapper. 
109 |         Options are passed via *args and **kwargs
110 |         [This only covers the simplest automatic
111 |          case]
112 |         """
113 |         self.cli('{} {}'.format(self.command(), filename),
114 |                  wait=True)
115 |         return self.cli.get_stdout(), self.cli.get_stderr()
116 | 
117 |     def command(self):
118 |         return str(self.options)
119 | 
120 |     def _read_result(self, output):
121 |         """
122 |         Read back the result.
123 |         """
124 |         fileobj = StringIO(output)
125 |         return AlignIO.read(fileobj, 'fasta')
126 | 
127 |     def _init_cli(self, binary):
128 |         return ProbConsCLI(executable=binary)
129 | 
130 | 
131 | def get_default_options():
132 |     return OptionSet([
133 |         # Algorithm
134 | 
135 |         # use CLUSTALW output format instead of MFA
136 |         FlagOption('-clustalw', False, active=False),
137 | 
138 |         # use 0 <= REPS <= 5 (default: 2) passes of consistency transformation
139 |         IntegerOption('-c', 0, active=False),
140 | 
141 |         # use 0 <= REPS <= 1000 (default: 100) passes of iterative-refinement
142 |         IntegerOption('-ir', 100, active=False),
143 | 
144 |         # use 0 <= REPS <= 20 (default: 0) rounds of pretraining
145 |         IntegerOption('-pre', 0, active=False),
146 | 
147 |         # generate all-pairs pairwise alignments
148 |         FlagOption('-pairs', False, active=False),
149 | 
150 |         #use Viterbi algorithm to generate all pairs(automatically enables - pairs)
151 |         FlagOption('-viterbi', False, active=False),
152 | 
153 |         # write annotation for multiple alignment to FILENAME
154 |         StringOption('-annot', '', active=False),
155 | 
156 |         # print sequences in alignment order rather than input order (default: off)
157 |         FlagOption('-a', False, active=False)
158 | 
159 |     ])
160 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/aligners/prographmsa.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from Bio import AlignIO, SeqIO
  3 | import tempfile
  4 | from six import StringIO
  5 | from ..abstract_cli import AbstractCLI
  6 | from .base_aligner import Aligner, AlignmentInput, DataType
  7 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
  8 | 
  9 | 
 10 | class ProGraphMSACLI(AbstractCLI):
 11 |     """
 12 |     PrographMSA low-level command line interface
 13 | 
 14 |     :Example:
 15 | 
 16 |     ::
 17 | 
 18 |         prograph_cli = ProGraphMSACLI()
 19 |         process = prograph_cli(cmd='mafft args...')
 20 |         stdout = prograph_cli.get_stdout()
 21 |     """
 22 | 
 23 |     @property
 24 |     def _default_exe(self):
 25 |         return 'ProGraphMSA'
 26 | 
 27 | 
 28 | def set_default_dna_options(aligner):
 29 |     """
 30 |     Dummy function as sensible default already provided by mafft --auto
 31 |     """
 32 |     aligner.options = get_default_options()
 33 | 
 34 | 
 35 | def set_default_protein_options(aligner):
 36 |     """
 37 |     Dummy function as sensible default already provided by mafft --auto
 38 |     """
 39 |     aligner.options = get_default_options()
 40 | 
 41 | 
 42 | class ProGraphMSA(Aligner):
 43 |     """
 44 |     Convenient wrapper for ProGraphMSA multiple sequence aligner
 45 | 
 46 |     The wrapper is written as a callable class.
 47 |     This can hold data (state) to do with the operation it performs, so it can keep results,
 48 |     execution times and other metadata, as well as perform the task.
 49 | 
 50 |     This is a basic implementation that can be extended. The important parts are
 51 |     __init__ (does the setup) and __call__ (does the work). All
 52 |     else are helper methods.
 53 | 
 54 |     :Example:
 55 | 
 56 |     ::
 57 | 
 58 |         callable_wrapper = Mafft(aln)
 59 |         result = callable_wrapper()
 60 |         time_taken = callable_wrapper.elapsed_time
 61 |         result_again = callable_wrapper.result
 62 |     """
 63 | 
 64 |     def __init__(self, input_, *args, **kwargs):
 65 |         super(ProGraphMSA, self).__init__(input_, *args, **kwargs)
 66 |         self.options = get_default_options()
 67 |         if self.datatype == DataType.DNA:
 68 |             set_default_dna_options(self)
 69 |         else:
 70 |             set_default_protein_options(self)
 71 | 
 72 |     def __call__(self, *args, **kwargs):
 73 |         """
 74 |         Anything to do with calling ProGraphMSA should go here.
 75 |         If any extra arguments need to be passed they can
 76 |         be specified (listed as *args and **kwargs for now).
 77 |         """
 78 |         start = time.time()  # time the execution
 79 | 
 80 |         if self.input_type == AlignmentInput.OBJECT:  # different operation depending on what it is
 81 |             with tempfile.NamedTemporaryFile(mode="wt") as fh:
 82 |                 SeqIO.write(self.input, fh, 'fasta')
 83 |                 fh.seek(0)
 84 |                 output, error = self._call(fh.name, *args, **kwargs)
 85 | 
 86 |         else:
 87 |             output, error = self._call(self.input, *args, **kwargs)
 88 | 
 89 |         self.result = self._read_result(output)  # store result
 90 |         self.stdout = output
 91 |         self.stderr = error
 92 | 
 93 |         end = time.time()
 94 |         self.elapsed_time = end - start
 95 |         return self.result
 96 |         # End call
 97 | 
 98 |     # Any other accessory methods
 99 |     def _call(self, filename, *args, **kwargs):
100 |         """
101 |         Call underlying low level ProGraphMSA wrapper.
102 |         Options are passed via *args and **kwargs
103 |         [This only covers the simplest automatic
104 |          case]
105 |         """
106 |         self.cli('{} {}'.format(self.command(), filename),
107 |                  wait=True)
108 |         return self.cli.get_stdout(), self.cli.get_stderr()
109 | 
110 |     def command(self):
111 |         return str(self.options)
112 | 
113 |     def _read_result(self, output):
114 |         """
115 |         Read back the result.
116 |         """
117 |         fileobj = StringIO(output)
118 |         return AlignIO.read(fileobj, 'fasta')
119 | 
120 |     def _init_cli(self, binary):
121 |         return ProGraphMSACLI(executable=binary)
122 | 
123 | 
124 | def get_default_options():
125 |     return OptionSet([
126 |         # Algorithm
127 | 
128 |         # output fasta format (instead of stockholm), better because no tree output is produced
129 |         FlagOption('--fasta', True, active=True),
130 | 
131 |         # output all ancestral sequences
132 |         FlagOption('--ancestral_seqs', False, active=False),
133 | 
134 |         # output sequences in input order (default: tree order)
135 |         FlagOption('--input_order', False, active=False),
136 | 
137 |         # output all intermediate guide trees
138 |         FlagOption('--all_trees', False, active=False),
139 | 
140 |         # use ML distances with gap
141 |         FlagOption('--mldist_gap', False, active=False),
142 | 
143 |         # use ML distances
144 |         FlagOption('--mldist', False, active=False),
145 | 
146 |         # use of guide tree
147 |         StringOption('--tree', '', active=False)
148 | 
149 |     ])
150 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/read_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | from .ngm import NGM
2 | from .ngmlr import NGMLR


--------------------------------------------------------------------------------
/read2tree/wrappers/read_mappers/base_mapper.py:
--------------------------------------------------------------------------------
  1 | import os, types
  2 | from abc import ABCMeta, abstractmethod
  3 | from enum import Enum
  4 | from Bio.SeqRecord import SeqRecord
  5 | from read2tree.wrappers import WrapperError
  6 | 
  7 | import logging
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | ReferenceInput = Enum('ReferenceInput', 'OBJECT STRING FILENAME')
 11 | ReadInput = Enum('ReadInput', 'OBJECT STRING FILENAME')
 12 | 
 13 | class ReadMapper(object):
 14 |     """
 15 |     Base class for wrappers of read mapping software
 16 | 
 17 |     The wrapper is written as a callable class.
 18 |     This can hold data (state) to do with the operation it performs, so it can keep results,
 19 |     execution times and other metadata, as well as perform the task.
 20 | 
 21 |     This is a base implementation to be extended. The important parts are
 22 |     __init__ (does the setup) and __call__ (does the work). All
 23 |     else are helper methods.
 24 | 
 25 |     :Example:
 26 | 
 27 |     ::
 28 | 
 29 |         callable_wrapper = ConcreteAligner(aln)
 30 |         result = callable_wrapper()
 31 |         time_taken = callable_wrapper.elapsed_time
 32 |         result_again = callable_wrapper.result
 33 |     """
 34 |     __metaclass__ = ABCMeta
 35 | 
 36 |     def __init__(self, reference=None, reads=None, tmp_folder=None, binary=None):
 37 |         """
 38 |         ..note::  TODO: this documentation is not correct. it needs to be updateted.
 39 | 
 40 |         Should work the same whether you're working with a Biopython object or a file
 41 |             but the implementation differs, e.g. a Biopython object will need
 42 |             to be written temporarily to disk for the Aligner to work on it.
 43 | 
 44 |         alignment is one of 4 things:
 45 |             a filename
 46 |             a Biopython MSA
 47 |             a list of Seq objects
 48 |             anything else (throw an exception)
 49 | 
 50 |         binary is the alignment's executable file, or None
 51 |         """
 52 |         if reference is not None:
 53 |             self.ref_input_type = identify_reference(reference)  # Figure out what it is - file or object
 54 |             self.ref_input = reference  # store it
 55 |         else:
 56 |             self.ref_input_type = None
 57 |             self.ref_input = None
 58 | 
 59 |         if reads is not None:
 60 |             self.read_input_type = identify_reads(reads)  # Figure out what it is - file or object
 61 |             self.read_input = reads  # store it
 62 |         else:
 63 |             self.read_input_type = None
 64 |             self.read_input = None
 65 | 
 66 |         if tmp_folder is not None:
 67 |             self.tmp_folder = tmp_folder
 68 |         else:
 69 |             self.tmp_folder = "./" # set to current folder
 70 | 
 71 |         self.elapsed_time = None
 72 |         self.stdout = None
 73 |         self.stderr = None
 74 |         try:
 75 |             self.cli = self._init_cli(binary)
 76 |         except IOError as err:
 77 |             raise WrapperError('Error searching for binary: {}'.format(err))
 78 |             # End setup
 79 | 
 80 |     @abstractmethod
 81 |     def __call__(self, *args, **kwargs):
 82 |         """
 83 |         How to call the underlying aligner
 84 |         """
 85 |         pass
 86 | 
 87 |     @abstractmethod
 88 |     def _init_cli(self, binary):
 89 |         """
 90 |         Set up the command-line interface to the wrapped software
 91 |         :param binary: filename of executable binary file
 92 |         :return: concrete CLI type inheriting from AbstractCLI
 93 |         """
 94 |         pass
 95 | 
 96 | def identify_reference(sequence):
 97 |     """
 98 |     Work out if we're dealing with a fasta (return True), a file
 99 |     (return False), or invalid input (raise error)
100 | 
101 |     :param alignment: either an Biopython MultipleSequenceAlignment or
102 |         a filename pointing to an existing msa file.
103 |     """
104 |     try:
105 |         if isinstance(sequence, (SeqRecord, types.GeneratorType, list)):
106 |             # `sequence` is a Biopython MultipleSequenceAlignment
107 |             return ReferenceInput.OBJECT
108 |         if isinstance(sequence, str) and not os.path.exists(sequence):
109 |             return ReferenceInput.STRING
110 |         elif isinstance(sequence, str) and os.path.exists(sequence):
111 |             # `sequence` is a filepath
112 |             return ReferenceInput.FILENAME
113 | 
114 |     except:
115 |         # `sequence` is some other thing we can't handle
116 |         raise ValueError('{} is not an sequence object or a valid filename'.format(sequence))
117 | 
118 | 
119 | def identify_reads(reads):
120 |     """
121 |     Work out if we're dealing with a fasta (return True), a file
122 |     (return False), or invalid input (raise error)
123 | 
124 |     :param alignment: either an Biopython MultipleSequenceAlignment or
125 |         a filename pointing to an existing msa file.
126 |     """
127 |     if isinstance(reads, list):
128 |         read = reads[0]
129 |     else:
130 |         read = reads
131 | 
132 |     try:
133 |         if isinstance(read, (SeqRecord, types.GeneratorType, list)):
134 |             # `sequence` is a Biopython MultipleSequenceAlignment
135 |             return ReadInput.OBJECT
136 |         elif isinstance(read, str) and not os.path.exists(read):
137 |             return ReadInput.STRING
138 |         elif isinstance(read, str) and os.path.exists(read):
139 |             # `sequence` is a filepath
140 |             return ReadInput.FILENAME
141 | 
142 |     except:
143 |         # `sequence` is some other thing we can't handle
144 |         raise ValueError('{} is not an sequence object or a valid filename'.format(sequence))
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/read_mappers/parser.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pysam
 3 | from pyparsing import Suppress, SkipTo, Word, Regex, Literal, OneOrMore, \
 4 |     Group, LineEnd, CharsNotIn, nums, alphanums, ParseException
 5 | 
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | logger.addHandler(logging.StreamHandler())
 9 | 
10 | FLOAT = Word(nums + '.-').setParseAction(lambda x: float(x[0]))
11 | INT = Word(nums).setParseAction(lambda x: int(x[0]))
12 | WORD = Word(alphanums + '_-%. ')
13 | SPACEDWORD = Word(alphanums+' _')
14 | 
15 | 
16 | class NGMParser(object):
17 |     """
18 |     Simple prottest result parser.
19 |     [MAIN] Done (15778 reads mapped (4.14%), 365184 reads not mapped, 15778 lines written)(elapsed: 73.131973s)
20 |     """
21 | 
22 |     def __init__(self):
23 |         self.READS_MAPPED = Literal('[MAIN] Done (')
24 |         self.TOTAL_READS = Regex(r'\[MAIN\] Done \(\d+ reads mapped \(\d+\.\d+\%\), ')
25 |         self.MAPPING_TIME = Literal('elapsed: ')
26 |         self.rm = Suppress(SkipTo(self.READS_MAPPED)) + Suppress(self.READS_MAPPED) + INT
27 |         self.tr = Suppress(SkipTo(self.TOTAL_READS)) + Suppress(self.TOTAL_READS) + INT
28 |         self.mt = Suppress(SkipTo(self.MAPPING_TIME)) + Suppress(self.MAPPING_TIME) + FLOAT
29 | 
30 |     def parse(self, stdout):
31 |         try:
32 |             reads_mapped = self.rm.parseString(stdout).asList()[0]
33 |             total_reads = self.tr.parseString(stdout).asList()[0]
34 |             mapping_time = self.mt.parseString(stdout).asList()[0]
35 |         except ParseException as err:
36 |             print(stdout)
37 |             logger.error(err)
38 |         else:
39 |             return reads_mapped, total_reads, mapping_time
40 | 
41 |     def to_dict(self, file, stdout):
42 |         try:
43 |             reads_mapped, total_reads, mapping_time = self.parse(stdout)
44 |         except UnboundLocalError:
45 |             reads_mapped = None
46 |             total_reads = None
47 |             mapping_time = None
48 |             pass
49 |         samfile = pysam.AlignmentFile(file, "r")
50 |         result = {'file': file,
51 |                   'reads_mapped': reads_mapped,
52 |                   'total_reads': total_reads,
53 |                   'mapping_time': mapping_time,
54 |                   'sam': samfile}
55 |         return result
56 | 
57 | 
58 | class NGMLRParser(object):
59 |     """
60 |     Simple prottest result parser.
61 |     for the following example output line:
62 |         Processed: 75400 (0.00), R/S: 60.15, RL: 7675, Time: 3.00 11.00 10.07, Align: 1.00, 310, 3.04
63 |         Done (77 reads mapped (0.10%), 75323 reads not mapped, 75402 lines written)(elapsed: 20m, 0 r/s)
64 |     """
65 | 
66 |     def __init__(self):
67 |         self.TOTAL_MAPPED_READS = Literal('Done (')
68 |         self.TOTAL_READS = Literal('Processed: ')
69 |         # These are all the models that are possible to be tested using phyml
70 |         self.tmr = Suppress(SkipTo(self.TOTAL_MAPPED_READS)) + \
71 |             Suppress(self.TOTAL_MAPPED_READS) + FLOAT
72 |         self.tr = Suppress(SkipTo(self.TOTAL_READS)) + Suppress(self.TOTAL_READS) + FLOAT
73 | 
74 |     def parse(self, stdout):
75 |         try:
76 |             total_mapped_reads = self.tmr.parseString(stdout).asList()[0]
77 |             total_reads = self.tr.parseString(stdout).asList()[0]
78 | 
79 |         except ParseException as err:
80 |             logger.error(err)
81 | 
82 |         return total_mapped_reads, total_reads
83 | 
84 |     def to_dict(self, file, stdout):
85 |         total_mapped_reads, total_reads = self.parse(stdout)
86 |         samfile = pysam.AlignmentFile(file, "r")
87 |         result = {'file': file,
88 |                   'reads_mapped': total_mapped_reads,
89 |                   'total_reads': total_reads,
90 |                   'sam': samfile}
91 | 
92 |         return result
93 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/treebuilders/__init__.py:
--------------------------------------------------------------------------------
1 | from .phyml import Phyml
2 | from .raxml import Raxml
3 | from .iqtree import Iqtree
4 | from .fasttree import Fasttree
5 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/treebuilders/base_treebuilder.py:
--------------------------------------------------------------------------------
  1 | import os, types, itertools
  2 | from abc import ABCMeta, abstractmethod
  3 | from enum import Enum
  4 | from Bio import AlignIO, SeqIO
  5 | from Bio.Align import MultipleSeqAlignment
  6 | from read2tree.utils.seq_utils import is_dna
  7 | 
  8 | from read2tree.wrappers import WrapperError
  9 | from read2tree.wrappers.aligners.base_aligner import identify_input
 10 | 
 11 | import logging
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | AlignmentInput = Enum('AlignmentInput', 'OBJECT FILENAME')
 15 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN')
 16 | 
 17 | 
 18 | class TreeBuilder(object):
 19 |     """
 20 |     Base class for wrappers of tree building software
 21 | 
 22 |     The wrapper is written as a callable class.
 23 |     This can hold data (state) to do with the operation it performs, so it can keep results,
 24 |     execution times and other metadata, as well as perform the task.
 25 | 
 26 |     This is a base implementation to be extended. The important parts are
 27 |     __init__ (does the setup) and __call__ (does the work). All
 28 |     else are helper methods.
 29 | 
 30 |     :Example:
 31 | 
 32 |     ::
 33 | 
 34 |         callable_wrapper = ConcreteAligner(aln)
 35 |         result = callable_wrapper()
 36 |         time_taken = callable_wrapper.elapsed_time
 37 |         result_again = callable_wrapper.result
 38 |     """
 39 |     __metaclass__ = ABCMeta
 40 | 
 41 |     def __init__(self, alignment=None, datatype=DataType.UNKNOWN, binary=None):
 42 |         """
 43 |         ..note::  TODO: this documentation is not correct. it needs to be updateted.
 44 | 
 45 |         Should work the same whether you're working with a Biopython object or a file
 46 |             but the implementation differs, e.g. a Biopython object will need
 47 |             to be written temporarily to disk for the Aligner to work on it.
 48 | 
 49 |         alignment is one of 4 things:
 50 |             a filename
 51 |             a Biopython MSA
 52 |             a list of Seq objects
 53 |             anything else (throw an exception)
 54 | 
 55 |         binary is the alignment's executable file, or None
 56 |         """
 57 | 
 58 |         if alignment is not None:
 59 |             self.input_type = identify_input(alignment)  # Figure out what it is - file or object
 60 |             if datatype == DataType.UNKNOWN:
 61 |                 # dup, input_ = itertools.tee(input_)
 62 |                 self.datatype = guess_datatype(alignment, from_filename=self.input_type == AlignmentInput.FILENAME)
 63 |             else:
 64 |                 self.datatype = datatype
 65 | 
 66 |             self.input = alignment  # store it
 67 |         else:
 68 |             self.input_type = None
 69 |             self.input = None
 70 | 
 71 | 
 72 |         self.elapsed_time = None
 73 |         self.stdout = None
 74 |         self.stderr = None
 75 |         try:
 76 |             self.cli = self._init_cli(binary)
 77 |         except IOError as err:
 78 |             raise WrapperError('Error searching for binary: {}'.format(err))
 79 |             # End setup
 80 | 
 81 |     @abstractmethod
 82 |     def __call__(self, *args, **kwargs):
 83 |         """
 84 |         How to call the underlying aligner
 85 |         """
 86 |         pass
 87 | 
 88 |     @abstractmethod
 89 |     def _init_cli(self, binary):
 90 |         """
 91 |         Set up the command-line interface to the wrapped software
 92 |         :param binary: filename of executable binary file
 93 |         :return: concrete CLI type inheriting from AbstractCLI
 94 |         """
 95 |         pass
 96 | 
 97 | 
 98 | def guess_datatype(alignment, from_filename=False):
 99 |     logger.warning("Guessing is not recommended - specify the sequence type with option datatype={DNA, PROTEIN}, be more confident")
100 |     if from_filename:
101 |         try:
102 |             alignment = list(SeqIO.parse(alignment, 'fasta'))
103 |         except:
104 |             alignment = list(SeqIO.parse(alignment, 'phylip-relaxed'))
105 |     return DataType.DNA if is_dna(alignment) else DataType.PROTEIN
106 | 
107 | 
108 | def identify_input(alignment):
109 |     """
110 |     Work out if we're dealing with an alignment (return True), a file
111 |     (return False), or invalid input (raise error)
112 | 
113 |     :param alignment: either an Biopython MultipleSequenceAlignment or
114 |         a filename pointing to an existing msa file.
115 |     """
116 |     try:
117 |         if isinstance(alignment, (MultipleSeqAlignment, types.GeneratorType, list)):
118 |             # `alignment` is a Biopython MultipleSequenceAlignment
119 |             return AlignmentInput.OBJECT
120 | 
121 |         elif isinstance(alignment, str) and os.path.exists(alignment):
122 |             # `alignment` is a filepath
123 |             return AlignmentInput.FILENAME
124 | 
125 |     except:
126 |         # `alignment` is some other thing we can't handle
127 |         raise ValueError('{} is not an alignment object or a valid filename'.format(alignment))
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/treebuilders/fasttree.py:
--------------------------------------------------------------------------------
  1 | # Author: Ivana Pilizota
  2 | # Date: 1 November 2016
  3 | 
  4 | import logging
  5 | import os
  6 | import time
  7 | 
  8 | from Bio import SeqIO
  9 | from pyparsing import ParseException
 10 | import tempfile
 11 | 
 12 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType
 13 | from .parsers import FasttreeParser
 14 | 
 15 | from ..abstract_cli import AbstractCLI
 16 | from ..options import OptionSet, StringOption, IntegerOption
 17 | from ...file_utils import TempFile, TempDir
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | logger.addHandler(logging.StreamHandler())
 21 | logger.setLevel(logging.INFO)
 22 | 
 23 | 
 24 | class FasttreeCLI(AbstractCLI):
 25 |     @property
 26 |     def _default_exe(self):
 27 |         return 'FastTree'
 28 | 
 29 | 
 30 | def set_default_dna_options(treebuilder):
 31 |     """
 32 |     Dummy function as sensible default
 33 |     """
 34 |     treebuilder.options = get_default_options()
 35 | 
 36 | 
 37 | def set_default_protein_options(treebuilder):
 38 |     """
 39 |     Dummy function as sensible default
 40 |     """
 41 |     treebuilder.options = get_default_options()
 42 | 
 43 | 
 44 | class Fasttree(TreeBuilder):
 45 | 
 46 |     def __init__(self, alignment, *args, **kwargs):
 47 |         self.options = get_default_options()
 48 |         super(Fasttree, self).__init__(alignment=alignment, *args, **kwargs)
 49 |         if self.input is not None:
 50 |             if self.datatype == DataType.DNA:
 51 |                 set_default_dna_options(self)
 52 |             else:
 53 |                 set_default_protein_options(self)
 54 | 
 55 |     def __call__(self, *args, **kwargs):
 56 |         """
 57 |         Sets up temporary output file location and calls FastTree using _call() function.
 58 |         Writes temporary input file if we're working with SeqIO object
 59 |         Saves the stdout and stderr and returns
 60 |         """
 61 |         start = time.time()  # time the execution
 62 |         if self.input_type == AlignmentInput.OBJECT:  # different operation depending on what it is
 63 |             with tempfile.NamedTemporaryFile(mode='wt') as fh:
 64 |                 SeqIO.write(self.input, fh, 'phylip-relaxed') # default interleaved
 65 |                 fh.seek(0)
 66 |                 output, error = self._call(fh.name, *args, **kwargs)
 67 |                 self.result = self._read_result(output, error)  # store result
 68 |         else:
 69 |             filename = os.path.abspath(self.input)
 70 |             output, error = self._call(filename, *args, **kwargs)
 71 |             self.result = self._read_result(output, error)  # store result
 72 | 
 73 |         end = time.time()
 74 |         self.elapsed_time = end - start
 75 |         return self.result["tree"]
 76 |         # End call
 77 | 
 78 |     # Any other accessory methods
 79 |     def _call(self, filename, *args, **kwargs):
 80 |         """
 81 |         Call underlying low level FastTree wrapper.
 82 |         Options are passed via *args and **kwargs
 83 |         [This only covers the simplest automatic
 84 |          case]
 85 |         """
 86 |         #hard code tmp_output as the output name since we don't save it anyway
 87 |         #self.cli('{} -log {log_output} {seqfile} > {tmp_path}'.format(self.command(), tmp_path=os.path.join(tmpd,'tmp_output'), log_output=logfile, seqfile=filename), wait=True)
 88 |         self.cli('{} {seq_file}'.format(self.command(), seq_file=filename), wait=True)
 89 | 
 90 |         return (self.cli.get_stdout(), self.cli.get_stderr())
 91 | 
 92 |     def command(self):
 93 |         return str(self.options)
 94 | 
 95 |     def _read_result(self, stdout, stderr):
 96 |         """
 97 |         Read back the result.
 98 |         """
 99 |         parser = FasttreeParser()
100 | 
101 |         try:
102 |             parser.parse(tree=stdout, other=stderr)
103 |             result = parser.to_dict()
104 |         except IOError as ioerr:
105 |             logger.error('Error reading results')
106 |             result = None
107 |         except ParseException as parseerr:
108 |             logger.error('Other parse error', parseerr)
109 |             result = None
110 | 
111 |         return result
112 | 
113 |     def _init_cli(self, binary):
114 |         return FasttreeCLI(executable=binary)
115 | 
116 | 
117 | def get_default_options():
118 | 
119 |     return OptionSet([
120 |         # Algorithm
121 | 
122 |         # Set datatype to DNA (nt) or AA alignment: AA by default. If set to True will assume DNA format.
123 |         StringOption('-nt', active=False),
124 | 
125 |         # Set the WAG model for AA alignment. Default Jones-Taylor-Thorton
126 |         StringOption('-wag', active=False),
127 | 
128 |         # Set the GTR model for nt alignment. Default Jones-Taylor-Thorton
129 |         StringOption('-gtr', active=False),
130 | 
131 |         # Set the gamma model. Default Jones-Taylor-Thorton
132 |         StringOption('-gamma', active=False),
133 | 
134 |         # Specify the number of rate categories of sites. Default 20.
135 |         IntegerOption('-cat', 20, active=False),
136 | 
137 |         # Specify starting tree
138 |         StringOption('-intree', '', active=False),
139 | 
140 |         # Speed up the neighbor joining phase & reduce memory usage (recommended for >50,000 sequences)
141 |         StringOption('-fastest', active=False),
142 | 
143 |         # Set the number of rounds of maximum-likelihood NNIs. Deafault 4*log2(N), N = the number of unique sequences
144 |         IntegerOption('-mlnni', 0, active=False),
145 | 
146 |     ])
147 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/treebuilders/iqtree.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import logging
  4 | import tempfile
  5 | from pyparsing import ParseException
  6 | 
  7 | from Bio import SeqIO
  8 | from .parsers import IqtreeParser
  9 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType
 10 | 
 11 | from ..abstract_cli import AbstractCLI
 12 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
 13 | 
 14 | from ...file_utils import TempFile,TempDir
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | logger.addHandler(logging.StreamHandler())
 18 | logger.setLevel(logging.INFO)
 19 | 
 20 | 
 21 | class IqtreeCLI(AbstractCLI):
 22 |     @property
 23 |     def _default_exe(self):
 24 |         return 'iqtree'
 25 | 
 26 | 
 27 | def set_default_dna_options(treebuilder):
 28 |     """
 29 |     Dummy function as sensible default
 30 |     """
 31 |     treebuilder.options = get_default_options()
 32 | 
 33 | 
 34 | def set_default_protein_options(treebuilder):
 35 |     """
 36 |     Dummy function as sensible default
 37 |     """
 38 |     treebuilder.options = get_default_options()
 39 | 
 40 | 
 41 | class Iqtree(TreeBuilder):
 42 | 
 43 |     def __init__(self, alignment, *args, **kwargs):
 44 |         self.options = get_default_options()
 45 |         super(Iqtree, self).__init__(alignment=alignment, *args, **kwargs)
 46 |         if self.input is not None:
 47 |             if self.datatype == DataType.DNA:
 48 |                 set_default_dna_options(self)
 49 |             else:
 50 |                 set_default_protein_options(self)
 51 | 
 52 |     def __call__(self, *args, **kwargs):
 53 |         """
 54 |         Sets up temporary output file location and calls iqtree using _call() function.
 55 |         Writes temporary input file if we're working with SeqIO object
 56 |         Saves the stdout and stderr and returns
 57 |         """
 58 |         start = time.time()  # time the execution
 59 |         if "TMPDIR" in os.environ:
 60 |             tmp_output_folder = tempfile.TemporaryDirectory(prefix='iqtree', dir=os.environ.get("TMPDIR"))
 61 |         else:
 62 |             tmp_output_folder = tempfile.TemporaryDirectory(prefix='iqtree_')
 63 |         tmpd = tmp_output_folder.name
 64 |         if self.input_type is AlignmentInput.OBJECT:  # different operation depending on what it is
 65 |             filename = os.path.join(tmpd,'tmp_output.phy')
 66 |             SeqIO.write(self.input, filename, 'phylip-relaxed')  # default interleaved
 67 |             output, error = self._call(filename, tmpd, *args, **kwargs)
 68 |         elif self.input_type is AlignmentInput.FILENAME:
 69 |             filename = self.input
 70 |             output, error = self._call(filename, tmpd, *args, **kwargs)
 71 |         else:
 72 |             output, error = self._call(None, tmpd, *args, **kwargs)
 73 |         self.result = self._read_result(tmpd)  # store result
 74 | 
 75 |         self.stdout = output
 76 |         self.stderr = error
 77 | 
 78 |         end = time.time()
 79 |         self.elapsed_time = end - start
 80 |         return self.result
 81 |         # End call
 82 | 
 83 |     # Any other accessory methods
 84 |     def _call(self, filename, tmpd, *args, **kwargs):
 85 |         """
 86 |         Call underlying low level _iqtree wrapper.
 87 |         Options are passed via *args and **kwargs
 88 |         [This only covers the simplest automatic
 89 |          case]
 90 |         """
 91 |         self.cli('{} -pre {tmp_path} -s {seqfile}'.format(self.command(),
 92 |                                                           tmp_path=os.path.join(tmpd, 'tmp_output'),
 93 |                                                           seqfile=filename),
 94 |                  wait=True)
 95 |         return self.cli.get_stdout(), self.cli.get_stderr()
 96 | 
 97 |     def command(self):
 98 |         return str(self.options)
 99 | 
100 |     def _read_result(self, tmpd):
101 |         """
102 |         Read back the result.
103 |         """
104 |         expected_outfiles = [os.path.join(tmpd, 'tmp_output.treefile')]
105 | 
106 |         parser = IqtreeParser()
107 | 
108 |         try:
109 |             result = parser.to_dict(*expected_outfiles)
110 | 
111 |         except IOError as ioerr:
112 |             logger.error('Error reading results')
113 |             result = None
114 |         except ParseException as parseerr:
115 |             logger.error('Other parse error', parseerr)
116 |             result = None
117 | 
118 |         return result["tree"]
119 | 
120 |     def _init_cli(self, binary):
121 |         return IqtreeCLI(executable=binary)
122 | 
123 | 
124 | def get_default_options():
125 |     return OptionSet([
126 |         # Number of threads
127 |         IntegerOption('-nt', 1, active=True),
128 | 
129 |         # Set the model for either DNA or AA alignment
130 |         StringOption('-m', '', active=False),
131 | 
132 |         # If set to true will assume sequential format
133 |         #FlagOption('-q', False, active=False),
134 | 
135 |         # Limit memory needs to 4G
136 |         StringOption('-mem', '4G', active=True),
137 | 
138 |         # Set seed to 12345
139 |         IntegerOption('-seed', 12345, active=False),
140 | 
141 |         # Ultrafast bootstrap (>=1000)
142 |         IntegerOption('-bb', 0, active=False),
143 | 
144 |         # SH-like approximate likelihood ratio test (SH-aLRT)
145 |         IntegerOption('-alrt', 0, active=False),
146 | 
147 |         # Bootstrap + ML tree + consensus tree (>=100)
148 |         IntegerOption('-b', 0, active=False)
149 |     ])
150 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/treebuilders/phyml.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import tempfile
  4 | import logging
  5 | from pyparsing import ParseException
  6 | from Bio import AlignIO, SeqIO
  7 | 
  8 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType
  9 | from .parsers import PhymlParser
 10 | 
 11 | from ..abstract_cli import AbstractCLI
 12 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
 13 | 
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | logger.addHandler(logging.StreamHandler())
 17 | logger.setLevel(logging.INFO)
 18 | 
 19 | 
 20 | class PhymlCLI(AbstractCLI):
 21 |     @property
 22 |     def _default_exe(self):
 23 |         return 'phyml'
 24 | 
 25 | 
 26 | def set_default_dna_options(treebuilder):
 27 |     """
 28 |     Dummy function as sensible default
 29 |     """
 30 |     treebuilder.options = get_default_options()
 31 | 
 32 | 
 33 | def set_default_protein_options(treebuilder):
 34 |     """
 35 |     Dummy function as sensible default
 36 |     """
 37 |     treebuilder.options = get_default_options()
 38 | 
 39 | 
 40 | class Phyml(TreeBuilder):
 41 |     """ Phyml tree reconstruction
 42 | 
 43 |     This wrapper can be called to reconstruct a phylogenetic tree
 44 |     using PhyML.
 45 |     """
 46 | 
 47 |     def __init__(self, alignment, *args, **kwargs):
 48 |         """
 49 |         :param alignment: input multiple sequence alignment. This can be either
 50 |             a filename or an biopython SeqRecord collection.
 51 |         """
 52 |         super(Phyml, self).__init__(alignment, *args, **kwargs)
 53 |         self.options = get_default_options()
 54 |         if self.datatype == DataType.DNA:
 55 |             set_default_dna_options(self)
 56 |         else:
 57 |             set_default_protein_options(self)
 58 | 
 59 |     def __call__(self, *args, **kwargs):
 60 |         """
 61 |         Anything to do with calling Mafft should go here.
 62 |         If any extra arguments need to be passed they can
 63 |         be specified (listed as *args and **kwargs for now).
 64 |         """
 65 |         start = time.time()  # time the execution
 66 | 
 67 |         if self.input_type == AlignmentInput.OBJECT:  # different operation depending on what it is
 68 |             with tempfile.NamedTemporaryFile(mode='wt') as fh:
 69 |                 SeqIO.write(self.input, fh, 'phylip-relaxed') # default interleaved
 70 |                 fh.seek(0)
 71 |                 output, error = self._call(fh.name, *args, **kwargs)
 72 |                 self.result = self._read_result(fh.name)  # store result
 73 |         else:
 74 |             path = os.path.dirname(self.input)
 75 |             filename = os.path.basename(self.input)
 76 |             os.chdir(path)  # some operations done because phyml can not deal with large filenames that are caused due to a large path
 77 |             output, error = self._call(filename, *args, **kwargs)
 78 |             self.result = self._read_result(filename)  # store result
 79 | 
 80 |         self.stdout = output
 81 |         self.stderr = error
 82 | 
 83 |         end = time.time()
 84 |         self.elapsed_time = end - start
 85 |         return self.result["tree"]
 86 |         # End call
 87 | 
 88 |     # Any other accessory methods
 89 |     def _call(self, filename, *args, **kwargs):
 90 |         """
 91 |         Call underlying low level _Phyml wrapper.
 92 |         Options are passed via *args and **kwargs
 93 |         [This only covers the simplest automatic
 94 |          case]
 95 |         """
 96 |         self.cli('{} -i {}'.format(self.command(), filename),
 97 |                  wait=True)
 98 |         return self.cli.get_stdout(), self.cli.get_stderr()
 99 | 
100 |     def command(self):
101 |         return str(self.options)
102 | 
103 |     def _read_result(self, output):
104 |         """
105 |         Read back the result.
106 |         """
107 | 
108 |         #TODO: change the output dictionary into a better format
109 |         expected_outfiles = ['{}_phyml_stats'.format(output), '{}_phyml_tree'.format(output)]
110 |         parser = PhymlParser()
111 | 
112 |         # Phyml outputs two outfiles, a stats file and a tree file.
113 |         # Sometimes it appends .txt, sometimes not. Seems to be platform-specific.
114 |         # Here we assume they are without .txt, but if we can't find them, try
115 |         # looking for the .txt onees instead
116 |         try:
117 |             # Check if these are the .txt style outfiles
118 |             if not os.path.exists(expected_outfiles[0]):
119 |                 expected_outfiles = [x + '.txt' for x in expected_outfiles]
120 |             result = parser.to_dict(*expected_outfiles)
121 | 
122 |         except IOError as ioerr:
123 |             logger.error('Error reading results')
124 |             result = None
125 |         except ParseException as parseerr:
126 |             logger.error('Other parse error', parseerr)
127 |             result = None
128 | 
129 |         return result
130 | 
131 |     def _init_cli(self, binary):
132 |         return PhymlCLI(executable=binary)
133 | 
134 | 
135 | def get_default_options():
136 |     return OptionSet([
137 |         # Algorithm
138 | 
139 |         # Set datatype to nt or aa
140 |         StringOption('-d', 'aa', active=True),
141 | 
142 |         # Set the model for either DNA or AA alignment
143 |         StringOption('-m', '', active=False),
144 | 
145 |         # If set to true will assume sequential format
146 |         FlagOption('-q', False, active=False),
147 | 
148 |         # Set bootstrap value
149 |         IntegerOption('-b', 0, active=False),
150 | 
151 |         # Tree topology search operation option
152 |         StringOption('-s', 'NNI', active=False)
153 |     ])
154 | 


--------------------------------------------------------------------------------
/read2tree/wrappers/treebuilders/src/pip-delete-this-directory.txt:
--------------------------------------------------------------------------------
1 | This file is placed here by pip to indicate the source was put
2 | here by pip.
3 | 
4 | Once this package is successfully installed this source code will be
5 | deleted (unless you remove this file).
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | name = 'read2tree'
 4 | 
 5 | __version__ = None
 6 | with open('{:s}/__init__.py'.format(name), 'rt') as fp:
 7 |     for line in fp:
 8 |         if line.startswith('__version__'):
 9 |             exec(line.rstrip())
10 | 
11 | # conda install -c conda-forge biopython numpy Cython ete3 lxml tqdm scipy pyparsing requests natsort pyyaml
12 | # conda install -c bioconda dendropy 
13 | requirements = ["numpy", "biopython", "ete3", "lxml", "tqdm", "scipy", 
14 |                 "pyparsing", "requests", "natsort", "pyyaml", "dendropy", 
15 |                 "pysam", "pyham", "filelock"]
16 | 
17 | with open("README.md", "r", encoding="utf-8") as fh:
18 |     long_description = fh.read()
19 | 
20 | setup(
21 |     name=name,
22 |     version=__version__,
23 |     author='David Dylus and Fritz Sedlaczek',
24 |     author_email='daviddylus@gmail.com',
25 |     description='read2tree allows to build high quality phylogenetic trees '
26 |                 'using reads and a reference set of orthologous groups '
27 |                 '(DNA + Protein).',
28 |     long_description=long_description,
29 |     long_description_content_type="text/markdown",
30 |     url="https://github.com/dessimozlab/read2tree",
31 |     packages=find_packages(".", exclude=["archive"]),
32 |     include_package_data=True,
33 |     package_data={
34 |           'read2tree': ['logging/log.yaml']
35 |       },
36 |     install_requires=requirements,
37 |     classifiers=[
38 |         "Programming Language :: Python :: 3",
39 |         "Environment :: Console",
40 |         "License :: OSI Approved :: MIT License",
41 |     ],
42 |     scripts=['bin/read2tree'],
43 |     python_requires=">=3.5",
44 | )
45 | 


--------------------------------------------------------------------------------
/src/pip-delete-this-directory.txt:
--------------------------------------------------------------------------------
1 | This file is placed here by pip to indicate the source was put
2 | here by pip.
3 | 
4 | Once this package is successfully installed this source code will be
5 | deleted (unless you remove this file).
6 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1001241.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00784 | OMA1001241 | ML06054a | [Mnemiopsis leidyi]
 2 | MFRNPKIIYSWPYGQHFCKYLRRNASFGEVHPLFESLIAGNRAALARAITLSESTLERHKQESAHLMSSVLKHNLQNRSL
 3 | RIGISGPPGAGKSTFIEAIGLHITELNNKLAVLAVDPSSTRSGGSLLADKTRMQQLSVEKLAYIRPSPNRGHLGGVARAT
 4 | NAAIQLCEAGGYNVIIVETVGAGQSEIAVANMTDIFVLLVPPGSGDELQGIKKGIVEVADMILVTKADGNLKTAARLVKT
 5 | EYSRALRLLRNHDDTSWKPFVQTVSSISGKGISDAWSDMLEFHQEMISTGKYQDRRKKQRVTWLWDHVQDELLEHLRKDT
 6 | LNAKFQEKLEADVRNGVILPSTAAQKLLNLFLKGKDNIS
 7 | 
 8 | >HUMAN77595 | OMA1001241 | Q495G5 | [Homo sapiens]
 9 | MPMLLPHPHQHFLKGLLRAPFRCYHFIFHSSTHLGSGIPCAQPFNSLGLHCTKWMLLSDGLKRKLCVQTTLKDHTEGLSD
10 | KEQRFVDKLYTGLIQGQRACLAEAITLVESTHSRKKELAQVLLQKVLLYHREQEQSNKGKPLAFRVGLSGPPGAGKSTFI
11 | EYFGKMLTERGHKLSVLAVDPSSCTSGGSLLGDKTRMTELSRDMNAYIRPSPTRGTLGGVTRTTNEAILLCEGAGYDIIL
12 | IETVGVGQSEFAVADMVDMFVLLLPPAGGDELQGIKRGIIEMADLVAVTKSDGDLIVPARRIQAEYVSALKLLRKRSQVW
13 | KPKVIRISARSGEGISEMWDKMKDFQDLMLASGELTAKRRKQQKVWMWNLIQESVLEHFRTHPTVREQIPLLEQKVLIGA
14 | LSPGLAADFLLKALKAETNKIHPV
15 | 
16 | >RATNO14529 | OMA1001241 | D3ZNY3 | [Rattus norvegicus]
17 | MTIPTLLLSPYRRLLTRLSRVPSPQLLHSSLPTLHPRDALPNSFGHHCSKRVLLSDGFRRTLCIRATLKDHTEGLSDKEQ
18 | RFVDRLYMGLVQGQRACLAEAITLVESTHTRKKELAQVLLQRVLAHQRERELQNHGKPFTFRVGLSGPPGAGKSTFIECF
19 | GKMLTERGHRLSVLAVDPSSCTSGGSLLGDKTRMTELSRDMNAYIRPSPTSGTLGGVTRTTNEAIVLCEGGGYDIILIET
20 | VGVGQSEFAVADMVDMFVLLLPPAGGDELQGIKRGIIEMADLVVITKSDGDLVVPARRIQAEYVSALKLLRRRSEVWRPK
21 | VIRISARSGEGITEMWDIMREFQHRMLASGELAAKRQTQHKVWMWNLIQENVLEHFKTHPSIREQIPLMEREVLSGALSP
22 | GRAADLLLKAFKSRH
23 | 
24 | >GORGO31007 | OMA1001241 | G3QJC8 | [Gorilla gorilla gorilla]
25 | MPMLLPHPHQHFLKGLLRAPFRCYHFIFHSSTHLGSGIPCAQPFNSLGLHCTKWMLLSDGLKRKLCVQTTLKDHTEGLSD
26 | KEQRFVDKLYTGLIQGQRACLAEAITLVESTHSRKKELAQVLLQKVLLYHREQEQSNKGKPLAFRVGLSGPPGAGKSTFI
27 | EYFGKMLTERGHKLSVLAVDPSSCTSGGSLLGDKTRMTELSRDMNAYIRPSPTRGTLGGVTRTTNEAILLCEGAGYDIIL
28 | IETVGVGQSEFAVADMVDMFVLLLPPAGGDELQGIKRGIIEMADLVAVTKSDGDLIVPARRIQAEYVSALKLLRKRSQVW
29 | KPKVIRISARSGEGISEMWDKMKDFQDLMLASGELTAKRRKQQKVWMWNLIQESVLEHFRTHPTVREQIPLLEQKVLIGA
30 | LSPGLAADFLLKTNKIHPV
31 | 
32 | >XENLA00784 | OMA1001241 | XELAEV_18005522mg | [Xenopus laevis]
33 | MQGITLCCIKTIAHPVSRYFTRNIVSLVKPAQSLGTVSESCKRKTDSFIKLFRTRLCISAVTHQDADILTEKEKRLLNNL
34 | YTGLIRGQRACLAEAITLVESTHSRKREMAQVLLHMVLSHHREQEKLNSGKPLAFRVGLSGPPGAGKSTFIEIFGKMLTE
35 | EGHKVAVLAVDPSSSTSGGSLLGDKTRMTELSRDMNAYIRPSPTRGTLGGVTRTTNEAILLCEGSGYNIILVETVGVGQS
36 | EFAVADMVDMFVLLLPPAGGDELQVMRISARTGEGIQELWNKLLEFQSNMLTSGELIGKRRSQQRVWMWNLIQENVLLYF
37 | RNHPAVKDQIPVLEERVRTGTLSPGLAADMLLKAFSKSS
38 | 
39 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1008242.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00922 | OMA1008242 | ML11532a | [Mnemiopsis leidyi]
 2 | MTDFDKLPSFKALEILAEKAKSVQLKDLFANDPNRFSKYSQAIEIDELKLLVDFSKNKIDEDIFGELLKLVKDAQVEEMR
 3 | DKMFKGEPINFTEQRAVLHIALRNRSNNPILVDGQDVTPKVNQVLEKMKIFADNLRNGTWKGVTGKAITDVVNIGIGGSD
 4 | LGPLMVTEALKSYRGNGPDVHFVSNIDGTHIATVLEKVNFESTLFIIASKTFGTLETLTNARTAREWFIKKSGDPSGVAK
 5 | HFIALSTNAKLVSEFGIDTANMFEFWDWVGGRYSLWSAIGMSIMCHIGSDNFIKLLEGAHAMDNHFTSAPVEKNIPIILA
 6 | VLGVWYNNFLGAQTHALLPYDQYMHRFAAYFQQGDMESNGKGVSREGTRVKYSTGPIVWGEPGTNGQHAFYQLIHQGTKL
 7 | IPCDFIMPVQSLNPIGDHHEILTANFLAQTAALMTGRGNEEARKELSSMSAEDQDRLSIYKEFTGDRPTNSILFTKLTPA
 8 | MLGALIVMYEHKIFVQGVLWNINSFDQMGVELGKKLALKIQPLLKDDNNVDSEDSSTNGLINFIKANRK
 9 | 
10 | >HUMAN42527 | OMA1008242 | G6PI_HUMAN | [Homo sapiens]
11 | MAALTRDPQFQKLQQWYREHRSELNLRRLFDANKDRFNHFSLTLNTNHGHILVDYSKNLVTEDVMRMLVDLAKSRGVEAA
12 | RERMFNGEKINYTEGRAVLHVALRNRSNTPILVDGKDVMPEVNKVLDKMKSFCQRVRSGDWKGYTGKTITDVINIGIGGS
13 | DLGPLMVTEALKPYSSGGPRVWYVSNIDGTHIAKTLAQLNPESSLFIIASKTFTTQETITNAETAKEWFLQAAKDPSAVA
14 | KHFVALSTNTTKVKEFGIDPQNMFEFWDWVGGRYSLWSAIGLSIALHVGFDNFEQLLSGAHWMDQHFRTTPLEKNAPVLL
15 | ALLGIWYINCFGCETHAMLPYDQYLHRFAAYFQQGDMESNGKYITKSGTRVDHQTGPIVWGEPGTNGQHAFYQLIHQGTK
16 | MIPCDFLIPVQTQHPIRKGLHHKILLANFLAQTEALMRGKSTEEARKELQAAGKSPEDLERLLPHKVFEGNRPTNSIVFT
17 | KLTPFMLGALVAMYEHKIFVQGIIWDINSFDQWGVELGKQLAKKIEPELDGSAQVTSHDASTNGLINFIKQQREARVQ
18 | 
19 | >RATNO16818 | OMA1008242 | G6PI_RAT | [Rattus norvegicus]
20 | MAALTRNPEFQKLLEWHRANSANLKLRELFEADPERFNHFSLNLNTNHGHILLDYSKNLVNKEVLHMLVDLAKSRGVEAA
21 | RDNMFSGLKINSTEDRAVLHVALRNRSNRSIMMDGKDVMPEVNKVLDKMKSFCQRVRSGDWKGYTGKAITDIINIGIGGS
22 | DLGPLMVTEALKPYSKGGPRVWFVSNIDGTHIAKTLANLNPESSLFIIASKTFTTQETITNAETAKEWFLQAAKDPSAVA
23 | KHFVALSTNTDKVKEFGIDPKNMFEFWDWVGGRYSLWSAIGLSIALHVGFDHFEQLLSGAHWMDQHFMKTPLDKNAPVLL
24 | ALLGIWYINFYGCETHAMLPYDQYMHRFAAYFQQGDMESNGKYITKSGARVDYQTGPIVWGEPGTNGQHAFYQLIHQGTK
25 | MIPCDFLIPVQTQHPIRNGLHHKILLANFLAQTEALMKGKSPEEARKELQAAGKSPEELEKLLPHKVFEGNRPTNSIVFT
26 | KLTPFILGALIAMYEHKIFVQGIIWDINSFDQWGVELGKQLAKKIEPELDGSSAVTSHDSSTNGLIGFIKLQRDTKID
27 | 
28 | >GORGO15800 | OMA1008242 | A0A2I2YE48 | [Gorilla gorilla gorilla]
29 | TSGQRPAKRRRKSPAMAALTRDPQFQKLQQWYREHGSELNLRRLFDANKDRFNHFSLTLNTNHGHILVDYSKNLVTEDVM
30 | RMLVDLAKSRGVEAARERMFNGEKINYTEGRAVLHVALRNRSNTPILVDGKDVMPEVNKVLDKMKSFCQRVRSGDWKGYT
31 | GKTITDVINIGIGGSDLGPLMVTEALKPYSSGGPRVWYVSNIDGTHIAKTLAQLNPESSLFIIASKTFTTQETITNAETA
32 | KEWFLQAAKDPSAVAKHFVALSTNTTKVKEFGIDPQNMFEFWDWVGGRYSLWSAIGLSIALHVGFDNFEQLLSGAHWMDQ
33 | HFRTTPLEKNAPVLLALLGIWYINCFGCETHAMLPYDQYLHRFAAYFQQGDMESNGKYITKSGTRVDHQTGPIVWGEPGT
34 | NGQHAFYQLIHQGTKMIPCDFLIPVQTQHPIRKGLHHKILLANFLAQTEALMRGKSTEEARKELQAAGKSPEDLERLLPH
35 | KVFEGNRPTNSIVFTKLTPFMLGALVAMYEHKIFVQGIIWDINSFDQWGVELGKQLAKKIEPELDGSAQVTSHDASTNGL
36 | INFIKQQREARVQ
37 | 
38 | >XENLA17790 | OMA1008242 | A0A1L8GL32 | [Xenopus laevis]
39 | MALSCDPVYQKLSQWYEAHHGSLNMRQMFEADKDRFSKFSKKLATDDGDILLDYSKNLVNEEVLKLLIELAHSRGVESAR
40 | QKMFSAEKINFTENRAVLHIALRNRSNTPITLEGKDVMPEVNAVLEKMKAFCQKVRSGDWKGYTGKAITDVINIGIGGSD
41 | LGPLMVTESLKPYSKGGPRVWFVSNIDGTHIAKTLAELNPETSLFIIASKTFTTQETITNAETAKEWFLTSAKDASAVAK
42 | HFVALSTNAPKVKDFGIDTANMFEFWDWVGGRYSLWSAIGLSIALHVGFDNFEKLLAGAHWMDNHFNKTPLENNVPVLLA
43 | MLGIWYTNFYGCETHALLPYDQYMHRFAAYFQQGDMESNGKYITKTGARVNYSTGPVVWGEPGTNGQHAFYQLIHQGTRK
44 | IPCDFLIPAQTQHPIRNGLHHKILLSNFLAQTEALMKGKSTEEAKKELQASGLTGDALEKLLPHKVFEGNRPTNSIVFTK
45 | LNPFILGALIAMYEHKIFVQGVVWDINSYDQWGVELGKQLAKKIEPELESDATITSHDSSTNGLIDFIKKHRG
46 | 
47 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1065415.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00913 | OMA1065415 | ML14561a | [Mnemiopsis leidyi]
 2 | MLRDPETVHPLDECKTWPEIRDKLRLWRKENVRCSDQIVELGEYALKHYQTNLGREKWAVFEQVCVAALDLCPAKMKLVN
 3 | TCIKELAEQFPSSLRVSMLEGLKYEYLKKWDDALEMYEDMIEYEPTFPAPYKRKVAILKAQNKISDAVNDLNRYLNTFSC
 4 | DHESWLELSDIYISNQNYKQALFCVEELLLQYPHNHLYHQRYADILFTIGGKDNLELSCKYYCKAAELNPGNVRALFGIQ
 5 | LASSTLSSIGKLSSKAKSDNQSLAAWASDMIEDFYKSQKTSKNLIIEVAGVLDKLSLK
 6 | 
 7 | >HUMAN95181 | OMA1065415 | EMC2_HUMAN | [Homo sapiens]
 8 | MAKVSELYDVTWEEMRDKMRKWREENSRNSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELR
 9 | RQFPGSHRVKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYLEQFVGDQEAWHE
10 | LAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENLELSRKYFAQALKLNNRNMRALFGLYMSASHIA
11 | SNPKASAKTKKDNMKYASWAASQINRAYQFAGRSKKETKYSLKAVEDMLETLQITQS
12 | 
13 | >RATNO39215 | OMA1065415 | EMC2_RAT | [Rattus norvegicus]
14 | MAKVTERYDVTWEEMRDKMRKWREENSRNSEQIMEVGEELINDYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELR
15 | RQFPGSHRVKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYLEQFVGDQEAWHE
16 | LAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENLELSRKYFAQALKLNNRNMRALFGLYMSASHIA
17 | SNPKASAKMKKDNIRYAGWAANQINRAYQFAGRSKKETKSSLKAVEDMLETLQITQS
18 | 
19 | >GORGO40150 | OMA1065415 | G3S2S4 | [Gorilla gorilla gorilla]
20 | MKYSSSHTLYCLKEEMRDKMRKWREENSRNSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQEL
21 | RRQFPGSHRVKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYLEQFVGDQEAWH
22 | ELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENLELSRKYFAQALKLNNRNMRALFGLYMSASHI
23 | ASNPKASAKTKKDNMKYASWAASQINRAYQFAGRSKKETKYSLKAVEDMLETLQITQS
24 | 
25 | >XENLA27199 | OMA1065415 | EMC2A_XENLA | [Xenopus laevis]
26 | MSKVSDLFDVTWEDMRDKMKTWREENYRNSEHVIEVGEELINEHASKLGDDIWIIYEQVMIAALDCGRDDIAMSCLQELR
27 | RQFPGSHRVKRLTGLRFEAMERYDDALQIYDRILQDDPTNTAARKRKIAIRKAQGRNSEAIRELNEYLEQFVGDQEAWHE
28 | LAELYINELDYAKAAFCLEELILTNPHNHFYYQQFAEVKYTQGGLENLELSRKYFSQALKLNNHNMRALFGLYISSVHIA
29 | SNPKASAKMKKDNVKYATWAASQIKKAYQLAGRTMTDTQTSLKAVEDMLETLQITQS
30 | 
31 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1121053.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00419 | OMA1121053 | ML26071a | [Mnemiopsis leidyi]
 2 | MEKAVLLAALLIATAGAASVQVSDPAKCTLCQAVVTELKVVMEDKDTKDFLAVLQTFICENVPIEDCNNWVSGELAQLDS
 3 | LVEGLDPNQACSSLALCAVHTSPLLSSIQCDFCEFLGDEVVKRVLTNATIDEVVTAAETICSELPFGSNECNALVKEYGH
 4 | YYLELLVGSIDVAQLCSEVGLCSEQVREMVLNSRLFQILQRGLKDDEGCKACVDGMDVIKEVLSSKDTLDLLHIAVHEIC
 5 | GLVSVTGCELIADTALDQIIEKLLPMFVPETVCQQIGACPALTAQDVFSPATVGDDSPLCTGCHDLLGEVKKVANDPETK
 6 | QINKDLAPVLCEVLSIPFCQSLISKFLEGALEKAQNLDVDETCVSLKACEAADEVVENWKDTCSECAMIADLILKELQDP
 7 | SVQQEIESVVDELCSVLPISDCKETLHSYLVMIESLIAGMNGKTLCGYIGLCSSKMSPMKKATGVTEITKLDFTPSKVGD
 8 | TCSECTMIAGEVISLLENGTIDSLIKEAISELCTVLPISDCEATIDGYFDEIVALLKNLDGKTLCSLVGLC
 9 | 
10 | >HUMAN01568 | OMA1121053 | ENSG00000197746.14 | [Homo sapiens]
11 | MYALFLLASLLGAALAGPVLGLKECTRGSAVWCQNVKTASDCGAVKHCLQTVWNKPTVKSLPCDICKDVVTAAGDMLKDN
12 | ATEEEILVYLEKTCDWLPKPNMSASCKEIVDSYLPVILDIIKGEMSRPGEVCSALNLCESLQKHLAELNHQKQLESNKIP
13 | ELDMTEVVAPFMANIPLLLYPQDGPRSKPQPKDNGDVCQDCIQMVTDIQTAVRTNSTFVQALVEHVKEECDRLGPGMADI
14 | CKNYISQYSEIAIQMMMHMSLQQPKEICALVGFCDEVKEMPMQTLVPAKVASKNVIPALELVEPIKKHEVPAKSDVYCEV
15 | CEFLVKEVTKLIDNNKTEKEILDAFDKMCSKLPKSLSEECQEVVDTYGSSILSILLEEVSPELVCSMLHLCSGTRLPALT
16 | VHVTQPKDGGFCEVCKKLVGYLDRNLEKNSTKQEILAALEKGCSFLPDPYQKQCDQFVAEYEPVLIEILVEVMDPSFVCL
17 | KIGACPSAHKPLLGTEKCIWGPSYWCQNTETAAQCNAVEHCKRHVWN
18 | 
19 | >RATNO22029 | OMA1121053 | A0A8I6ASQ4 | [Rattus norvegicus]
20 | MYALALLASLLVTALTSPVQDPKICSGGSAVVCRDVKTAVDCRAVKHCQQMVWSKPTAKSLPCDICKTVVTEAGNLLKDN
21 | ATEEEILHYLEKTCAWIHDSSLSASCKEVVDSYLPVILDMIKGEMSNPGEVCSALNLCQSLQEYLAEQNQRQLESNKIPE
22 | VDLARVVAPFMSNIPLLLYPQDRPRSQPQPKANEDVCQDCMKLVTDIQTAVRTNSSFVQGLVDHVKEDCDRLGPGVSDIC
23 | KNYVDQYSEVAVQMMMHMQDQQPKEICVMVGFCDEVKRVPMRTLVPATEAIKNILPALELTDPYEDVIQAQNVIFCQVCQ
24 | LVMRKLSELIINNATEELLIKGLSKACSLLPAPASTKCQEVLVTFGPSLLDVLMHEVNPNFLCGVISLCSANPNLVGTLE
25 | QPAAAIVSALPKEPAPPKQPEEPKQSALRAHVPPQKNGGFCEVCKKLVIYLEHNLEKNSTKEEILAALEKGCSFLPDPYQ
26 | KQCDEFVAEYEPLLLEILVEVMDPSFVCSKIGVCPSAYKLLLGTEKCVWGPGYWCQNMETAARCNAVDHCKRHVWN
27 | 
28 | >GORGO01692 | OMA1121053 | G3S0G3 | [Gorilla gorilla gorilla]
29 | MYALFLLASLLGAALAGPVLGLKECTRGSAVWCQNVKTASDCGAVKHCLQTVWNKPTVKSLPCDICKDVVTAAGDMLKDN
30 | ATEEEILVYLEKTCDWLPKPNMSASCKEIVDSYLPVILDIIKGEMSRPGEVCSALNLCESLQKHLAELNHQKQLESNKIP
31 | ELDMTEVVAPFMANIPLLLYPQDGPRSKPQPKDNGDVCQDCIQMVTDIQTAVRTNSTFVQALVEHVKEECDRLGPGMADI
32 | CKNYISQYSEIAIQMMMHMQPKEICALVGFCDEVKEMPMQTLVPAKVASKNVIPALELVEPIKKHEVPAKSDVYCEVCEF
33 | LVKEVTKLIDNNKTEKEILDAFDKMCSKLPKSLSEECQEVVDTYGSSILSILLEEVSPELVCSMLHLCSGTRLPALTVHV
34 | TQPKDGGFCEVCKKLVGYLDRNLEKNSTKQEILAALEKGCSFLPDPYQKQCDQFVAEYEPVLIEILVEVMDPSFVCLKIG
35 | ACPSAHKPLLGTEKCVWGPSYWCQNTETAAQCNAVEHCKRHVWN
36 | 
37 | >XENLA29771 | OMA1121053 | XELAEV_18034910mg | [Xenopus laevis]
38 | MKKFAVLVCALAVVAATPLFGTEQCAKGPEVWCENVRTASQCGAVKHCQQNVWNKPTVKSMPCDFCKEVVTVLGNYLKDN
39 | ITQDEIKQYLNKVCDFIPDPGLASTCKQEVSDYFTIVLNLLEQELSNPGVLCSSLGLCTSLQRHLASLKQPTQLLTNEIP
40 | DVDAAKLVYPYIVNIPQLLYPQEKTLKEPKTGDICNDCTKLVSDVQDALRSNSSFSKKLVDHFLQECNLLDPAIAEMCKS
41 | YINQYSDIAIQVLLQMQPKQLCGMAGFCDQEKSTPLQNIIPAKSLIPAVKVQPAVKITKNPLPGNNVLCEVCELMVSQLE
42 | KLLDNNRTRENIKHGLEKVCKLLPSQYTQKCEDMIEEYSDALIELLEQEANPQAICTALGYCSGSKNLKIVKISAEKAAA
43 | GDYCAVCKMLMRYVDELLEKNATEIRIKAFLGRICNFLPDSMQNECSALVNEYEPLFIQLLLEALDPSFICIKVNLCQNK
44 | KVLLGTEKCMWGPSYWCKDMETAANCNALEHCRRHVWN
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1125645.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00647 | OMA1125645 | ML13582a | [Mnemiopsis leidyi]
 2 | MVFYFKTVVNGREYMIYMGRDKMENEDLLRWGFPEDVWFHVDKLSSAHVYLRLNKGEGVADIPKELVDQCCQLVKANSIQ
 3 | GCKLANVDIVYTPYPNLKKTGDMVAGQVGFHKNKKVVKVNVEKNNEVWKKLEKTREEREVDLQEERNRREREEQEELKKQ
 4 | KKLQREMEKLQIEKEKKEREMKSYKLMNADPEKCTSNQFDNESDAERELVDDFM
 5 | 
 6 | >HUMAN93157 | OMA1125645 | CCD25_HUMAN | [Homo sapiens]
 7 | MVFYFTSSSVNSSAYTIYMGKDKYENEDLIKHGWPEDIWFHVDKLSSAHVYLRLHKGENIEDIPKEVLMDCAHLVKANSI
 8 | QGCKMNNVNVVYTPWSNLKKTADMDVGQIGFHRQKDVKIVTVEKKVNEILNRLEKTKVERFPDLAAEKECRDREERNEKK
 9 | AQIQEMKKREKEEMKKKREMDELRSYSSLMKVENMSSNQDGNDSDEFM
10 | 
11 | >RATNO10219 | OMA1125645 | D4AAU6 | [Rattus norvegicus]
12 | MVFYFTSSSVNSSTYTIYMGKDKYENEDLIKYGWPEDVWFHVDKLSSAHVYLRLQKGEKIEDIPKEVLMDCAHLVKANSI
13 | QGCKMNNVNVVYTPWSNLKKTADMDVGQIGFHRQKDVKIVTVEKKVNEILNRLEKTKLEKFPDLAAEKEGRDREERNEKK
14 | AQIQEMKRKEKEEMKKKREMDELRSYSSLMKVENMSSNQDGNDSDEFM
15 | 
16 | >GORGO40924 | OMA1125645 | G3R2K5 | [Gorilla gorilla gorilla]
17 | MVFYFTSSSVNSSAYTIYMGKDKYENEDLIKHGWPEDIWFHVDKLSSAHVYLRLHKGENIEDIPKEVLMDCAHLVKANSI
18 | QGCKMNNVNVVYTPWSNLKKTADMDVGQIGFHRQKDVKIVTVEKKVNEILNRLEKTKVERFPDLAAEKECRDREERNEKK
19 | AQIQEMKKREKEEMKKKREMDELRSYSSLMKVENMSSNQDGNDSDEFM
20 | 
21 | >XENLA23340 | OMA1125645 | Q7T0Y7 | [Xenopus laevis]
22 | MVFYFTSDVISPAYTIYMGKDKYENEDLIKYGWPEDIWFHVDKLSSAHVYLRLQKGQTIEDIPKEVLLDCVQLVKANSIQ
23 | GCKMNNLNVVYTPWANLKKTADMDVGQIGFYRQKDVKTMSVEKVNKIVNRLEKTKDERFPDLAAEKEARDREERNEKKAQ
24 | IQEIKKKEKDEMKKKKEMEELRSYSSLMKSENMSSNQDGNDSDDFM
25 | 
26 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1133018.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00906 | OMA1133018 | ML18772a | [Mnemiopsis leidyi]
 2 | MTITSERRDQVLLGPLPPSFLRLETRTTDGETVTTVTADPAVGRQVELPPQGAGAPNNGAGAPNNGGGAPNNGGQPRPPP
 3 | QHYTAHPVQPYVPQAGLSITIAQAVLNKSYTLIGSMDPYVRLKVGHNTYETFTHAGADKTPCWNKVYHCPLPNTHSVRTV
 4 | SVEIFDEKALTDDQRIAYAKISVPQSVFEGHTLDEWFPLSGKLGEAKEGSINLIISYTTMPMITLPYQRIHYPYVGAAPQ
 5 | HFTPRPPPQISEADVTSIKDMFPAVDKEVIRTVLESKHGNVESAVAAILQIMEGEQGGQQ
 6 | 
 7 | >HUMAN03808 | OMA1133018 | TOLIP_HUMAN | [Homo sapiens]
 8 | MATTVSTQRGPVYIGELPQDFLRITPTQQQRQVQLDAQAAQQLQYGGAVGTVGRLNITVVQAKLAKNYGMTRMDPYCRLR
 9 | LGYAVYETPTAHNGAKNPRWNKVIHCTVPPGVDSFYLEIFDERAFSMDDRIAWTHITIPESLRQGKVEDKWYSLSGRQGD
10 | DKEGMINLVMSYALLPAAMVMPPQPVVLMPTVYQQGVGYVPITGMPAVCSPGMVPVALPPAAVNAQPRCSEEDLKAIQDM
11 | FPNMDQEVIRSVLEAQRGNKDAAINSLLQMGEEP
12 | 
13 | >RATNO18182 | OMA1133018 | TOLIP_RAT | [Rattus norvegicus]
14 | MATTVSTQRGPVYIGELPQDFLRITPTQQQQQIQLDAQAAQQLQYGGAVGTVGRLSITVVQAKLAKNYGMTRMDPYCRLR
15 | LGYAVYETPTAHNGAKNPRWNKVIQCTVPPGVDSFYLEIFDERAFSMDDRIAWTHITIPESLKQGQVEDEWYSLSGRQGD
16 | DKEGMINLVMSYTSLPAAMMMPPQPVVLMPTVYQQGVGYVPIAGMPAVCSPGMVPMAMPPPAVAPQPRCNEEDLKAIQDM
17 | FPNMDREVIRSVLEAQRGNKDAAINSLLQMGEES
18 | 
19 | >GORGO04229 | OMA1133018 | G3R9G9 | [Gorilla gorilla gorilla]
20 | MATTVSTQRGPVYIGELPQDFLRITPTQQQRQVQLDAQAAQQLQYGGAVGTVGRLNITVVQAKLAKNYGMTRMDPYCRLR
21 | LGYAVYETPTAHNGAKNPRWNKVIHCTVPPGVDSFYLEIFDELLPAAMVMPPQPVVLMPTVYQQGVGYVPITGMPAVCSP
22 | GMVPVALPPAAVNAQPRCSEEDLKAIQDMFPNMDQEVIRSVLEAQRGNKDAAINSLLQMGEEP
23 | 
24 | >XENLA19417 | OMA1133018 | TOIPA_XENLA | [Xenopus laevis]
25 | MATSISTQRGQVFIGELPQDFLRIAPTQQQQQIQLDAQAAQQLQYSGVMGTMGRLSITVVQAKLAKNYGMTRMDPYCRIR
26 | LGYAVYETPTAHNGAKNPRWNKVIQCTIPPGVDSFYLEIFDERAFSMDDRIAWTHITIPETLKEGKHVDEWFSLSGRQGD
27 | DKEGMINLVMSYTSVPAMMPAQPVVLMPTVYQQGVGYVPIAGPVYNPGMPMIASPPAVNPQHQTQEVDIQSIKDMFPTID
28 | PEVIRSVLEAQGGNRDAAINSLLQMVEDS
29 | 
30 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1151179.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00706 | OMA1151179 | ML10761a | [Mnemiopsis leidyi]
 2 | MALSRFYIPCHALVKLAPQTRTAVTSVVLERLEQKKKEALLGGGQHRIDAQHKKGKLTARERIEVLLDEGSFVEWDQLVE
 3 | HDCIDWGMDKTHFAGDGVVTGTGTVNGRQVFLFSQDFTVFGGSLSAAYASKICKIMDHAEMVGAPLLGLNDSGGARIQEG
 4 | VASLGGYGDIFLRNVLLSGVVPQISLIMGPCAGGAVYSPAITDFTFMVKGTSHMFITGPDVVKQVTNETVTQEELGGSAA
 5 | HCSTSGCAAGACENDVHLLLQTRRLLEFLPSNNQEKSPVRPCSDPAEREIPALDNIVPNSPISPYDIKHIVEFLVDEGDF
 6 | FEIMPDYAKNIVVGFARMNGETVGIVGNQPLVAAGCLDINASVKGARFVRFCDSFNIPLIILEDVPGFLPGTQQEHGGII
 7 | KHGAKLLYALAEATVPKLTVITRKAYGGAYVVMNSKHIRADVNYAWPSSEIAVMGSKGAVAIICRGDPDLAKREEEYIDT
 8 | FANPFPTAKKGFVDDVIMPRDTRKRLCADLKWLRNKSQKNPWKKHGNIPL
 9 | 
10 | >HUMAN72443 | OMA1151179 | PCCB_HUMAN | [Homo sapiens]
11 | MAAALRVAAVGARLSVLASGLRAAVRSLCSQATSVNERIENKRRTALLGGGQRRIDAQHKRGKLTARERISLLLDPGSFV
12 | ESDMFVEHRCADFGMAADKNKFPGDSVVTGRGRINGRLVYVFSQDFTVFGGSLSGAHAQKICKIMDQAITVGAPVIGLND
13 | SGGARIQEGVESLAGYADIFLRNVTASGVIPQISLIMGPCAGGAVYSPALTDFTFMVKDTSYLFITGPDVVKSVTNEDVT
14 | QEELGGAKTHTTMSGVAHRAFENDVDALCNLRDFFNYLPLSSQDPAPVRECHDPSDRLVPELDTIVPLESTKAYNMVDII
15 | HSVVDEREFFEIMPNYAKNIIVGFARMNGRTVGIVGNQPKVASGCLDINSSVKGARFVRFCDAFNIPLITFVDVPGFLPG
16 | TAQEYGGIIRHGAKLLYAFAEATVPKVTVITRKAYGGAYDVMSSKHLCGDTNYAWPTAEIAVMGAKGAVEIIFKGHENVE
17 | AAQAEYIEKFANPFPAAVRGFVDDIIQPSSTRARICCDLDVLASKKVQRPWRKHANIPL
18 | 
19 | >RATNO41763 | OMA1151179 | Q68FZ8 | [Rattus norvegicus]
20 | MAAAIRIRAMAAGTRLRVLNCGLRTTIRSLCSQPVSVNERIENKRHAALLGGGQRRIDAQHKRGKLTARERISLLLDPGS
21 | FVESDMFVEHRCADFGMAAEKNKFPGDSVVTGRGRINGRLVYVFSQDFTVFGGSLSGAHAQKICKIMDQAITVGAPVIGL
22 | NDSGGARIQEGVESLAGYADIFLRNVTASGVIPQISLIMGPCAGGAVYSPALTDFTFMVKDTSYLFITGPEVVKSVTNED
23 | VTQEQLGGAKTHTTVSGVAHRAFDNDVDALCNLREFFNFLPLSNQDPAPIRECHDPSDRLVPELDTVVPLESSKAYNMLD
24 | IIHAVIDEREFFEIMPNYAKNIVIGFARMNGRTVGIVGNQPNVASGCLDINSSVKGARFVRFCDAFSIPLITFVDVPGFL
25 | PGTAQEYGGIIRHGAKLLYAFAEATVPKITVITRKAYGGAYDVMSSKHLLGDTNYAWPTAEIAVMGAKGAVEIIFKGHQD
26 | VEAAQAEYVEKFANPFPAAVRGFVDDIIQPSSTRARICCDLEVLASKKVHRPWRKHANVPL
27 | 
28 | >GORGO29347 | OMA1151179 | A0A2I2YT25 | [Gorilla gorilla gorilla]
29 | MAAALRVAAAGARLSVLASGLRAAVRSLCSQATSVNERIENKRRTALLGGGQRRIDAQHKRGKLTARERISLLLDPGSFV
30 | ESDMFVEHRCADFGMAADKNKFPGDSVVTGRGRINGRLVYVFSQDFTVFGGSLSGAHAQKICKIMDQAITVGAPVIGLND
31 | SGGARIQEGVESLAGYADIFLRNVTASGVIPQISLIMGPCAGGAVYSPALTDFTFMVKDTSYLFITGPDVVKSVTNEDVT
32 | QEELGGAKTHTTVSGVAHRAFENDVDALCNLRDFFNYLPLSSQDPAPVRECHDPSDRLVPELDTIVPLESTKAYNMVDII
33 | HSVVDEHEFFEIMPNYAKNIIVGFARMNGRTVGIVGNQPKVASGCLDINSSVKGARFVRFCDAFNIPLITFVDVPGFLPG
34 | TAQEYGGIIRHGAKLLYAFAEATVPKVTVITRKAYGGAYDVMSSKHLCGDTNYAWPTAEIAVMGAKGAVEIIFKGHENVE
35 | AAQAEYIEKFANPFPAAVRGFVDDIIQPSSTRARICCDLDVLASKKVRVQEVFHQVVQSGHRDGRGAVRW
36 | 
37 | >XENLA22949 | OMA1151179 | Q52L44 | [Xenopus laevis]
38 | MAAVRSVSRFLAAVRGSGSVCGPRGLLRAYSVSHLSVPERIEKKRREALLGGGEQRIEAQHRRGKLTARERISLLLDPGS
39 | FAEYDMFVEHRCSDFGMEEDRNKYPGDSVVTGQGRINGRLVYVFSQDFTVFGGSLSGAHAQKICKIMDQAVMVGAPVIGL
40 | NDSGGARIQEGVESLAGYADIFLRNVLSSGVVPQISLIMGPCAGGAVYSPALTDFTFMVKDTSYLFITGPDVVKSVTNED
41 | VTQEDLGGAKTHTALSGVAHRAFENDIDALLNLREFFNFLPLSNKDSAPVRKCHDPSDRLVPGLDTVVPMESTKAYDMLD
42 | IIHSIIDEREFFEIMPNYAKNIVVGFARMNGRTVGIVGNQPKVASGCLDINSSVKGARFVRFCDAFNIPIITFVDVPGFL
43 | PGTAQEYGGIIRHGAKLLFAFAEATVPKITVITRKAYGGAYDVMSSKHLRGDVNYAWPTAEVAVMGAKGAVQIIFRGKQN
44 | QAEAEEEYVEKFANPFPAAVRGFVDDIIQPSKTRMRICRDLEVLASKQQVNPWKKHANIPL
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1163384.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00619 | OMA1163384 | ML08751a | [Mnemiopsis leidyi]
 2 | MEKSELSKAWSIDPRERIKALSELASGSVSISHGIPIKRYYRSGVELERMAKVYEDENNLEKAFFLYMKYTTLFVECLPK
 3 | HPDYKSPQTSNERKVVRSKLKTIFDRAEFIKNNLTITYAGQHKKWIMEEQIRKAEAEQKRLEEEARIEAEAVAAKRAEME
 4 | RRETELALELEQIEKQLEETRTIAVKASEKPVVVPPPMHRQATYPSLPVETPAKQSNTSSFNEAFNMRSPATPLAAPSLS
 5 | LPSAPSAPSAHIQIISDTGPFPTVDRSTKPAAPQIDRSTKPATLAASDMFAEMMTQDSQRAVIIPSSLPDKFLSVCLDNT
 6 | QKNVETCGILAAKLTANNFTITHVILPKQRGTPDSCQTLAEEELFEYQDKLDLITVGWIHTHPTQSAFLSSVDLHTHCSY
 7 | QLMLREAIAIVCAPKHNRLFTVILKELLEDSRSFNGDPPHLTLNHGFPVIRLSRPKKIA
 8 | 
 9 | >HUMAN63278 | OMA1163384 | STABP_HUMAN | [Homo sapiens]
10 | MSDHGDVSLPPEDRVRALSQLGSAVEVNEDIPPRRYFRSGVEIIRMASIYSEEGNIEHAFILYNKYITLFIEKLPKHRDY
11 | KSAVIPEKKDTVKKLKEIAFPKAEELKAELLKRYTKEYTEYNEEKKKEAEELARNMAIQQELEKEKQRVAQQKQQQLEQE
12 | QFHAFEEMIRNQELEKERLKIVQEFGKVDPGLGGPLVPDLEKPSLDVFPTLTVSSIQPSDCHTTVRPAKPPVVDRSLKPG
13 | ALSNSESIPTIDGLRHVVVPGRLCPQFLQLASANTARGVETCGILCGKLMRNEFTITHVLIPKQSAGSDYCNTENEEELF
14 | LIQDQQGLITLGWIHTHPTQTAFLSSVDLHTHCSYQMMLPESVAIVCSPKFQETGFFKLTDHGLEEISSCRQKGFHPHSK
15 | DPPLFCSCSHVTVVDRAVTITDLR
16 | 
17 | >RATNO29630 | OMA1163384 | STABP_RAT | [Rattus norvegicus]
18 | MSDHADVSLPPQDRVRILSQLGSAVELNEDIPPRRYFRSGVEIIRMASIYSEEGNIEHAFILYNKYITLFIEKLPKHRDY
19 | KSAIIPEKKDAVKKLKNVAFPKAEELKTELLKRYTKEYEQYKERKKKEEEELARNIAIQQELEKEKQRVAQQKQKQLEQE
20 | QFHAFEKMIQKQELEKERLKIVQEFGKVDPGPCGPLLPDLEKPCVDVAPSSPFSPTQTSDCNTTLRPAKPPVVDRSLKPG
21 | ALSVIENVPTIEGLRHIVVPRNLCSEFLQLASANTAKGIETCGVLCGKLMRNEFTITHVLIPRQNGGPDYCHTENEEEIF
22 | FMQDDLGLLTLGWIHTHPTQTAFLSSVDLHTHCSYQMMLPESIAIVCSPKFQETGFFKLTDYGLQEISTCRQKGFHPHGR
23 | DPPLFCDCSHVTVKDRIVTITDLR
24 | 
25 | >GORGO24976 | OMA1163384 | G3RXN2 | [Gorilla gorilla gorilla]
26 | MSDHGDVSLPPEDRVRALSQLGSAVEVNEDIPPRRYFRSGVEIIRMASIYSEEGNIEHAFILYNKYITLFIEKLPKHRDY
27 | KSAVIPEKKDTVKKLKEIAFPKAEELKAELLKRYTKEYTEYNEEKKKEAEELARNMAIQQELEKEKQRVAQQKQQQLEQE
28 | QFHAFEEMIRNQELEKERLKIVQEFGKVDPGLGGPLVPNLEKPSLDVFPTSTVSSIQPSDCHTTVRPAKPPVVDRSLKPG
29 | ALSNSESIPTIDGLRHVVVPGRLCPQFLQLASANTARGVETCGILCGKLMRNEFTITHVLIPKQSAGSDYCNTENEEELF
30 | LIQDQQGLITLGWIHTHPTQTAFLSSVDLHTHCSYQMMLPESVAIVCSPKFQETGFFKLTDHGLEEISSCRQKGFHPHSK
31 | DPPLFCSCSHVTVVDRAVTITDLR
32 | 
33 | >XENLA13786 | OMA1163384 | XELAEV_18018654mg | [Xenopus laevis]
34 | MPEHSDASLPPEERIRALVLKGTSVEVNDDIPPKRYYRSGVELIRMANVYSGEGSIENAFILYNKYITLFIEKLPKHRDY
35 | KTANVPEKKETLKKLKEIAFPKAEELKKELHKRYKKEYDEYSEKQRKEEEERARRLALQQQLDAEKQRVALLKQQQEQQE
36 | QVQAFEEMMRRKELEAERLRILHQFSKDEPEAEPLGSPLIPGVNEPPVTPLLPSYGTVQPHPPAVDRSLKPSSYGSNSSG
37 | VTSDGLRHVKIPRDVCCKFLQLSENNTQRGVETCGILCGKLMQNEFTVTHVIVPKQSGGPDYCNTESEEELFLIQDQQGL
38 | ITLGWIHTHPTQTAFLSSVDLHTHCSYQMMLPESIAIVCSPKFQETGFFKLTDYGMKEIGECRQKGFHPHCKEPPLFSAG
39 | GHVSVTEQDVTMMDLR
40 | 
41 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1171372.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00942 | OMA1171372 | ML05061a | [Mnemiopsis leidyi]
 2 | MDTFTSLSTGCFVVKPSKRNERLAEFPQLYTEESKMSLLGRYEQYISSKKKFTDALNITLEDINKEVSSKIVNFINEGLK
 3 | ASVSRGIPTLTLKIGVNFKEFSPIYKLAAKEVSETFKIQCATVNSSQHSDMTAVLKDVFGQLLNSENDANKVLLKTLSMR
 4 | GLLDHVMTSNNCNGIVLFIPRFDILPSTIMDKLIDICSSRNQEVPFFFVLGLSTGIELSAEWMSSSAISQLNIETVTPPS
 5 | PTELLERVLFKSLLDTETSFKLSYRTFEVLLSRFSFSSYSLHDVMKTIDVALLSHSMHQPLFKHIEKTSSNIQFHANNLD
 6 | DEEKALLLQLPSVQKYIEKCVVSNKSLALQLLEGDADAIDNLYQQCTENYQVLFHSFKVLHSLIKNIPGSSLVGKPLELY
 7 | SFCLQGCVNEVKEVCIALKCFAMLQSSAFLERLQSAYQECQSVEKPCTKLQLLEKVLKVQIEEMKSILTETSKYELDCSD
 8 | KKSYSEKVKSIQQSFISRLENYFSTLTAPTSWPMHEIIWYSNHVELQNMLVGKCRTALHRGLTNPNLYLQGSSDITAKPD
 9 | LCHLYDLFQEHGKMISLHDWIQSFNALLGKKKISSETHAQFISAVSELHFMGYLKPTKRKTDHVAKLSLLGY
10 | 
11 | >HUMAN85723 | OMA1171372 | ORC3_HUMAN | [Homo sapiens]
12 | MATSSMSKGCFVFKPNSKKRKISLPIEDYFNKGKNEPEDSKLRFETYQLIWQQMKSENERLQEELNKNLFDNLIEFLQKS
13 | HSGFQKNSRDLGGQIKLREIPTAALVLGVNVTDHDLTFGSLTEALQNNVTPYVVSLQAKDCPDMKHFLQKLISQLMDCCV
14 | DIKSKEEESVHVTQRKTHYSMDSLSSWYMTVTQKTDPKMLSKKRTTSSQWQSPPVVVILKDMESFATKVLQDFIIISSQH
15 | LHEFPLILIFGIATSPIIIHRLLPHAVSSLLCIELFQSLSCKEHLTTVLDKLLLTTQFPFKINEKVLQVLTNIFLYHDFS
16 | VQNFIKGLQLSLLEHFYSQPLSVLCCNLPEAKRRINFLSNNQCENIRRLPSFRRYVEKQASEKQVALLTNERYLKEETQL
17 | LLENLHVYHMNYFLVLRCLHKFTSSLPKYPLGRQIRELYCTCLEKNIWDSEEYASVLQLLRMLAKDELMTILEKCFKVFK
18 | SYCENHLGSTAKRIEEFLAQFQSLDETKEEEDASGSQPKGLQKTDLYHLQKSLLEMKELRRSKKQTKFEVLRENVVNFID
19 | CLVREYLLPPETQPLHEVVYFSAAHALREHLNAAPRIALHTALNNPYYYLKNEALKSEEGCIPNIAPDICIAYKLHLECS
20 | RLINLVDWSEAFATVVTAAEKMDANSATSEEMNEIIHARFIRAVSELELLGFIKPTKQKTDHVARLTWGGC
21 | 
22 | >RATNO32812 | OMA1171372 | F1LSH3 | [Rattus norvegicus]
23 | RHTGPRTMATSSVSKGCFVFKPDFKKRKISVPIEDYFNNEELDSEDSKLRFETYRLLWQRIKSETEQLQEGLNENLFDNL
24 | VDFLQKSHSELQKNSGNWGSQMRLREIPTAALILGVNVTDHDVIFRSLTETLHNNVTPYVVSLQAKDCPDVKHFLQKLTS
25 | ELIDCCVDRNSKEEKNDKALRRTSYSMDSLSSWYSTVAQKTGPKMTSKKRATCSQWQSPPVVLILKNMESFSTKVLQDFI
26 | IISSQHLHEFPLILIFGIATSPVIIHRLLPHSVSSLLCIELFQSLSCKEHLTVVLDKLLLTPQFPFKLSKKALQVLTNIF
27 | LYHDFSIQNFIKGLKLSLLEHFYSQPLSVLCCDLSEAKKRINVFSVNQCEKIRRLPSFRRYVENQPLEKQVALLTNETFL
28 | KEETQSLLEDLHVYHINYFLVLRCLHNFTSSLPKYPLGRQIRELYCTCLEKKIWDSEEYESALQLLRMLAKDELMGILEQ
29 | CVKVLNSSTEKQLSNTAQKIKGFLTQFQNLDADSKEEEDACGSQPKGLQKTDLYHLQKSLLEMKELRRTTKKPTKFEMLR
30 | ENVINFIDNLVRDYLLPPEGQPLHEVVYFSAANTLREHLNAAPRIALHTALNNPYYYLKNEALKSEEGCIPSVAPDICIA
31 | YKLHLECSRLINLVDWSEAFATVVTAAEKMDTNSTVSEEMSEIIHARFIRAVSELELLGFIKPTKQKTDHVARLTWGGC
32 | 
33 | >GORGO37065 | OMA1171372 | G3S685 | [Gorilla gorilla gorilla]
34 | MSNQWEKDGLYNKGGFFKPNVIIYRLQEELNKNLFDNLIEFLQKSHSGFQKNSRDLGGQIKLREIPTAALVLGVNVTDHD
35 | LTFRSLTEALQNNVTPYVVSLQAKDCPDMKHFLQKLISQLMDCCVDIKSKEEESVHVTQRKTHYSMDSLSSWYMTVTQKT
36 | DPKMLSKKRTTCSQWQSPPVVVILKDMESFATKVLQDFIIISSQHLHEFPLILIFGIATSPIIIHRLLPHAVSSLLCIEL
37 | FQSLSCKEHLTTVLDKLLLTTQFPFKINEKVLQVLTNIFLYHDFSIQNFIKGLQLSLLEHFYSQPLSVLCCNLPEAKRRI
38 | NFLSNNQCENIRRLPSFRRYVEKQASEKQVALLTNERYLKEETQLLLENLHVYHMNYFLVLRCLHKFTSSLPKYPLGRQI
39 | RELYCTCLEKNIWDSEEYASVLQLLRMLAKDELMTILEKCFKVFKSYCENHLGSTAKRIEEFLAQFQSLDAETKEEEDAS
40 | GSQPKGLQKTDLYHLQKSLLEMKELRRSKKQTKFEVLRENVVNFIDCLVREYLLPPETQPLHEVVYFSAAHALREHLNAA
41 | PRIALHTALNNPYYYLKNEALKSEEGCIPNIAPDICIAYKLHLECSRLINLVDWSEAFATVVTAAEKMDANSATSEEMNE
42 | IIHARFIRAVSELELLGFIKPTKQKTDHVARLTWGGC
43 | 
44 | >XENLA24336 | OMA1171372 | A0A1L8G315 | [Xenopus laevis]
45 | MTTSSVSKGCFVFKPSAKKKKTSLTVADYFNEGLRDSEDSKKRFESCQLLWQQMKSQTEQLQEEMNKRLFENLIGFLRKS
46 | HADFHNKKDDWSCRMRASEIPTAALVLGVNVTDHDLTFNSLSDILHETITPFVVLLQSKECTGIKQLLQKLLTQLMGNTV
47 | DIDLEEEEEQVTISQRKMNCTLASLSDWYKRATKKSASPKKKRSLTSTHWESPPVVVIFKDLESFTASVLQEFIVISSGY
48 | VQDLPLVLVFGIATSPMIIHRLLSHSVSSRLCIELFQSMSCTEHLATVVDQLLLTNHFPIKLSGRVMQVLITIFLYHDFS
49 | VQNFIKGLQLSVVEHFYTQPLSVLCCSLSESRKRIKNLSHAQCENIRHLSSFMSYVESQTPENQVNLLTNDRFLKEMTQE
50 | FLERLNSYHENFTPILRCLHHFTCILPKYPLGKQIREIYCACLEKKVWETEDYNSALPLLRMLAKDEIVATLQKCVAVLK
51 | PYSEKKLGNALEKLEEFLINFQSLEETTQNEEDEDTSPQKSLQKKTDLYQLQKKLLEMKETRRTKKPSRFELLRQDVVDF
52 | IDGLVREYLLPPEMLPLHEVVYFSAASTLRRHLNAAPRVALHTALNNPASYLKCLENEGGSISNAAPDICIAYKLHLECG
53 | RLINLYDWLEAFATVVHAAEGSESDSAQQVDDVTHARFIRAVSELELLGFVKPTKQKTDHVARLTWGGC
54 | 
55 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1188079.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00930 | OMA1188079 | ML01881a | [Mnemiopsis leidyi]
 2 | FLGMTQSSISSFFKPKRQLEDEDGKENKVSKMLKCSDDSVLKDWKISTSWEKCLSNELTKSYFTDISSFVAKERVSKTIY
 3 | PSHDEVFSWTHHCKLDDVKVVILGQDPYHGPNQAHGLCFSVKVGVPPPPSLKNIFKAIKKDLDKFEEPGHGYLVGWARQG
 4 | VLMLNAVLTVEKSKANSHKSKGWEKLTDHVIKYIGFHMKSCVFLLWGTPAMKKQSLINKTNHLVLTSGHPSPLSAHRGFF
 5 | DCKHFSKANEYLLKNKKDAIDWNRLPTE
 6 | 
 7 | >HUMAN15265 | OMA1188079 | UNG_HUMAN | [Homo sapiens]
 8 | MGVFCLGPWGLGRKLRTPGKGPLQLLSRLCGDHLQAIPAKKAPAGQEEPGTPPSSPLSAEQLDRIQRNKAAALLRLAARN
 9 | VPVGFGESWKKHLSGEFGKPYFIKLMGFVAEERKHYTVYPPPHQVFTWTQMCDIKDVKVVILGQDPYHGPNQAHGLCFSV
10 | QRPVPPPPSLENIYKELSTDIEDFVHPGHGDLSGWAKQGVLLLNAVLTVRAHQANSHKERGWEQFTDAVVSWLNQNSNGL
11 | VFLLWGSYAQKKGSAIDRKRHHVLQTAHPSPLSVYRGFFGCRHFSKTNELLQKSGKKPIDWKEL
12 | 
13 | >RATNO05802 | OMA1188079 | A0A8J8YLI4 | [Rattus norvegicus]
14 | MIGQKTLYSFFSPTPTGKRTTRSPQPAPGSGVTAENSSDAAASPAKKARVEQDEPATPPSSPLSAEQLVRIQRNKAAALL
15 | RLAARNVPAGLGESWKQQLCGEFGKPYFVKLMGFVAEERKHHKVYPPPEQVFTWTQMCDIRDVKVVILGQDPYHGPNQAH
16 | GLCFSVQRPVPPPPSLENIFKELSTDIDGFVHPGHGDLSGWARQGVLLLNAVLTVRAHQANSHKERGWEQFTDAVVSWLN
17 | QNLNGLVFLLWGSYAQKKGSAIDRKRHHVLQTAHPSPLSVYRGFFGCRHFSKANELLQRSGKKPISWKEL
18 | 
19 | >GORGO06002 | OMA1188079 | G3RY62 | [Gorilla gorilla gorilla]
20 | MIGQKTLYSFFSPSPARKRHAPSPEPAVQGTGVAGVPEESGDAAAIPAKKAPAGQEEPGTPPSSPLSAEQLDRIQRNKAA
21 | ALLRLAARNVPVGFGESWKKHLSGEFGKPYFIKLMGFVAEERKHYTVYPPPHQVFTWTQMCDIRDVKVVILGQDPYHGPN
22 | QAHGLCFSVQRPVPPPPSLENIYKELSTDIEDFVHPGHGDLSGWAKQGVLLLNAVLTVRAHQANSHKERGWEQFTDAVVS
23 | WLNQNSNGLVFLLWGSYAQKKGSAIDRKRHHVLQTAHPSPLSVYRGFFGCRHFSKTNELLQKSGKKPIDWKEL
24 | 
25 | >XENLA02694 | OMA1188079 | XELAEV_18007455mg | [Xenopus laevis]
26 | MIGQRTINSFFGAAVKKRAASTVWDGEDSCKAGETTPVKKSRPSNENDIPSAVSPPLSPEQLERMQRNKAAALQKLAARH
27 | APQGLGESWKQELLAEFAKPYFVKLSNFIAEERKKCTVYPPPEEVFTWTQMVDIKDVKVVILGQDPYHGPNQAHGLCFSV
28 | KKPVPPPPSLVNMYKELETDIEGFSRPGHGDLTGWAKQGVLLLNAVLTVRAHNANSHKDCGWEQFTDVVVSWLNKNMDGL
29 | VFMLWGAYAQKKGSNIDRKRHHVLQTVHPSPLSVHRGFFGCRHFSKTNAYLQGLGKKPIDWKAL
30 | 
31 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_649216.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00920 | OMA649216 | ML39325a | [Mnemiopsis leidyi]
 2 | MSLSRLGFQLFRRGTLSTVSPAQQLVSARLFNYDQSHDADNYLKFGLLGGLGLFVTAMCKEEAANEEPANHSEATQEEEE
 3 | EQKPKKKKKKGFGERKVMEYENRIREFSTPDKIFRYFATVRVEFENGKKEIFMTPKDFMRSITPGELQPSHLGLDLYRDV
 4 | PISKLLDHVDEEEGEQPEFLSRLAQHGLISFQDYIFLLTLLSTPKHDCEIAFKMFDLYGDGCVSYQEFLDTRSVLESRSS
 5 | MGKRHRDNIYSGNTINKDGHSALTKYFFGEDSAKKLTLDDFVVFMDGLKEDVFRMEFNKYDPVDGKITEQDFANLLLLHA
 6 | TLSNQAKSKFVRRVKKAYKNESQGITFDQFMTFNHFLDHLDDVEILVSVYFAAGMKFNKASLKQVAHVVADVELDSHIID
 7 | LVFTIFDDNGDELLSNREFISVLKERAHRGLEKPSDTGFVRLITALGACVASYVKGEEL
 8 | 
 9 | >HUMAN01609 | OMA649216 | MICU1_HUMAN | [Homo sapiens]
10 | MFRLNSLSALAELAVGSRWYHGGSQPIQIRRRLMMVAFLGASAVTASTGLLWKRAHAESPPCVDNLKSDIGDKGKNKDEG
11 | DVCNHEKKTADLAPHPEEKKKKRSGFRDRKVMEYENRIRAYSTPDKIFRYFATLKVISEPGEAEVFMTPEDFVRSITPNE
12 | KQPEHLGLDQYIIKRFDGKKISQEREKFADEGSIFYTLGECGLISFSDYIFLTTVLSTPQRNFEIAFKMFDLNGDGEVDM
13 | EEFEQVQSIIRSQTSMGMRHRDRPTTGNTLKSGLCSALTTYFFGADLKGKLTIKNFLEFQRKLQHDVLKLEFERHDPVDG
14 | RITERQFGGMLLAYSGVQSKKLTAMQRQLKKHFKEGKGLTFQEVENFFTFLKNINDVDTALSFYHMAGASLDKVTMQQVA
15 | RTVAKVELSDHVCDVVFALFDCDGNGELSNKEFVSIMKQRLMRGLEKPKDMGFTRLMQAMWKCAQETAWDFALPKQ
16 | 
17 | >RATNO21729 | OMA649216 | A0A8I6A7K0 | [Rattus norvegicus]
18 | MFRLNALSALAELAMGSRWYHGTSQPTQTKRRLMLVAFLGASAVTASTGLLWKKAHAESPPSVNSKKTDAGDKGKSKDTR
19 | EVSSHEGSAADTAAEPYPEEKKKKRSGFRDRKVMEYENRIRAYSTPDKIFRYFATLKVINEPGETEVFMTPQDFVRSITP
20 | NEKQPEHLGLDQYIIKRFDGKEFWQTEKIAQEREKFADEGSIFYTLGECGLISFSDYIFLTTVLSTPQRNFEIAFKMFDL
21 | NGDGEVDMEEFEQVQSIIRSQTSMGMRHRDRPTTGNTLKSGLCSALTTYFFGADLKGKLTIKNFLEFQRKLQHDVLKLEF
22 | ERHDPVDGRISERQFGGMLLAYSGVQSKKLTAMQRQLKKHFKDGKGLTFQEVENFFTFLKNINDVDTALSFYHMAGASLD
23 | KVTMQQVARTVAKVELSDHVCDVVFALFDCDGNGELSNKEFVSIMKQRLMRGLEKPKDMGFTRLMQAMWKCAQETAWDFA
24 | LPK
25 | 
26 | >GORGO00558 | OMA649216 | A0A2I2YKG1 | [Gorilla gorilla gorilla]
27 | MFRLNSLSALAELAVGSRWYHGGSQPIQIRRRLMMVAFLGASAVTASTGLLWKRAHAESPPCVDNLKSDIGDKGKNKDEG
28 | DVCNHEKKTADLVPHPEEKKKKRSGFRDRKVMEYENRIRAYSTPDKIFRYFATLKVISEPGEAEVFMTPEDFVRSITPNE
29 | KQPEHLGLDQYIIKRFDGKDFWQTEKISQEREKFADEGSIFYTLGECGLISFSDYIFLTTVLSTPQRNFEIAFKMFDLNG
30 | DGEVDMEEFEQVQSIIRSQTSMGMRHRDRPTTGNTLKSGLCSALTTYFFGADLKGKLTIKNFLEFQRKLQHDVLKLEFER
31 | HDPVDGRITERQFGGMLLAYSGVQSKKLTAMQRQLKKHFKEGKGLTFQEVENFFTFLKNINDVDTALSFYHMAGASLDKV
32 | TMQQVARTVAKVELSDHVCDVVFALFDCDGNGELSNKEFVSIMKQRLMRGLEKPKDMGFTRLMQAMWKCAQETAWDFALP
33 | KQ
34 | 
35 | >XENLA31939 | OMA649216 | A0A1L8FF34 | [Xenopus laevis]
36 | MFRLRFIPAVAGLAAVSRRYHGVANHTRSRRRLMMAAFVGATAVSASAGLLWKRANAEAQSSVKHNMREESSEKEKEPED
37 | TDQAVESSDEEQQQEGKKKKRVGFRDRKVMEYENRIRAYSTPDKIFRYFATLKVIHESGESEVFMTPQDFVRSITPNEKQ
38 | PENLGLDQFIVKRYDGKKISQEREKFADEDSIFYSLGECGLISFSDYIFLTTVLSTPQRNFEIAFKMFDLNGDGEVDMEE
39 | FEQVQSIIRSQTSMGMRHRDRSTTGNTLKTGFSSALTTYFFGADLKGKLTIKNFLEFQRKLQHDVLKLEFERQDPVDGRI
40 | TERQFGSMLLAYSGVQSKKLTHMLKQLKKRFKDAEGLTFEEVENFFTFLKNINDVDTALSFYHMAGASLDKVTMQQVART
41 | VAKVELSDHVCDVVFALFDCDGNGELSNKEFIAIMKQRLMRGLEKPKDMGFTRLMRAMWKCAQETAWDFAMPKQ
42 | 
43 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_681083.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00869 | OMA681083 | ML02403a | [Mnemiopsis leidyi]
 2 | MPMKLKFLFRASNKIKTKPSLNFLVMVKRLADPEVDENVEDIDSDFSDDDGASSVSDTGSVEETAQGKRLRLAKQYLDKL
 3 | ENEQLKSENDTEINRDLIAHRLQQDVLAEKGKLETRVGKRLCVLENKFTLKGHRLSPTCLAITDTHLFSGSKDGAIIKWD
 4 | LSTGKKLSVVKHDSKKQILALAASSDNVYLASGGQDKIIVLWDIESMTFVKCFRKHRGPITALTFQRNSHLLMSGSADRS
 5 | VNLWNCDDKLYIESLYGHQDMVADMDSFLQERVVTVGGHDKTLRLWKIQEESQLVFNGHKNTVLDCVSMLNEEHFVTGSQ
 6 | DNVLAVWHIKKKKPAITQLQAHAKGSWVSAVAGLKNTECFISGSNGGNVKVWACAENYRSMECIRSIEIIGTVNSIVISH
 7 | DNSCFALAVGQEPKMGRWWSDKAARNRVLVFPMAIEDEVNR
 8 | 
 9 | >HUMAN70334 | OMA681083 | U3IP2_HUMAN | [Homo sapiens]
10 | MSATAAARKRGKPASGAGAGAGAGKRRRKADSAGDRGKSKGGGKMNEEISSDSESESLAPRKPEEEEEEELEETAQEKKL
11 | RLAKLYLEQLRQQEEEKAEARAFEEDQVAGRLKEDVLEQRGRLQKLVAKEIQAPASADIRVLRGHQLSITCLVVTPDDSA
12 | IFSAAKDCSIIKWSVESGRKLHVIPRAKKGAEGKPPGHSSHVLCMAISSDGKYLASGDRSKLILIWEAQSCQHLYTFTGH
13 | RDAVSGLAFRRGTHQLYSTSHDRSVKVWNVAENSYVETLFGHQDAVAALDALSRECCVTAGGRDGTVRVWKIPEESQLVF
14 | YGHQGSIDCIHLINEEHMVSGADDGSVALWGLSKKRPLALQREAHGLRGEPGLEQPFWISSVAALLNTDLVATGSHSSCV
15 | RLWQCGEGFRQLDLLCDIPLVGFINSLKFSSSGDFLVAGVGQEHRLGRWWRIKEARNSVCIIPLRRVPVPPAAGS
16 | 
17 | >RATNO41032 | OMA681083 | B0BND5 | [Rattus norvegicus]
18 | MNSMSTAVATRKRAKPAPGPGAAPVDGKRRRKVDSAASRGKSKGGGKMNEEISSDSESESLAPRKTEEEEEEELEETAQE
19 | KKLRLAKLYLEQLRQQEEEKAEARAFEEDQVAGRLKEDVLEQRGRLQKLVAKEIQAPAPTDIRVLRGHQLSITCLVITPD
20 | DLAIFSAAKDCTIIKWSVETGRKLHVIPRAKKGTQGQPSGHSSHILCMAISSDGKYLASGDRSKLILIWEAQSCQHLYTF
21 | TGHRDAVSGLAFRRGTHQLYSTSHDRSVKVWNAAENSYVETLFGHQDAVAALDALSRECCVTAGGRDGTVRVWKIPEESQ
22 | LVFYGHQGSIDCIHLINEEHMVSGADDGSVALWGLSKKRPLALQREAHGLHGEPGLEQPFWISSVAALLNTDLVATGSHN
23 | ACVRLWQCGEGFRQLDPLCDIPLVGFINSLKFSSGGDFLVAGVGQEHRLGRWWRIKEARNSVCIIPLRRVPVSPVAGS
24 | 
25 | >GORGO30225 | OMA681083 | G3R9Q2 | [Gorilla gorilla gorilla]
26 | MSATAAARKRGKPASGVGAGAGAGKRRRKADSAGDRGKSKGGGKMNEEISSDSESESLAPRKPEEEEEEELEETAQEKKL
27 | RLAKLYLEQLRQQEEEKAEARAFEEDQVAGRLKEDVLEQRGRLQKLVAKEIQAPASADIRVLRGHQLSITCLVVTPDDSA
28 | IFSAAKDCTIIKWSVESGRKLRVIPRAKKGAEGKPPGHSSHVLCMAISSDGKYLASGDRSKLILIWEAQSCQHLYTFTGH
29 | RDAVSGLAFRRGTHQLYSTSHDRSVKVWNVAENSYVETLFGHQDAVAALDALSRECCVTAGGRDGTVRVWKIPEESQLVF
30 | YGHQGSIDCIHLINEEHMVSGADDGSVALWGLSKKRPLALQREAHGLRGEPGLEQPFWISSVAALLNTDLVATGSHSSCV
31 | RLWQCGEGFRQLDLLCDIPLVGFINSLKFSSSGDFLVAGVGQEHRLGRWWRIKQARNSVCIIPLRRVPVPPAAGS
32 | 
33 | >XENLA20807 | OMA681083 | A0A1L8GHE5 | [Xenopus laevis]
34 | MSGLFIKKKSGVTPRRRRAEGNDAEATSQKKKKPKDTHLREEIESDSDTEIAPARTKPRQDEEDLDETAQEKKLRLAKEY
35 | LKQLQQQEEEQKEDEDQDTIANRLQEDVLEQRGRLQRPLAKELLPPEPSEIRLLRGHQGPITCLVISPDDSYLFSGSKDC
36 | SIIKWSVSDGKKIHKIPGGRKGTESTHVGHTGHVLCMALSSDGKYLASGDRNKLIFIWDPVTCQNLHKFQGHRDAVSGLS
37 | FQKGTHQLFSVSHDRSVKVWNVEENAYIETLFGHQDAITGLDSLSRERCVTVGGRDGTMRIWKIAEETQLVFSGHEGSID
38 | CVRLINEEHIVTGADDGSLALWTVGKKKPLTQMKCAHGSHGDAGLEQPYWISSIAAALNSDVVASGSHDGFVHVWRCGEG
39 | FRSLSPLFTVPVVGFVNSLQFSSSANFLVAGVGQEHRLGRWWRKKEAKNALCIIPFKRTLVLGS
40 | 
41 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_683078.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00595 | OMA683078 | ML13641a | [Mnemiopsis leidyi]
 2 | MENTVLRAKLIVLGDASVGKSSLVQVFHSDSQQGFPKAYSMTSDVQLQVKSVKIPDSPYTVELYVYDCAGQETFQPFISK
 3 | ILGSSALVLLVSDLTNQSSLSAAVKWFERARNANKDFKMQGALVGNKCDLDLRRAIKASEAEETAANLGIPYFECSAKEG
 4 | VQVDEPFYFLANCLYEQYIEQTQEFQNIADTV
 5 | 
 6 | >HUMAN60325 | OMA683078 | IFT27_HUMAN | [Homo sapiens]
 7 | MVKLAAKCILAGDPAVGKTALAQIFRSDGAHFQKSYTLTTGMDLVVKTVPVPDTGDSVELFIFDSAGKELFSEMLDKLWE
 8 | SPNVLCLVYDVTNEESFNNCSKWLEKARSQAPGISLPGVLVGNKTDLAGRRAVDSAEARAWALGQGLECFETSVKEMENF
 9 | EAPFHCLAKQFHQLYREKVEVFRALA
10 | 
11 | >RATNO39676 | OMA683078 | A0A8I5ZYK4 | [Rattus norvegicus]
12 | QVKMDNDFSSALASGDPAVGKTALVQMFRSDGTHFQKNYTLTTGVDLVVKTVPVLDTNDSVELFIFDSAGKELFSEMLDK
13 | LWENPNVLCLVYDVTNEQSFISCTKWLEKVRSQTPGISLPGVLVGTKTDLAGRQTVDSAQAQAWALSQGLEFFETSVKEM
14 | DNYEAPFHCLAKQFYQLYREKVDIFHTLV
15 | 
16 | >GORGO23896 | OMA683078 | G3R464 | [Gorilla gorilla gorilla]
17 | MRTKAFFFFFSDLTGDPAVGKTALAQIFRSDGAHFQKSYTLTTGMDLVVKTVPVPDTGDSVELFIFDSAGKELFSEMLDK
18 | LWESPNVLCLVYDVTNEESFNNCSKWLEKARSQAPGISLPGVLVGNKTDLAGRRAVDSAEARAWALGQGLECFETSVKEM
19 | ENLEAPFHCLAKQFHQLYREKVEVFRALA
20 | 
21 | >XENLA18783 | OMA683078 | A0A1L8GP42 | [Xenopus laevis]
22 | MVKLSAKCIVAGDTAVGKSTLVQLFRSDGSHFPKNYSMTATVEVSVKTVQIPDTGDSVELFLCDSPGKAIFYEMTEKLWD
23 | QPGALCLVFDVTNESSFSSCTKWLQRVRSKTLSPHLPGVLVGNKTDMAGLRAVEKGQAEEWAASNGLEYFETSAKELENF
24 | ERPFQALAKAFHHLYQERVEHFQSLV
25 | 
26 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_894224.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00417 | OMA894224 | ML36131a | [Mnemiopsis leidyi]
 2 | MGPKGMDKILVSMGQDGYPGDIQVTNDGATILRSIGVDNPAAKVLVNISKVQDDEVGDGTTSVTVLAAELLREAEQLVAK
 3 | KLHPQTIISGYRAALKVAVQVLTDTAIDNGKDNEAFKKDLMNIARTTLSSKILNQHKEHFAELAVNAVLRLKGSTDLELV
 4 | QILKKTGGSIEDSYLDEGFLLEKEIGHNQPKRIENARILVANTPMDTDKIKVFGSKVKVDSTAKVADIELAEKNKMKQKV
 5 | DKILSHDITCFINRQLIYDYPDQLFADAGIMAIEHADFDGIERLSKVLGAEIVSTFDQPDKVTLGSCKVIEEVILGEDKL
 6 | IKFSGVKQGEACTVVLRGATKMIVDEAERSLHDALCVLTQTVKETRTVFGGGCSEMRMARHVEELAARTPGKEALAIESF
 7 | ARALRQIPTIIADNGGYDSSQLVSELRAMHSQDELYMGLNMTTGEVGDMRELGITESFAVKHAVVNSAAEAAEMILRVDD
 8 | ILKATPRQRGGNDCM
 9 | 
10 | >HUMAN14228 | OMA894224 | TCPB_HUMAN | [Homo sapiens]
11 | MASLSLAPVNIFKAGADEERAETARLTSFIGAIAIGDLVKSTLGPKGMDKILLSSGRDASLMVTNDGATILKNIGVDNPA
12 | AKVLVDMSRVQDDEVGDGTTSVTVLAAELLREAESLIAKKIHPQTIIAGWREATKAAREALLSSAVDHGSDEVKFRQDLM
13 | NIAGTTLSSKLLTHHKDHFTKLAVEAVLRLKGSGNLEAIHIIKKLGGSLADSYLDEGFLLDKKIGVNQPKRIENAKILIA
14 | NTGMDTDKIKIFGSRVRVDSTAKVAEIEHAEKEKMKEKVERILKHGINCFINRQLIYNYPEQLFGAAGVMAIEHADFAGV
15 | ERLALVTGGEIASTFDHPELVKLGSCKLIEEVMIGEDKLIHFSGVALGEACTIVLRGATQQILDEAERSLHDALCVLAQT
16 | VKDSRTVYGGGCSEMLMAHAVTQLANRTPGKEAVAMESYAKALRMLPTIIADNAGYDSADLVAQLRAAHSEGNTTAGLDM
17 | REGTIGDMAILGITESFQVKRQVLLSAAEAAEVILRVDNIIKAAPRKRVPDHHPC
18 | 
19 | >RATNO37617 | OMA894224 | A0A8I6GLE7 | [Rattus norvegicus]
20 | MPSSGVSEHSNKASLSLAPVNIFKAGADEERAETARLSSFIGAIAIGDLVKSTLGPKGMDKILLSSGRDASLMVTNDGAT
21 | ILKNIGVDNPAAKVLVDMSRVQDDEVGDGTTSVTVLAAELLREAESLIAKKIHPQTIIAGWREATKAAREALLSSAVDHG
22 | SDEVKFWQDLMNIAGTTLSSKLLTHHKDHFTKLAVEAVLRLKGSGNLEAIHVIKKLGGSLADSYLDEGFLLDKKIGVNQP
23 | KRIENAKILIANTGMDTDKIKIFGSRVRVDSTAKVAEIEHAEKEKMKEKVERILKHGINCFINRQLIYNYPEQLFGAAGV
24 | MAIEHADFAGVERLALVTGGEIASTFDHPELVKLGSCKLIEEVMIGEDKLIHFSGVALGEACTIVLRGATQQILDEAERS
25 | LHDALCVLAQTVKDPRTVYGGGCSEMLMAHAVTMLASRTPGKEAVAMESFAKALRMLPTIIADNAGYDSADLVAQLRAAH
26 | SEGRITAGLDMKEGSIGDMAVLGITESFQVKRQVLLSAAEAAEVILRVDNIIKAAPRKRVPDHHPC
27 | 
28 | >GORGO04796 | OMA894224 | A0A2I2Y483 | [Gorilla gorilla gorilla]
29 | MASLSLAPVNIFKAGADEERAETARLTSFIGAIAIGDLVKSTLGPKGMDKILLSSGRDASLMVTNDGATILKNIGVDNPA
30 | AKVLVDMSRVQDDEVGDGTTSVTVLAAELLREAESLIAKKIHPQTIIAGWREATKAAREALLSSAVDHGSDEVKFRQDLM
31 | NIAGTTLSSKLLTHHKDHFTKLAVEAVLRLKGSGNLEAIHIIKKLGGSLADSYLDEGFLLDKKIGVNQPKRIENAKILIA
32 | NTGMDTDKIKIFGSRVRVDSTAKVAEIEHAEKEKMKEKVERILKHGINCFINRQLIYNYPEQLFGAAGVMAIEHADFAGV
33 | ERLALVTGGEIASTFDHPELVKLGSCKLIEEVMIGEDKLIHFSGVALGEACTIVLRGATQQILDEAERSLHDALCVLAQT
34 | VKDSRTVYGGGCSEMLMAHAVTQLANRTPGKEAVAMESYAKALRMLPTIIADNAGYDSADLVAQLRAAHSEGNTTAGLDM
35 | REGTIGDMAILGITESFQVKRQVLLSAAEAAEVILRVDNIIKAAPRKRVPDHHPC
36 | 
37 | >XENLA12675 | OMA894224 | A0A1L8GYM0 | [Xenopus laevis]
38 | MASLSLAPVNIFKAGADEEKAETARLSSFIGAIAIGDLVKSTLGPKGMDKILLSSGRDSSVTVTNDGATILKAIGIDNPA
39 | AQVLVDMSKVQDDEVGDGTTSVTVLAAELLREAEILVAKKIHPQTIVSGWRQATQVAREALLKASMDHGNDEEKFCCDLM
40 | NIARTTLSSKLLTHHKDHFSKLAVEAVLRLKGSGNLEAIHLIKKLGGSLTESYLDEGFLLDKKIGVNQPKRIENAKILIA
41 | NTGMDTDKIKVFGSRVRVDSTAKVAEIELAEKEKMKEKVERILKHGINCFINRQLIYNYPEQLFAAAGVMAIEHADFAGV
42 | ERLALVTGGEIASTFDHPELVKLGTCKLIEEVMIGEDKLIHFSGVAMGEACTIVLRGATQQILDEAERSLHDALCVLAQT
43 | VKDTRTVYGGGCSEMLMAHAVTELANRTPGKESVAMESFAKALRMLPTIIADNAGYDSADLVSQLRAAHSEGKSTYGLDM
44 | KNGIIGDMGELGITESFQVKRQVLLSASEAAEVILRVDNIIKAAPRKRVPDHHPC
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_944789.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00946 | OMA944789 | ML24671a | [Mnemiopsis leidyi]
 2 | MIRGGKVAKRKSTTVLQEGTKKKCVNGAASSRQTSLQSHFKPLQPKQAPTTSSTGSKRRYVDSRTTSWEEVNKIRPELPT
 3 | PPFGEDSPAYSSQIPNSVVIPLPPLNFAPNDKIWAGLKQQEEKRQASKLGILNHPFIKQGARAILLDWLIEVSQLYCLKR
 4 | ETFYLSMDYIDRFISKRYDIKKEQLQLVGITALHMAAKLEEIYPPGLEKLSYITDNSCSKEAMWKMELEMMKALDWRLAA
 5 | LTVNTWLNLYLQIEYYRGTSCSTFQFLRGEYSQSDFVKIIQLIDLCSLDVKSVEYRPSMIAASALWLVVPSKLKEVTGYS
 6 | WDDLISCRHWMQPYAQVLKDQPAQQLKDFEDVEKKDRHHIQTHFKAIPLLHDVYELQESQPLTPDSDSDNENAEVAHYLT
 7 | PNSSTHSSPSSSTKHR
 8 | 
 9 | >HUMAN42399 | OMA944789 | CCNE1_HUMAN | [Homo sapiens]
10 | MPRERRERDAKERDTMKEDGGAEFSARSRKRKANVTVFLQDPDEEMAKIDRTARDQCGSQPWDNNAVCADPCSLIPTPDK
11 | EDDDRVYPNSTCKPRIIAPSRGSPLPVLSWANREEVWKIMLNKEKTYLRDQHFLEQHPLLQPKMRAILLDWLMEVCEVYK
12 | LHRETFYLAQDFFDRYMATQENVVKTLLQLIGISSLFIAAKLEEIYPPKLHQFAYVTDGACSGDEILTMELMIMKALKWR
13 | LSPLTIVSWLNVYMQVAYLNDLHEVLLPQYPQQIFIQIAELLDLCVLDVDCLEFPYGILAASALYHFSSSELMQKVSGYQ
14 | WCDIENCVKWMVPFAMVIRETGSSKLKHFRGVADEDAHNIQTHRDSLDLLDKARAKKAMLSEQNRASPLPSGLLTPPQSG
15 | KKQSSGPEMA
16 | 
17 | >RATNO19647 | OMA944789 | B1WC54 | [Rattus norvegicus]
18 | MPRERKERDSKDHSNMKEEGGSDLSVRSRKRKANVAVFLQDPDEEIAKIDKTVKSQDSSQPWDDDSACVDPCSFIPTPNK
19 | EEDNELEYPKTAFQPRKIRPPRASPLPVLNWGNREEVWRIMLNKEKTYLRDEHFLQRHPLLQARMRAVLLDWLMEVCEVY
20 | KLHRETFYLAQDFFDRYMASQQNIIKTLLQLIGISALFIASKLEEIYPPKLHQFAYVTDGACSGDEILTMELMMMKALKW
21 | RLSPLTIVSWLNVYVQVAYVNDTGEVLMPQYPQQVFVQIAELLDLCVLDVGCLEFPYGVLAASALYHFSSLELMQKVSGY
22 | QWCDIEKCVKWMVPFAMVIREMGSSKLKHFRGVPMEDSHNIQTHTNSLDLLDKAQAKKAILSEQNRISPPPSGVLTPPHS
23 | SKKQSSEQETE
24 | 
25 | >GORGO16559 | OMA944789 | G3QZF2 | [Gorilla gorilla gorilla]
26 | MPRERRERDAKERDTMKEDGGAEFSARSRKRKANVAVFLQDPDEEMAKIDRTARDQCGSQPWDNNAVCADPCSLIPTPDK
27 | EDDERVYPNSTCKPQIIAPSRGSPLPVLSWANREEVWKIMLNKEKTYLRDQHFLEQHPLLQPKMRAILLDWLMEVCEVYK
28 | LHRETFYLAQDFFDRYMATQENVVKTLLQLIGISSLFIAAKLEEIYPPKLHQFAYVTDGACSGDEILTMELMIMKALKWR
29 | LSPLTIVSWLNVYMQVAYLNDLHEVLLPQYPQQIFIQIAELLDLCVLDVDCLEFPYGILAASALYHFSSSELMQKVSGYQ
30 | WCDIENCVKWMVPFAMVIRETGSSKLKHFRGVADEDAHNIQTHRDSLDLLDKARAKKAMLSEQNRASPLPSGLLTPPQSG
31 | KKQSSGPEMA
32 | 
33 | >XENLA17827 | OMA944789 | CCNE1_XENLA | [Xenopus laevis]
34 | MPVISNPAVEKSTKDEGTASCSVRSRKRKADVAIFLQDPDETLDSLEMTKKKQYQDRGPWSNEMTCKSPHKLIPTPEKEE
35 | HEPNPTNYSHFASLRFSPVSVSPLPRLGWANQDDVWRNMLNKDRIYLRDKNFFQKHPQLQPNMRAILLDWLMEVCEVYKL
36 | HRETFYLAQDFFDRFMATQKNVIKSRLQLIGITSLFIAAKLEEIYPPKLHQFSFITDGACTEDEITRMELIIMKDLGWCL
37 | SPMTIVSWFNVFLQVAYIRELQQFLRPQFPQEIYIQIVQLLDLCVLDICCLEYPYGVLAASAMYHFSCPELVEKVSGFKV
38 | TELQGCIKWLVPFAMAIKEGGKSKLNFFKGVDIEDAHNIQTHSGCLELMEKVYINQALLEEQNRTSPIPTGVLTPPQSNK
39 | KQKSDRAD
40 | 
41 | 


--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_974829.fa:
--------------------------------------------------------------------------------
 1 | >MNELE00836 | OMA974829 | ML01593a | [Mnemiopsis leidyi]
 2 | MDLVQVGITTDELDRYAHDLIVQHAAYPAPLNYRGYPKSICTSVNNVLCHGIPNSRELQDGDIISIDVSIFYKGVFGDCC
 3 | STRVVGEGDSTAHKLAKVTRDSTLAAIETCKPGTRLSSVGNTISKYAKEAGLSICKEFIGHGIGSYFHGLPEVYHYANSH
 4 | GPTLRPGMVFTIEPILMEGRDTMAILADGWTAVSADSKRAAQFEHTILITDSEPEILSPHR
 5 | 
 6 | >HUMAN65562 | OMA974829 | MAP12_HUMAN | [Homo sapiens]
 7 | MAAPSGVHLLVRRGSHRIFSSPLNHIYLHKQSSSQQRRNFFFRRQRDISHSIVLPAAVSSAHPVPKHIKKPDYVTTGIVP
 8 | DWGDSIEVKNEDQIQGLHQACQLARHVLLLAGKSLKVDMTTEEIDALVHREIISHNAYPSPLGYGGFPKSVCTSVNNVLC
 9 | HGIPDSRPLQDGDIINIDVTVYYNGYHGDTSETFLVGNVDECGKKLVEVARRCRDEAIAACRAGAPFSVIGNTISHITHQ
10 | NGFQVCPHFVGHGIGSYFHGHPEIWHHANDSDLPMEEGMAFTIEPIITEGSPEFKVLEDAWTVVSLDNQRSAQFEHTVLI
11 | TSRGAQILTKLPHEA
12 | 
13 | >RATNO27947 | OMA974829 | G3V670 | [Rattus norvegicus]
14 | MAAPIGVHLLVRGGCQRILSSPLHHLFLHKRAGSQQRRYFFWRQRDISHSVVSPAAVSPAHPVPEHIKKPDYVTTGIVPD
15 | WGDSIEVKNEDQIQGLREACRLARHVLLLAGKSLKVGMTTEEIDALVHREIIRRDAYPSPLGYGRFPKSVCTSVNNVLCH
16 | GIPDSRPLQDGDIINIDVTVYYNGYHGDTSETFLVGNVDESGTKLVEVARACRDEAIAACRAGAPFSVIGNTISHITRQN
17 | GLQVCPHFVGHGIGSYFHGHPEIWHHANDNDLPMEERMAFTIEPIITEGSPEFKVLEDAWTVVSLDNRRSAQFEHTVLIT
18 | PRGVEILTKVPQEA
19 | 
20 | >GORGO26472 | OMA974829 | A0A2I2ZWI0 | [Gorilla gorilla gorilla]
21 | MAAPSGVHQLVRRGSHRIFSSPLNHIYLHKQSSSQQRRNFFFRRQRDISHSIVLPAAVSSAHPVPKHIKKPDYVTTGIVP
22 | DWGDSIEVKNEDQIQGLHQACQLARHVLLLAGKSLKVDMTTEEIDALVHREIISHNAYPSPLGYGGFPKSVCTSVNNVLC
23 | HGIPDSRPLQDGDIINIDVTVYYNGYHGDTSETFLVGNVDECGKKLVEVARRCRDEAIAACRAGAPFSVIGNTISHITHQ
24 | NGFQVCPHFVGHGIGSYFHGHPEIWHHANDNDLPMEEGMAFTIEPIITEGSPEFKVLEDAWTVVSLDNQRSAQFEHTVLI
25 | TSRGAQILTKLPHEA
26 | 
27 | >XENLA39438 | OMA974829 | A0A1L8EW64 | [Xenopus laevis]
28 | MQVAGVISCALRRGSVTGCQRVFPLAVNHIYLHRQLNIQQRRYFFFRKQRSAAYDIVWPGTVSPAHPVPEHIMKPDYVTT
29 | GIVPDWGDYIEIKDEDQIQGLRQACQLARHILLMAGKSLKVGMTTEEIDALVHENIISWNAYPSPLGYGGFPKSVCTSVN
30 | NVVCHGIPDSRALQDGDIINIDVTVYFGGYHGDTSETFLVGNVDKCGRGLVKIARRCRDEAIAVCKPGAPFSSIGNTISR
31 | IAGENGFRVCPSFVGHGIGSFFHGHPEIWHHANNNDMPMEEGMAFTIEPIIMEGSPDFKILKDKWTAVSVDNKRSAQCEH
32 | TIVITSGGAEILTKLPQEE
33 | 
34 | 


--------------------------------------------------------------------------------
/tests/test_aligner.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import gzip
 4 | import argparse
 5 | from Bio import SeqIO
 6 | from read2tree.Reads import Reads
 7 | from read2tree.FastxReader import FastxReader
 8 | dirname = os.path.dirname(__file__)
 9 | 
10 | 
11 | class ReadTest(unittest.TestCase):
12 | 
13 |     def setup_reads_paired(self, sampling=False):
14 |         arg_parser = argparse.ArgumentParser(prog='read2tree')
15 | 
16 |         arg_parser.add_argument('--standalone_path', default='.',
17 |                                 help='[Default is current directory] Path to '
18 |                                      'oma standalone directory.')
19 | 
20 |         arg_parser.add_argument('--reads', nargs='+', default=None,
21 |                                 help='Reads to be mapped to reference. If paired '
22 |                                 'end add separated by space.')
23 | 
24 |         arg_parser.add_argument('--read_type', default='short',
25 |                                 help='[Default is short reads] Type of reads to '
26 |                                 'use for mapping. Either ngm for short reads or '
27 |                                 'ngmlr for long will be used.')
28 | 
29 |         arg_parser.add_argument('--dna_reference', default='',
30 |                                 help='Reference file that contains nucleotide '
31 |                                 'sequences (fasta, hdf5). If not given it will use'
32 |                                 'the RESTapi and retrieve sequences '
33 |                                 'from http://omabrowser.org directly. '
34 |                                 'NOTE: internet connection required!')
35 |                                 
36 |         arg_parser.add_argument('--keep_all_ogs', action='store_true',
37 |                                 help='Keep all orthologs after addition of '
38 |                                 'mapped seq, which means also the groups that '
39 |                                 'have no mapped sequence. Otherwise only groups '
40 |                                 'are used that have the mapped sequence for '
41 |                                 'alignment and tree inference.')
42 | 
43 |         arg_parser.add_argument('-r', '--reference', action='store_true',
44 |                                 help='Just generate the reference dataset for '
45 |                                 'mapping.')
46 | 
47 |         arg_parser.add_argument('--remove_species_ogs', default=None,
48 |                                 help='[Default is none] Remove species present '
49 |                                 'in data set after mapping step completed to '
50 |                                 'build OGs. Input is comma separated list '
51 |                                 'without spaces, e.g. XXX,YYY,AAA.')
52 | 
53 |         arg_parser.add_argument('-s', '--species_name', default=None,
54 |                                 help='[Default is name of read] Name of species '
55 |                                      'for mapped sequence.')
56 | 
57 |         arg_parser.add_argument('--output_path', default='.', required=True,
58 |                                 help='[Default is current directory] Path to '
59 |                                 'output directory.')
60 | 
61 |         argv = ['--standalone_path', 'tests/data/marker_genes/',
62 |                 '--dna_reference', 'tests/data/dna.fa', '--reads',
63 |                 'tests/data/mapper/test3/test_1b.fq',
64 |                 'tests/data/mapper/test3/test_2b.fq',
65 |                 '--output_path', 'tests/data/output', '--read_type',
66 |                 'short', '--keep_all_ogs', '--reference',
67 |                 '--remove_species_ogs', 'CIOIN', '--species_name', 'ass']
68 | 
69 |         args = arg_parser.parse_args(argv)
70 |         return alignments = Aligner(args, ogset.ogs, load=True)
71 | 


--------------------------------------------------------------------------------
/tests/test_og.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from Bio import SeqIO
 4 | from read2tree.OGSet import OG
 5 | 
 6 | dirname = os.path.dirname(__file__)
 7 | 
 8 | 
 9 | class OGTest(unittest.TestCase):
10 | 
11 |     def setup(self):
12 |         aa = list(SeqIO.parse('data/OG4.aa', format='fasta'))
13 |         dna = list(SeqIO.parse('data/OG4.dna', format='fasta'))
14 |         og = OG()
15 |         og.aa = aa
16 |         og.dna = dna
17 |         return og
18 | 
19 |     def test_init(self):
20 |         og = self.setup()
21 |         self.assertEqual(og.dna[0].id, 'MOUSE21964_OG4')
22 | 
23 |     def test_get_og_dict(self):
24 |         og = self.setup()
25 |         dna_dict = og._get_og_dict(og)
26 |         self.assertEqual(dna_dict['MOUSE21964'].name, 'MOUSE21964_OG4')
27 | 
28 |     def test_remove_species_records(self):
29 |         og = self.setup()
30 |         og_wo_mouse = og.remove_species_records('MOUSE')
31 |         self.assertEqual(len(og_wo_mouse[0]), 4)
32 |         self.assertEqual(len(og_wo_mouse[1]), 4)
33 | 
34 |     def test_get_species_id(self):
35 |         og = self.setup()
36 |         dna = og.dna[0]
37 |         aa = og.aa[0]
38 |         self.assertEqual(og._get_species_id(dna), 'MOUSE')
39 |         self.assertEqual(og._get_species_id(aa), 'MOUSE')
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/tests/test_ogset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | from read2tree import OGSet
 4 | 
 5 | API_URL = 'http://omabrowser.org/api'
 6 | 
 7 | class OGSetTest(unittest.TestCase):
 8 |     def setUp(self):
 9 |         arg_parser = argparse.ArgumentParser(prog='read2tree')
10 | 
11 |         arg_parser.add_argument('--reads', nargs='+', default=None,
12 |                                 help='Reads to be mapped to reference. If paired end '
13 |                                      'add separated by space.')
14 |         arg_parser.add_argument('--read_split_length', type=int, default=400,
15 |                                 help='Set read split length.')
16 |         arg_parser.add_argument('--read_split_overlap', type=int, default=50,
17 |                                 help='Set read split length overlap.')
18 |         arg_parser.add_argument('-s', '--species_name', default=None,
19 |                                 help='[Default is name of read] Name of species '
20 |                                      'for mapped sequence.')
21 | 
22 |         argv = ['--reads', 'tests/data/reads/test.fq']
23 | 
24 |         args = arg_parser.parse_args(argv)
25 |         return OGSet(args)
26 | 
27 |     def test_OGSet(self):
28 |         raise NotImplementedError
29 | 
30 |     def test_marker_genes_input(self):
31 |         raise NotImplementedError
32 | 
33 |     def test_omastandalone_input(self):
34 |         raise NotImplementedError
35 | 
36 |     def test_output_folder_structure(self):
37 |         raise NotImplementedError
38 | 
39 |     def test_species_removal(self):
40 |         raise NotImplementedError
41 | 
42 |     def test_species_removal_after_mapping(self):
43 |         raise NotImplementedError
44 | 
45 |     def test_rest_api_connection(self):
46 |         OGSet._read
47 | 
48 |     def test_rest_api_dna_downlaod(self):
49 |         raise NotImplementedError
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     unittest.main()
54 | 


--------------------------------------------------------------------------------
/tests/test_reads.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | import gzip
  4 | import argparse
  5 | from Bio import SeqIO
  6 | from read2tree.Reads import Reads
  7 | from read2tree.FastxReader import FastxReader
  8 | from read2tree.main import parse_args
  9 | from read2tree._utils import exe_name
 10 | dirname = os.path.dirname(__file__)
 11 | 
 12 | 
 13 | class ReadTest(unittest.TestCase):
 14 | 
 15 |     def setup_long_reads(self, split=False):
 16 |         if split:
 17 |             argv = ['--output_path', 'data/output', '--reads', 'data/reads/test.fq.gz', '--split_reads',
 18 |                     '--split_overlap', '50', '--split_len', '400', '--sample_reads', '--coverage', '10',
 19 |                     '--genome_len', '1000']
 20 |         else:
 21 |             argv = ['--output_path', 'data/output', '--reads', 'data/reads/test.fq.gz']
 22 | 
 23 |         args = parse_args(argv, exe_name(), '')
 24 |         # args = arg_parser.parse_args(argv)
 25 |         return Reads(args)
 26 | 
 27 |     def setup_reads_paired(self, sampling=False):
 28 | 
 29 |         if sampling:
 30 |             argv = ['--output_path', 'data/output', '--reads', 'data/reads/test_1a.fq.gz',
 31 |                     'data/reads/test_2a.fq.gz', '--sample_reads', '--coverage', '10', '--genome_len', '1000']
 32 |         else:
 33 |             argv = ['--output_path', 'data/output', '--reads', 'data/reads/test_1a.fq.gz',
 34 |                     'data/reads/test_2a.fq.gz']
 35 |         args = parse_args(argv, exe_name(), '')
 36 |         return Reads(args)
 37 | 
 38 |     def test_split(self):
 39 |         test_seq = 'ACGTTTTTTGGAAGAGTTAGAGATTTTTAGAGAGGAGGGGT'
 40 |         expected = ['ACGTTTTTTG', 'GAAGAGTTAG', 'AGATTTTTAG', 'AGAGGAGGGG',
 41 |                     'GAGGAGGGGT']
 42 |         reads = self.setup_long_reads()
 43 |         # obtained = reads._split_len(test_seq, 10)
 44 |         obtained = reads._split_len_overlap(test_seq, 10, 0)
 45 |         self.assertEqual(expected, obtained)
 46 | 
 47 |     def test_splitOverlap(self):
 48 |         test_seq = 'ACGTTTTTTGGAAGAGTTAGAGATTTTTAGAGAGGAGGGGTTT'
 49 |         expected = ['ACGTTTTTTG', 'TTTTGGAAGA', 'GAAGAGTTAG', 'GTTAGAGATT',
 50 |                     'AGATTTTTAG', 'TTTAGAGAGG', 'AGAGGAGGGG', 'GGAGGGGTTT']
 51 |         reads = self.setup_long_reads()
 52 |         obtained = reads._split_len_overlap(test_seq, 10, 5)
 53 |         # print(reads._split_len_overlap('TTTTTAGAGAGGAGGGGTTT', 10, 5))
 54 |         self.assertEqual(expected, obtained)
 55 | 
 56 |     def test_get_4_line_fastq_string(self):
 57 |         reads = self.setup_long_reads()
 58 |         expected = '@SRR00001 length=16\nACGTTTGGGAAGGTTT\n+SRR00001 ' \
 59 |                    'length=16\n????????????????\n'
 60 |         read_id = 'SRR00001'
 61 |         seq = 'ACGTTTGGGAAGGTTT'
 62 |         qual = '????????????????'
 63 |         name = reads._get_4_line_fastq_string(read_id, seq, qual, x=0)
 64 |         self.assertEqual(name, expected)
 65 | 
 66 |     def test_read_num_split(self):
 67 |         reads = self.setup_long_reads(split=True)
 68 |         num_reads = reads._get_num_reads('data/reads/test.fq.gz')
 69 |         self.assertEqual(num_reads, 18)
 70 | 
 71 |     def test_read_len_split(self):
 72 |         reads = self.setup_long_reads(split=True)
 73 |         len_reads = reads._get_read_len('data/reads/test.fq.gz',1000)
 74 |         self.assertEqual(len_reads, 400)
 75 | 
 76 |     def test_read_num_paired(self):
 77 |         reads = self.setup_reads_paired()
 78 |         num_reads = reads._get_num_reads('data/reads/test_1a.fq.gz')
 79 |         self.assertEqual(num_reads, 1000)
 80 | 
 81 |     def test_read_len_paired(self):
 82 |         reads = self.setup_reads_paired()
 83 |         num_reads = reads._get_read_len('data/reads/test_1a.fq.gz', 1000)
 84 |         self.assertEqual(num_reads, 151.0)
 85 | 
 86 |     def test_read_num_by_coverage_paired(self):
 87 |         reads = self.setup_reads_paired(sampling=True)
 88 |         num_reads = reads._get_num_reads_by_coverage(
 89 |             'data/reads/test_1a.fq.gz', 1000)
 90 |         self.assertEqual(num_reads, 34)
 91 | 
 92 |     def test_read_num_by_coverage_split(self):
 93 |         reads = self.setup_long_reads(split=True)
 94 |         num_reads = reads._get_num_reads_by_coverage(['data/reads/test.fq.gz'],1000)
 95 |         self.assertEqual(num_reads, 25)
 96 | 
 97 |     def test_read_vec_paired(self):
 98 |         reads = self.setup_reads_paired(sampling=True)
 99 |         num_reads = reads._get_vector_random_reads(
100 |             'data/reads/test_1a.fq.gz')
101 |         self.assertEqual(len(num_reads), 34)
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     unittest.main()
106 | 


--------------------------------------------------------------------------------
/tests/test_use.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import argparse
 3 | import warnings
 4 | warnings.filterwarnings('ignore')
 5 | from read2tree.Progress import Progress
 6 | from read2tree.stats.Coverage import Coverage
 7 | from read2tree.stats.SeqCompleteness import SeqCompleteness
 8 | import os
 9 | 
10 | class Use(unittest.TestCase):
11 | 
12 |     def test_OGSet(self):
13 | 
14 |     def test_write_progress(self):
15 | 
16 |     def test_read_progress(self):
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     unittest.main()
21 | 


--------------------------------------------------------------------------------