├── .github
└── workflows
│ └── docker-image.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── archive
├── .idea
│ ├── encodings.xml
│ ├── inspectionProfiles
│ │ └── profiles_settings.xml
│ ├── libraries
│ │ └── R_User_Library.xml
│ ├── misc.xml
│ ├── modules.xml
│ ├── other.xml
│ ├── pore2tree.iml
│ └── vcs.xml
├── TODO
├── down_py_script.sh
├── r2t_py_script.sh
├── requirements.txt
├── rm_py_script.sh
├── scripts
│ ├── SraRunTable.txt
│ ├── adjust_mapping_names.py
│ ├── clean_fasta_cdna_cds.py
│ ├── concat_alignments.py
│ ├── down_py_script.sh
│ ├── from_assemblies.py
│ ├── get_alignment_similarity.py
│ ├── get_computation_progress.py
│ ├── get_highly_conserved_og_dna_hdf5.py
│ ├── get_og_from_readmapping.py
│ ├── get_reconstructed_seq_by_species.py
│ ├── get_seq_completeness.py
│ ├── get_topological_likelihoods.py
│ ├── lsf_submit.py
│ ├── lsf_submit_mouse.py
│ ├── lsf_submit_paired.py
│ ├── map2align.py
│ ├── map2align_test.py
│ ├── monitor_folder_size.py
│ ├── orthogroups_fasta_to_marker_genes.py
│ ├── orthogroups_fasta_to_marker_genes_by_groups.py
│ ├── protein_converter.py
│ ├── r2t_py_script.sh
│ ├── relabel_msa.py
│ ├── remove_species_from_alignment.py
│ ├── rm_py_script.sh
│ ├── sample_from_reads.py
│ ├── sample_reads.py
│ ├── sge_submit.py
│ ├── sge_submit_paired.py
│ ├── sge_submit_paired_comic.py
│ ├── slurm_submit.py
│ ├── subsample_nextstrain_covid_genomes_with_sra_accession.py
│ ├── treecl
│ │ └── select_alignments.py
│ └── trim_alignment.py
├── set_marker_genes
│ ├── bacteria_markergenes.zip
│ └── mammalia_markergenes.zip
├── tests
│ ├── info.log
│ ├── input.log
│ ├── test_aligner.py
│ ├── test_og.py
│ ├── test_ogset.py
│ ├── test_reads.py
│ ├── test_seqCompleteness.py
│ └── test_use.py
└── wiki_images
│ ├── covid1.jpg
│ ├── covid2.jpg
│ ├── figure1.jpg
│ ├── figure_1sp.jpg
│ ├── oma_page_0.png
│ ├── oma_page_1.png
│ ├── oma_page_2.png
│ ├── oma_page_3.png
│ ├── oma_page_4.png
│ ├── oma_page_5.png
│ ├── oma_page_6.png
│ ├── oma_page_7.png
│ └── oma_page_8.png
├── bin
└── read2tree
├── environment.yml
├── read2tree
├── Aligner.py
├── Analyzer.py
├── FastxReader.py
├── GuidedAssembler.py
├── Mapper.py
├── MultiProcessingLog.py
├── OGSet.py
├── Progress.py
├── Reads.py
├── ReferenceSet.py
├── TreeInference.py
├── __init__.py
├── _utils.py
├── file_utils
│ ├── __init__.py
│ └── context_managers.py
├── logging
│ ├── log.yaml
│ └── log.yaml.bak
├── main.py
├── parser
│ ├── OMAOutputParser.py
│ └── __init__.py
├── stats
│ ├── Coverage.py
│ ├── SeqCompleteness.py
│ └── __init__.py
├── utils
│ ├── __init__.py
│ └── seq_utils.py
└── wrappers
│ ├── __init__.py
│ ├── abstract_cli.py
│ ├── aligners
│ ├── __init__.py
│ ├── base_aligner.py
│ ├── mafft.py
│ ├── muscle.py
│ ├── probcons.py
│ └── prographmsa.py
│ ├── options.py
│ ├── read_mappers
│ ├── __init__.py
│ ├── base_mapper.py
│ ├── ngm.py
│ ├── ngmlr.py
│ └── parser.py
│ └── treebuilders
│ ├── __init__.py
│ ├── base_treebuilder.py
│ ├── fasttree.py
│ ├── iqtree.py
│ ├── parsers.py
│ ├── phyml.py
│ ├── raxml.py
│ └── src
│ └── pip-delete-this-directory.txt
├── setup.py
├── src
└── pip-delete-this-directory.txt
└── tests
├── dna_ref.fa
├── marker_genes
├── OMAGroup_1001241.fa
├── OMAGroup_1008242.fa
├── OMAGroup_1065415.fa
├── OMAGroup_1121053.fa
├── OMAGroup_1125645.fa
├── OMAGroup_1133018.fa
├── OMAGroup_1151179.fa
├── OMAGroup_1163384.fa
├── OMAGroup_1171372.fa
├── OMAGroup_1188079.fa
├── OMAGroup_649157.fa
├── OMAGroup_649216.fa
├── OMAGroup_671579.fa
├── OMAGroup_681083.fa
├── OMAGroup_681195.fa
├── OMAGroup_683078.fa
├── OMAGroup_894224.fa
├── OMAGroup_898327.fa
├── OMAGroup_944789.fa
└── OMAGroup_974829.fa
├── sample_1.fastq
├── sample_2.fastq
├── test_aligner.py
├── test_og.py
├── test_ogset.py
├── test_reads.py
├── test_seqCompleteness.py
└── test_use.py
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
1 | name: Docker Image CI
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 | pull_request:
7 | branches: [ main ]
8 | release:
9 | type: [published]
10 |
11 | env:
12 | TEST_TAG: dessimozlab/read2tree:test
13 |
14 | jobs:
15 |
16 | build:
17 |
18 | runs-on: ubuntu-latest
19 |
20 | steps:
21 | -
22 | name: Checkout
23 | uses: actions/checkout@v2
24 | with:
25 | submodules: recursive
26 |
27 | -
28 | name: Docker meta
29 | id: meta
30 | uses: docker/metadata-action@v4
31 | with:
32 | # list of Docker images to use as base name for tags
33 | images: |
34 | dessimozlab/read2tree
35 | # generate Docker tags based on the following events/attributes
36 | tags: |
37 | type=schedule
38 | type=ref,event=branch
39 | type=ref,event=pr
40 | type=semver,pattern={{version}}
41 | type=semver,pattern={{major}}.{{minor}}
42 | type=semver,pattern={{major}}
43 | type=sha
44 | -
45 | name: Set up QEMU
46 | uses: docker/setup-qemu-action@v2
47 | -
48 | name: Set up Docker Buildx
49 | uses: docker/setup-buildx-action@v2
50 |
51 | -
52 | name: Build and export to docker for testing
53 | uses: docker/build-push-action@v3
54 | with:
55 | context: .
56 | load: true
57 | tags: ${{ env.TEST_TAG }}
58 | -
59 | name: Test
60 | run: |
61 | docker run --rm -i -v $PWD/tests:/input -v $PWD/tests/:/reads -v $PWD/output:/out -v $PWD/run:/run ${{ env.TEST_TAG }} --tree --standalone_path /input/marker_genes --dna_reference /input/dna_ref.fa --reads /reads/sample_1.fastq --output_path /out
62 | if [ ! -f output/tree_sample_1.nwk ] ; then exit 1; fi
63 | -
64 | name: Login to DockerHub
65 | if: github.event_name != 'pull_request' && github.event_name != 'push'
66 | uses: docker/login-action@v2
67 | with:
68 | username: ${{ secrets.DOCKER_HUB_USERNAME }}
69 | password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
70 |
71 | -
72 | name: Build and push
73 | uses: docker/build-push-action@v3
74 | with:
75 | context: .
76 | platforms: linux/amd64
77 | push: ${{ github.event_name != 'push' && github.event_name != 'pull_request' }}
78 | tags: ${{ steps.meta.outputs.tags }}
79 | labels: ${{ steps.meta.outputs.labels }}
80 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | build
3 | dist
4 | read2tree.egg-info
5 | docs/*
6 | tmp/*
7 | .Rhistory
8 | .RData
9 | *pyc
10 | tmp
11 | **/.ropeproject
12 | **/__pycache__
13 | **/.ipynb_checkpoints
14 | .idea/workspace.xml
15 | .python-version
16 | .DS_Store
17 | tests/output
18 | tests/mplog.log
19 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # base image
2 | FROM continuumio/miniconda3
3 |
4 | LABEL software="read2tree"
5 |
6 |
7 | WORKDIR /app
8 |
9 | # Create the environment:
10 | COPY environment.yml .
11 |
12 | RUN apt-get -qq update \
13 | && apt-get install -y --no-install-recommends \
14 | wget \
15 | && rm -rf /var/lib/apt/lists/*
16 |
17 | RUN conda env create -f environment.yml
18 |
19 | # Make RUN commands use the new environment:
20 | SHELL ["conda", "run", "-n", "read2tree_env", "/bin/bash", "-c"]
21 |
22 | # Make sure the environment is activated:
23 | RUN echo "Make sure numpy is installed:" \
24 | && python -c "import numpy" \
25 | && python -c "import ete3" \
26 | && python -c "import pysam"
27 |
28 | COPY . .
29 | RUN python setup.py install
30 |
31 | ENV PATH /app/read2tree/bin:/opt/conda/envs/read2tree_env/bin:$PATH
32 |
33 | WORKDIR /run
34 |
35 | ENTRYPOINT ["read2tree"]
36 |
37 | CMD ["-h"]
38 |
39 |
40 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 webfucktory
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/archive/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/archive/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/archive/.idea/libraries/R_User_Library.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/archive/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/archive/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/archive/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/archive/.idea/pore2tree.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/archive/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/archive/TODO:
--------------------------------------------------------------------------------
1 | added here something to test windows bash
2 | * add better error handling when mapping doesn't work
3 | * tables needs hdf5 dependencies
4 | * some git repositories are difficult to access
5 | * each time the mapping function finishes it should check whether the currently running mapping is the
6 | last one and if so then once this is done it should delete all unnecessary files
7 |
--------------------------------------------------------------------------------
/archive/down_py_script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/down_GLYSP.o%J
3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/down_GLYSP.e%J
4 | #BSUB -u david.dylus@unil.ch
5 | #BSUB -J down_GLYSP
6 | #BSUB -n 4
7 | #BSUB -R "span[ptile=4]"
8 | #BSUB -R "rusage[mem=4000]"
9 | #BSUB -M 4000000
10 | srr=SRR3115005
11 | speciesid=GLYSP
12 | module add Utility/aspera_connect/3.7.4.147727
13 | source activate r2t
14 | mkdir /scratch/beegfs/weekly/ddylus/avian/reads/$speciesid
15 | echo 'Created read $speciesid'
16 | cd /scratch/beegfs/weekly/ddylus/avian/reads/$speciesid
17 | ascp -v -QT -k1 -l100M -i /software/Utility/aspera_connect/3.7.4.147727/etc/asperaweb_id_dsa.openssh anonftp@ftp.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/SRR/${srr:0:6}/$srr/$srr.sra ./
18 | echo 'Finished download'
19 | parallel-fastq-dump -s *.sra -t 4 -O . --split-files --tmpdir .
20 | echo 'Finished getting fastq from sra and split files'
21 | mv *\_1.fastq $speciesid\_1.fq
22 | mv *\_2.fastq $speciesid\_2.fq
23 | echo 'Finished moving files'
--------------------------------------------------------------------------------
/archive/r2t_py_script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/r2t_GLYSP.o%J
3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/r2t_GLYSP.e%J
4 | #BSUB -u david.dylus@unil.ch
5 | #BSUB -J r2t_GLYSP
6 | #BSUB -n 4
7 | #BSUB -R "span[ptile=4]"
8 | #BSUB -R "rusage[mem=10000]"
9 | #BSUB -M 10000000
10 | source activate r2t
11 | reads=/scratch/beegfs/weekly/ddylus/avian/reads/GLYSP
12 | cd /scratch/beegfs/weekly/ddylus/avian/r2t/
13 | python -W ignore ~/opt/read2tree/bin/read2tree --standalone_path /scratch/beegfs/weekly/ddylus/avian/marker_genes/ --dna_reference /scratch/beegfs/weekly/ddylus/avian/eukaryotes.cdna.fa --reads $reads/GLYSP_1.fq $reads/GLYSP_2.fq --output_path /scratch/beegfs/weekly/ddylus/avian/r2t/ --single_mapping /scratch/beegfs/weekly/ddylus/avian/r2t/02_ref_dna/MELGA_OGs.fa --threads 4 --min_species 8
--------------------------------------------------------------------------------
/archive/requirements.txt:
--------------------------------------------------------------------------------
1 | biopython
2 | numpy>=1.13.3
3 | scipy
4 | lxml
5 | pandas
6 | Cython
7 | ete3>=3.0.0b35
8 | pyparsing>=2.1.10
9 | pysam>=0.11.2.2
10 | six>=1.10.0
11 | requests>=2.13.0
12 | dendropy>=4.3.0
13 | tqdm>=4.19.1
14 | pyham
15 | pyyaml
16 | multiprocessing_logging
17 |
--------------------------------------------------------------------------------
/archive/rm_py_script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/rm_GLYSP.o%J
3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/rm_GLYSP.e%J
4 | #BSUB -u david.dylus@unil.ch
5 | #BSUB -J rm_GLYSP
6 | #BSUB -n 1
7 | #BSUB -R "span[ptile=1]"
8 | #BSUB -R "rusage[mem=1000]"
9 | #BSUB -M 1000000
10 | rm -r /scratch/beegfs/weekly/ddylus/avian/reads/GLYSP
--------------------------------------------------------------------------------
/archive/scripts/adjust_mapping_names.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import shutil
3 | import os
4 | import getopt
5 | import glob
6 |
7 | from Bio import SeqIO
8 | from tables import *
9 |
10 | def main():
11 |
12 | try:
13 | opts, args = getopt.getopt(sys.argv[1:], "m:r:h", ["mapping_folder=", "reference_folder="])
14 | except getopt.GetoptError as e:
15 | print(str(e))
16 | print('get_seq_completeness.py -m ')
17 | sys.exit(2)
18 |
19 | mapping_folder = None
20 |
21 | for opt, arg in opts:
22 | if opt == '-h':
23 | print('get_seq_completeness.py -m ')
24 | sys.exit()
25 | elif opt in ("-m", "--mapping_folder"):
26 | mapping_folder = arg
27 | if mapping_folder[-1] is not "/":
28 | mapping_folder += "/"
29 | else:
30 | assert False, "unhandled option"
31 |
32 | if mapping_folder:
33 | for file in glob.glob(mapping_folder + "/*.fa"):
34 | if "_OGs" not in os.path.basename(file):
35 | species_name = os.path.basename(file).split("_")[0]
36 | new_file_name = species_name + "_OGs_consensus.fa"
37 | shutil.move(file, os.path.join(mapping_folder, new_file_name))
38 | for file in glob.glob(mapping_folder + "/*cov.txt"):
39 | if "_OGs" not in os.path.basename(file):
40 | species_name = os.path.basename(file).split("_")[0]
41 | new_file_name = species_name + "_OGs_cov.txt"
42 | shutil.move(file, os.path.join(mapping_folder, new_file_name))
43 |
44 | if __name__ == "__main__":
45 | main()
46 |
--------------------------------------------------------------------------------
/archive/scripts/clean_fasta_cdna_cds.py:
--------------------------------------------------------------------------------
1 |
2 | from Bio import SeqIO
3 | from Bio.Seq import Seq
4 | import sys
5 | from os import listdir
6 | import os
7 |
8 |
9 |
10 |
11 | def read_fasta_files(input_folder, format_input="fna"):
12 |
13 | files = listdir(input_folder)
14 | records_all = []
15 | file_names = []
16 | for file in files:
17 | sp_name = file.split(".")[:-1]
18 | if file.split(".")[-1] == format_input:
19 | file_names.append(file)
20 | records = list(SeqIO.parse(input_folder + file, "fasta"))
21 | records_all.append(records)
22 | else:
23 | print("we are not reading the file "+str(input_folder+file)+" since extension is not faa.")
24 | if records_all:
25 | print("there are ", len(file_names), format_input, " files, and the first file has ", len(records_all[0]), "sequences in it.")
26 | else:
27 | print("there is no " +format_input, " files in ",input_folder)
28 | return file_names, records_all
29 |
30 |
31 | def create_five_letter(file_names, output_five_letter_tsv = "clean_five_letter_species.tsv"):
32 |
33 | fiveLetter_species_dic = {}
34 | countr = 0
35 | for file_name in file_names:
36 | fiveLetter_species = "s" + str(countr).zfill(4)
37 | fiveLetter_species_dic[file_name] = fiveLetter_species
38 | countr += 1
39 | file_out = open(output_five_letter_tsv, "w")
40 | for species_name, fiveLetter in fiveLetter_species_dic.items():
41 | file_out.write(species_name + "\t" + fiveLetter + "\n")
42 | file_out.close()
43 | print("the five letter codes for each faa files are written in "+output_five_letter_tsv)
44 |
45 | return fiveLetter_species_dic
46 |
47 |
48 |
49 | def clean_translate(records ,species_fivelet):
50 |
51 | records_nuc = []
52 | records_aa = []
53 | for record in records:
54 | sequence = record.seq
55 | remainder = len(sequence) % 3
56 | if remainder != 0:
57 | sequence +=Seq('N' * (3 - remainder))
58 | record.seq= sequence
59 |
60 | id_old = str(record.id).replace("_","").replace(".","")
61 | id_new= species_fivelet + id_old
62 |
63 | nuc_seq= SeqIO.SeqRecord(sequence, id=id_new, description="cleaned for r2t", name = id_new)
64 |
65 | protein_seq = sequence.translate()
66 | protein_seq = SeqIO.SeqRecord(protein_seq, id=id_new, description="cleaned for r2t", name = id_new)
67 |
68 |
69 | records_nuc.append(nuc_seq)
70 | records_aa.append(protein_seq)
71 |
72 | print("the clean aa and nuc for "+species_fivelet+" is ready")
73 |
74 | return records_nuc, records_aa
75 |
76 |
77 |
78 |
79 |
80 | if __name__ == '__main__':
81 |
82 | input_folder_fna = sys.argv[1] + "/" # "myfolder/input_fna/" #
83 |
84 | file_names, records_all = read_fasta_files(input_folder_fna, "fna")
85 | fiveLetter_species_dic = create_five_letter(file_names)
86 |
87 |
88 | folder_aa= "clean_aa"
89 |
90 |
91 | if not os.path.exists(folder_aa):
92 | os.makedirs(folder_aa)
93 | else:
94 | print("ERROR the folder exists "+folder_aa +" better to remove it ")
95 |
96 | records_nuc_all_clean=[]
97 | for idx in range(len(file_names)):
98 | file_name = file_names[idx]
99 | records = records_all[idx]
100 | species_fivelet = fiveLetter_species_dic[file_name]
101 |
102 | records_nuc, records_aa = clean_translate(records ,species_fivelet)
103 |
104 | SeqIO.write(records_aa, folder_aa+"/"+species_fivelet+".fa", "fasta")
105 |
106 | records_nuc_all_clean += records_nuc # one big list
107 |
108 |
109 | SeqIO.write(records_nuc_all_clean, "dna_ref.fa", "fasta")
110 |
111 | print("we wrote "+str(len(file_names))+" faa fiels in the folder "+folder_aa+" and the nucluetide sequences all together in dna_ref.fa" )
112 |
113 | print("Now you can use the folder with OMA standalone" )
114 |
--------------------------------------------------------------------------------
/archive/scripts/concat_alignments.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import getopt
4 | import glob
5 | from Bio import AlignIO, SeqIO
6 |
7 | from zoo.seq_utils.utils import concatenate
8 |
9 | <<<<<<< Updated upstream
10 | def concatenate_alignments(folder):
11 | =======
12 | def concatenate_alignments(folder, min_taxa=0):
13 | >>>>>>> Stashed changes
14 | all_og_alignments = []
15 | all_og_align_pos = {}
16 | start = 0
17 | for f in glob.glob(folder+'*.phy'):
18 | used_ogs = 0
19 | if os.path.getsize(f) > 0:
20 | try:
21 | msa = AlignIO.read(f, "phylip-relaxed")
22 | except ValueError:
23 | msa = AlignIO.read(f, "fasta")
24 | #for record in msa:
25 | # record.id = record.id[0:5]
26 | #msa[-1].id = "CANAL"
27 | if len(msa) >= min_taxa:
28 | print(f)
29 | used_ogs =+ 1
30 | all_og_alignments.append(msa)
31 | #all_og_align_pos[f] = [start, start + len(record.seq)]
32 | #start = len(record.seq) + 1
33 | con_alignment = concatenate(all_og_alignments)
34 | print('OGs used: {}!'.format(used_ogs))
35 | return con_alignment
36 |
37 |
38 | def main():
39 |
40 | try:
41 | opts, args = getopt.getopt(sys.argv[1:], "f:m:o:", ["folder=", "min_taxa=", "out_file="])
42 | except getopt.GetoptError as e:
43 | print(str(e))
44 | print('concat_alignments.py -f -m -o ')
45 | sys.exit(2)
46 |
47 | seq_folder = None
48 | out_file = None
49 | min_taxa = 0
50 |
51 | for opt, arg in opts:
52 | if opt == '-h':
53 | print('concat_alignments.py -f -m -o -d')
54 | sys.exit()
55 | elif opt in ("-f", "--folder"):
56 | seq_folder = arg
57 | elif opt in ("-o", "--out_file"):
58 | out_file = arg
59 | elif opt in ("-m", "--min_taxa"):
60 | min_taxa = int(arg)
61 | else:
62 | assert False, "unhandled option"
63 |
64 |
65 |
66 | if seq_folder[-1] is not "/":
67 | seq_folder += "/"
68 |
69 | if min_taxa > 0:
70 | out_file = out_file+"_"+str(min_taxa)+".phy"
71 |
72 | alignment = concatenate_alignments(seq_folder, min_taxa=min_taxa)
73 | if alignment is not None:
74 | align_output = open(out_file, "w")
75 | AlignIO.write(alignment, align_output, "phylip-relaxed")
76 | align_output.close()
77 |
78 | if __name__ == "__main__":
79 | main()
80 |
--------------------------------------------------------------------------------
/archive/scripts/down_py_script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/down_GLYSP.o%J
3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/down_GLYSP.e%J
4 | #BSUB -u david.dylus@unil.ch
5 | #BSUB -J down_GLYSP
6 | #BSUB -n 1
7 | #BSUB -R "span[ptile=1]"
8 | #BSUB -R "rusage[mem=2000]"
9 | #BSUB -M 2000000
10 | srr=SRR3115005
11 | speciesid=GLYSP
12 | module add Utility/aspera_connect/3.7.4.147727
13 | module add UHTS/Analysis/sratoolkit/2.8.2.1
14 | source activate r2t
15 | mkdir /scratch/beegfs/weekly/ddylus/avian/reads/$speciesid
16 | echo 'Created read $speciesid'
17 | cd /scratch/beegfs/weekly/ddylus/avian/reads/$speciesid
18 | ascp -v -QT -k1 -l100M -i /software/Utility/aspera_connect/3.7.4.147727/etc/asperaweb_id_dsa.openssh anonftp@ftp.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/SRR/${srr:0:6}/$srr/$srr.sra ./
19 | echo 'Finished download'
20 | fastq-dump --split-files --gzip $srr.sra
21 | echo 'Finished getting fastq from sra and split files'
22 | mv *\_1.* $speciesid\_1.fq.gz
23 | mv *\_2.* $speciesid\_2.fq.gz
24 | echo 'Finished moving files'
--------------------------------------------------------------------------------
/archive/scripts/get_highly_conserved_og_dna_hdf5.py:
--------------------------------------------------------------------------------
1 | from tables import *
2 | from Bio import SeqIO
3 | from Bio.Seq import Seq
4 | from Bio.SeqRecord import SeqRecord
5 | from pyoma.browser import db
6 | import familyanalyzer as fa
7 |
8 | # parameters
9 | MIN_SPECIES = 20
10 | DUP_RATIO = 0
11 | DIR = '/Users/daviddylus/Research/read2tree/reference_datasets/Dataset1/Output/'
12 |
13 | # read in files
14 | hog_XML = DIR+'HierarchicalGroups.orthoxml'
15 | og_XML = DIR+'OrthologousGroups.orthoxml'
16 | h5file = open_file("/Volumes/Untitled/OmaServer.h5", mode="r")
17 |
18 | genomeTab = h5file.root.Genome
19 | dbObj = db.Database(h5file)
20 | omaIdObj = db.OmaIdMapper(dbObj)
21 |
22 | if DUP_RATIO != 0:
23 | hog_op = fa.OrthoXMLParser(hog_XML)
24 | gene_family_xml_nodes_hog = hog_op.getToplevelGroups()
25 | # select all the families with more than X species and duplication ratio smaller than Y
26 | hog_families_X = {}
27 | for i, family in enumerate(gene_family_xml_nodes_hog):
28 | family_id = family.get('id')
29 | genes_per_hog = [val for sublist in hog_op.getGenesPerSpeciesInFam(family).values() for val in sublist]
30 | species_per_hog = hog_op.getGenesPerSpeciesInFam(family).keys()
31 | duplication_ratio = float(len(genes_per_hog)) / float(len(species_per_hog))
32 | if len(species_per_hog) >= MIN_SPECIES and duplication_ratio <= DUP_RATIO:
33 | hog_families_X[family_id] = genes_per_hog
34 |
35 | print(len(hog_families_X))
36 |
37 |
38 | og_op = fa.OrthoXMLParser(og_XML)
39 | gene_family_xml_nodes_og = og_op.getToplevelGroups()
40 | og_families_X = {}
41 | for i, family in enumerate(gene_family_xml_nodes_og):
42 | family_id = family.get('id')
43 | genes_per_og = [val for sublist in og_op.getGenesPerSpeciesInFam(family).values() for val in sublist]
44 | species_per_og = og_op.getGenesPerSpeciesInFam(family).keys()
45 | if len(species_per_og) >= MIN_SPECIES:
46 | og_families_X[family_id] = genes_per_og
47 |
48 | print(len(og_families_X))
49 |
50 | if DUP_RATIO != 0:
51 | family_map = {}
52 | entries_map_omaids = {}
53 | cpt = 0
54 | for og in og_families_X:
55 | cpt += 1
56 | if cpt % 10 == 0:
57 | print("{} on {}".format(cpt, len(og_families_X)))
58 | a = og_families_X[og]
59 | for hog in hog_families_X:
60 | b = hog_families_X[hog]
61 | if len(set(a).intersection(b)) == 30:
62 | oma_ids_full = [og_op.mapGeneToXRef(val, 'protId') for val in og_families_X[og]]
63 | oma_ids = [og_op.mapGeneToXRef(val, 'protId').split(' | ')[0] for val in og_families_X[og]]
64 | entries = [omaIdObj.omaid_to_entry_nr(val) for val in oma_ids]
65 | for oma_id in oma_ids_full:
66 | entries_map_omaids[omaIdObj.omaid_to_entry_nr(oma_id.split(' | ')[0])] = oma_id
67 | family_map[og] = entries
68 | break
69 | print(len(entries_map_omaids))
70 | else:
71 | family_map = {}
72 | entries_map_omaids = {}
73 | cpt = 0
74 | for og in og_families_X:
75 | cpt += 1
76 | if cpt % 1000 == 0:
77 | print(og_op.mapGeneToXRef(og_families_X[og][0], 'protId').split(' | ')[0])
78 | print("{} on {}".format(cpt, len(og_families_X)))
79 | oma_ids_full = [og_op.mapGeneToXRef(val, 'protId') for val in og_families_X[og] if og_op.mapGeneToXRef(val, 'protId')]
80 | oma_ids = [val.split(' | ')[0] for val in oma_ids_full]
81 | entries = [omaIdObj.omaid_to_entry_nr(val) for val in oma_ids if omaIdObj.omaid_to_entry_nr(val)]
82 | print(entries)
83 | for oma_id in oma_ids_full:
84 | entries_map_omaids[omaIdObj.omaid_to_entry_nr(oma_id.split(' | ')[0])] = oma_id
85 | family_map[og] = entries
86 | print(len(entries_map_omaids))
87 |
88 |
89 | family_map_invert = {}
90 | for key in family_map:
91 | for val in family_map[key]:
92 | family_map_invert[val]=key
93 |
94 | print(len(family_map_invert))
95 |
96 | records = []
97 | for key in family_map_invert:
98 | new_id = entries_map_omaids[key] + '| OG' + family_map_invert[key]
99 | record = SeqRecord(Seq(dbObj.get_cdna(key)), id=new_id, description="")
100 | records.append(record)
101 |
102 | with open("dataset2.fasta", "w") as output_handle:
103 | SeqIO.write(records, output_handle, "fasta")
104 |
105 |
--------------------------------------------------------------------------------
/archive/scripts/get_og_from_readmapping.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import getopt
3 | from Bio import SeqIO
4 | from Bio.SeqIO import FastaIO
5 |
6 | def get_ogs(mapped_reads, og_data):
7 | og_data_names = [record.name for record in og_data]
8 | og_data_ogs = [record.description.split("| ")[-1] for record in og_data]
9 | list_of_ogs = {}
10 | for record in mapped_reads:
11 | if record.name in og_data_names:
12 | og_index = og_data_names.index(record.name)
13 | og_name = og_data[og_index].description.split("| ")[-1]
14 | indices = [i for i, x in enumerate(og_data_ogs) if x == og_name]
15 | seq_to_write = [og_data[i] for i in indices]
16 | record.seq = record.seq.upper()
17 | record.id = "SRR400661_" + record.id
18 | record.name = "SRR400661_" + record.name
19 | record.description = "SRR400661_" + record.description
20 | seq_to_write.append(record)
21 | list_of_ogs[og_name] = seq_to_write
22 | return list_of_ogs
23 |
24 |
25 |
26 | def main():
27 |
28 | try:
29 | opts, args = getopt.getopt(sys.argv[1:], "r:d:o:", ["mapped_reads=", "ref_data=", "out_folder="])
30 | except getopt.GetoptError as e:
31 | print(str(e))
32 | print('concat_alignments.py -r -d -o ')
33 | sys.exit(2)
34 |
35 | mapped_reads = None
36 | ref_data = None
37 | out_folder = None
38 |
39 | for opt, arg in opts:
40 | if opt == '-h':
41 | print('concat_alignments.py -r -d -o ')
42 | sys.exit()
43 | elif opt in ("-r", "--reads"):
44 | mapped_reads = arg
45 | elif opt in ("-d", "--ref_data"):
46 | ref_data = arg
47 | elif opt in ("-o", "--out_folder"):
48 | out_folder = arg
49 | else:
50 | assert False, "unhandled option"
51 |
52 | read_mappings = list(SeqIO.parse(mapped_reads, "fasta"))
53 | og_data = list(SeqIO.parse(ref_data, "fasta"))
54 |
55 | if out_folder[-1] is not "/":
56 | out_folder += "/"
57 |
58 | list_of_ogs = get_ogs(read_mappings, og_data)
59 | if list_of_ogs is not None:
60 | for og in list_of_ogs:
61 | file_name = out_folder + og + ".fasta"
62 | fasta_out = FastaIO.FastaWriter(open(file_name, "w"), wrap=None)
63 | fasta_out.write_file(list_of_ogs[og])
64 |
65 |
66 | if __name__ == "__main__":
67 | main()
--------------------------------------------------------------------------------
/archive/scripts/get_reconstructed_seq_by_species.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import getopt
4 | import glob
5 | import pandas as pd
6 |
7 | from Bio import SeqIO
8 | from tables import *
9 | from Bio.SeqIO.FastaIO import FastaWriter
10 |
11 |
12 | def read_seq_records(folder):
13 | out_dic = {}
14 | for file in glob.glob(os.path.join(folder, "*.fa")):
15 | sp_name = os.path.basename(file).split("_")[0]
16 | out_dic[sp_name] = {rec.id: rec for rec in list(SeqIO.parse(file, "fasta"))}
17 | return out_dic
18 |
19 | def read_sc_file(file):
20 | tmp = pd.read_csv(file)
21 | return [t['gene_id']+"_"+t['og']+"_"+t['og'] for i,t in tmp.iterrows()]
22 |
23 |
24 | def main():
25 |
26 | try:
27 | opts, args = getopt.getopt(sys.argv[1:], "m:s:h", ["mapping_folder=", "sc_file="])
28 | except getopt.GetoptError as e:
29 | print(str(e))
30 | print('get_reconstructed_seq_by_species.py -m -s ')
31 | sys.exit(2)
32 |
33 | mapping_folder = None
34 | sc_file = None
35 |
36 | for opt, arg in opts:
37 | if opt == '-h':
38 | print('get_reconstructed_seq_by_species.py -m -s ')
39 | sys.exit()
40 | elif opt in ("-m", "--mapping_folder"):
41 | mapping_folder = arg
42 | if mapping_folder[-1] is not "/":
43 | mapping_folder += "/"
44 | elif opt in ("-s", "--sc_file"):
45 | sc_file = arg
46 | else:
47 | assert False, "unhandled option"
48 |
49 | all_records = read_seq_records(mapping_folder)
50 | selected_seq = [all_records[idx[0:5]][idx] for idx in read_sc_file(sc_file)]
51 | print(selected_seq)
52 | file_name = mapping_folder.split("03_mapping_")[-1].split("/")[0]+"_consensus.fa"
53 | handleF = open(file_name, "w")
54 | writer = FastaWriter(handleF, wrap=None)
55 | writer.write_file(selected_seq)
56 | handleF.close()
57 |
58 | if __name__ == "__main__":
59 | main()
60 |
--------------------------------------------------------------------------------
/archive/scripts/get_seq_completeness.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import getopt
4 | import glob
5 |
6 | from Bio import SeqIO
7 | from tables import *
8 | from read2tree.stats.SeqCompleteness import SeqCompleteness
9 |
10 |
11 | def read_seq_records(file):
12 | return list(SeqIO.parse(file, "fasta"))
13 |
14 |
15 | def main():
16 |
17 | try:
18 | opts, args = getopt.getopt(sys.argv[1:], "m:r:h", ["mapping_folder=", "reference_folder="])
19 | except getopt.GetoptError as e:
20 | print(str(e))
21 | print('get_seq_completeness.py -m -r ')
22 | sys.exit(2)
23 |
24 | mapping_folder = None
25 | reference_folder = None
26 |
27 | for opt, arg in opts:
28 | if opt == '-h':
29 | print('get_seq_completeness.py -m -r ')
30 | sys.exit()
31 | elif opt in ("-m", "--mapping_folder"):
32 | mapping_folder = arg
33 | if mapping_folder[-1] is not "/":
34 | mapping_folder += "/"
35 | elif opt in ("-r", "--reference_folder"):
36 | reference_folder = arg
37 | if reference_folder[-1] is not "/":
38 | reference_folder += "/"
39 | else:
40 | assert False, "unhandled option"
41 | if reference_folder and mapping_folder:
42 | for file in glob.glob(reference_folder+"/*.fa"):
43 | species = os.path.basename(file).split("_")[0]
44 | print(species)
45 | ref_records = read_seq_records(file)
46 | mapping_file = os.path.join(mapping_folder,os.path.basename(file).split(".")[0]+"_consensus.fa")
47 | if os.path.exists(mapping_file):
48 | map_records = read_seq_records(mapping_file)
49 | seqC = SeqCompleteness(ref_records)
50 | seqC.get_seq_completeness(map_records)
51 | seqC.write_seq_completeness(os.path.join(mapping_folder, species + "_OGs_sc.txt"))
52 |
53 | if __name__ == "__main__":
54 | main()
55 |
--------------------------------------------------------------------------------
/archive/scripts/map2align_test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import getopt
4 | import glob
5 |
6 | from Bio import SeqIO
7 | from zoo.wrappers.treebuilders import Fasttree
8 | from tables import *
9 | from Bio import AlignIO
10 | from zoo.wrappers.aligners import Mafft
11 | from Bio.SeqIO import FastaIO
12 |
13 |
14 | from zoo.seq_utils.utils import concatenate
15 |
16 |
17 | def get_coverage(og):
18 | return (len(og[-1].seq)-og[-1].seq.count('X'))/len(og[-1].seq)
19 |
20 | def perform_mapping(DIR_MAPPING, FILE_OGS):
21 | og_dict = {}
22 | '''read in og with aa seq'''
23 | og = list(SeqIO.parse(FILE_OGS, "fasta"))
24 | for record in og:
25 | key = record.description.split(" | ")[-1]
26 | if key in og_dict:
27 | ids = [rec.id for rec in og_dict[key]]
28 | if record.id not in ids:
29 | og_dict[key].append(record)
30 | else:
31 | og_dict[key] = []
32 | og_dict[key].append(record)
33 |
34 |
35 | # parse the mapped reads to ogs to dictionary
36 | all_dict = {}
37 | for file in glob.glob(DIR_MAPPING + "*.fa"):
38 | og_name = file.split("_")[-1].split(".")[0]
39 | og = og_dict[og_name]
40 |
41 | # change ids to species names
42 | for i, record in enumerate(og):
43 | s = record.id[0:5]
44 | record.id = s
45 | all_dict[og_name] = og
46 |
47 | OG_OUT = DIR_MAPPING + 'origin_og/'
48 | if not os.path.exists(OG_OUT):
49 | os.makedirs(OG_OUT)
50 |
51 | for key, item in all_dict.items():
52 | file_name = OG_OUT + key + ".fa"
53 | fasta_out = FastaIO.FastaWriter(open(file_name, "w"), wrap=None)
54 | fasta_out.write_file(item)
55 |
56 | print("FINISHED PARSING OGs!")
57 | return all_dict
58 |
59 | def read_alignments(folder):
60 | align_list = []
61 | for filename in glob.glob(folder+"*.phy"):
62 | # input_handle = open(filename, "rU")
63 | align_list.append(AlignIO.read(filename, "phylip-relaxed"))
64 | print("FINISHED READING ALIGNMENTS!")
65 | return align_list
66 |
67 | def perform_alignment(all_dict, DIR_MAPPING):
68 | align_dict = {}
69 | align_list = []
70 | counter = 0
71 | for key, value in all_dict.items():
72 | mafft_wrapper = Mafft(value, datatype="PROTEIN")
73 | mafft_wrapper.options.options['--localpair'].set_value(True)
74 | mafft_wrapper.options.options['--maxiterate'].set_value(1000)
75 | alignment = mafft_wrapper()
76 | align_dict[key] = alignment
77 | align_list.append(alignment)
78 | counter += 1
79 | if counter % 50 == 0:
80 | print('{} of {} alignments done'.format(counter, len(all_dict)))
81 |
82 | ALIGN_OUT = DIR_MAPPING + 'origin_align/'
83 |
84 | if not os.path.exists(ALIGN_OUT):
85 | os.makedirs(ALIGN_OUT)
86 | print("WRITING ALIGNMENT FILES INTO: {}!".format(ALIGN_OUT))
87 | for key, value in align_dict.items():
88 | output_handle = open(ALIGN_OUT + key + ".phy", "w")
89 | AlignIO.write(value, output_handle, "phylip")
90 | print("FINISHED ALIGNMENTS!")
91 | return align_list
92 |
93 | def concatenate_alignment(align_list, DIR_MAPPING):
94 | ALIGN_OUT = DIR_MAPPING + 'origin_align/'
95 | concat_align = concatenate(align_list)
96 |
97 | output_handle = open(ALIGN_OUT + "CONCAT.phy", "w")
98 | AlignIO.write(concat_align, output_handle, "phylip")
99 | print("FINISHED CONCATINATION!")
100 | return concat_align
101 |
102 | def build_tree(concat_align, DIR_MAPPING):
103 |
104 | fasttree_wrapper = Fasttree(concat_align, datatype="PROTEIN")
105 | tree = fasttree_wrapper()
106 | print("FINISHED TREE INFERENCE!")
107 | with open(DIR_MAPPING+"original_tree.nwk", "w") as text_file:
108 | text_file.write("{};".format(tree))
109 | print("Resulting tree: {}".format(tree))
110 | return tree
111 |
112 | def main():
113 |
114 | try:
115 | opts, args = getopt.getopt(sys.argv[1:], "m:o:a:t:h", ["mapping_folder=", "ortholog_file=", "alignmnet_folder="])
116 | except getopt.GetoptError as e:
117 | print(str(e))
118 | print('map2align_test.py -m -o -a ')
119 | sys.exit(2)
120 |
121 | mapping_folder = None
122 | ortholog_file = None
123 | alignment_folder = None
124 |
125 | for opt, arg in opts:
126 | if opt == '-h':
127 | print('map2align_test.py -m -o -a ')
128 | sys.exit()
129 | elif opt in ("-m", "--mapping_folder"):
130 | mapping_folder = arg
131 | if mapping_folder[-1] is not "/":
132 | mapping_folder += "/"
133 | elif opt in ("-a", "--alignmnet_folder"):
134 | alignment_folder = arg
135 | if alignment_folder[-1] is not "/":
136 | alignment_folder += "/"
137 | elif opt in ("-o", "--ortholog_folder"):
138 | ortholog_file = arg
139 | else:
140 | assert False, "unhandled option"
141 |
142 |
143 | if alignment_folder:
144 | align = read_alignments(alignment_folder)
145 | else:
146 | mapping = perform_mapping(mapping_folder, ortholog_file)
147 | align = perform_alignment(mapping, mapping_folder)
148 |
149 | concatenation = concatenate_alignment(align, mapping_folder)
150 | build_tree(concatenation, mapping_folder)
151 |
152 |
153 | if __name__ == "__main__":
154 | main()
155 |
--------------------------------------------------------------------------------
/archive/scripts/monitor_folder_size.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import time
3 | import pandas as pd
4 | import subprocess
5 |
6 |
7 | def output_shell(line):
8 | """
9 | Save output of shell line that has pipes
10 | taken from: https://stackoverflow.com/questions/7389662/link-several-popen-commands-with-pipes
11 | :param line:
12 | :return:
13 | """
14 | try:
15 | shell_command = subprocess.Popen(line, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
16 | except OSError:
17 | return None
18 | except ValueError:
19 | return None
20 |
21 | (output, err) = shell_command.communicate()
22 | shell_command.wait()
23 | if shell_command.returncode != 0:
24 | print("Shell command failed to execute")
25 | return None
26 |
27 | return output
28 |
29 | def du(path):
30 | return subprocess.check_output(['du', '-sh', path]).split()[0].decode('utf-8')
31 |
32 | def bjobs():
33 | return output_shell("bjobs | grep -c 'RUN'")
34 |
35 |
36 | if __name__ == "__main__":
37 | path = sys.argv[1] if len(sys.argv) > 1 else '.'
38 |
39 | bjobs_exist = True
40 | folder_size = []
41 | number_jobs = []
42 | total_time = []
43 | current_time = 0
44 | time_interval = 10
45 |
46 | try:
47 | with open('./monitoring.csv', 'a') as file:
48 | file.write('current_time,folder_size,num_bjobs\n')
49 | while True and bjobs_exist:
50 | folder_size.append(du(path))
51 | number_jobs.append(bjobs())
52 | total_time.append(current_time)
53 | to_write = str(current_time)+','+str(folder_size[-1])+','+str(number_jobs[-1])+'\n'
54 | file.write(to_write)
55 | current_time += time_interval
56 | time.sleep(time_interval)
57 | # if "No unfinished job found" in output_shell("bjobs"):
58 | # bjobs_exist = False
59 | except KeyboardInterrupt:
60 | #time.sleep(time_interval)
61 | file.close()
62 | # d = {"folder_size": folder_size, "current_time": total_time, "num_bjobs": number_jobs}
63 | # df = pd.DataFrame(d)
64 | # df.to_csv("./monitoring.csv")
65 | raise
--------------------------------------------------------------------------------
/archive/scripts/orthogroups_fasta_to_marker_genes.py:
--------------------------------------------------------------------------------
1 | from Bio.SeqIO.FastaIO import FastaWriter
2 | from Bio import SeqIO
3 | import tqdm, os, glob, re
4 | from xml.dom import minidom
5 |
6 |
7 |
8 | def _find_index_substring(ids, search_string, tmp_list):
9 | best_index = None
10 | max_occurence = 0
11 | tmp_ids = [re.sub(r'\..*', '', tmp) for tmp in tmp_list]
12 | use_ids = [re.sub(r'\W+', '', tmp_id) for tmp_id in tmp_ids]
13 | index = [i for i, s in enumerate(ids) if search_string in s]
14 | for i in index:
15 | string_occurence = len([k for k in use_ids if k in ids[i]])
16 | if string_occurence > max_occurence:
17 | best_index = i
18 | max_occurence = string_occurence
19 | if best_index:
20 | return best_index
21 | else:
22 | return None
23 |
24 |
25 | def _get_all_ids(f_orthoxml):
26 | all_prot_ids = []
27 | xmldoc = minidom.parse(f_orthoxml)
28 | itemlist = xmldoc.getElementsByTagName('gene')
29 | print(" --- loading all protids ---")
30 | for s in tqdm.tqdm(itemlist):
31 | tmp = s.attributes['protId'].value
32 | all_prot_ids.append(tmp)
33 | return all_prot_ids
34 |
35 |
36 | def _write(file, value):
37 | """
38 | Write output to fasta file
39 | :param file: file and location of outputfile
40 | :param value:
41 | :return:
42 | """
43 | handle = open(file, "w")
44 | writer = FastaWriter(handle, wrap=None)
45 | writer.write_file(value)
46 | handle.close()
47 |
48 |
49 | def _get_species_id(record):
50 | if '[' in record.description and ']' in record.description:
51 | return record.description[record.description.find(
52 | "[")+1:record.description.find("]")]
53 | else:
54 | return record.id[0:5]
55 |
56 | def run(orthogroups_fasta_folder, orthogroups_xml, output_path, min_species):
57 | if not os.path.exists(output_path):
58 | os.makedirs(output_path)
59 | all_prot_ids = _get_all_ids(orthogroups_xml)
60 | for f in tqdm.tqdm(glob.glob(os.path.join(orthogroups_fasta_folder, '*.fa'))):
61 | records = list(SeqIO.parse(f, 'fasta'))
62 | if len(records) >= min_species:
63 | for rec in records:
64 | sp_id = _get_species_id(rec)
65 | tmp_lst = rec.description.split()
66 | if sp_id not in tmp_lst[0]:
67 | tmp = tmp_lst[-2]
68 | tmp_id = re.sub(r'\..*', '', tmp)
69 | use_id = re.sub(r'\W+', '', tmp_id)
70 | new_id = _find_index_substring(all_prot_ids, use_id, tmp_lst)
71 | if new_id:
72 | rec.id = all_prot_ids[new_id]
73 | new_description = rec.description.split()[-1]
74 | rec.description = new_description
75 | rec.name = ''
76 | output_file = os.path.join(output_path,
77 | os.path.basename(f))
78 | _write(output_file, records)
79 |
80 |
81 | if __name__ == "__main__":
82 | import argparse
83 | parser = argparse.ArgumentParser(
84 | description="""Transform OrthogroupsFasta into marker_genes""")
85 | parser.add_argument('--oxml', default=None,
86 | help='[Default is none] Remove species present '
87 | 'in data set after mapping step completed to '
88 | 'build OGs. Input is comma separated list '
89 | 'without spaces, e.g. XXX,YYY,AAA.')
90 | parser.add_argument('--ofolder', default='marker_genes', required=True,
91 | help='[Default is current directory] Path to '
92 | 'output directory.')
93 | parser.add_argument('--ofasta', default='.', required=True,
94 | help='[Default is current directory] Path to '
95 | 'output directory.')
96 | parser.add_argument('--min_species', type=int, default=None,
97 | help='Min number of species in selected '
98 | 'orthologous groups. If not selected it will be '
99 | 'estimated such that around 1000 OGs '
100 | 'are available.')
101 |
102 | conf = parser.parse_args()
103 |
104 | run(conf.ofasta, conf.oxml, conf.ofolder, conf.min_species)
105 |
--------------------------------------------------------------------------------
/archive/scripts/orthogroups_fasta_to_marker_genes_by_groups.py:
--------------------------------------------------------------------------------
1 | from Bio.SeqIO.FastaIO import FastaWriter
2 | from Bio import SeqIO
3 | import tqdm, os, glob
4 |
5 | def _oma_replace(row):
6 | if 'OMA0000' in row:
7 | return 'OMA0000'
8 | elif 'OMA000' in row:
9 | return 'OMA000'
10 | elif 'OMA00' in row:
11 | return 'OMA00'
12 | elif 'OMA0' in row:
13 | return 'OMA0'
14 | elif 'OMA' in row:
15 | return 'OMA'
16 |
17 |
18 | def _get_all_ids(orthogroups_txt):
19 | with open(orthogroups_txt) as f:
20 | lines = f.readlines()
21 | x = []
22 | for l in lines:
23 | if '#' not in l:
24 | x.append(l.rstrip("\n").split("\t"))
25 | og_dic = {}
26 | for r in x:
27 | tmp = r[0].replace(_oma_replace(r[0]), 'OG')
28 | r[0] = tmp
29 | og_dic[tmp] = {i[0:5]: i[6:] for i in r[1:]}
30 | return og_dic
31 |
32 |
33 | def _write(file, value):
34 | """
35 | Write output to fasta file
36 | :param file: file and location of outputfile
37 | :param value:
38 | :return:
39 | """
40 | handle = open(file, "w")
41 | writer = FastaWriter(handle, wrap=None)
42 | writer.write_file(value)
43 | handle.close()
44 |
45 |
46 | def _get_species_id(record):
47 | if '[' in record.description and ']' in record.description:
48 | return record.description[record.description.find(
49 | "[")+1:record.description.find("]")]
50 | else:
51 | return record.id[0:5]
52 |
53 | def run(orthogroups_fasta_folder, og_dic, output_path, min_species):
54 | if not os.path.exists(output_path):
55 | os.makedirs(output_path)
56 | for f in tqdm.tqdm(glob.glob(os.path.join(orthogroups_fasta_folder, '*.fa'))):
57 | new_name_dic = og_dic[os.path.basename(f).split(".")[0]]
58 | records = list(SeqIO.parse(f, 'fasta'))
59 | if len(records) >= min_species:
60 | for rec in records:
61 | sp_id = _get_species_id(rec)
62 | new_id = new_name_dic[sp_id].split()[0]
63 | rec.id = new_id
64 | rec.description = new_name_dic[sp_id].replace(new_id, "") + " [" + sp_id + "]"
65 | output_file = os.path.join(output_path,
66 | os.path.basename(f))
67 | _write(output_file, records)
68 |
69 |
70 | if __name__ == "__main__":
71 | import argparse
72 | parser = argparse.ArgumentParser(
73 | description="""Transform OrthogroupsFasta into marker_genes""")
74 | parser.add_argument('--ogroups', default=None,
75 | help='[Default is none] Remove species present '
76 | 'in data set after mapping step completed to '
77 | 'build OGs. Input is comma separated list '
78 | 'without spaces, e.g. XXX,YYY,AAA.')
79 | parser.add_argument('--ofolder', default='marker_genes', required=True,
80 | help='[Default is current directory] Path to '
81 | 'output directory.')
82 | parser.add_argument('--ofasta', default='.', required=True,
83 | help='[Default is current directory] Path to '
84 | 'output directory.')
85 | parser.add_argument('--min_species', type=int, default=None,
86 | help='Min number of species in selected '
87 | 'orthologous groups. If not selected it will be '
88 | 'estimated such that around 1000 OGs '
89 | 'are available.')
90 |
91 | conf = parser.parse_args()
92 | og_dic = _get_all_ids(conf.ogroups)
93 |
94 | run(conf.ofasta, og_dic, conf.ofolder, conf.min_species)
95 |
--------------------------------------------------------------------------------
/archive/scripts/protein_converter.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from Bio import SeqIO
3 |
4 | # Get input and output file paths from command-line arguments
5 | # Daniel Paiva Agustinho
6 | input_file = sys.argv[1]
7 | output_file = sys.argv[2]
8 |
9 | with open(input_file, "r") as input_handle:
10 | with open(output_file, "w") as output_handle:
11 | for record in SeqIO.parse(input_handle, "fasta"):
12 | protein_seq = record.seq.translate()
13 |
14 | # Extract the entire original header
15 | original_header = record.description
16 |
17 | # Create a new sequence record with the original header
18 | protein_seq = SeqIO.SeqRecord(
19 | protein_seq, id=record.id, description=original_header
20 | )
21 | SeqIO.write(protein_seq, output_handle, "fasta")
22 | print("done",str(output_file))
23 |
--------------------------------------------------------------------------------
/archive/scripts/r2t_py_script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/r2t_GLYSP.o%J
3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/r2t_GLYSP.e%J
4 | #BSUB -u david.dylus@unil.ch
5 | #BSUB -J r2t_GLYSP
6 | #BSUB -n 1
7 | #BSUB -R "span[ptile=1]"
8 | #BSUB -R "rusage[mem=4000]"
9 | #BSUB -M 4000000
10 | source activate r2t
11 | reads=/scratch/beegfs/weekly/ddylus/avian/reads/GLYSP
12 | cd /scratch/beegfs/weekly/ddylus/avian/r2t/
13 | python -W ignore /scratch/beegfs/monthly/ddylus/opt/read2tree/bin/read2tree --standalone_path /scratch/beegfs/weekly/ddylus/avian/marker_genes/ --dna_reference /scratch/beegfs/weekly/ddylus/avian/eukaryotes.cdna.fa --reads $reads/GLYSP_1.fq.gz $reads/GLYSP_2.fq.gz --output_path /scratch/beegfs/weekly/ddylus/avian/r2t/ --single_mapping /scratch/beegfs/weekly/ddylus/avian/r2t/02_ref_dna/MELGA_OGs.fa --threads 4 --min_species 8
--------------------------------------------------------------------------------
/archive/scripts/relabel_msa.py:
--------------------------------------------------------------------------------
1 | import Bio.AlignIO
2 | import csv
3 |
4 |
5 | def load_oma_species(fn):
6 | with open(fn, 'rt') as fh:
7 | reader = csv.reader((l for l in fh if not l.startswith('#')), dialect="excel-tab")
8 | mapping = {row[0]: row[2].replace(' ','_') + "__" + row[1] for row in reader}
9 | return mapping
10 |
11 |
12 | def load_nextstrain_metadata(fn):
13 | with open(fn, 'rt') as fh:
14 | reader = csv.DictReader(fh, dialect="excel-tab")
15 | mapping = {row['sra_accession']: row['sra_accession'] + "__" + row['strain'].replace(' ','_') + "__" + row['Nextstrain_clade'].replace(' ','_').replace('(','[').replace(')',']') + row['date']
16 | for row in reader}
17 | return mapping
18 |
19 |
20 | def update_msa_ids(msa_path, new_path, mapping, format="phylip-relaxed"):
21 | msa = Bio.AlignIO.read(msa_path, format=format)
22 | for rec in msa:
23 | rec.id = mapping.get(rec.id, rec.id)
24 | Bio.AlignIO.write(msa, new_path, format=format)
25 |
26 |
27 | if __name__ == "__main__":
28 | import argparse
29 | parser = argparse.ArgumentParser(description="update labels of sequence ids")
30 | parser.add_argument('--oma-map', help="path to the oma-species.txt file to change 5letter codes with scientific names")
31 | parser.add_argument('--nextstrain', help="path to the nextstrain metadata file with the sra accessions")
32 | parser.add_argument('--msa-format', help="format of the msa. if not set, it will be guessed based on file extension")
33 | parser.add_argument('--out', required=True, help="Path to the output filename")
34 | parser.add_argument('msa', help="Path to the input msa filename")
35 |
36 | conf = parser.parse_args()
37 | mapping = {}
38 | if conf.oma_map:
39 | mapping.update(load_oma_species(conf.oma_map))
40 | if conf.nextstrain:
41 | mapping.update(load_nextstrain_metadata(conf.nextstrain))
42 |
43 | if conf.msa_format is None:
44 | conf.msa_format = "phylip-relaxed" if conf.msa.endswith('.phy') else "fasta"
45 | update_msa_ids(conf.msa, conf.out, mapping, format=conf.msa_format)
46 |
47 |
--------------------------------------------------------------------------------
/archive/scripts/remove_species_from_alignment.py:
--------------------------------------------------------------------------------
1 | from Bio import AlignIO
2 | from Bio.Align import MultipleSeqAlignment
3 | from Bio.Alphabet import IUPAC, Gapped
4 |
5 |
6 | def get_alignment(file, species_list):
7 | keep_species = []
8 | alignment = AlignIO.read(file, 'phylip-relaxed')
9 | for i, record in enumerate(alignment):
10 | if record.id not in species_list:
11 | keep_species.append(record)
12 | return MultipleSeqAlignment(keep_species, Gapped(IUPAC.protein, "-"))
13 |
14 |
15 | def write_alignment(output, alignment):
16 | AlignIO.write(alignment, output, 'phylip-relaxed')
17 |
18 |
19 | if __name__ == "__main__":
20 | import argparse
21 | parser = argparse.ArgumentParser(
22 | description="""Remove species from given alignment.""")
23 | parser.add_argument('-s', '--remove_species', default=None,
24 | help='[Default is none] Remove species present '
25 | 'in data set after mapping step completed to '
26 | 'build OGs. Input is comma separated list '
27 | 'without spaces, e.g. XXX,YYY,AAA.')
28 | parser.add_argument('-o', '--output', default='.', required=True,
29 | help='[Default is current directory] Path to '
30 | 'output directory.')
31 | parser.add_argument('-i', '--input', default='.', required=True,
32 | help='[Default is current directory] Path to '
33 | 'output directory.')
34 |
35 | conf = parser.parse_args()
36 |
37 | new_alignment = get_alignment(conf.input, conf.remove_species)
38 | write_alignment(conf.output, new_alignment)
39 |
--------------------------------------------------------------------------------
/archive/scripts/rm_py_script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/rm_GLYSP.o%J
3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/rm_GLYSP.e%J
4 | #BSUB -u david.dylus@unil.ch
5 | #BSUB -J rm_GLYSP
6 | #BSUB -n 1
7 | #BSUB -R "span[ptile=1]"
8 | #BSUB -R "rusage[mem=1000]"
9 | #BSUB -M 1000000
10 | rm -r /scratch/beegfs/weekly/ddylus/avian/reads/GLYSP
--------------------------------------------------------------------------------
/archive/scripts/sample_from_reads.py:
--------------------------------------------------------------------------------
1 | #from __future__ import division
2 | import random
3 | import argparse
4 | import sys
5 | # bp length of mouse transcriptome in OMA: 37.914.531
6 | # bp length of CANVA genome 2.5Mpb
7 |
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument("-i", "--input", nargs='+', default=None, help="input FASTQ filename")
10 | parser.add_argument("-out", "--output", help="output FASTQ filename")
11 | parser.add_argument("-f", "--fraction", type=float, help="fraction of reads to sample")
12 | parser.add_argument("-n", "--number", type=int, help="number of reads to sample")
13 | parser.add_argument("-s", "--sample", type=int, help="number of output files to write", default=1)
14 | args = parser.parse_args()
15 |
16 | if args.fraction and args.number:
17 | sys.exit("give either a fraction or a number, not both")
18 |
19 | if not args.fraction and not args.number:
20 | sys.exit("you must give either a fraction or a number")
21 |
22 | print("counting records....")
23 | with open(args.input[0]) as input:
24 | num_lines = sum([1 for line in input])
25 | total_records = int(num_lines / 4)
26 |
27 | if args.fraction:
28 | args.number = int(total_records * args.fraction)
29 |
30 | print("sampling " + str(args.number) + " out of " + str(total_records) + " records")
31 |
32 | output_sequence_sets = []
33 | output_file_left = []
34 | if len(args.input) > 1:
35 | output_file_right = []
36 | for i in range(args.sample):
37 | output_sequence_sets.append(set(random.sample(range(total_records + 1), args.number)))
38 | #output_file = args.input[0].split("/")[-1].split(".")[0]
39 | output_file = args.output
40 | output_file_left.append(open(output_file + "_0_" + str(i) + ".fq", "w"))
41 | if len(args.input) > 1:
42 | output_file_right.append(open(output_file + "_1_" + str(i) + ".fq", "w"))
43 |
44 | initial_length = 0
45 | sampling_length = 0
46 |
47 | record_number = 0
48 | with open(args.input[0]) as read_input:
49 | for line1 in read_input:
50 | line2 = read_input.readline()
51 | initial_length += len(line2)
52 | line3 = read_input.readline()
53 | line4 = read_input.readline()
54 | for i, output in enumerate(output_file_left):
55 | if record_number in output_sequence_sets[i]:
56 | output.write(line1)
57 | output.write(line2)
58 | output.write(line3)
59 | output.write(line4)
60 | sampling_length += len(line2)
61 | record_number += 1
62 |
63 | if len(args.input) > 1:
64 | record_number = 0
65 | with open(args.input[1]) as read_input:
66 | for line1 in read_input:
67 | line2 = read_input.readline()
68 | line3 = read_input.readline()
69 | line4 = read_input.readline()
70 | for i, output in enumerate(output_file_right):
71 | if record_number in output_sequence_sets[i]:
72 | output.write(line1)
73 | output.write(line2)
74 | output.write(line3)
75 | output.write(line4)
76 | sampling_length += len(line2)
77 | record_number += 1
78 |
79 |
80 | #output[0].close()
81 | # if len(args.input) > 1:
82 | # output[1].close()
83 | print("The mean length of all reads is {} and the mean length of the subsampled reads is {}".format(initial_length/total_records, sampling_length/args.number))
84 | print("The sum length of all reads is {} and the sum length of the subsampled reads is {}".format(initial_length, sampling_length))
85 | print("done!")
86 |
--------------------------------------------------------------------------------
/archive/scripts/subsample_nextstrain_covid_genomes_with_sra_accession.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import lzma
3 | import csv
4 | import random
5 |
6 |
7 | def get_sra_datasets(fn):
8 | with lzma.open(fn, "rt", newline="") as fh:
9 | reader = csv.DictReader(fh, dialect="excel-tab")
10 | for row in reader:
11 | if row["sra_accession"] not in ('', '?'):
12 | yield row
13 |
14 |
15 | def subsample(metafile, nr_per_clade):
16 | sra = sorted(get_sra_datasets(metafile), key=lambda x: x["Nextstrain_clade"])
17 | sub = []
18 | for clade, samples in itertools.groupby(sra, key=lambda x: x["Nextstrain_clade"]):
19 | if clade == "":
20 | continue
21 | samples = list(samples)
22 | print(f"{clade}: {len(samples)}")
23 | sub.extend(random.sample(samples, min(nr_per_clade, len(samples))))
24 | return sub
25 |
26 | def write(outfn, sub):
27 | with open(outfn,'w') as fout:
28 | w = csv.DictWriter(fout, fieldnames=sub[0].keys(), dialect="excel-tab")
29 | w.writeheader()
30 | w.writerows(sub)
31 |
32 | if __name__ == "__main__":
33 | import argparse
34 | parser = argparse.ArgumentParser(description="subsample nextstrain samples from all clades that contain sra accession ids")
35 | parser.add_argument("--out", required=True, help="path to output file")
36 | parser.add_argument("--nr-per-clade", default=2, type=int, help="number of samples to use per nextstrain clade. [default: 2]")
37 | parser.add_argument("metafile", help="metadata.tsv.xz file from nextstrain, e.g. https://data.nextstrain.org/files/ncov/open/global/metadata.tsv.xz")
38 | conf = parser.parse_args()
39 |
40 | subset = subsample(conf.metafile, conf.nr_per_clade)
41 | write(conf.out, subset)
42 |
--------------------------------------------------------------------------------
/archive/scripts/treecl/select_alignments.py:
--------------------------------------------------------------------------------
1 | from Bio import AlignIO
2 | import tqdm, os, glob
3 |
4 | def run(afolder, ofolder, min_species):
5 | if not os.path.exists(ofolder):
6 | os.makedirs(ofolder)
7 | for f in tqdm.tqdm(glob.glob(os.path.join(afolder, '*.fa'))):
8 | if os.path.getsize(f) > 0:
9 | try:
10 | msa = AlignIO.read(f, "phylip-relaxed")
11 | except ValueError:
12 | msa = AlignIO.read(f, "fasta")
13 | if len(msa) >= min_species:
14 | align_output = open(os.path.join(ofolder, os.path.basename(f).split(".")[0]+".phy"), "w")
15 | AlignIO.write(msa, align_output, "phylip-relaxed")
16 | align_output.close()
17 |
18 |
19 | if __name__ == "__main__":
20 | import argparse
21 | parser = argparse.ArgumentParser(
22 | description="""Transform OrthogroupsFasta into marker_genes""")
23 | parser.add_argument('--afolder', default=None, required=True,
24 | help='[Default is none] Folder that contains alignments'
25 | 'without spaces, e.g. XXX,YYY,AAA.')
26 | parser.add_argument('--ofolder', default='alignments_selected', required=True,
27 | help='[Default is current directory] Path to '
28 | 'output directory.')
29 | parser.add_argument('--min_species', type=int, default=0,
30 | help='Min number of species in selected '
31 | 'alignments. ')
32 |
33 | conf = parser.parse_args()
34 |
35 | run(conf.afolder, conf.ofolder, conf.min_species)
36 |
--------------------------------------------------------------------------------
/archive/scripts/trim_alignment.py:
--------------------------------------------------------------------------------
1 | import Bio.AlignIO
2 | import Bio.Align
3 | import collections
4 | import math
5 |
6 | def load_msa(fn):
7 | if fn.endswith('.phy'):
8 | format = 'phylip-relaxed'
9 | elif fn.endswith('.fa'):
10 | format = 'fasta'
11 | else:
12 | raise UnkownFormatError('unknown format for '+fn)
13 | with open(fn, 'rt') as fh:
14 | msa = next(Bio.AlignIO.parse(fn, format))
15 | return msa
16 |
17 |
18 | def write_msa(fn, msa):
19 | with open(fn, 'wt') as fh:
20 | Bio.AlignIO.write(msa, fh, 'phylip-relaxed')
21 |
22 |
23 | def count_nucs(data):
24 | c = collections.Counter(data)
25 | valid = sum(c[x] for x in ('ATCGN'))
26 | return valid
27 |
28 | def trim(msa, min_residue):
29 | keep = []
30 | for col in range(msa.get_alignment_length()):
31 | if count_nucs(msa[:,col]) >= min_residue:
32 | keep.append(col)
33 | print(len(keep))
34 | trimmed = msa[:, keep[0]:keep[0]+1]
35 | for k in keep[1:]:
36 | trimmed = trimmed + msa[:, k:k+1]
37 | return keep, trimmed
38 |
39 | def filter_taxa(msa, min_residue):
40 | filtered = Bio.Align.MultipleSeqAlignment(filter(lambda taxon: count_nucs(taxon) > min_residue, msa))
41 | return filtered
42 |
43 | class UnknownFormatError(Exception):
44 | pass
45 |
46 |
47 | if __name__ == "__main__":
48 | import argparse
49 | parser = argparse.ArgumentParser(description="sample part of the alignment that contains enough data, and through out species which have too little data")
50 | parser.add_argument('alignment', help="path to multiple sequence alignment")
51 | parser.add_argument('--min-per-col', type=int, help="Min nr of taxa that need to have a nuc at a column to be included. Defaults to ceil(nr_taxa*0.3)")
52 | parser.add_argument('--min-res-per-species', default=400, type=int, help="Minimum number of residues for a taxon in the trimmed alignment to not be kicked out. Defaults to 400")
53 | parser.add_argument('--out', help="Outfile of trimmed alignment")
54 | conf = parser.parse_args()
55 |
56 | msa = load_msa(conf.alignment)
57 | if conf.min_per_col is None:
58 | conf.min_per_col = math.ceil(0.3*len(msa))
59 | if conf.out is None:
60 | conf.out = conf.alignment+".trimmed"
61 |
62 | print("Loaded MSA ({}x{}). Filter cols with less than {} residue"
63 | .format(len(msa), msa.get_alignment_length(), conf.min_per_col))
64 | keep, trimmed_msa = trim(msa, conf.min_per_col)
65 | print(" after filtering columns: {}x{}".format(len(trimmed_msa), trimmed_msa.get_alignment_length()))
66 | filtered = filter_taxa(trimmed_msa, conf.min_res_per_species)
67 | print(" after filtering taxa: {}x{}".format(len(filtered), filtered.get_alignment_length()))
68 | write_msa(conf.out, filtered)
69 |
--------------------------------------------------------------------------------
/archive/set_marker_genes/bacteria_markergenes.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/set_marker_genes/bacteria_markergenes.zip
--------------------------------------------------------------------------------
/archive/set_marker_genes/mammalia_markergenes.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/set_marker_genes/mammalia_markergenes.zip
--------------------------------------------------------------------------------
/archive/tests/info.log:
--------------------------------------------------------------------------------
1 | 2018-11-19 08:44:51,173:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz ---
2 | 2018-11-19 08:44:51,180:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz ---
3 | 2018-11-19 08:44:51,183:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz ---
4 | 2018-11-19 08:48:03,835:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz ---
5 | 2018-11-19 08:48:03,839:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz ---
6 | 2018-11-19 08:48:03,842:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz ---
7 |
--------------------------------------------------------------------------------
/archive/tests/input.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/tests/input.log
--------------------------------------------------------------------------------
/archive/tests/test_aligner.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import gzip
4 | import argparse
5 | from Bio import SeqIO
6 | from read2tree.Reads import Reads
7 | from read2tree.FastxReader import FastxReader
8 | dirname = os.path.dirname(__file__)
9 |
10 |
11 | class ReadTest(unittest.TestCase):
12 |
13 | def setup_reads_paired(self, sampling=False):
14 | arg_parser = argparse.ArgumentParser(prog='read2tree')
15 |
16 | arg_parser.add_argument('--standalone_path', default='.',
17 | help='[Default is current directory] Path to '
18 | 'oma standalone directory.')
19 |
20 | arg_parser.add_argument('--reads', nargs='+', default=None,
21 | help='Reads to be mapped to reference. If paired '
22 | 'end add separated by space.')
23 |
24 | arg_parser.add_argument('--read_type', default='short',
25 | help='[Default is short reads] Type of reads to '
26 | 'use for mapping. Either ngm for short reads or '
27 | 'ngmlr for long will be used.')
28 |
29 | arg_parser.add_argument('--dna_reference', default='',
30 | help='Reference file that contains nucleotide '
31 | 'sequences (fasta, hdf5). If not given it will use'
32 | 'the RESTapi and retrieve sequences '
33 | 'from http://omabrowser.org directly. '
34 | 'NOTE: internet connection required!')
35 |
36 | arg_parser.add_argument('--keep_all_ogs', action='store_true',
37 | help='Keep all orthologs after addition of '
38 | 'mapped seq, which means also the groups that '
39 | 'have no mapped sequence. Otherwise only groups '
40 | 'are used that have the mapped sequence for '
41 | 'alignment and tree inference.')
42 |
43 | arg_parser.add_argument('-r', '--reference', action='store_true',
44 | help='Just generate the reference dataset for '
45 | 'mapping.')
46 |
47 | arg_parser.add_argument('--remove_species_ogs', default=None,
48 | help='[Default is none] Remove species present '
49 | 'in data set after mapping step completed to '
50 | 'build OGs. Input is comma separated list '
51 | 'without spaces, e.g. XXX,YYY,AAA.')
52 |
53 | arg_parser.add_argument('-s', '--species_name', default=None,
54 | help='[Default is name of read] Name of species '
55 | 'for mapped sequence.')
56 |
57 | arg_parser.add_argument('--output_path', default='.', required=True,
58 | help='[Default is current directory] Path to '
59 | 'output directory.')
60 |
61 | argv = ['--standalone_path', 'tests/data/marker_genes/',
62 | '--dna_reference', 'tests/data/dna.fa', '--reads',
63 | 'tests/data/mapper/test3/test_1b.fq',
64 | 'tests/data/mapper/test3/test_2b.fq',
65 | '--output_path', 'tests/data/output', '--read_type',
66 | 'short', '--keep_all_ogs', '--reference',
67 | '--remove_species_ogs', 'CIOIN', '--species_name', 'ass']
68 |
69 | args = arg_parser.parse_args(argv)
70 | return alignments = Aligner(args, ogset.ogs, load=True)
71 |
--------------------------------------------------------------------------------
/archive/tests/test_og.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | from Bio import SeqIO
4 | from read2tree.OGSet import OG
5 |
6 | dirname = os.path.dirname(__file__)
7 |
8 |
9 | class OGTest(unittest.TestCase):
10 |
11 | def setup(self):
12 | aa = list(SeqIO.parse('data/OG4.aa', format='fasta'))
13 | dna = list(SeqIO.parse('data/OG4.dna', format='fasta'))
14 | og = OG()
15 | og.aa = aa
16 | og.dna = dna
17 | return og
18 |
19 | def test_init(self):
20 | og = self.setup()
21 | self.assertEqual(og.dna[0].id, 'MOUSE21964_OG4')
22 |
23 | def test_get_og_dict(self):
24 | og = self.setup()
25 | dna_dict = og._get_og_dict(og)
26 | self.assertEqual(dna_dict['MOUSE21964'].name, 'MOUSE21964_OG4')
27 |
28 | def test_remove_species_records(self):
29 | og = self.setup()
30 | og_wo_mouse = og.remove_species_records('MOUSE')
31 | self.assertEqual(len(og_wo_mouse[0]), 4)
32 | self.assertEqual(len(og_wo_mouse[1]), 4)
33 |
34 | def test_get_species_id(self):
35 | og = self.setup()
36 | dna = og.dna[0]
37 | aa = og.aa[0]
38 | self.assertEqual(og._get_species_id(dna), 'MOUSE')
39 | self.assertEqual(og._get_species_id(aa), 'MOUSE')
40 |
41 |
42 | if __name__ == "__main__":
43 | unittest.main()
44 |
--------------------------------------------------------------------------------
/archive/tests/test_ogset.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | from read2tree import OGSet
4 |
5 | API_URL = 'http://omabrowser.org/api'
6 |
7 | class OGSetTest(unittest.TestCase):
8 | def setUp(self):
9 | arg_parser = argparse.ArgumentParser(prog='read2tree')
10 |
11 | arg_parser.add_argument('--reads', nargs='+', default=None,
12 | help='Reads to be mapped to reference. If paired end '
13 | 'add separated by space.')
14 | arg_parser.add_argument('--read_split_length', type=int, default=400,
15 | help='Set read split length.')
16 | arg_parser.add_argument('--read_split_overlap', type=int, default=50,
17 | help='Set read split length overlap.')
18 | arg_parser.add_argument('-s', '--species_name', default=None,
19 | help='[Default is name of read] Name of species '
20 | 'for mapped sequence.')
21 |
22 | argv = ['--reads', 'tests/data/reads/test.fq']
23 |
24 | args = arg_parser.parse_args(argv)
25 | return OGSet(args)
26 |
27 | def test_OGSet(self):
28 | raise NotImplementedError
29 |
30 | def test_marker_genes_input(self):
31 | raise NotImplementedError
32 |
33 | def test_omastandalone_input(self):
34 | raise NotImplementedError
35 |
36 | def test_output_folder_structure(self):
37 | raise NotImplementedError
38 |
39 | def test_species_removal(self):
40 | raise NotImplementedError
41 |
42 | def test_species_removal_after_mapping(self):
43 | raise NotImplementedError
44 |
45 | def test_rest_api_connection(self):
46 | OGSet._read
47 |
48 | def test_rest_api_dna_downlaod(self):
49 | raise NotImplementedError
50 |
51 |
52 | if __name__ == "__main__":
53 | unittest.main()
54 |
--------------------------------------------------------------------------------
/archive/tests/test_reads.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import gzip
4 | import argparse
5 | from Bio import SeqIO
6 | from read2tree.Reads import Reads
7 | from read2tree.FastxReader import FastxReader
8 | from read2tree.main import parse_args
9 | from read2tree._utils import exe_name
10 | dirname = os.path.dirname(__file__)
11 |
12 |
13 | class ReadTest(unittest.TestCase):
14 |
15 | def setup_long_reads(self, split=False):
16 | if split:
17 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test.fq.gz', '--split_reads',
18 | '--split_overlap', '50', '--split_len', '400', '--sample_reads', '--coverage', '10',
19 | '--genome_len', '1000']
20 | else:
21 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test.fq.gz']
22 |
23 | args = parse_args(argv, exe_name(), '')
24 | # args = arg_parser.parse_args(argv)
25 | return Reads(args)
26 |
27 | def setup_reads_paired(self, sampling=False):
28 |
29 | if sampling:
30 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test_1a.fq.gz',
31 | 'data/reads/test_2a.fq.gz', '--sample_reads', '--coverage', '10', '--genome_len', '1000']
32 | else:
33 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test_1a.fq.gz',
34 | 'data/reads/test_2a.fq.gz']
35 | args = parse_args(argv, exe_name(), '')
36 | return Reads(args)
37 |
38 | def test_split(self):
39 | test_seq = 'ACGTTTTTTGGAAGAGTTAGAGATTTTTAGAGAGGAGGGGT'
40 | expected = ['ACGTTTTTTG', 'GAAGAGTTAG', 'AGATTTTTAG', 'AGAGGAGGGG',
41 | 'GAGGAGGGGT']
42 | reads = self.setup_long_reads()
43 | # obtained = reads._split_len(test_seq, 10)
44 | obtained = reads._split_len_overlap(test_seq, 10, 0)
45 | self.assertEqual(expected, obtained)
46 |
47 | def test_splitOverlap(self):
48 | test_seq = 'ACGTTTTTTGGAAGAGTTAGAGATTTTTAGAGAGGAGGGGTTT'
49 | expected = ['ACGTTTTTTG', 'TTTTGGAAGA', 'GAAGAGTTAG', 'GTTAGAGATT',
50 | 'AGATTTTTAG', 'TTTAGAGAGG', 'AGAGGAGGGG', 'GGAGGGGTTT']
51 | reads = self.setup_long_reads()
52 | obtained = reads._split_len_overlap(test_seq, 10, 5)
53 | # print(reads._split_len_overlap('TTTTTAGAGAGGAGGGGTTT', 10, 5))
54 | self.assertEqual(expected, obtained)
55 |
56 | def test_get_4_line_fastq_string(self):
57 | reads = self.setup_long_reads()
58 | expected = '@SRR00001 length=16\nACGTTTGGGAAGGTTT\n+SRR00001 ' \
59 | 'length=16\n????????????????\n'
60 | read_id = 'SRR00001'
61 | seq = 'ACGTTTGGGAAGGTTT'
62 | qual = '????????????????'
63 | name = reads._get_4_line_fastq_string(read_id, seq, qual, x=0)
64 | self.assertEqual(name, expected)
65 |
66 | def test_read_num_split(self):
67 | reads = self.setup_long_reads(split=True)
68 | num_reads = reads._get_num_reads('data/reads/test.fq.gz')
69 | self.assertEqual(num_reads, 18)
70 |
71 | def test_read_len_split(self):
72 | reads = self.setup_long_reads(split=True)
73 | len_reads = reads._get_read_len('data/reads/test.fq.gz',1000)
74 | self.assertEqual(len_reads, 400)
75 |
76 | def test_read_num_paired(self):
77 | reads = self.setup_reads_paired()
78 | num_reads = reads._get_num_reads('data/reads/test_1a.fq.gz')
79 | self.assertEqual(num_reads, 1000)
80 |
81 | def test_read_len_paired(self):
82 | reads = self.setup_reads_paired()
83 | num_reads = reads._get_read_len('data/reads/test_1a.fq.gz', 1000)
84 | self.assertEqual(num_reads, 151.0)
85 |
86 | def test_read_num_by_coverage_paired(self):
87 | reads = self.setup_reads_paired(sampling=True)
88 | num_reads = reads._get_num_reads_by_coverage(
89 | 'data/reads/test_1a.fq.gz', 1000)
90 | self.assertEqual(num_reads, 34)
91 |
92 | def test_read_num_by_coverage_split(self):
93 | reads = self.setup_long_reads(split=True)
94 | num_reads = reads._get_num_reads_by_coverage(['data/reads/test.fq.gz'],1000)
95 | self.assertEqual(num_reads, 25)
96 |
97 | def test_read_vec_paired(self):
98 | reads = self.setup_reads_paired(sampling=True)
99 | num_reads = reads._get_vector_random_reads(
100 | 'data/reads/test_1a.fq.gz')
101 | self.assertEqual(len(num_reads), 34)
102 |
103 |
104 | if __name__ == "__main__":
105 | unittest.main()
106 |
--------------------------------------------------------------------------------
/archive/tests/test_use.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import argparse
3 | import warnings
4 | warnings.filterwarnings('ignore')
5 | from read2tree.Progress import Progress
6 | from read2tree.stats.Coverage import Coverage
7 | from read2tree.stats.SeqCompleteness import SeqCompleteness
8 | import os
9 |
10 | class Use(unittest.TestCase):
11 |
12 | def test_OGSet(self):
13 |
14 | def test_write_progress(self):
15 |
16 | def test_read_progress(self):
17 |
18 |
19 | if __name__ == "__main__":
20 | unittest.main()
21 |
--------------------------------------------------------------------------------
/archive/wiki_images/covid1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/covid1.jpg
--------------------------------------------------------------------------------
/archive/wiki_images/covid2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/covid2.jpg
--------------------------------------------------------------------------------
/archive/wiki_images/figure1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/figure1.jpg
--------------------------------------------------------------------------------
/archive/wiki_images/figure_1sp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/figure_1sp.jpg
--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_0.png
--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_1.png
--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_2.png
--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_3.png
--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_4.png
--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_5.png
--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_6.png
--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_7.png
--------------------------------------------------------------------------------
/archive/wiki_images/oma_page_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_8.png
--------------------------------------------------------------------------------
/bin/read2tree:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python -W ignore
2 | '''
3 | Wrapper to enable the user to call the installed hogprop without the '.py'
4 | ending.
5 |
6 | -- Alex Warwick Vesztrocy, June 2016
7 | '''
8 | from read2tree.main import main
9 | from read2tree._utils import exe_name
10 | import sys
11 |
12 |
13 | if __name__ == '__main__':
14 | desc = 'read2tree is a pipeline allowing to use read data in combination with ' \
15 | 'an OMA standalone output run to produce high quality trees. '
16 | main(sys.argv[1:], exe_name=exe_name(), desc=desc)
17 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: read2tree_env
2 | channels:
3 | - conda-forge
4 | - bioconda
5 | dependencies:
6 | - python=3.9
7 | - numpy
8 | - biopython
9 | - ete3
10 | - lxml
11 | - tqdm
12 | - scipy
13 | - pyparsing
14 | - requests
15 | - natsort
16 | - pyyaml
17 | - filelock
18 | - dendropy
19 | - mafft
20 | - iqtree
21 | - ngmlr
22 | - nextgenmap
23 | - samtools
24 | - filelock
25 | - pyham
26 | - pysam
--------------------------------------------------------------------------------
/read2tree/Analyzer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | '''
3 | This file contains definitions of a class which surrounds possible alignment methods
4 |
5 | -- David Dylus, July--XXX 2017
6 | '''
7 | import os
8 | from Bio import AlignIO
9 | import re
10 |
11 | class Analyzer(object):
12 |
13 | def __init__(self, args, og_set=None):
14 | print('--- Alignment of OGs ---')
15 | self.args = args
16 | self.cov = {}
17 | self.seq_completeness = {}
18 |
19 | self._genome_or_transcriptome_length = args.gt_length
20 |
21 | if " " in args.reads:
22 | self._reads = args.reads.rstrip().split(" ")
23 | else:
24 | self._reads = args.reads
25 |
26 | if len(self._reads) == 2:
27 | self._species_name = self._reads[0].split("/")[-1].split(".")[0]
28 | else:
29 | self._species_name = self._reads.split("/")[-1].split(".")[0]
30 |
31 | self.treeStats = {}
32 | self.alignmentStats = {}
33 |
34 | # def __call__(self, *args, **kwargs):
35 | # raise NotImplementedError
36 |
37 | def _get_coverage_reads(self, args):
38 | """
39 |
40 | :param args:
41 | :return: coverage
42 | """
43 | with open(args.reads[0]) as input:
44 | read_length = input.readline().split("length=")[-1]
45 | num_lines = sum([1 for line in input])
46 |
47 | total_records = int(num_lines / 4)
48 | coverage = (total_records * read_length * len(args.reads))/self._genome_or_transcriptome_length
49 | return coverage
50 |
51 | def _get_number_results(self):
52 | raise NotImplementedError
53 |
54 | def _get_rf_dist(self, ref_tree):
55 | raise NotImplementedError
56 |
57 | def _get_length_align(self):
58 | raise NotImplementedError
59 |
60 | def _get_num_OGs(self):
61 | raise NotImplementedError
62 |
63 | def _get_mean_ACGT(self, args):
64 | import glob
65 | for folder in glob.iglob(args.output + '/05_*', recursive=True):
66 | print(folder)
67 | all_coverages = []
68 |
69 | for file in glob.iglob(folder + '/*.phy'):
70 | align = AlignIO.read(file, "phylip-relaxed")
71 | for record in align:
72 | if self._species_name[0:5] in record.id:
73 | seq = re.sub('-', '', str(record.seq))
74 | xx = seq.count("X")
75 | aa = len(seq) - xx
76 | all_coverages.append((aa / len(seq)))
77 | print(sum(all_coverages) / len(all_coverages))
78 |
79 | def _get_branch_length_mapped_seq(self):
80 | raise NotImplementedError
81 |
82 | def write_to_csv(self):
83 | raise NotImplementedError
--------------------------------------------------------------------------------
/read2tree/FastxReader.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import logging
3 | import gzip
4 | import mimetypes
5 | # from memory_profiler import memory_usage
6 |
7 | class FastxReader(object):
8 |
9 | def __init__(self, file):
10 |
11 | self._file = file
12 | guessed_type = mimetypes.guess_type(file)[1]
13 | if guessed_type:
14 | if 'gzip' in guessed_type:
15 | self._file_handle = 'gzip'
16 | else:
17 | self._file_handle = 'txt'
18 |
19 | def open_fastx(self):
20 | if self._file_handle in 'gzip':
21 | return gzip.open(self._file, 'rt')
22 | else:
23 | return open(self._file, 'rt')
24 |
25 | def readfq_id(self, file_handle):
26 | for l in file_handle:
27 | name = l.rstrip()
28 | seq = next(file_handle).rstrip()
29 | tmp = next(file_handle).rstrip()
30 | qual = next(file_handle).rstrip()
31 | yield name.split(' ')[0]
32 |
33 | def readfq(self, file_handle):
34 | for l in file_handle:
35 | name = l.rstrip()
36 | seq = next(file_handle).rstrip()
37 | tmp = next(file_handle).rstrip()
38 | qual = next(file_handle).rstrip()
39 | yield name, seq, qual
40 |
41 | def readfa(self, file_handle):
42 | for l in file_handle:
43 | name = l.rstrip()
44 | seq = next(file_handle).rstrip()
45 | yield name, seq
46 |
47 | def readfx(self, file_handle):
48 | for l in file_handle:
49 | name = l.rstrip()
50 | seq = next(file_handle).rstrip()
51 | if '@' in name[0]:
52 | tmp = next(file_handle).rstrip()
53 | qual = next(file_handle).rstrip()
54 | elif '>' in name[0]:
55 | qual = None
56 | yield name, seq, qual
57 |
58 | # def readfx(self, file_handle): # this is a generator function
59 | # '''
60 | # This function was copy and pasted from https://github.com/lh3/readfq
61 | # Readfq is a fast implementation of a read iterator and provides a
62 | # massive spead up compared to regular
63 | # implementations
64 | # :param file_handle: is a filehandle
65 | # :return: name, seq, quality
66 | # '''
67 | # last = None # this is a buffer keeping the last unprocessed line
68 | # while True: # mimic closure; is it a bad idea?
69 | # if not last: # the first record or a record following a fastq
70 | # for l in file_handle: # search for the start of the next record
71 | # if l[0] in '>@': # fasta/q header line
72 | # last = l[:-1] # save this line
73 | # break
74 | # if not last:
75 | # break
76 | # name, seqs, last = last, [], None
77 | # for l in file_handle: # read the sequence
78 | # if l[0] in '@+>':
79 | # last = l[:-1]
80 | # break
81 | # seqs.append(l[:-1])
82 | # if not last or last[0] != '+': # this is a fasta record
83 | # yield name, ''.join(seqs), None # yield a fasta record
84 | # if not last:
85 | # break
86 | # else: # this is a fastq record
87 | # seq, leng, seqs = ''.join(seqs), 0, []
88 | # for l in file_handle: # read the quality
89 | # seqs.append(l[:-1])
90 | # leng += len(l) - 1
91 | # if leng >= len(seq): # have read enough quality
92 | # last = None
93 | # yield name, seq, ''.join(seqs) # yield a fastq record
94 | # break
95 | # if last: # reach EOF before reading enough quality
96 | # yield name, seq, None # yield a fasta record instead
97 | # break
98 |
--------------------------------------------------------------------------------
/read2tree/GuidedAssembler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | '''
3 | This file contains definitions of a class which surrounds possible alignment methods
4 |
5 | -- David Dylus, July--XXX 2017
6 | '''
7 | import logging
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 | class Aligner(object):
12 |
13 | def __init__(self, args=None, alignments=None):
14 |
15 | self.args = args
16 | self.alignments = alignments
17 | self.placement_dic = alignments.placement_dic
18 |
19 |
20 |
--------------------------------------------------------------------------------
/read2tree/MultiProcessingLog.py:
--------------------------------------------------------------------------------
1 | # taken from https://gist.github.com/JesseBuesking/10674086
2 |
3 | from logging.handlers import RotatingFileHandler
4 | import multiprocessing, threading, logging, sys, traceback
5 | #import os
6 |
7 |
8 | class MultiProcessingLog(logging.Handler):
9 | def __init__(self, name, mode, maxsize, rotate):
10 | logging.Handler.__init__(self)
11 |
12 | self._handler = RotatingFileHandler(name, mode, maxsize, rotate)
13 | self.queue = multiprocessing.Queue(-1)
14 |
15 | t = threading.Thread(target=self.receive)
16 | t.daemon = True
17 | t.start()
18 |
19 | def setFormatter(self, fmt):
20 | logging.Handler.setFormatter(self, fmt)
21 | self._handler.setFormatter(fmt)
22 |
23 | def receive(self):
24 | while True:
25 | try:
26 | record = self.queue.get()
27 | self._handler.emit(record)
28 | #print('received on pid {}'.format(os.getpid()))
29 | except (KeyboardInterrupt, SystemExit):
30 | raise
31 | except EOFError:
32 | break
33 | except:
34 | traceback.print_exc(file=sys.stderr)
35 |
36 | def send(self, s):
37 | self.queue.put_nowait(s)
38 |
39 | def _format_record(self, record):
40 | # ensure that exc_info and args have been stringified. Removes any
41 | # chance of unpickleable things inside and possibly reduces message size
42 | # sent over the pipe
43 | if record.args:
44 | record.msg = record.msg % record.args
45 | record.args = None
46 | if record.exc_info:
47 | dummy = self.format(record)
48 | record.exc_info = None
49 |
50 | return record
51 |
52 | def emit(self, record):
53 | try:
54 | s = self._format_record(record)
55 | self.send(s)
56 | except (KeyboardInterrupt, SystemExit):
57 | raise
58 | except:
59 | self.handleError(record)
60 |
61 | def close(self):
62 | self._handler.close()
63 | logging.Handler.close(self)
--------------------------------------------------------------------------------
/read2tree/ReferenceSet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | '''
3 | This file contains definitions of a class which allows to create
4 | the reference orthologous groups with their DNA sequences.
5 |
6 | -- David Dylus, July--XXX 2017
7 | '''
8 |
9 | import os
10 | import glob
11 | import logging
12 | import time
13 | from tqdm import tqdm
14 | from Bio import SeqIO
15 | from Bio.SeqIO.FastaIO import FastaWriter
16 |
17 | from read2tree.Progress import Progress
18 |
19 |
20 | class ReferenceSet(object):
21 | '''
22 | Structure for reference
23 | '''
24 |
25 | def __init__(self, args, og_set=None, load=True, progress=None):
26 | """
27 |
28 | :param args: list of arguments from command line
29 | :param og_set: set of OGs used to obtain reference DNA sequences
30 | :param load: set to True when reference loaded from folder/file of list of arguments
31 | """
32 | self.ref = {}
33 | self.load = load
34 | self.args = args
35 | self.progress = progress
36 |
37 | self.logger = logging.getLogger(__name__)
38 | self._species_name = self.args.species_name
39 |
40 | if load is False:
41 | self.ref = self._load_records_folder()
42 | elif og_set is not None and load is True:
43 | self.ref = self._generate_reference(og_set)
44 | self.write()
45 | # self.progress.set_status('ref')
46 |
47 | # if args.remove_species:
48 | # self.ref = self._remove_species()
49 |
50 | def _read_fasta(self, ref_file):
51 | '''
52 |
53 | :param ref_file: file that contains all the DNA sequences from the oma database
54 | :return:
55 | '''
56 | print('--- Reading DNA reference into memory ---')
57 | return SeqIO.index(ref_file, "fasta")
58 |
59 | def _load_records_folder(self):
60 | """
61 | Parse species with their dna sequences from folder
62 | :return:
63 | """
64 | ref_dict = {}
65 | print('--- Generating reference for mapping from folder ---')
66 | ref_dna = os.path.join(self.args.output_path, '02_ref_dna')
67 | for file in tqdm(glob.glob(os.path.join(ref_dna, "*.fa")), desc="Re-loading references for mapping from folder", unit=" species"):
68 | species_name = file.split("/")[-1].split("_")[0]
69 | ref_dict[species_name] = Reference()
70 | ref_dict[species_name].dna = list(SeqIO.parse(file, 'fasta'))
71 |
72 | return ref_dict
73 |
74 | def _generate_reference(self, og_set):
75 | '''
76 | Split records into dictionary with keys being species and the values the corresponded sequence records
77 | '''
78 | print('--- Generating reference for mapping ---')
79 | start = time.time()
80 | ref_set = {}
81 | for name, og in tqdm(og_set.items(), desc="Loading records", unit=" record"):
82 | for record in og.aa:
83 | species = record.id[0:5]
84 | record.id = record.id # +"_"+name
85 | if species in ref_set.keys():
86 | ref_set[species].aa.append(record)
87 | else:
88 | ref_set[species] = Reference()
89 | ref_set[species].aa.append(record)
90 |
91 | for record in og.dna:
92 | species = record.id[0:5]
93 | record.id = record.id # + "_" + name
94 | if species in ref_set.keys():
95 | ref_set[species].dna.append(record)
96 | else:
97 | ref_set[species] = Reference()
98 | ref_set[species].dna.append(record)
99 | end = time.time()
100 | elapsed_time = end - start
101 | self.logger.info('{}: Extracted {} reference species form {} ogs took {}'
102 | .format(self._species_name, len(ref_set.keys()),
103 | len(og_set.keys()), elapsed_time))
104 | return ref_set
105 |
106 | def write(self):
107 | '''
108 | Write for each species all the DNA sequences into separate fasta files
109 | :param output_folder: folder where files should be stored
110 | '''
111 | out_dna = os.path.join(self.args.output_path, '02_ref_dna')
112 | if not os.path.exists(out_dna):
113 | os.makedirs(out_dna)
114 | for key, value in self.ref.items():
115 | if value.dna: # only write if not empty
116 | value.write_dna(key, out_dna)
117 |
118 | def _remove_species(self):
119 | raise NotImplementedError
120 |
121 |
122 | class Reference(object):
123 |
124 | def __init__(self, args=None):
125 | self.args = args
126 | self.aa = []
127 | self.dna = []
128 |
129 | def write_aa(self, species, output_folder):
130 | handle = open(os.path.join(output_folder, species + '_OGs.fa'), "w")
131 | writer = FastaWriter(handle, wrap=None)
132 | writer.write_file(self.aa)
133 | handle.close()
134 |
135 | def write_dna(self, species, output_folder):
136 | handle = open(os.path.join(output_folder, species + '_OGs.fa'), "w")
137 | writer = FastaWriter(handle, wrap=None)
138 | writer.write_file(self.dna)
139 | handle.close()
140 |
--------------------------------------------------------------------------------
/read2tree/TreeInference.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | '''
3 | This file contains definitions of a class which surrounds the wrappers to build trees given a set of command line arguments.
4 |
5 | -- David Dylus, July--XXX 2017
6 | '''
7 | import os
8 | import time
9 | import logging
10 | from read2tree.wrappers.treebuilders import Fasttree, Iqtree
11 | from read2tree.wrappers.treebuilders.base_treebuilder import DataType
12 |
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | class TreeInference(object):
18 |
19 | def __init__(self, args, concat_alignment=None):
20 | print('--- Tree inference ---')
21 |
22 | self.args = args
23 |
24 | self.elapsed_time = 0
25 |
26 | if self.args.reads:
27 | if len(self.args.reads) == 2:
28 | self._reads = self.args.reads
29 | self._species_name = self._reads[0].split("/")[-1].split(".")[0]
30 | else:
31 | self._reads = self.args.reads[0]
32 | self._species_name = self._reads.split("/")[-1].split(".")[0]
33 |
34 | if self.args.species_name:
35 | self._species_name = self.args.species_name
36 |
37 | if not self.args.reads and not self.args.species_name:
38 | self._species_name = 'merge'
39 |
40 | self.tree = None
41 | if concat_alignment is not None:
42 | self.tree = self._infer_tree(concat_alignment)
43 |
44 | def _infer_tree(self, concat_alignment):
45 | start = time.time()
46 | output_folder = self.args.output_path
47 | if not os.path.exists(output_folder):
48 | os.makedirs(output_folder)
49 | #fasttree_wrapper = Fasttree(concat_alignment, datatype=DataType.PROTEIN)
50 | #tree = fasttree_wrapper()
51 | iqtree_wrapper = Iqtree(concat_alignment, datatype=DataType.PROTEIN)
52 | iqtree_wrapper.options.options['-m'].set_value('LG')
53 | iqtree_wrapper.options.options['-nt'].set_value(self.args.threads)
54 | tree = iqtree_wrapper()
55 | with open(os.path.join(output_folder, "tree_" + self._species_name + ".nwk"), "w") as text_file:
56 | text_file.write("{}".format(tree))
57 | self.tree = "{}".format(tree)
58 | end = time.time()
59 | self.elapsed_time = end - start
60 | logger.info('{}: Tree inference took {}.'.format(self._species_name,
61 | self.elapsed_time))
62 |
63 | return tree
64 |
--------------------------------------------------------------------------------
/read2tree/__init__.py:
--------------------------------------------------------------------------------
1 | from datetime import date
2 | import logging
3 | import logging.config
4 | import yaml
5 | import os
6 | from pkg_resources import resource_string
7 | logging.getLogger(__name__).addHandler(logging.NullHandler())
8 |
9 | __version__ = '0.1.5'
10 | __copyright__ = 'read2tree (C) 2017-{:d} David Dylus' \
11 | .format(date.today().year)
12 |
13 | # path = './log.yaml'
14 | # if os.path.exists(path):
15 | # with open(path, 'rt') as f:
16 | # config = yaml.load(f.read())
17 | # logging.config.dictConfig(config)
18 |
19 | conf = resource_string(__name__, 'logging/log.yaml')
20 |
21 | D = yaml.load(conf, Loader=yaml.FullLoader)
22 | D.setdefault('version', 1)
23 | logging.config.dictConfig(D)
24 | # del D
25 |
--------------------------------------------------------------------------------
/read2tree/_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | '''
3 | Utilities for parsing the annotations files.
4 |
5 | -- Alex Warwick Vesztrocy - March--June 2016
6 | '''
7 | import bz2
8 | import gzip
9 | import os
10 | import sys
11 |
12 |
13 | # File opening. This is based on the example on SO here:
14 | # http://stackoverflow.com/a/26986344
15 | fmagic = {b'\x1f\x8b\x08': gzip.open,
16 | b'\x42\x5a\x68': bz2.BZ2File}
17 |
18 |
19 | def auto_open(fn, *args):
20 | '''
21 | Opens files based on their "magic bytes". Supports bz2 and gzip. If it
22 | finds neither of these, presumption is it is a standard, uncompressed
23 | file.
24 | '''
25 | if os.path.isfile(fn) and os.stat(fn).st_size > 0:
26 | with open(fn, 'rb') as fp:
27 | fs = fp.read(max([len(x) for x in fmagic]))
28 | for (magic, _open) in fmagic.items():
29 | if fs.startswith(magic):
30 | return _open(fn, *args)
31 | else:
32 | if fn.endswith('gz'):
33 | return gzip.open(fn, *args)
34 | elif fn.endswith('bz2'):
35 | return bz2.BZ2File(fn, *args)
36 |
37 | return open(fn, *args)
38 |
39 |
40 | def exe_name():
41 | '''
42 | Return the executable's basename, for inclusion in the help (with the
43 | help of argparse).
44 | '''
45 | return os.path.basename(sys.argv[0])
46 |
47 |
48 | class LazyProperty(object):
49 | '''
50 | Decorator to evaluate a property only on access.
51 |
52 | Compute the attribute value and caches it in the instance.
53 | Python Cookbook (Denis Otkidach)
54 | http://stackoverflow.com/users/168352/denis-otkidach
55 | This decorator allows you to create a property which can be computed
56 | once and accessed many times.
57 |
58 | (Include from pyoma.browser.models - Adrian Altenhoff)
59 | '''
60 | def __init__(self, method, name=None):
61 | # record the unbound-method and the name
62 | self.method = method
63 | self.name = name or method.__name__
64 | self.__doc__ = method.__doc__
65 |
66 | def __get__(self, inst, cls):
67 | if inst is None:
68 | return self
69 | # compute, cache and return the instance's attribute value
70 | result = self.method(inst)
71 | # setattr redefines the instance's attribute so this doesn't get called
72 | # again
73 | setattr(inst, self.name, result)
74 | return result
75 |
76 |
77 | def get_job_id():
78 | '''
79 | Gets job ID.
80 | '''
81 | if 'JOB_ID' in os.environ:
82 | # SGE
83 | return int(os.environ['JOB_ID'])
84 | elif 'LSB_JOBID' in os.environ:
85 | # LSF
86 | return int(os.environ['LSB_JOBID'])
87 | elif 'PBS_JOBID' in os.environ:
88 | # PBS / Torque
89 | return int(os.environ['PBS_JOBID'])
90 | elif 'SLURM_ARRAY_JOB_ID' in os.environ:
91 | # Slurm
92 | return int(os.environ['SLURM_ARRAY_JOB_ID'])
93 | else:
94 | # No parallelism detected.
95 | return None
96 |
97 |
98 | def get_worker_id():
99 | '''
100 | Gets worker ID from the array ID in the job handler.
101 | number of workers.
102 | '''
103 | try:
104 | if 'SGE_TASK_ID' in os.environ:
105 | # SGE
106 | return int(os.environ['SGE_TASK_ID'])
107 | elif 'LSB_JOBINDEX' in os.environ:
108 | return int(os.environ['LSB_JOBINDEX'])
109 | elif 'PBS_ARRAYID' in os.environ:
110 | # PBS / Torque
111 | return int(os.environ['PBS_ARRAYID'])
112 | elif 'SLURM_ARRAY_TASK_ID' in os.environ:
113 | # Slurm
114 | return int(os.environ['SLURM_ARRAY_TASK_ID'])
115 | except ValueError:
116 | # int() to base10 error
117 | pass
118 |
119 | # No parallelism detected.
120 | return None
121 |
122 |
123 | def check_array_ids(args):
124 | '''
125 | Checks the IDs added to args for array jobs. Raises errors if not setup
126 | correctly.
127 | '''
128 | if args.worker_id > args.array or args.worker_id == 0:
129 | raise RuntimeError('Recognised: worker ID {} and array size {}. '
130 | 'Worker IDs should run from 1-N (N is array size'
131 | ').'.format(args.worker_id, args.array))
132 | if args.job_id is None or args.worker_id is None:
133 | raise RuntimeError('User requested HOGPROP to run as job array.'
134 | 'Can\'t find job ID ({}) or array ID ({}).'
135 | .format(args.job_id, args.worker_id))
136 |
--------------------------------------------------------------------------------
/read2tree/file_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .context_managers import *
2 |
--------------------------------------------------------------------------------
/read2tree/file_utils/context_managers.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import tempfile
4 |
5 | __all__ = ['TempFile', 'TempDir', 'ChDir', 'MkDir', 'NonDeletingTempDir']
6 |
7 |
8 | class TempFile(object):
9 | """
10 | Context manager for working with a temporary file
11 | that automatically cleans up.
12 |
13 | Usage:
14 |
15 | with TempFile() as tmp:
16 | # In scope, tmp exists on the disk
17 | # Do some work with tmp, e.g. tmp.write('something')
18 |
19 | # Out of scope, tmp is deleted
20 |
21 | with TempFile('local_temp_space') as tmp:
22 | # tmp is created in the directory 'local_temp_space'
23 | # The specified directory must exist, or an error is thrown
24 |
25 | """
26 |
27 | def __init__(self, dir_=None):
28 | if dir_ is not None and not os.path.exists(dir_):
29 | raise IOError('Directory "{}"" does not exist'.format(dir_))
30 | self.dir = dir_
31 |
32 | def __enter__(self):
33 | self._fd, self._wrapped_tmp = tempfile.mkstemp(dir=self.dir)
34 | return os.path.abspath(self._wrapped_tmp)
35 |
36 | def __exit__(self, type, value, tb):
37 | os.close(self._fd)
38 | os.remove(self._wrapped_tmp)
39 |
40 |
41 | class TempDir(object):
42 | """
43 | Context manager for working with a temporary file
44 | that automatically cleans up.
45 |
46 | Usage:
47 |
48 | with TempDir() as tmpd:
49 | # In scope, tmpd exists on the disk
50 | # Do some work with tmpd ...
51 |
52 | # Out of scope, tmpd is deleted along with all its content
53 |
54 | Can be nested with TempFile, e.g.
55 |
56 | with TempDir() as tmpd, TempFile(tmpd) as tmpf:
57 | # tempfile tmpf is created inside temporary directory tmpd
58 | # On exit, everything is deleted
59 |
60 | """
61 |
62 | def __enter__(self):
63 | self._wrapped_tmpdir = tempfile.mkdtemp()
64 | return os.path.abspath(self._wrapped_tmpdir)
65 |
66 | def __exit__(self, type, value, tb):
67 | shutil.rmtree(self._wrapped_tmpdir)
68 |
69 |
70 | class NonDeletingTempDir(TempDir):
71 | def __exit__(self, tpye, value, tb):
72 | pass
73 |
74 |
75 | class ChDir(object):
76 | """
77 | Context manager to switch to a working directory,
78 | and return to the current directory (like 'Dir.chdir do' block in Ruby)
79 |
80 | Usage:
81 |
82 | with TempDir() as dir, ChDir(dir):
83 | # Do some work in the working temp directory 'dir'
84 |
85 | # Exit 'dir'
86 | """
87 |
88 | def __init__(self, working_dir):
89 | if not os.path.exists(working_dir):
90 | raise IOError('Directory "{}"" does not exist'.format(working_dir))
91 | self._cdir = os.getcwd()
92 | self._wdir = working_dir
93 |
94 | def __enter__(self):
95 | os.chdir(self._wdir)
96 |
97 | def __exit__(self, type, value, tb):
98 | os.chdir(self._cdir)
99 |
100 |
101 | class MkDir(ChDir):
102 | """
103 | Context manager to create and switch to a working directory,
104 | then return to the current directory.
105 |
106 | Usage:
107 |
108 | with TempDir() as dir, MkDir(dir):
109 | # Do some work in the working temp directory 'dir'
110 |
111 | # Exit 'dir'
112 | """
113 |
114 | def __init__(self, working_dir):
115 | if not os.path.exists(working_dir):
116 | try:
117 | os.makedirs(working_dir)
118 | except OSError as e:
119 | if e.errno != 17:
120 | raise
121 | pass # path was created by another thread / process
122 | # this is a race condition, but probably benign
123 |
124 | def __enter__(self):
125 | pass
126 |
127 | def __exit__(self, type, value, tb):
128 | pass
129 |
--------------------------------------------------------------------------------
/read2tree/logging/log.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | version: 1
3 | disable_existing_loggers: False
4 | formatters:
5 | simple:
6 | format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
7 | handlers:
8 | console:
9 | class: logging.StreamHandler
10 | level: INFO
11 | formatter: simple
12 | stream: ext://sys.stdout
13 | mplog:
14 | class: read2tree.MultiProcessingLog.MultiProcessingLog
15 | level: DEBUG
16 | formatter: simple
17 | name: mplog.log
18 | mode: a
19 | maxsize: 1024
20 | rotate: 0
21 | root:
22 | level: DEBUG
23 | handlers: [console, mplog]
--------------------------------------------------------------------------------
/read2tree/logging/log.yaml.bak:
--------------------------------------------------------------------------------
1 | ---
2 | version: 1
3 | disable_existing_loggers: False
4 | formatters:
5 | simple:
6 | format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
7 | handlers:
8 | console:
9 | class: logging.StreamHandler
10 | level: INFO
11 | formatter: simple
12 | stream: ext://sys.stdout
13 | mplog:
14 | class: read2tree.MultiProcessingLog.MultiProcessingLog
15 | level: DEBUG
16 | formatter: simple
17 | name: mplog.log
18 | mode: a
19 | maxsize: 1024
20 | rotate: 0
21 | root:
22 | level: DEBUG
23 | handlers: [console, mplog]
--------------------------------------------------------------------------------
/read2tree/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .OMAOutputParser import *
--------------------------------------------------------------------------------
/read2tree/stats/Coverage.py:
--------------------------------------------------------------------------------
1 | import pysam
2 | import numpy as np
3 |
4 |
5 | class Coverage(object):
6 |
7 | def __init__(self, args):
8 | self.args = args
9 | self.coverage = {}
10 |
11 | def get_coverage_bam(self, file_name):
12 | mybam = pysam.AlignmentFile(file_name, 'rb')
13 | for ref in mybam.references:
14 | self.coverage[self._get_clean_id(ref)] \
15 | = self._get_gene_coverage(mybam, ref)
16 |
17 | def _get_clean_id(self, id):
18 | id = id.split(" ")[0]
19 | id = id.split("_")
20 | return id[0]+"_"+id[1]
21 |
22 | def add_coverage(self, ref, coverage):
23 | self.coverage[ref] = coverage
24 |
25 | def write_coverage_bam(self, file_name):
26 | out_text = ''
27 | header = '#species,og,gene_id,coverage,std\n'
28 | out_text += header
29 | for key, value in self.coverage.items():
30 | species = key[0:5]
31 | og = key.split("_")[-1]
32 | gene_id = key.split("_")[0]
33 | coverage = value
34 | line = species + "," + og + "," + gene_id + "," + \
35 | str(coverage[0]) + "," + str(coverage[1]) + "\n"
36 | out_text += line
37 |
38 | with open(file_name, "w") as myfile:
39 | myfile.write(out_text)
40 |
41 | def read_coverage_from_file(self, file_name):
42 | raise NotImplementedError
43 |
44 | def _get_gene_coverage(self, mybam, ref):
45 | """
46 |
47 | :param mybam: bam_file object from pysam
48 | :param ref: the gene_id reference to pileup the the number of reads per column
49 | :return: average coverage per gene
50 | """
51 | column_coverage = []
52 | for pileupcolumn in mybam.pileup(ref, 0, 100000):
53 | if pileupcolumn.n >= self.args.min_cons_coverage:
54 | column_coverage.append(pileupcolumn.n)
55 | np_column_coverage = np.array(column_coverage)
56 | return [np.mean(np_column_coverage), np.std(np_column_coverage)]
57 |
--------------------------------------------------------------------------------
/read2tree/stats/SeqCompleteness.py:
--------------------------------------------------------------------------------
1 | import pysam
2 | import numpy as np
3 |
4 |
5 | class SeqCompleteness(object):
6 |
7 | def __init__(self, mapped_ref=None, tested_ref=None):
8 | self.seq_completeness = {}
9 |
10 | if mapped_ref:
11 | self.map_ref_records = self._get_og_dict(mapped_ref)
12 | else:
13 | self.map_ref_records = None
14 |
15 | if tested_ref:
16 | self.ref_records = self._get_og_dict(tested_ref)
17 | else:
18 | self.ref_records = None
19 |
20 | def get_seq_completeness(self, records):
21 | for record in records:
22 | self.seq_completeness[
23 | record.id] = self._get_single_seq_completeness(record)
24 |
25 | def _get_single_seq_completeness(self, mapped_record, gene_code='dna'):
26 | """
27 | Calculate single sequence completeness using the number of dna or aa
28 | positions that are not n/X divided by either
29 | length of sequence or full length or reference
30 | :param mapped_record: sequence record that was produced by mapping
31 | :param gene_code: dna or aa
32 | :return: tuple with partial seq completeness computed using just the
33 | mapped_record itself and ref_seq_completeness computed
34 | using also t
35 | """
36 |
37 | map_ref_record = self.map_ref_records[self._get_og_id(mapped_record.id)]
38 | map_ref_seq = str(map_ref_record.seq).upper()
39 | map_seq = str(mapped_record.seq).upper()
40 | if self.ref_records and self._get_og_id(mapped_record.id) in \
41 | self.ref_records.keys():
42 | ref_record = self.ref_records[self._get_og_id(mapped_record.id)]
43 | ref_seq = str(ref_record.seq).upper()
44 | else:
45 | ref_seq = map_ref_seq
46 | if gene_code == 'dna':
47 | ref_seq_len = len(ref_seq)
48 | map_seq_len = len(map_ref_seq)
49 | non_n_len = len(map_ref_seq) - str(map_seq).count('N')
50 | map_seq_completeness = non_n_len / map_seq_len
51 | ref_seq_completeness = non_n_len / ref_seq_len
52 | elif gene_code == 'aa':
53 | ref_seq_len = len(ref_seq)
54 | map_seq_len = len(map_seq)
55 | non_n_len = len(map_seq) - str(map_seq).count('X')
56 | map_seq_completeness = non_n_len / map_seq_len
57 | ref_seq_completeness = non_n_len / ref_seq_len
58 | return [map_seq_completeness, ref_seq_completeness,
59 | non_n_len, map_seq_len, ref_seq_len]
60 |
61 | def _get_og_dict(self, ref_og):
62 | dna_dict = {}
63 | for record in ref_og:
64 | if '_' in record.id:
65 | split_id = record.id.split("_")
66 | tmp = split_id[0]+"_"+split_id[1]
67 | record.id = tmp
68 | og_id = split_id[1]
69 |
70 | dna_dict[og_id] = record
71 | return dna_dict
72 |
73 | def _get_og_id(self, id):
74 | split_id = id.split("_")
75 | # return split_id[0]+"_"+split_id[1]
76 | return split_id[1]
77 |
78 | def _get_gene_id(self, id):
79 | split_id = id.split("_")
80 | return split_id[0]
81 |
82 | def add_seq_completeness(self, ref, seq_completeness):
83 | self.seq_completeness[ref] = seq_completeness
84 |
85 | def write_seq_completeness(self, file_name):
86 | out_text = ''
87 | header = '#species,og,gene_id,map_seq_completeness,' \
88 | 'ref_seq_completeness,inferred_len,given_len,ref_len\n'
89 | out_text += header
90 | for key, value in self.seq_completeness.items():
91 | species = key[0:5]
92 | og = key.split("_")[-1]
93 | gene_id = key.split("_")[0]
94 | seq_completeness = value
95 | line = species + "," + og + "," + gene_id + "," + \
96 | str(seq_completeness[0]) + "," + str(seq_completeness[1]) + \
97 | "," + str(seq_completeness[2]) + "," + \
98 | str(seq_completeness[3]) + "," + \
99 | str(seq_completeness[4]) + "\n"
100 | out_text += line
101 |
102 | with open(file_name, "w") as myfile:
103 | myfile.write(out_text)
104 |
--------------------------------------------------------------------------------
/read2tree/stats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/read2tree/stats/__init__.py
--------------------------------------------------------------------------------
/read2tree/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .seq_utils import *
2 |
--------------------------------------------------------------------------------
/read2tree/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | class WrapperError(Exception):
2 | pass
3 |
--------------------------------------------------------------------------------
/read2tree/wrappers/aligners/__init__.py:
--------------------------------------------------------------------------------
1 | from .mafft import Mafft
2 | from .muscle import Muscle
3 | from .prographmsa import ProGraphMSA
4 | from .probcons import ProbCons
5 | from .base_aligner import AlignmentInput, DataType, WrapperError
--------------------------------------------------------------------------------
/read2tree/wrappers/aligners/base_aligner.py:
--------------------------------------------------------------------------------
1 | import os, types, itertools
2 | from abc import ABCMeta, abstractmethod
3 | from enum import Enum
4 | from Bio import AlignIO, SeqIO
5 | from Bio.Align import MultipleSeqAlignment
6 | from read2tree.utils.seq_utils import is_dna
7 |
8 |
9 | from read2tree.wrappers import WrapperError
10 |
11 |
12 | AlignmentInput = Enum('AlignmentInput', 'OBJECT FILENAME')
13 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN')
14 |
15 | class Aligner(object):
16 | """
17 | Base class for wrappers of Multiple Sequence Aligner software
18 |
19 | The wrapper is written as a callable class.
20 | This can hold data (state) to do with the operation it performs, so it can keep results,
21 | execution times and other metadata, as well as perform the task.
22 |
23 | This is a base implementation to be extended. The important parts are
24 | __init__ (does the setup) and __call__ (does the work). All
25 | else are helper methods.
26 |
27 | :Example:
28 |
29 | ::
30 |
31 | callable_wrapper = ConcreteAligner(aln)
32 | result = callable_wrapper()
33 | time_taken = callable_wrapper.elapsed_time
34 | result_again = callable_wrapper.result
35 |
36 | """
37 | __metaclass__ = ABCMeta
38 |
39 | def __init__(self, input_, datatype=DataType.UNKNOWN, binary=None):
40 | """
41 | Should work the same whether you're working with a Biopython object or a file
42 | but the implementation differs, e.g. a Biopython object will need
43 | to be written temporarily to disk for the Aligner to work on it.
44 |
45 | :param input_: can be either a filename or a biopython multiple
46 | sequence alignment (a collection of :class:`Bio.SeqRecord.SeqRecord`)
47 |
48 | :param binary: is the alignment's executable file, or None. If set to
49 | None, it is assumed to be found in the PATH.
50 |
51 | :param datatype: means is it DNA or protein?
52 | """
53 | self.input_type = identify_input(input_) # Figure out what it is - file or object
54 |
55 | if datatype == DataType.UNKNOWN:
56 | #dup, input_ = itertools.tee(input_)
57 | self.datatype = guess_datatype(input_, from_filename=self.input_type==AlignmentInput.FILENAME)
58 | if self.input_type == AlignmentInput.OBJECT:
59 | dup, input_ = itertools.tee(input_)
60 | self.datatype = guess_datatype(dup, False)
61 | else:
62 | self.datatype = guess_datatype(input_, True)
63 | else:
64 | self.datatype = datatype
65 |
66 | self.input = input_ # store it
67 | self.elapsed_time = None
68 | self.stdout = None
69 | self.stderr = None
70 | try:
71 | self.cli = self._init_cli(binary)
72 | except IOError as err:
73 | raise WrapperError('Error searching for binary: {}'.format(err))
74 | # End setup
75 |
76 | @abstractmethod
77 | def __call__(self, *args, **kwargs):
78 | """
79 | How to call the underlying aligner
80 | """
81 | pass
82 |
83 | @abstractmethod
84 | def _init_cli(self, binary):
85 | pass
86 |
87 | import logging
88 | logger = logging.getLogger()
89 |
90 | def guess_datatype(alignment, from_filename=False):
91 | logger.warning("Guessing is not recommended - specify the sequence type with option datatype={DNA, PROTEIN}, be more confident")
92 | if from_filename:
93 | try:
94 | alignment = SeqIO.parse(alignment, 'fasta')
95 | except:
96 | alignment = SeqIO.parse(alignment, 'phylip-relaxed')
97 | return DataType.DNA if is_dna(alignment) else DataType.PROTEIN
98 |
99 |
100 | def identify_input(alignment):
101 | """
102 | Work out if we're dealing with a Biopython object (return True), a file
103 | (return False), or invalid input (raise error)
104 | """
105 | try:
106 | if isinstance(alignment, (MultipleSeqAlignment, types.GeneratorType, list)):
107 | # `alignment` is a Biopython MultipleSequenceAlignment
108 | return AlignmentInput.OBJECT
109 |
110 | elif isinstance(alignment, str) and os.path.exists(alignment):
111 | # `alignment` is a filepath
112 | return AlignmentInput.FILENAME
113 |
114 | except:
115 | # `alignment` is some other thing we can't handle
116 | raise ValueError('{} is not an alignment object or a valid filename'.format(alignment))
117 |
118 |
119 | # TODO: Break the identify_input function into two parts - one to work out the datatype, one to work out whether
120 | # this is a file or an object
121 |
--------------------------------------------------------------------------------
/read2tree/wrappers/aligners/muscle.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | import time
3 | from Bio import AlignIO, SeqIO
4 | from six import StringIO
5 | from ..abstract_cli import AbstractCLI
6 | from .base_aligner import Aligner, AlignmentInput, DataType
7 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, TreeInputOption, OptionSet
8 |
9 |
10 | class MuscleCLI(AbstractCLI):
11 | """
12 | Muscle low-level command line interface
13 |
14 | example:
15 | muscle_cli = MuscleCLI()
16 | process = muscle_cli(cmd='muscle args...')
17 | stdout = muscle_cli.get_stdout()
18 | """
19 | @property
20 | def _default_exe(self):
21 | return 'muscle'
22 |
23 | # def _set_help(self):
24 | # self(help=True, wait=True)
25 | # self._help = self.get_stdout()
26 |
27 | def set_default_dna_options(aligner):
28 | """
29 | Dummy function as sensible default already provided by mafft --auto
30 | """
31 | aligner.options = get_default_options()
32 |
33 |
34 | def set_default_protein_options(aligner):
35 | """
36 | Dummy function as sensible default already provided by mafft --auto
37 | """
38 | aligner.options = get_default_options()
39 |
40 | class Muscle(Aligner):
41 | """
42 | Convenient wrapper for Muscle multiple sequence aligner
43 |
44 | The wrapper is written as a callable class.
45 | This can hold data (state) to do with the operation it performs, so it can keep results,
46 | execution times and other metadata, as well as perform the task.
47 |
48 | This is a basic implementation that can be extended. The important parts are
49 | __init__ (does the setup) and __call__ (does the work). All
50 | else are helper methods.
51 |
52 | :Example:
53 |
54 | ::
55 |
56 | callable_wrapper = Muscle(aln)
57 | result = callable_wrapper()
58 | time_taken = callable_wrapper.elapsed_time
59 | result_again = callable_wrapper.result
60 | """
61 |
62 | def __init__(self, input_, *args, **kwargs):
63 | super(Muscle, self).__init__(input_, *args, **kwargs)
64 | self.options = get_default_options()
65 |
66 | if self.datatype == DataType.DNA:
67 | set_default_dna_options(self)
68 | else:
69 | set_default_protein_options(self)
70 |
71 | def __call__(self, *args, **kwargs):
72 | """
73 | Anything to do with calling Muscle should go here.
74 | If any extra arguments need to be passed they can
75 | be specified (listed as *args and **kwargs for now).
76 | """
77 | start = time.time() # time the execution
78 |
79 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
80 | with tempfile.NamedTemporaryFile(mode="wt") as filehandle:
81 | SeqIO.write(self.input, filehandle, 'fasta')
82 | filehandle.seek(0)
83 | output, error = self._call(filehandle.name, *args, **kwargs)
84 | else:
85 | output, error = self._call(self.input, *args, **kwargs)
86 |
87 | self.result = self._read_result(output) # store result
88 | self.stdout = output
89 | self.stderr = error
90 |
91 | end = time.time()
92 | self.elapsed_time = end - start
93 | return self.result
94 | # End call
95 |
96 | # Any other accessory methods
97 | def _call(self, filename, *args, **kwargs):
98 | """
99 | Call underlying low level _MuscleCLI wrapper.
100 | Options are passed via *args and **kwargs
101 | [This only covers the simplest automatic
102 | case]
103 | """
104 | self.cli('{} -in {}'.format(self.command(), filename),
105 | wait=True)
106 | return self.cli.get_stdout(), self.cli.get_stderr()
107 |
108 | def command(self):
109 | return str(self.options)
110 |
111 | def _read_result(self, output):
112 | """
113 | Read back the result.
114 | """
115 | fileobj = StringIO(output)
116 | return AlignIO.read(fileobj, 'fasta')
117 |
118 | def _init_cli(self, binary):
119 | return MuscleCLI(executable=binary)
120 |
121 |
122 | def get_default_options():
123 | return OptionSet([
124 | # Algorithm
125 |
126 | # Find diagonals (faster for similar sequences)
127 | FlagOption('-diags', False, active=False),
128 |
129 | # Maximum number of iterations(integer, default 16)
130 | IntegerOption('-maxiters', 16, active=False),
131 |
132 | # Maximum time to iterate in hours (default no limit)
133 | FloatOption('-maxhours', 0.0, active=False)
134 |
135 | #reeInputOption('-usetree', '', active=False)
136 | ])
137 |
--------------------------------------------------------------------------------
/read2tree/wrappers/aligners/probcons.py:
--------------------------------------------------------------------------------
1 | import time
2 | from Bio import AlignIO, SeqIO
3 | from six import StringIO
4 | from ..abstract_cli import AbstractCLI
5 | from .base_aligner import Aligner, AlignmentInput, DataType
6 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
7 | import tempfile
8 |
9 |
10 | class ProbConsCLI(AbstractCLI):
11 | """
12 | ProbCons low-level command line interface
13 |
14 | :Example:
15 |
16 | ::
17 |
18 | probcons_cli = _ProbConsCLI()
19 | process = mafft_cli(cmd='mafft args...')
20 | stdout = mafft_cli.get_stdout()
21 | """
22 | @property
23 | def _default_exe(self):
24 | return 'probcons'
25 |
26 | # def _set_help(self):
27 | # self(help=True, wait=True)
28 | # self._help = self.get_stdout()
29 |
30 |
31 | def set_default_dna_options(aligner):
32 | """
33 | Dummy function as sensible default already provided by mafft --auto
34 | """
35 | aligner.options = get_default_options()
36 |
37 |
38 | def set_default_protein_options(aligner):
39 | """
40 | Dummy function as sensible default already provided by mafft --auto
41 | """
42 | aligner.options = get_default_options()
43 |
44 |
45 | class ProbCons(Aligner):
46 | """
47 | Convenient wrapper for ProbCons multiple sequence aligner
48 |
49 | The wrapper is written as a callable class.
50 | This can hold data (state) to do with the operation it performs, so it can keep results,
51 | execution times and other metadata, as well as perform the task.
52 |
53 | This is a basic implementation that can be extended. The important parts are
54 | __init__ (does the setup) and __call__ (does the work). All
55 | else are helper methods.
56 |
57 | :Example:
58 |
59 | ::
60 |
61 | callable_wrapper = ProbCons(aln)
62 | result = callable_wrapper()
63 | time_taken = callable_wrapper.elapsed_time
64 | result_again = callable_wrapper.result
65 |
66 |
67 | .. note:: There exists an ipython notebook on how to work with wrappers,
68 | including dealing with non-default parameters.
69 | """
70 |
71 | def __init__(self, input_, *args, **kwargs):
72 | super(ProbCons, self).__init__(input_, *args, **kwargs)
73 | self.options = get_default_options()
74 | if self.datatype == DataType.DNA:
75 | set_default_dna_options(self)
76 | else:
77 | set_default_protein_options(self)
78 |
79 | def __call__(self, *args, **kwargs):
80 | """
81 | Anything to do with calling Mafft should go here.
82 | If any extra arguments need to be passed they can
83 | be specified (listed as *args and **kwargs for now).
84 | """
85 | start = time.time() # time the execution
86 |
87 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
88 | with tempfile.NamedTemporaryFile(mode='wt') as filehandle:
89 | SeqIO.write(self.input, filehandle, 'fasta')
90 | filehandle.seek(0)
91 | output, error = self._call(filehandle.name, *args, **kwargs)
92 |
93 | else:
94 | output, error = self._call(self.input, *args, **kwargs)
95 |
96 | self.result = self._read_result(output) # store result
97 | self.stdout = output
98 | self.stderr = error
99 |
100 | end = time.time()
101 | self.elapsed_time = end - start
102 | return self.result
103 | # End call
104 |
105 | # Any other accessory methods
106 | def _call(self, filename, *args, **kwargs):
107 | """
108 | Call underlying low level _Mafft wrapper.
109 | Options are passed via *args and **kwargs
110 | [This only covers the simplest automatic
111 | case]
112 | """
113 | self.cli('{} {}'.format(self.command(), filename),
114 | wait=True)
115 | return self.cli.get_stdout(), self.cli.get_stderr()
116 |
117 | def command(self):
118 | return str(self.options)
119 |
120 | def _read_result(self, output):
121 | """
122 | Read back the result.
123 | """
124 | fileobj = StringIO(output)
125 | return AlignIO.read(fileobj, 'fasta')
126 |
127 | def _init_cli(self, binary):
128 | return ProbConsCLI(executable=binary)
129 |
130 |
131 | def get_default_options():
132 | return OptionSet([
133 | # Algorithm
134 |
135 | # use CLUSTALW output format instead of MFA
136 | FlagOption('-clustalw', False, active=False),
137 |
138 | # use 0 <= REPS <= 5 (default: 2) passes of consistency transformation
139 | IntegerOption('-c', 0, active=False),
140 |
141 | # use 0 <= REPS <= 1000 (default: 100) passes of iterative-refinement
142 | IntegerOption('-ir', 100, active=False),
143 |
144 | # use 0 <= REPS <= 20 (default: 0) rounds of pretraining
145 | IntegerOption('-pre', 0, active=False),
146 |
147 | # generate all-pairs pairwise alignments
148 | FlagOption('-pairs', False, active=False),
149 |
150 | #use Viterbi algorithm to generate all pairs(automatically enables - pairs)
151 | FlagOption('-viterbi', False, active=False),
152 |
153 | # write annotation for multiple alignment to FILENAME
154 | StringOption('-annot', '', active=False),
155 |
156 | # print sequences in alignment order rather than input order (default: off)
157 | FlagOption('-a', False, active=False)
158 |
159 | ])
160 |
--------------------------------------------------------------------------------
/read2tree/wrappers/aligners/prographmsa.py:
--------------------------------------------------------------------------------
1 | import time
2 | from Bio import AlignIO, SeqIO
3 | import tempfile
4 | from six import StringIO
5 | from ..abstract_cli import AbstractCLI
6 | from .base_aligner import Aligner, AlignmentInput, DataType
7 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
8 |
9 |
10 | class ProGraphMSACLI(AbstractCLI):
11 | """
12 | PrographMSA low-level command line interface
13 |
14 | :Example:
15 |
16 | ::
17 |
18 | prograph_cli = ProGraphMSACLI()
19 | process = prograph_cli(cmd='mafft args...')
20 | stdout = prograph_cli.get_stdout()
21 | """
22 |
23 | @property
24 | def _default_exe(self):
25 | return 'ProGraphMSA'
26 |
27 |
28 | def set_default_dna_options(aligner):
29 | """
30 | Dummy function as sensible default already provided by mafft --auto
31 | """
32 | aligner.options = get_default_options()
33 |
34 |
35 | def set_default_protein_options(aligner):
36 | """
37 | Dummy function as sensible default already provided by mafft --auto
38 | """
39 | aligner.options = get_default_options()
40 |
41 |
42 | class ProGraphMSA(Aligner):
43 | """
44 | Convenient wrapper for ProGraphMSA multiple sequence aligner
45 |
46 | The wrapper is written as a callable class.
47 | This can hold data (state) to do with the operation it performs, so it can keep results,
48 | execution times and other metadata, as well as perform the task.
49 |
50 | This is a basic implementation that can be extended. The important parts are
51 | __init__ (does the setup) and __call__ (does the work). All
52 | else are helper methods.
53 |
54 | :Example:
55 |
56 | ::
57 |
58 | callable_wrapper = Mafft(aln)
59 | result = callable_wrapper()
60 | time_taken = callable_wrapper.elapsed_time
61 | result_again = callable_wrapper.result
62 | """
63 |
64 | def __init__(self, input_, *args, **kwargs):
65 | super(ProGraphMSA, self).__init__(input_, *args, **kwargs)
66 | self.options = get_default_options()
67 | if self.datatype == DataType.DNA:
68 | set_default_dna_options(self)
69 | else:
70 | set_default_protein_options(self)
71 |
72 | def __call__(self, *args, **kwargs):
73 | """
74 | Anything to do with calling ProGraphMSA should go here.
75 | If any extra arguments need to be passed they can
76 | be specified (listed as *args and **kwargs for now).
77 | """
78 | start = time.time() # time the execution
79 |
80 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
81 | with tempfile.NamedTemporaryFile(mode="wt") as fh:
82 | SeqIO.write(self.input, fh, 'fasta')
83 | fh.seek(0)
84 | output, error = self._call(fh.name, *args, **kwargs)
85 |
86 | else:
87 | output, error = self._call(self.input, *args, **kwargs)
88 |
89 | self.result = self._read_result(output) # store result
90 | self.stdout = output
91 | self.stderr = error
92 |
93 | end = time.time()
94 | self.elapsed_time = end - start
95 | return self.result
96 | # End call
97 |
98 | # Any other accessory methods
99 | def _call(self, filename, *args, **kwargs):
100 | """
101 | Call underlying low level ProGraphMSA wrapper.
102 | Options are passed via *args and **kwargs
103 | [This only covers the simplest automatic
104 | case]
105 | """
106 | self.cli('{} {}'.format(self.command(), filename),
107 | wait=True)
108 | return self.cli.get_stdout(), self.cli.get_stderr()
109 |
110 | def command(self):
111 | return str(self.options)
112 |
113 | def _read_result(self, output):
114 | """
115 | Read back the result.
116 | """
117 | fileobj = StringIO(output)
118 | return AlignIO.read(fileobj, 'fasta')
119 |
120 | def _init_cli(self, binary):
121 | return ProGraphMSACLI(executable=binary)
122 |
123 |
124 | def get_default_options():
125 | return OptionSet([
126 | # Algorithm
127 |
128 | # output fasta format (instead of stockholm), better because no tree output is produced
129 | FlagOption('--fasta', True, active=True),
130 |
131 | # output all ancestral sequences
132 | FlagOption('--ancestral_seqs', False, active=False),
133 |
134 | # output sequences in input order (default: tree order)
135 | FlagOption('--input_order', False, active=False),
136 |
137 | # output all intermediate guide trees
138 | FlagOption('--all_trees', False, active=False),
139 |
140 | # use ML distances with gap
141 | FlagOption('--mldist_gap', False, active=False),
142 |
143 | # use ML distances
144 | FlagOption('--mldist', False, active=False),
145 |
146 | # use of guide tree
147 | StringOption('--tree', '', active=False)
148 |
149 | ])
150 |
--------------------------------------------------------------------------------
/read2tree/wrappers/read_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | from .ngm import NGM
2 | from .ngmlr import NGMLR
--------------------------------------------------------------------------------
/read2tree/wrappers/read_mappers/base_mapper.py:
--------------------------------------------------------------------------------
1 | import os, types
2 | from abc import ABCMeta, abstractmethod
3 | from enum import Enum
4 | from Bio.SeqRecord import SeqRecord
5 | from read2tree.wrappers import WrapperError
6 |
7 | import logging
8 | logger = logging.getLogger(__name__)
9 |
10 | ReferenceInput = Enum('ReferenceInput', 'OBJECT STRING FILENAME')
11 | ReadInput = Enum('ReadInput', 'OBJECT STRING FILENAME')
12 |
13 | class ReadMapper(object):
14 | """
15 | Base class for wrappers of read mapping software
16 |
17 | The wrapper is written as a callable class.
18 | This can hold data (state) to do with the operation it performs, so it can keep results,
19 | execution times and other metadata, as well as perform the task.
20 |
21 | This is a base implementation to be extended. The important parts are
22 | __init__ (does the setup) and __call__ (does the work). All
23 | else are helper methods.
24 |
25 | :Example:
26 |
27 | ::
28 |
29 | callable_wrapper = ConcreteAligner(aln)
30 | result = callable_wrapper()
31 | time_taken = callable_wrapper.elapsed_time
32 | result_again = callable_wrapper.result
33 | """
34 | __metaclass__ = ABCMeta
35 |
36 | def __init__(self, reference=None, reads=None, tmp_folder=None, binary=None):
37 | """
38 | ..note:: TODO: this documentation is not correct. it needs to be updateted.
39 |
40 | Should work the same whether you're working with a Biopython object or a file
41 | but the implementation differs, e.g. a Biopython object will need
42 | to be written temporarily to disk for the Aligner to work on it.
43 |
44 | alignment is one of 4 things:
45 | a filename
46 | a Biopython MSA
47 | a list of Seq objects
48 | anything else (throw an exception)
49 |
50 | binary is the alignment's executable file, or None
51 | """
52 | if reference is not None:
53 | self.ref_input_type = identify_reference(reference) # Figure out what it is - file or object
54 | self.ref_input = reference # store it
55 | else:
56 | self.ref_input_type = None
57 | self.ref_input = None
58 |
59 | if reads is not None:
60 | self.read_input_type = identify_reads(reads) # Figure out what it is - file or object
61 | self.read_input = reads # store it
62 | else:
63 | self.read_input_type = None
64 | self.read_input = None
65 |
66 | if tmp_folder is not None:
67 | self.tmp_folder = tmp_folder
68 | else:
69 | self.tmp_folder = "./" # set to current folder
70 |
71 | self.elapsed_time = None
72 | self.stdout = None
73 | self.stderr = None
74 | try:
75 | self.cli = self._init_cli(binary)
76 | except IOError as err:
77 | raise WrapperError('Error searching for binary: {}'.format(err))
78 | # End setup
79 |
80 | @abstractmethod
81 | def __call__(self, *args, **kwargs):
82 | """
83 | How to call the underlying aligner
84 | """
85 | pass
86 |
87 | @abstractmethod
88 | def _init_cli(self, binary):
89 | """
90 | Set up the command-line interface to the wrapped software
91 | :param binary: filename of executable binary file
92 | :return: concrete CLI type inheriting from AbstractCLI
93 | """
94 | pass
95 |
96 | def identify_reference(sequence):
97 | """
98 | Work out if we're dealing with a fasta (return True), a file
99 | (return False), or invalid input (raise error)
100 |
101 | :param alignment: either an Biopython MultipleSequenceAlignment or
102 | a filename pointing to an existing msa file.
103 | """
104 | try:
105 | if isinstance(sequence, (SeqRecord, types.GeneratorType, list)):
106 | # `sequence` is a Biopython MultipleSequenceAlignment
107 | return ReferenceInput.OBJECT
108 | if isinstance(sequence, str) and not os.path.exists(sequence):
109 | return ReferenceInput.STRING
110 | elif isinstance(sequence, str) and os.path.exists(sequence):
111 | # `sequence` is a filepath
112 | return ReferenceInput.FILENAME
113 |
114 | except:
115 | # `sequence` is some other thing we can't handle
116 | raise ValueError('{} is not an sequence object or a valid filename'.format(sequence))
117 |
118 |
119 | def identify_reads(reads):
120 | """
121 | Work out if we're dealing with a fasta (return True), a file
122 | (return False), or invalid input (raise error)
123 |
124 | :param alignment: either an Biopython MultipleSequenceAlignment or
125 | a filename pointing to an existing msa file.
126 | """
127 | if isinstance(reads, list):
128 | read = reads[0]
129 | else:
130 | read = reads
131 |
132 | try:
133 | if isinstance(read, (SeqRecord, types.GeneratorType, list)):
134 | # `sequence` is a Biopython MultipleSequenceAlignment
135 | return ReadInput.OBJECT
136 | elif isinstance(read, str) and not os.path.exists(read):
137 | return ReadInput.STRING
138 | elif isinstance(read, str) and os.path.exists(read):
139 | # `sequence` is a filepath
140 | return ReadInput.FILENAME
141 |
142 | except:
143 | # `sequence` is some other thing we can't handle
144 | raise ValueError('{} is not an sequence object or a valid filename'.format(sequence))
145 |
146 |
147 |
--------------------------------------------------------------------------------
/read2tree/wrappers/read_mappers/parser.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pysam
3 | from pyparsing import Suppress, SkipTo, Word, Regex, Literal, OneOrMore, \
4 | Group, LineEnd, CharsNotIn, nums, alphanums, ParseException
5 |
6 |
7 | logger = logging.getLogger(__name__)
8 | logger.addHandler(logging.StreamHandler())
9 |
10 | FLOAT = Word(nums + '.-').setParseAction(lambda x: float(x[0]))
11 | INT = Word(nums).setParseAction(lambda x: int(x[0]))
12 | WORD = Word(alphanums + '_-%. ')
13 | SPACEDWORD = Word(alphanums+' _')
14 |
15 |
16 | class NGMParser(object):
17 | """
18 | Simple prottest result parser.
19 | [MAIN] Done (15778 reads mapped (4.14%), 365184 reads not mapped, 15778 lines written)(elapsed: 73.131973s)
20 | """
21 |
22 | def __init__(self):
23 | self.READS_MAPPED = Literal('[MAIN] Done (')
24 | self.TOTAL_READS = Regex(r'\[MAIN\] Done \(\d+ reads mapped \(\d+\.\d+\%\), ')
25 | self.MAPPING_TIME = Literal('elapsed: ')
26 | self.rm = Suppress(SkipTo(self.READS_MAPPED)) + Suppress(self.READS_MAPPED) + INT
27 | self.tr = Suppress(SkipTo(self.TOTAL_READS)) + Suppress(self.TOTAL_READS) + INT
28 | self.mt = Suppress(SkipTo(self.MAPPING_TIME)) + Suppress(self.MAPPING_TIME) + FLOAT
29 |
30 | def parse(self, stdout):
31 | try:
32 | reads_mapped = self.rm.parseString(stdout).asList()[0]
33 | total_reads = self.tr.parseString(stdout).asList()[0]
34 | mapping_time = self.mt.parseString(stdout).asList()[0]
35 | except ParseException as err:
36 | print(stdout)
37 | logger.error(err)
38 | else:
39 | return reads_mapped, total_reads, mapping_time
40 |
41 | def to_dict(self, file, stdout):
42 | try:
43 | reads_mapped, total_reads, mapping_time = self.parse(stdout)
44 | except UnboundLocalError:
45 | reads_mapped = None
46 | total_reads = None
47 | mapping_time = None
48 | pass
49 | samfile = pysam.AlignmentFile(file, "r")
50 | result = {'file': file,
51 | 'reads_mapped': reads_mapped,
52 | 'total_reads': total_reads,
53 | 'mapping_time': mapping_time,
54 | 'sam': samfile}
55 | return result
56 |
57 |
58 | class NGMLRParser(object):
59 | """
60 | Simple prottest result parser.
61 | for the following example output line:
62 | Processed: 75400 (0.00), R/S: 60.15, RL: 7675, Time: 3.00 11.00 10.07, Align: 1.00, 310, 3.04
63 | Done (77 reads mapped (0.10%), 75323 reads not mapped, 75402 lines written)(elapsed: 20m, 0 r/s)
64 | """
65 |
66 | def __init__(self):
67 | self.TOTAL_MAPPED_READS = Literal('Done (')
68 | self.TOTAL_READS = Literal('Processed: ')
69 | # These are all the models that are possible to be tested using phyml
70 | self.tmr = Suppress(SkipTo(self.TOTAL_MAPPED_READS)) + \
71 | Suppress(self.TOTAL_MAPPED_READS) + FLOAT
72 | self.tr = Suppress(SkipTo(self.TOTAL_READS)) + Suppress(self.TOTAL_READS) + FLOAT
73 |
74 | def parse(self, stdout):
75 | try:
76 | total_mapped_reads = self.tmr.parseString(stdout).asList()[0]
77 | total_reads = self.tr.parseString(stdout).asList()[0]
78 |
79 | except ParseException as err:
80 | logger.error(err)
81 |
82 | return total_mapped_reads, total_reads
83 |
84 | def to_dict(self, file, stdout):
85 | total_mapped_reads, total_reads = self.parse(stdout)
86 | samfile = pysam.AlignmentFile(file, "r")
87 | result = {'file': file,
88 | 'reads_mapped': total_mapped_reads,
89 | 'total_reads': total_reads,
90 | 'sam': samfile}
91 |
92 | return result
93 |
--------------------------------------------------------------------------------
/read2tree/wrappers/treebuilders/__init__.py:
--------------------------------------------------------------------------------
1 | from .phyml import Phyml
2 | from .raxml import Raxml
3 | from .iqtree import Iqtree
4 | from .fasttree import Fasttree
5 |
--------------------------------------------------------------------------------
/read2tree/wrappers/treebuilders/base_treebuilder.py:
--------------------------------------------------------------------------------
1 | import os, types, itertools
2 | from abc import ABCMeta, abstractmethod
3 | from enum import Enum
4 | from Bio import AlignIO, SeqIO
5 | from Bio.Align import MultipleSeqAlignment
6 | from read2tree.utils.seq_utils import is_dna
7 |
8 | from read2tree.wrappers import WrapperError
9 | from read2tree.wrappers.aligners.base_aligner import identify_input
10 |
11 | import logging
12 | logger = logging.getLogger(__name__)
13 |
14 | AlignmentInput = Enum('AlignmentInput', 'OBJECT FILENAME')
15 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN')
16 |
17 |
18 | class TreeBuilder(object):
19 | """
20 | Base class for wrappers of tree building software
21 |
22 | The wrapper is written as a callable class.
23 | This can hold data (state) to do with the operation it performs, so it can keep results,
24 | execution times and other metadata, as well as perform the task.
25 |
26 | This is a base implementation to be extended. The important parts are
27 | __init__ (does the setup) and __call__ (does the work). All
28 | else are helper methods.
29 |
30 | :Example:
31 |
32 | ::
33 |
34 | callable_wrapper = ConcreteAligner(aln)
35 | result = callable_wrapper()
36 | time_taken = callable_wrapper.elapsed_time
37 | result_again = callable_wrapper.result
38 | """
39 | __metaclass__ = ABCMeta
40 |
41 | def __init__(self, alignment=None, datatype=DataType.UNKNOWN, binary=None):
42 | """
43 | ..note:: TODO: this documentation is not correct. it needs to be updateted.
44 |
45 | Should work the same whether you're working with a Biopython object or a file
46 | but the implementation differs, e.g. a Biopython object will need
47 | to be written temporarily to disk for the Aligner to work on it.
48 |
49 | alignment is one of 4 things:
50 | a filename
51 | a Biopython MSA
52 | a list of Seq objects
53 | anything else (throw an exception)
54 |
55 | binary is the alignment's executable file, or None
56 | """
57 |
58 | if alignment is not None:
59 | self.input_type = identify_input(alignment) # Figure out what it is - file or object
60 | if datatype == DataType.UNKNOWN:
61 | # dup, input_ = itertools.tee(input_)
62 | self.datatype = guess_datatype(alignment, from_filename=self.input_type == AlignmentInput.FILENAME)
63 | else:
64 | self.datatype = datatype
65 |
66 | self.input = alignment # store it
67 | else:
68 | self.input_type = None
69 | self.input = None
70 |
71 |
72 | self.elapsed_time = None
73 | self.stdout = None
74 | self.stderr = None
75 | try:
76 | self.cli = self._init_cli(binary)
77 | except IOError as err:
78 | raise WrapperError('Error searching for binary: {}'.format(err))
79 | # End setup
80 |
81 | @abstractmethod
82 | def __call__(self, *args, **kwargs):
83 | """
84 | How to call the underlying aligner
85 | """
86 | pass
87 |
88 | @abstractmethod
89 | def _init_cli(self, binary):
90 | """
91 | Set up the command-line interface to the wrapped software
92 | :param binary: filename of executable binary file
93 | :return: concrete CLI type inheriting from AbstractCLI
94 | """
95 | pass
96 |
97 |
98 | def guess_datatype(alignment, from_filename=False):
99 | logger.warning("Guessing is not recommended - specify the sequence type with option datatype={DNA, PROTEIN}, be more confident")
100 | if from_filename:
101 | try:
102 | alignment = list(SeqIO.parse(alignment, 'fasta'))
103 | except:
104 | alignment = list(SeqIO.parse(alignment, 'phylip-relaxed'))
105 | return DataType.DNA if is_dna(alignment) else DataType.PROTEIN
106 |
107 |
108 | def identify_input(alignment):
109 | """
110 | Work out if we're dealing with an alignment (return True), a file
111 | (return False), or invalid input (raise error)
112 |
113 | :param alignment: either an Biopython MultipleSequenceAlignment or
114 | a filename pointing to an existing msa file.
115 | """
116 | try:
117 | if isinstance(alignment, (MultipleSeqAlignment, types.GeneratorType, list)):
118 | # `alignment` is a Biopython MultipleSequenceAlignment
119 | return AlignmentInput.OBJECT
120 |
121 | elif isinstance(alignment, str) and os.path.exists(alignment):
122 | # `alignment` is a filepath
123 | return AlignmentInput.FILENAME
124 |
125 | except:
126 | # `alignment` is some other thing we can't handle
127 | raise ValueError('{} is not an alignment object or a valid filename'.format(alignment))
128 |
129 |
130 |
--------------------------------------------------------------------------------
/read2tree/wrappers/treebuilders/fasttree.py:
--------------------------------------------------------------------------------
1 | # Author: Ivana Pilizota
2 | # Date: 1 November 2016
3 |
4 | import logging
5 | import os
6 | import time
7 |
8 | from Bio import SeqIO
9 | from pyparsing import ParseException
10 | import tempfile
11 |
12 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType
13 | from .parsers import FasttreeParser
14 |
15 | from ..abstract_cli import AbstractCLI
16 | from ..options import OptionSet, StringOption, IntegerOption
17 | from ...file_utils import TempFile, TempDir
18 |
19 | logger = logging.getLogger(__name__)
20 | logger.addHandler(logging.StreamHandler())
21 | logger.setLevel(logging.INFO)
22 |
23 |
24 | class FasttreeCLI(AbstractCLI):
25 | @property
26 | def _default_exe(self):
27 | return 'FastTree'
28 |
29 |
30 | def set_default_dna_options(treebuilder):
31 | """
32 | Dummy function as sensible default
33 | """
34 | treebuilder.options = get_default_options()
35 |
36 |
37 | def set_default_protein_options(treebuilder):
38 | """
39 | Dummy function as sensible default
40 | """
41 | treebuilder.options = get_default_options()
42 |
43 |
44 | class Fasttree(TreeBuilder):
45 |
46 | def __init__(self, alignment, *args, **kwargs):
47 | self.options = get_default_options()
48 | super(Fasttree, self).__init__(alignment=alignment, *args, **kwargs)
49 | if self.input is not None:
50 | if self.datatype == DataType.DNA:
51 | set_default_dna_options(self)
52 | else:
53 | set_default_protein_options(self)
54 |
55 | def __call__(self, *args, **kwargs):
56 | """
57 | Sets up temporary output file location and calls FastTree using _call() function.
58 | Writes temporary input file if we're working with SeqIO object
59 | Saves the stdout and stderr and returns
60 | """
61 | start = time.time() # time the execution
62 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
63 | with tempfile.NamedTemporaryFile(mode='wt') as fh:
64 | SeqIO.write(self.input, fh, 'phylip-relaxed') # default interleaved
65 | fh.seek(0)
66 | output, error = self._call(fh.name, *args, **kwargs)
67 | self.result = self._read_result(output, error) # store result
68 | else:
69 | filename = os.path.abspath(self.input)
70 | output, error = self._call(filename, *args, **kwargs)
71 | self.result = self._read_result(output, error) # store result
72 |
73 | end = time.time()
74 | self.elapsed_time = end - start
75 | return self.result["tree"]
76 | # End call
77 |
78 | # Any other accessory methods
79 | def _call(self, filename, *args, **kwargs):
80 | """
81 | Call underlying low level FastTree wrapper.
82 | Options are passed via *args and **kwargs
83 | [This only covers the simplest automatic
84 | case]
85 | """
86 | #hard code tmp_output as the output name since we don't save it anyway
87 | #self.cli('{} -log {log_output} {seqfile} > {tmp_path}'.format(self.command(), tmp_path=os.path.join(tmpd,'tmp_output'), log_output=logfile, seqfile=filename), wait=True)
88 | self.cli('{} {seq_file}'.format(self.command(), seq_file=filename), wait=True)
89 |
90 | return (self.cli.get_stdout(), self.cli.get_stderr())
91 |
92 | def command(self):
93 | return str(self.options)
94 |
95 | def _read_result(self, stdout, stderr):
96 | """
97 | Read back the result.
98 | """
99 | parser = FasttreeParser()
100 |
101 | try:
102 | parser.parse(tree=stdout, other=stderr)
103 | result = parser.to_dict()
104 | except IOError as ioerr:
105 | logger.error('Error reading results')
106 | result = None
107 | except ParseException as parseerr:
108 | logger.error('Other parse error', parseerr)
109 | result = None
110 |
111 | return result
112 |
113 | def _init_cli(self, binary):
114 | return FasttreeCLI(executable=binary)
115 |
116 |
117 | def get_default_options():
118 |
119 | return OptionSet([
120 | # Algorithm
121 |
122 | # Set datatype to DNA (nt) or AA alignment: AA by default. If set to True will assume DNA format.
123 | StringOption('-nt', active=False),
124 |
125 | # Set the WAG model for AA alignment. Default Jones-Taylor-Thorton
126 | StringOption('-wag', active=False),
127 |
128 | # Set the GTR model for nt alignment. Default Jones-Taylor-Thorton
129 | StringOption('-gtr', active=False),
130 |
131 | # Set the gamma model. Default Jones-Taylor-Thorton
132 | StringOption('-gamma', active=False),
133 |
134 | # Specify the number of rate categories of sites. Default 20.
135 | IntegerOption('-cat', 20, active=False),
136 |
137 | # Specify starting tree
138 | StringOption('-intree', '', active=False),
139 |
140 | # Speed up the neighbor joining phase & reduce memory usage (recommended for >50,000 sequences)
141 | StringOption('-fastest', active=False),
142 |
143 | # Set the number of rounds of maximum-likelihood NNIs. Deafault 4*log2(N), N = the number of unique sequences
144 | IntegerOption('-mlnni', 0, active=False),
145 |
146 | ])
147 |
--------------------------------------------------------------------------------
/read2tree/wrappers/treebuilders/iqtree.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import logging
4 | import tempfile
5 | from pyparsing import ParseException
6 |
7 | from Bio import SeqIO
8 | from .parsers import IqtreeParser
9 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType
10 |
11 | from ..abstract_cli import AbstractCLI
12 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
13 |
14 | from ...file_utils import TempFile,TempDir
15 |
16 | logger = logging.getLogger(__name__)
17 | logger.addHandler(logging.StreamHandler())
18 | logger.setLevel(logging.INFO)
19 |
20 |
21 | class IqtreeCLI(AbstractCLI):
22 | @property
23 | def _default_exe(self):
24 | return 'iqtree'
25 |
26 |
27 | def set_default_dna_options(treebuilder):
28 | """
29 | Dummy function as sensible default
30 | """
31 | treebuilder.options = get_default_options()
32 |
33 |
34 | def set_default_protein_options(treebuilder):
35 | """
36 | Dummy function as sensible default
37 | """
38 | treebuilder.options = get_default_options()
39 |
40 |
41 | class Iqtree(TreeBuilder):
42 |
43 | def __init__(self, alignment, *args, **kwargs):
44 | self.options = get_default_options()
45 | super(Iqtree, self).__init__(alignment=alignment, *args, **kwargs)
46 | if self.input is not None:
47 | if self.datatype == DataType.DNA:
48 | set_default_dna_options(self)
49 | else:
50 | set_default_protein_options(self)
51 |
52 | def __call__(self, *args, **kwargs):
53 | """
54 | Sets up temporary output file location and calls iqtree using _call() function.
55 | Writes temporary input file if we're working with SeqIO object
56 | Saves the stdout and stderr and returns
57 | """
58 | start = time.time() # time the execution
59 | if "TMPDIR" in os.environ:
60 | tmp_output_folder = tempfile.TemporaryDirectory(prefix='iqtree', dir=os.environ.get("TMPDIR"))
61 | else:
62 | tmp_output_folder = tempfile.TemporaryDirectory(prefix='iqtree_')
63 | tmpd = tmp_output_folder.name
64 | if self.input_type is AlignmentInput.OBJECT: # different operation depending on what it is
65 | filename = os.path.join(tmpd,'tmp_output.phy')
66 | SeqIO.write(self.input, filename, 'phylip-relaxed') # default interleaved
67 | output, error = self._call(filename, tmpd, *args, **kwargs)
68 | elif self.input_type is AlignmentInput.FILENAME:
69 | filename = self.input
70 | output, error = self._call(filename, tmpd, *args, **kwargs)
71 | else:
72 | output, error = self._call(None, tmpd, *args, **kwargs)
73 | self.result = self._read_result(tmpd) # store result
74 |
75 | self.stdout = output
76 | self.stderr = error
77 |
78 | end = time.time()
79 | self.elapsed_time = end - start
80 | return self.result
81 | # End call
82 |
83 | # Any other accessory methods
84 | def _call(self, filename, tmpd, *args, **kwargs):
85 | """
86 | Call underlying low level _iqtree wrapper.
87 | Options are passed via *args and **kwargs
88 | [This only covers the simplest automatic
89 | case]
90 | """
91 | self.cli('{} -pre {tmp_path} -s {seqfile}'.format(self.command(),
92 | tmp_path=os.path.join(tmpd, 'tmp_output'),
93 | seqfile=filename),
94 | wait=True)
95 | return self.cli.get_stdout(), self.cli.get_stderr()
96 |
97 | def command(self):
98 | return str(self.options)
99 |
100 | def _read_result(self, tmpd):
101 | """
102 | Read back the result.
103 | """
104 | expected_outfiles = [os.path.join(tmpd, 'tmp_output.treefile')]
105 |
106 | parser = IqtreeParser()
107 |
108 | try:
109 | result = parser.to_dict(*expected_outfiles)
110 |
111 | except IOError as ioerr:
112 | logger.error('Error reading results')
113 | result = None
114 | except ParseException as parseerr:
115 | logger.error('Other parse error', parseerr)
116 | result = None
117 |
118 | return result["tree"]
119 |
120 | def _init_cli(self, binary):
121 | return IqtreeCLI(executable=binary)
122 |
123 |
124 | def get_default_options():
125 | return OptionSet([
126 | # Number of threads
127 | IntegerOption('-nt', 1, active=True),
128 |
129 | # Set the model for either DNA or AA alignment
130 | StringOption('-m', '', active=False),
131 |
132 | # If set to true will assume sequential format
133 | #FlagOption('-q', False, active=False),
134 |
135 | # Limit memory needs to 4G
136 | StringOption('-mem', '4G', active=True),
137 |
138 | # Set seed to 12345
139 | IntegerOption('-seed', 12345, active=False),
140 |
141 | # Ultrafast bootstrap (>=1000)
142 | IntegerOption('-bb', 0, active=False),
143 |
144 | # SH-like approximate likelihood ratio test (SH-aLRT)
145 | IntegerOption('-alrt', 0, active=False),
146 |
147 | # Bootstrap + ML tree + consensus tree (>=100)
148 | IntegerOption('-b', 0, active=False)
149 | ])
150 |
--------------------------------------------------------------------------------
/read2tree/wrappers/treebuilders/phyml.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import tempfile
4 | import logging
5 | from pyparsing import ParseException
6 | from Bio import AlignIO, SeqIO
7 |
8 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType
9 | from .parsers import PhymlParser
10 |
11 | from ..abstract_cli import AbstractCLI
12 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet
13 |
14 |
15 | logger = logging.getLogger(__name__)
16 | logger.addHandler(logging.StreamHandler())
17 | logger.setLevel(logging.INFO)
18 |
19 |
20 | class PhymlCLI(AbstractCLI):
21 | @property
22 | def _default_exe(self):
23 | return 'phyml'
24 |
25 |
26 | def set_default_dna_options(treebuilder):
27 | """
28 | Dummy function as sensible default
29 | """
30 | treebuilder.options = get_default_options()
31 |
32 |
33 | def set_default_protein_options(treebuilder):
34 | """
35 | Dummy function as sensible default
36 | """
37 | treebuilder.options = get_default_options()
38 |
39 |
40 | class Phyml(TreeBuilder):
41 | """ Phyml tree reconstruction
42 |
43 | This wrapper can be called to reconstruct a phylogenetic tree
44 | using PhyML.
45 | """
46 |
47 | def __init__(self, alignment, *args, **kwargs):
48 | """
49 | :param alignment: input multiple sequence alignment. This can be either
50 | a filename or an biopython SeqRecord collection.
51 | """
52 | super(Phyml, self).__init__(alignment, *args, **kwargs)
53 | self.options = get_default_options()
54 | if self.datatype == DataType.DNA:
55 | set_default_dna_options(self)
56 | else:
57 | set_default_protein_options(self)
58 |
59 | def __call__(self, *args, **kwargs):
60 | """
61 | Anything to do with calling Mafft should go here.
62 | If any extra arguments need to be passed they can
63 | be specified (listed as *args and **kwargs for now).
64 | """
65 | start = time.time() # time the execution
66 |
67 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is
68 | with tempfile.NamedTemporaryFile(mode='wt') as fh:
69 | SeqIO.write(self.input, fh, 'phylip-relaxed') # default interleaved
70 | fh.seek(0)
71 | output, error = self._call(fh.name, *args, **kwargs)
72 | self.result = self._read_result(fh.name) # store result
73 | else:
74 | path = os.path.dirname(self.input)
75 | filename = os.path.basename(self.input)
76 | os.chdir(path) # some operations done because phyml can not deal with large filenames that are caused due to a large path
77 | output, error = self._call(filename, *args, **kwargs)
78 | self.result = self._read_result(filename) # store result
79 |
80 | self.stdout = output
81 | self.stderr = error
82 |
83 | end = time.time()
84 | self.elapsed_time = end - start
85 | return self.result["tree"]
86 | # End call
87 |
88 | # Any other accessory methods
89 | def _call(self, filename, *args, **kwargs):
90 | """
91 | Call underlying low level _Phyml wrapper.
92 | Options are passed via *args and **kwargs
93 | [This only covers the simplest automatic
94 | case]
95 | """
96 | self.cli('{} -i {}'.format(self.command(), filename),
97 | wait=True)
98 | return self.cli.get_stdout(), self.cli.get_stderr()
99 |
100 | def command(self):
101 | return str(self.options)
102 |
103 | def _read_result(self, output):
104 | """
105 | Read back the result.
106 | """
107 |
108 | #TODO: change the output dictionary into a better format
109 | expected_outfiles = ['{}_phyml_stats'.format(output), '{}_phyml_tree'.format(output)]
110 | parser = PhymlParser()
111 |
112 | # Phyml outputs two outfiles, a stats file and a tree file.
113 | # Sometimes it appends .txt, sometimes not. Seems to be platform-specific.
114 | # Here we assume they are without .txt, but if we can't find them, try
115 | # looking for the .txt onees instead
116 | try:
117 | # Check if these are the .txt style outfiles
118 | if not os.path.exists(expected_outfiles[0]):
119 | expected_outfiles = [x + '.txt' for x in expected_outfiles]
120 | result = parser.to_dict(*expected_outfiles)
121 |
122 | except IOError as ioerr:
123 | logger.error('Error reading results')
124 | result = None
125 | except ParseException as parseerr:
126 | logger.error('Other parse error', parseerr)
127 | result = None
128 |
129 | return result
130 |
131 | def _init_cli(self, binary):
132 | return PhymlCLI(executable=binary)
133 |
134 |
135 | def get_default_options():
136 | return OptionSet([
137 | # Algorithm
138 |
139 | # Set datatype to nt or aa
140 | StringOption('-d', 'aa', active=True),
141 |
142 | # Set the model for either DNA or AA alignment
143 | StringOption('-m', '', active=False),
144 |
145 | # If set to true will assume sequential format
146 | FlagOption('-q', False, active=False),
147 |
148 | # Set bootstrap value
149 | IntegerOption('-b', 0, active=False),
150 |
151 | # Tree topology search operation option
152 | StringOption('-s', 'NNI', active=False)
153 | ])
154 |
--------------------------------------------------------------------------------
/read2tree/wrappers/treebuilders/src/pip-delete-this-directory.txt:
--------------------------------------------------------------------------------
1 | This file is placed here by pip to indicate the source was put
2 | here by pip.
3 |
4 | Once this package is successfully installed this source code will be
5 | deleted (unless you remove this file).
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | name = 'read2tree'
4 |
5 | __version__ = None
6 | with open('{:s}/__init__.py'.format(name), 'rt') as fp:
7 | for line in fp:
8 | if line.startswith('__version__'):
9 | exec(line.rstrip())
10 |
11 | # conda install -c conda-forge biopython numpy Cython ete3 lxml tqdm scipy pyparsing requests natsort pyyaml
12 | # conda install -c bioconda dendropy
13 | requirements = ["numpy", "biopython", "ete3", "lxml", "tqdm", "scipy",
14 | "pyparsing", "requests", "natsort", "pyyaml", "dendropy",
15 | "pysam", "pyham", "filelock"]
16 |
17 | with open("README.md", "r", encoding="utf-8") as fh:
18 | long_description = fh.read()
19 |
20 | setup(
21 | name=name,
22 | version=__version__,
23 | author='David Dylus and Fritz Sedlaczek',
24 | author_email='daviddylus@gmail.com',
25 | description='read2tree allows to build high quality phylogenetic trees '
26 | 'using reads and a reference set of orthologous groups '
27 | '(DNA + Protein).',
28 | long_description=long_description,
29 | long_description_content_type="text/markdown",
30 | url="https://github.com/dessimozlab/read2tree",
31 | packages=find_packages(".", exclude=["archive"]),
32 | include_package_data=True,
33 | package_data={
34 | 'read2tree': ['logging/log.yaml']
35 | },
36 | install_requires=requirements,
37 | classifiers=[
38 | "Programming Language :: Python :: 3",
39 | "Environment :: Console",
40 | "License :: OSI Approved :: MIT License",
41 | ],
42 | scripts=['bin/read2tree'],
43 | python_requires=">=3.5",
44 | )
45 |
--------------------------------------------------------------------------------
/src/pip-delete-this-directory.txt:
--------------------------------------------------------------------------------
1 | This file is placed here by pip to indicate the source was put
2 | here by pip.
3 |
4 | Once this package is successfully installed this source code will be
5 | deleted (unless you remove this file).
6 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1001241.fa:
--------------------------------------------------------------------------------
1 | >MNELE00784 | OMA1001241 | ML06054a | [Mnemiopsis leidyi]
2 | MFRNPKIIYSWPYGQHFCKYLRRNASFGEVHPLFESLIAGNRAALARAITLSESTLERHKQESAHLMSSVLKHNLQNRSL
3 | RIGISGPPGAGKSTFIEAIGLHITELNNKLAVLAVDPSSTRSGGSLLADKTRMQQLSVEKLAYIRPSPNRGHLGGVARAT
4 | NAAIQLCEAGGYNVIIVETVGAGQSEIAVANMTDIFVLLVPPGSGDELQGIKKGIVEVADMILVTKADGNLKTAARLVKT
5 | EYSRALRLLRNHDDTSWKPFVQTVSSISGKGISDAWSDMLEFHQEMISTGKYQDRRKKQRVTWLWDHVQDELLEHLRKDT
6 | LNAKFQEKLEADVRNGVILPSTAAQKLLNLFLKGKDNIS
7 |
8 | >HUMAN77595 | OMA1001241 | Q495G5 | [Homo sapiens]
9 | MPMLLPHPHQHFLKGLLRAPFRCYHFIFHSSTHLGSGIPCAQPFNSLGLHCTKWMLLSDGLKRKLCVQTTLKDHTEGLSD
10 | KEQRFVDKLYTGLIQGQRACLAEAITLVESTHSRKKELAQVLLQKVLLYHREQEQSNKGKPLAFRVGLSGPPGAGKSTFI
11 | EYFGKMLTERGHKLSVLAVDPSSCTSGGSLLGDKTRMTELSRDMNAYIRPSPTRGTLGGVTRTTNEAILLCEGAGYDIIL
12 | IETVGVGQSEFAVADMVDMFVLLLPPAGGDELQGIKRGIIEMADLVAVTKSDGDLIVPARRIQAEYVSALKLLRKRSQVW
13 | KPKVIRISARSGEGISEMWDKMKDFQDLMLASGELTAKRRKQQKVWMWNLIQESVLEHFRTHPTVREQIPLLEQKVLIGA
14 | LSPGLAADFLLKALKAETNKIHPV
15 |
16 | >RATNO14529 | OMA1001241 | D3ZNY3 | [Rattus norvegicus]
17 | MTIPTLLLSPYRRLLTRLSRVPSPQLLHSSLPTLHPRDALPNSFGHHCSKRVLLSDGFRRTLCIRATLKDHTEGLSDKEQ
18 | RFVDRLYMGLVQGQRACLAEAITLVESTHTRKKELAQVLLQRVLAHQRERELQNHGKPFTFRVGLSGPPGAGKSTFIECF
19 | GKMLTERGHRLSVLAVDPSSCTSGGSLLGDKTRMTELSRDMNAYIRPSPTSGTLGGVTRTTNEAIVLCEGGGYDIILIET
20 | VGVGQSEFAVADMVDMFVLLLPPAGGDELQGIKRGIIEMADLVVITKSDGDLVVPARRIQAEYVSALKLLRRRSEVWRPK
21 | VIRISARSGEGITEMWDIMREFQHRMLASGELAAKRQTQHKVWMWNLIQENVLEHFKTHPSIREQIPLMEREVLSGALSP
22 | GRAADLLLKAFKSRH
23 |
24 | >GORGO31007 | OMA1001241 | G3QJC8 | [Gorilla gorilla gorilla]
25 | MPMLLPHPHQHFLKGLLRAPFRCYHFIFHSSTHLGSGIPCAQPFNSLGLHCTKWMLLSDGLKRKLCVQTTLKDHTEGLSD
26 | KEQRFVDKLYTGLIQGQRACLAEAITLVESTHSRKKELAQVLLQKVLLYHREQEQSNKGKPLAFRVGLSGPPGAGKSTFI
27 | EYFGKMLTERGHKLSVLAVDPSSCTSGGSLLGDKTRMTELSRDMNAYIRPSPTRGTLGGVTRTTNEAILLCEGAGYDIIL
28 | IETVGVGQSEFAVADMVDMFVLLLPPAGGDELQGIKRGIIEMADLVAVTKSDGDLIVPARRIQAEYVSALKLLRKRSQVW
29 | KPKVIRISARSGEGISEMWDKMKDFQDLMLASGELTAKRRKQQKVWMWNLIQESVLEHFRTHPTVREQIPLLEQKVLIGA
30 | LSPGLAADFLLKTNKIHPV
31 |
32 | >XENLA00784 | OMA1001241 | XELAEV_18005522mg | [Xenopus laevis]
33 | MQGITLCCIKTIAHPVSRYFTRNIVSLVKPAQSLGTVSESCKRKTDSFIKLFRTRLCISAVTHQDADILTEKEKRLLNNL
34 | YTGLIRGQRACLAEAITLVESTHSRKREMAQVLLHMVLSHHREQEKLNSGKPLAFRVGLSGPPGAGKSTFIEIFGKMLTE
35 | EGHKVAVLAVDPSSSTSGGSLLGDKTRMTELSRDMNAYIRPSPTRGTLGGVTRTTNEAILLCEGSGYNIILVETVGVGQS
36 | EFAVADMVDMFVLLLPPAGGDELQVMRISARTGEGIQELWNKLLEFQSNMLTSGELIGKRRSQQRVWMWNLIQENVLLYF
37 | RNHPAVKDQIPVLEERVRTGTLSPGLAADMLLKAFSKSS
38 |
39 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1008242.fa:
--------------------------------------------------------------------------------
1 | >MNELE00922 | OMA1008242 | ML11532a | [Mnemiopsis leidyi]
2 | MTDFDKLPSFKALEILAEKAKSVQLKDLFANDPNRFSKYSQAIEIDELKLLVDFSKNKIDEDIFGELLKLVKDAQVEEMR
3 | DKMFKGEPINFTEQRAVLHIALRNRSNNPILVDGQDVTPKVNQVLEKMKIFADNLRNGTWKGVTGKAITDVVNIGIGGSD
4 | LGPLMVTEALKSYRGNGPDVHFVSNIDGTHIATVLEKVNFESTLFIIASKTFGTLETLTNARTAREWFIKKSGDPSGVAK
5 | HFIALSTNAKLVSEFGIDTANMFEFWDWVGGRYSLWSAIGMSIMCHIGSDNFIKLLEGAHAMDNHFTSAPVEKNIPIILA
6 | VLGVWYNNFLGAQTHALLPYDQYMHRFAAYFQQGDMESNGKGVSREGTRVKYSTGPIVWGEPGTNGQHAFYQLIHQGTKL
7 | IPCDFIMPVQSLNPIGDHHEILTANFLAQTAALMTGRGNEEARKELSSMSAEDQDRLSIYKEFTGDRPTNSILFTKLTPA
8 | MLGALIVMYEHKIFVQGVLWNINSFDQMGVELGKKLALKIQPLLKDDNNVDSEDSSTNGLINFIKANRK
9 |
10 | >HUMAN42527 | OMA1008242 | G6PI_HUMAN | [Homo sapiens]
11 | MAALTRDPQFQKLQQWYREHRSELNLRRLFDANKDRFNHFSLTLNTNHGHILVDYSKNLVTEDVMRMLVDLAKSRGVEAA
12 | RERMFNGEKINYTEGRAVLHVALRNRSNTPILVDGKDVMPEVNKVLDKMKSFCQRVRSGDWKGYTGKTITDVINIGIGGS
13 | DLGPLMVTEALKPYSSGGPRVWYVSNIDGTHIAKTLAQLNPESSLFIIASKTFTTQETITNAETAKEWFLQAAKDPSAVA
14 | KHFVALSTNTTKVKEFGIDPQNMFEFWDWVGGRYSLWSAIGLSIALHVGFDNFEQLLSGAHWMDQHFRTTPLEKNAPVLL
15 | ALLGIWYINCFGCETHAMLPYDQYLHRFAAYFQQGDMESNGKYITKSGTRVDHQTGPIVWGEPGTNGQHAFYQLIHQGTK
16 | MIPCDFLIPVQTQHPIRKGLHHKILLANFLAQTEALMRGKSTEEARKELQAAGKSPEDLERLLPHKVFEGNRPTNSIVFT
17 | KLTPFMLGALVAMYEHKIFVQGIIWDINSFDQWGVELGKQLAKKIEPELDGSAQVTSHDASTNGLINFIKQQREARVQ
18 |
19 | >RATNO16818 | OMA1008242 | G6PI_RAT | [Rattus norvegicus]
20 | MAALTRNPEFQKLLEWHRANSANLKLRELFEADPERFNHFSLNLNTNHGHILLDYSKNLVNKEVLHMLVDLAKSRGVEAA
21 | RDNMFSGLKINSTEDRAVLHVALRNRSNRSIMMDGKDVMPEVNKVLDKMKSFCQRVRSGDWKGYTGKAITDIINIGIGGS
22 | DLGPLMVTEALKPYSKGGPRVWFVSNIDGTHIAKTLANLNPESSLFIIASKTFTTQETITNAETAKEWFLQAAKDPSAVA
23 | KHFVALSTNTDKVKEFGIDPKNMFEFWDWVGGRYSLWSAIGLSIALHVGFDHFEQLLSGAHWMDQHFMKTPLDKNAPVLL
24 | ALLGIWYINFYGCETHAMLPYDQYMHRFAAYFQQGDMESNGKYITKSGARVDYQTGPIVWGEPGTNGQHAFYQLIHQGTK
25 | MIPCDFLIPVQTQHPIRNGLHHKILLANFLAQTEALMKGKSPEEARKELQAAGKSPEELEKLLPHKVFEGNRPTNSIVFT
26 | KLTPFILGALIAMYEHKIFVQGIIWDINSFDQWGVELGKQLAKKIEPELDGSSAVTSHDSSTNGLIGFIKLQRDTKID
27 |
28 | >GORGO15800 | OMA1008242 | A0A2I2YE48 | [Gorilla gorilla gorilla]
29 | TSGQRPAKRRRKSPAMAALTRDPQFQKLQQWYREHGSELNLRRLFDANKDRFNHFSLTLNTNHGHILVDYSKNLVTEDVM
30 | RMLVDLAKSRGVEAARERMFNGEKINYTEGRAVLHVALRNRSNTPILVDGKDVMPEVNKVLDKMKSFCQRVRSGDWKGYT
31 | GKTITDVINIGIGGSDLGPLMVTEALKPYSSGGPRVWYVSNIDGTHIAKTLAQLNPESSLFIIASKTFTTQETITNAETA
32 | KEWFLQAAKDPSAVAKHFVALSTNTTKVKEFGIDPQNMFEFWDWVGGRYSLWSAIGLSIALHVGFDNFEQLLSGAHWMDQ
33 | HFRTTPLEKNAPVLLALLGIWYINCFGCETHAMLPYDQYLHRFAAYFQQGDMESNGKYITKSGTRVDHQTGPIVWGEPGT
34 | NGQHAFYQLIHQGTKMIPCDFLIPVQTQHPIRKGLHHKILLANFLAQTEALMRGKSTEEARKELQAAGKSPEDLERLLPH
35 | KVFEGNRPTNSIVFTKLTPFMLGALVAMYEHKIFVQGIIWDINSFDQWGVELGKQLAKKIEPELDGSAQVTSHDASTNGL
36 | INFIKQQREARVQ
37 |
38 | >XENLA17790 | OMA1008242 | A0A1L8GL32 | [Xenopus laevis]
39 | MALSCDPVYQKLSQWYEAHHGSLNMRQMFEADKDRFSKFSKKLATDDGDILLDYSKNLVNEEVLKLLIELAHSRGVESAR
40 | QKMFSAEKINFTENRAVLHIALRNRSNTPITLEGKDVMPEVNAVLEKMKAFCQKVRSGDWKGYTGKAITDVINIGIGGSD
41 | LGPLMVTESLKPYSKGGPRVWFVSNIDGTHIAKTLAELNPETSLFIIASKTFTTQETITNAETAKEWFLTSAKDASAVAK
42 | HFVALSTNAPKVKDFGIDTANMFEFWDWVGGRYSLWSAIGLSIALHVGFDNFEKLLAGAHWMDNHFNKTPLENNVPVLLA
43 | MLGIWYTNFYGCETHALLPYDQYMHRFAAYFQQGDMESNGKYITKTGARVNYSTGPVVWGEPGTNGQHAFYQLIHQGTRK
44 | IPCDFLIPAQTQHPIRNGLHHKILLSNFLAQTEALMKGKSTEEAKKELQASGLTGDALEKLLPHKVFEGNRPTNSIVFTK
45 | LNPFILGALIAMYEHKIFVQGVVWDINSYDQWGVELGKQLAKKIEPELESDATITSHDSSTNGLIDFIKKHRG
46 |
47 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1065415.fa:
--------------------------------------------------------------------------------
1 | >MNELE00913 | OMA1065415 | ML14561a | [Mnemiopsis leidyi]
2 | MLRDPETVHPLDECKTWPEIRDKLRLWRKENVRCSDQIVELGEYALKHYQTNLGREKWAVFEQVCVAALDLCPAKMKLVN
3 | TCIKELAEQFPSSLRVSMLEGLKYEYLKKWDDALEMYEDMIEYEPTFPAPYKRKVAILKAQNKISDAVNDLNRYLNTFSC
4 | DHESWLELSDIYISNQNYKQALFCVEELLLQYPHNHLYHQRYADILFTIGGKDNLELSCKYYCKAAELNPGNVRALFGIQ
5 | LASSTLSSIGKLSSKAKSDNQSLAAWASDMIEDFYKSQKTSKNLIIEVAGVLDKLSLK
6 |
7 | >HUMAN95181 | OMA1065415 | EMC2_HUMAN | [Homo sapiens]
8 | MAKVSELYDVTWEEMRDKMRKWREENSRNSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELR
9 | RQFPGSHRVKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYLEQFVGDQEAWHE
10 | LAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENLELSRKYFAQALKLNNRNMRALFGLYMSASHIA
11 | SNPKASAKTKKDNMKYASWAASQINRAYQFAGRSKKETKYSLKAVEDMLETLQITQS
12 |
13 | >RATNO39215 | OMA1065415 | EMC2_RAT | [Rattus norvegicus]
14 | MAKVTERYDVTWEEMRDKMRKWREENSRNSEQIMEVGEELINDYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELR
15 | RQFPGSHRVKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYLEQFVGDQEAWHE
16 | LAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENLELSRKYFAQALKLNNRNMRALFGLYMSASHIA
17 | SNPKASAKMKKDNIRYAGWAANQINRAYQFAGRSKKETKSSLKAVEDMLETLQITQS
18 |
19 | >GORGO40150 | OMA1065415 | G3S2S4 | [Gorilla gorilla gorilla]
20 | MKYSSSHTLYCLKEEMRDKMRKWREENSRNSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQEL
21 | RRQFPGSHRVKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYLEQFVGDQEAWH
22 | ELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENLELSRKYFAQALKLNNRNMRALFGLYMSASHI
23 | ASNPKASAKTKKDNMKYASWAASQINRAYQFAGRSKKETKYSLKAVEDMLETLQITQS
24 |
25 | >XENLA27199 | OMA1065415 | EMC2A_XENLA | [Xenopus laevis]
26 | MSKVSDLFDVTWEDMRDKMKTWREENYRNSEHVIEVGEELINEHASKLGDDIWIIYEQVMIAALDCGRDDIAMSCLQELR
27 | RQFPGSHRVKRLTGLRFEAMERYDDALQIYDRILQDDPTNTAARKRKIAIRKAQGRNSEAIRELNEYLEQFVGDQEAWHE
28 | LAELYINELDYAKAAFCLEELILTNPHNHFYYQQFAEVKYTQGGLENLELSRKYFSQALKLNNHNMRALFGLYISSVHIA
29 | SNPKASAKMKKDNVKYATWAASQIKKAYQLAGRTMTDTQTSLKAVEDMLETLQITQS
30 |
31 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1121053.fa:
--------------------------------------------------------------------------------
1 | >MNELE00419 | OMA1121053 | ML26071a | [Mnemiopsis leidyi]
2 | MEKAVLLAALLIATAGAASVQVSDPAKCTLCQAVVTELKVVMEDKDTKDFLAVLQTFICENVPIEDCNNWVSGELAQLDS
3 | LVEGLDPNQACSSLALCAVHTSPLLSSIQCDFCEFLGDEVVKRVLTNATIDEVVTAAETICSELPFGSNECNALVKEYGH
4 | YYLELLVGSIDVAQLCSEVGLCSEQVREMVLNSRLFQILQRGLKDDEGCKACVDGMDVIKEVLSSKDTLDLLHIAVHEIC
5 | GLVSVTGCELIADTALDQIIEKLLPMFVPETVCQQIGACPALTAQDVFSPATVGDDSPLCTGCHDLLGEVKKVANDPETK
6 | QINKDLAPVLCEVLSIPFCQSLISKFLEGALEKAQNLDVDETCVSLKACEAADEVVENWKDTCSECAMIADLILKELQDP
7 | SVQQEIESVVDELCSVLPISDCKETLHSYLVMIESLIAGMNGKTLCGYIGLCSSKMSPMKKATGVTEITKLDFTPSKVGD
8 | TCSECTMIAGEVISLLENGTIDSLIKEAISELCTVLPISDCEATIDGYFDEIVALLKNLDGKTLCSLVGLC
9 |
10 | >HUMAN01568 | OMA1121053 | ENSG00000197746.14 | [Homo sapiens]
11 | MYALFLLASLLGAALAGPVLGLKECTRGSAVWCQNVKTASDCGAVKHCLQTVWNKPTVKSLPCDICKDVVTAAGDMLKDN
12 | ATEEEILVYLEKTCDWLPKPNMSASCKEIVDSYLPVILDIIKGEMSRPGEVCSALNLCESLQKHLAELNHQKQLESNKIP
13 | ELDMTEVVAPFMANIPLLLYPQDGPRSKPQPKDNGDVCQDCIQMVTDIQTAVRTNSTFVQALVEHVKEECDRLGPGMADI
14 | CKNYISQYSEIAIQMMMHMSLQQPKEICALVGFCDEVKEMPMQTLVPAKVASKNVIPALELVEPIKKHEVPAKSDVYCEV
15 | CEFLVKEVTKLIDNNKTEKEILDAFDKMCSKLPKSLSEECQEVVDTYGSSILSILLEEVSPELVCSMLHLCSGTRLPALT
16 | VHVTQPKDGGFCEVCKKLVGYLDRNLEKNSTKQEILAALEKGCSFLPDPYQKQCDQFVAEYEPVLIEILVEVMDPSFVCL
17 | KIGACPSAHKPLLGTEKCIWGPSYWCQNTETAAQCNAVEHCKRHVWN
18 |
19 | >RATNO22029 | OMA1121053 | A0A8I6ASQ4 | [Rattus norvegicus]
20 | MYALALLASLLVTALTSPVQDPKICSGGSAVVCRDVKTAVDCRAVKHCQQMVWSKPTAKSLPCDICKTVVTEAGNLLKDN
21 | ATEEEILHYLEKTCAWIHDSSLSASCKEVVDSYLPVILDMIKGEMSNPGEVCSALNLCQSLQEYLAEQNQRQLESNKIPE
22 | VDLARVVAPFMSNIPLLLYPQDRPRSQPQPKANEDVCQDCMKLVTDIQTAVRTNSSFVQGLVDHVKEDCDRLGPGVSDIC
23 | KNYVDQYSEVAVQMMMHMQDQQPKEICVMVGFCDEVKRVPMRTLVPATEAIKNILPALELTDPYEDVIQAQNVIFCQVCQ
24 | LVMRKLSELIINNATEELLIKGLSKACSLLPAPASTKCQEVLVTFGPSLLDVLMHEVNPNFLCGVISLCSANPNLVGTLE
25 | QPAAAIVSALPKEPAPPKQPEEPKQSALRAHVPPQKNGGFCEVCKKLVIYLEHNLEKNSTKEEILAALEKGCSFLPDPYQ
26 | KQCDEFVAEYEPLLLEILVEVMDPSFVCSKIGVCPSAYKLLLGTEKCVWGPGYWCQNMETAARCNAVDHCKRHVWN
27 |
28 | >GORGO01692 | OMA1121053 | G3S0G3 | [Gorilla gorilla gorilla]
29 | MYALFLLASLLGAALAGPVLGLKECTRGSAVWCQNVKTASDCGAVKHCLQTVWNKPTVKSLPCDICKDVVTAAGDMLKDN
30 | ATEEEILVYLEKTCDWLPKPNMSASCKEIVDSYLPVILDIIKGEMSRPGEVCSALNLCESLQKHLAELNHQKQLESNKIP
31 | ELDMTEVVAPFMANIPLLLYPQDGPRSKPQPKDNGDVCQDCIQMVTDIQTAVRTNSTFVQALVEHVKEECDRLGPGMADI
32 | CKNYISQYSEIAIQMMMHMQPKEICALVGFCDEVKEMPMQTLVPAKVASKNVIPALELVEPIKKHEVPAKSDVYCEVCEF
33 | LVKEVTKLIDNNKTEKEILDAFDKMCSKLPKSLSEECQEVVDTYGSSILSILLEEVSPELVCSMLHLCSGTRLPALTVHV
34 | TQPKDGGFCEVCKKLVGYLDRNLEKNSTKQEILAALEKGCSFLPDPYQKQCDQFVAEYEPVLIEILVEVMDPSFVCLKIG
35 | ACPSAHKPLLGTEKCVWGPSYWCQNTETAAQCNAVEHCKRHVWN
36 |
37 | >XENLA29771 | OMA1121053 | XELAEV_18034910mg | [Xenopus laevis]
38 | MKKFAVLVCALAVVAATPLFGTEQCAKGPEVWCENVRTASQCGAVKHCQQNVWNKPTVKSMPCDFCKEVVTVLGNYLKDN
39 | ITQDEIKQYLNKVCDFIPDPGLASTCKQEVSDYFTIVLNLLEQELSNPGVLCSSLGLCTSLQRHLASLKQPTQLLTNEIP
40 | DVDAAKLVYPYIVNIPQLLYPQEKTLKEPKTGDICNDCTKLVSDVQDALRSNSSFSKKLVDHFLQECNLLDPAIAEMCKS
41 | YINQYSDIAIQVLLQMQPKQLCGMAGFCDQEKSTPLQNIIPAKSLIPAVKVQPAVKITKNPLPGNNVLCEVCELMVSQLE
42 | KLLDNNRTRENIKHGLEKVCKLLPSQYTQKCEDMIEEYSDALIELLEQEANPQAICTALGYCSGSKNLKIVKISAEKAAA
43 | GDYCAVCKMLMRYVDELLEKNATEIRIKAFLGRICNFLPDSMQNECSALVNEYEPLFIQLLLEALDPSFICIKVNLCQNK
44 | KVLLGTEKCMWGPSYWCKDMETAANCNALEHCRRHVWN
45 |
46 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1125645.fa:
--------------------------------------------------------------------------------
1 | >MNELE00647 | OMA1125645 | ML13582a | [Mnemiopsis leidyi]
2 | MVFYFKTVVNGREYMIYMGRDKMENEDLLRWGFPEDVWFHVDKLSSAHVYLRLNKGEGVADIPKELVDQCCQLVKANSIQ
3 | GCKLANVDIVYTPYPNLKKTGDMVAGQVGFHKNKKVVKVNVEKNNEVWKKLEKTREEREVDLQEERNRREREEQEELKKQ
4 | KKLQREMEKLQIEKEKKEREMKSYKLMNADPEKCTSNQFDNESDAERELVDDFM
5 |
6 | >HUMAN93157 | OMA1125645 | CCD25_HUMAN | [Homo sapiens]
7 | MVFYFTSSSVNSSAYTIYMGKDKYENEDLIKHGWPEDIWFHVDKLSSAHVYLRLHKGENIEDIPKEVLMDCAHLVKANSI
8 | QGCKMNNVNVVYTPWSNLKKTADMDVGQIGFHRQKDVKIVTVEKKVNEILNRLEKTKVERFPDLAAEKECRDREERNEKK
9 | AQIQEMKKREKEEMKKKREMDELRSYSSLMKVENMSSNQDGNDSDEFM
10 |
11 | >RATNO10219 | OMA1125645 | D4AAU6 | [Rattus norvegicus]
12 | MVFYFTSSSVNSSTYTIYMGKDKYENEDLIKYGWPEDVWFHVDKLSSAHVYLRLQKGEKIEDIPKEVLMDCAHLVKANSI
13 | QGCKMNNVNVVYTPWSNLKKTADMDVGQIGFHRQKDVKIVTVEKKVNEILNRLEKTKLEKFPDLAAEKEGRDREERNEKK
14 | AQIQEMKRKEKEEMKKKREMDELRSYSSLMKVENMSSNQDGNDSDEFM
15 |
16 | >GORGO40924 | OMA1125645 | G3R2K5 | [Gorilla gorilla gorilla]
17 | MVFYFTSSSVNSSAYTIYMGKDKYENEDLIKHGWPEDIWFHVDKLSSAHVYLRLHKGENIEDIPKEVLMDCAHLVKANSI
18 | QGCKMNNVNVVYTPWSNLKKTADMDVGQIGFHRQKDVKIVTVEKKVNEILNRLEKTKVERFPDLAAEKECRDREERNEKK
19 | AQIQEMKKREKEEMKKKREMDELRSYSSLMKVENMSSNQDGNDSDEFM
20 |
21 | >XENLA23340 | OMA1125645 | Q7T0Y7 | [Xenopus laevis]
22 | MVFYFTSDVISPAYTIYMGKDKYENEDLIKYGWPEDIWFHVDKLSSAHVYLRLQKGQTIEDIPKEVLLDCVQLVKANSIQ
23 | GCKMNNLNVVYTPWANLKKTADMDVGQIGFYRQKDVKTMSVEKVNKIVNRLEKTKDERFPDLAAEKEARDREERNEKKAQ
24 | IQEIKKKEKDEMKKKKEMEELRSYSSLMKSENMSSNQDGNDSDDFM
25 |
26 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1133018.fa:
--------------------------------------------------------------------------------
1 | >MNELE00906 | OMA1133018 | ML18772a | [Mnemiopsis leidyi]
2 | MTITSERRDQVLLGPLPPSFLRLETRTTDGETVTTVTADPAVGRQVELPPQGAGAPNNGAGAPNNGGGAPNNGGQPRPPP
3 | QHYTAHPVQPYVPQAGLSITIAQAVLNKSYTLIGSMDPYVRLKVGHNTYETFTHAGADKTPCWNKVYHCPLPNTHSVRTV
4 | SVEIFDEKALTDDQRIAYAKISVPQSVFEGHTLDEWFPLSGKLGEAKEGSINLIISYTTMPMITLPYQRIHYPYVGAAPQ
5 | HFTPRPPPQISEADVTSIKDMFPAVDKEVIRTVLESKHGNVESAVAAILQIMEGEQGGQQ
6 |
7 | >HUMAN03808 | OMA1133018 | TOLIP_HUMAN | [Homo sapiens]
8 | MATTVSTQRGPVYIGELPQDFLRITPTQQQRQVQLDAQAAQQLQYGGAVGTVGRLNITVVQAKLAKNYGMTRMDPYCRLR
9 | LGYAVYETPTAHNGAKNPRWNKVIHCTVPPGVDSFYLEIFDERAFSMDDRIAWTHITIPESLRQGKVEDKWYSLSGRQGD
10 | DKEGMINLVMSYALLPAAMVMPPQPVVLMPTVYQQGVGYVPITGMPAVCSPGMVPVALPPAAVNAQPRCSEEDLKAIQDM
11 | FPNMDQEVIRSVLEAQRGNKDAAINSLLQMGEEP
12 |
13 | >RATNO18182 | OMA1133018 | TOLIP_RAT | [Rattus norvegicus]
14 | MATTVSTQRGPVYIGELPQDFLRITPTQQQQQIQLDAQAAQQLQYGGAVGTVGRLSITVVQAKLAKNYGMTRMDPYCRLR
15 | LGYAVYETPTAHNGAKNPRWNKVIQCTVPPGVDSFYLEIFDERAFSMDDRIAWTHITIPESLKQGQVEDEWYSLSGRQGD
16 | DKEGMINLVMSYTSLPAAMMMPPQPVVLMPTVYQQGVGYVPIAGMPAVCSPGMVPMAMPPPAVAPQPRCNEEDLKAIQDM
17 | FPNMDREVIRSVLEAQRGNKDAAINSLLQMGEES
18 |
19 | >GORGO04229 | OMA1133018 | G3R9G9 | [Gorilla gorilla gorilla]
20 | MATTVSTQRGPVYIGELPQDFLRITPTQQQRQVQLDAQAAQQLQYGGAVGTVGRLNITVVQAKLAKNYGMTRMDPYCRLR
21 | LGYAVYETPTAHNGAKNPRWNKVIHCTVPPGVDSFYLEIFDELLPAAMVMPPQPVVLMPTVYQQGVGYVPITGMPAVCSP
22 | GMVPVALPPAAVNAQPRCSEEDLKAIQDMFPNMDQEVIRSVLEAQRGNKDAAINSLLQMGEEP
23 |
24 | >XENLA19417 | OMA1133018 | TOIPA_XENLA | [Xenopus laevis]
25 | MATSISTQRGQVFIGELPQDFLRIAPTQQQQQIQLDAQAAQQLQYSGVMGTMGRLSITVVQAKLAKNYGMTRMDPYCRIR
26 | LGYAVYETPTAHNGAKNPRWNKVIQCTIPPGVDSFYLEIFDERAFSMDDRIAWTHITIPETLKEGKHVDEWFSLSGRQGD
27 | DKEGMINLVMSYTSVPAMMPAQPVVLMPTVYQQGVGYVPIAGPVYNPGMPMIASPPAVNPQHQTQEVDIQSIKDMFPTID
28 | PEVIRSVLEAQGGNRDAAINSLLQMVEDS
29 |
30 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1151179.fa:
--------------------------------------------------------------------------------
1 | >MNELE00706 | OMA1151179 | ML10761a | [Mnemiopsis leidyi]
2 | MALSRFYIPCHALVKLAPQTRTAVTSVVLERLEQKKKEALLGGGQHRIDAQHKKGKLTARERIEVLLDEGSFVEWDQLVE
3 | HDCIDWGMDKTHFAGDGVVTGTGTVNGRQVFLFSQDFTVFGGSLSAAYASKICKIMDHAEMVGAPLLGLNDSGGARIQEG
4 | VASLGGYGDIFLRNVLLSGVVPQISLIMGPCAGGAVYSPAITDFTFMVKGTSHMFITGPDVVKQVTNETVTQEELGGSAA
5 | HCSTSGCAAGACENDVHLLLQTRRLLEFLPSNNQEKSPVRPCSDPAEREIPALDNIVPNSPISPYDIKHIVEFLVDEGDF
6 | FEIMPDYAKNIVVGFARMNGETVGIVGNQPLVAAGCLDINASVKGARFVRFCDSFNIPLIILEDVPGFLPGTQQEHGGII
7 | KHGAKLLYALAEATVPKLTVITRKAYGGAYVVMNSKHIRADVNYAWPSSEIAVMGSKGAVAIICRGDPDLAKREEEYIDT
8 | FANPFPTAKKGFVDDVIMPRDTRKRLCADLKWLRNKSQKNPWKKHGNIPL
9 |
10 | >HUMAN72443 | OMA1151179 | PCCB_HUMAN | [Homo sapiens]
11 | MAAALRVAAVGARLSVLASGLRAAVRSLCSQATSVNERIENKRRTALLGGGQRRIDAQHKRGKLTARERISLLLDPGSFV
12 | ESDMFVEHRCADFGMAADKNKFPGDSVVTGRGRINGRLVYVFSQDFTVFGGSLSGAHAQKICKIMDQAITVGAPVIGLND
13 | SGGARIQEGVESLAGYADIFLRNVTASGVIPQISLIMGPCAGGAVYSPALTDFTFMVKDTSYLFITGPDVVKSVTNEDVT
14 | QEELGGAKTHTTMSGVAHRAFENDVDALCNLRDFFNYLPLSSQDPAPVRECHDPSDRLVPELDTIVPLESTKAYNMVDII
15 | HSVVDEREFFEIMPNYAKNIIVGFARMNGRTVGIVGNQPKVASGCLDINSSVKGARFVRFCDAFNIPLITFVDVPGFLPG
16 | TAQEYGGIIRHGAKLLYAFAEATVPKVTVITRKAYGGAYDVMSSKHLCGDTNYAWPTAEIAVMGAKGAVEIIFKGHENVE
17 | AAQAEYIEKFANPFPAAVRGFVDDIIQPSSTRARICCDLDVLASKKVQRPWRKHANIPL
18 |
19 | >RATNO41763 | OMA1151179 | Q68FZ8 | [Rattus norvegicus]
20 | MAAAIRIRAMAAGTRLRVLNCGLRTTIRSLCSQPVSVNERIENKRHAALLGGGQRRIDAQHKRGKLTARERISLLLDPGS
21 | FVESDMFVEHRCADFGMAAEKNKFPGDSVVTGRGRINGRLVYVFSQDFTVFGGSLSGAHAQKICKIMDQAITVGAPVIGL
22 | NDSGGARIQEGVESLAGYADIFLRNVTASGVIPQISLIMGPCAGGAVYSPALTDFTFMVKDTSYLFITGPEVVKSVTNED
23 | VTQEQLGGAKTHTTVSGVAHRAFDNDVDALCNLREFFNFLPLSNQDPAPIRECHDPSDRLVPELDTVVPLESSKAYNMLD
24 | IIHAVIDEREFFEIMPNYAKNIVIGFARMNGRTVGIVGNQPNVASGCLDINSSVKGARFVRFCDAFSIPLITFVDVPGFL
25 | PGTAQEYGGIIRHGAKLLYAFAEATVPKITVITRKAYGGAYDVMSSKHLLGDTNYAWPTAEIAVMGAKGAVEIIFKGHQD
26 | VEAAQAEYVEKFANPFPAAVRGFVDDIIQPSSTRARICCDLEVLASKKVHRPWRKHANVPL
27 |
28 | >GORGO29347 | OMA1151179 | A0A2I2YT25 | [Gorilla gorilla gorilla]
29 | MAAALRVAAAGARLSVLASGLRAAVRSLCSQATSVNERIENKRRTALLGGGQRRIDAQHKRGKLTARERISLLLDPGSFV
30 | ESDMFVEHRCADFGMAADKNKFPGDSVVTGRGRINGRLVYVFSQDFTVFGGSLSGAHAQKICKIMDQAITVGAPVIGLND
31 | SGGARIQEGVESLAGYADIFLRNVTASGVIPQISLIMGPCAGGAVYSPALTDFTFMVKDTSYLFITGPDVVKSVTNEDVT
32 | QEELGGAKTHTTVSGVAHRAFENDVDALCNLRDFFNYLPLSSQDPAPVRECHDPSDRLVPELDTIVPLESTKAYNMVDII
33 | HSVVDEHEFFEIMPNYAKNIIVGFARMNGRTVGIVGNQPKVASGCLDINSSVKGARFVRFCDAFNIPLITFVDVPGFLPG
34 | TAQEYGGIIRHGAKLLYAFAEATVPKVTVITRKAYGGAYDVMSSKHLCGDTNYAWPTAEIAVMGAKGAVEIIFKGHENVE
35 | AAQAEYIEKFANPFPAAVRGFVDDIIQPSSTRARICCDLDVLASKKVRVQEVFHQVVQSGHRDGRGAVRW
36 |
37 | >XENLA22949 | OMA1151179 | Q52L44 | [Xenopus laevis]
38 | MAAVRSVSRFLAAVRGSGSVCGPRGLLRAYSVSHLSVPERIEKKRREALLGGGEQRIEAQHRRGKLTARERISLLLDPGS
39 | FAEYDMFVEHRCSDFGMEEDRNKYPGDSVVTGQGRINGRLVYVFSQDFTVFGGSLSGAHAQKICKIMDQAVMVGAPVIGL
40 | NDSGGARIQEGVESLAGYADIFLRNVLSSGVVPQISLIMGPCAGGAVYSPALTDFTFMVKDTSYLFITGPDVVKSVTNED
41 | VTQEDLGGAKTHTALSGVAHRAFENDIDALLNLREFFNFLPLSNKDSAPVRKCHDPSDRLVPGLDTVVPMESTKAYDMLD
42 | IIHSIIDEREFFEIMPNYAKNIVVGFARMNGRTVGIVGNQPKVASGCLDINSSVKGARFVRFCDAFNIPIITFVDVPGFL
43 | PGTAQEYGGIIRHGAKLLFAFAEATVPKITVITRKAYGGAYDVMSSKHLRGDVNYAWPTAEVAVMGAKGAVQIIFRGKQN
44 | QAEAEEEYVEKFANPFPAAVRGFVDDIIQPSKTRMRICRDLEVLASKQQVNPWKKHANIPL
45 |
46 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1163384.fa:
--------------------------------------------------------------------------------
1 | >MNELE00619 | OMA1163384 | ML08751a | [Mnemiopsis leidyi]
2 | MEKSELSKAWSIDPRERIKALSELASGSVSISHGIPIKRYYRSGVELERMAKVYEDENNLEKAFFLYMKYTTLFVECLPK
3 | HPDYKSPQTSNERKVVRSKLKTIFDRAEFIKNNLTITYAGQHKKWIMEEQIRKAEAEQKRLEEEARIEAEAVAAKRAEME
4 | RRETELALELEQIEKQLEETRTIAVKASEKPVVVPPPMHRQATYPSLPVETPAKQSNTSSFNEAFNMRSPATPLAAPSLS
5 | LPSAPSAPSAHIQIISDTGPFPTVDRSTKPAAPQIDRSTKPATLAASDMFAEMMTQDSQRAVIIPSSLPDKFLSVCLDNT
6 | QKNVETCGILAAKLTANNFTITHVILPKQRGTPDSCQTLAEEELFEYQDKLDLITVGWIHTHPTQSAFLSSVDLHTHCSY
7 | QLMLREAIAIVCAPKHNRLFTVILKELLEDSRSFNGDPPHLTLNHGFPVIRLSRPKKIA
8 |
9 | >HUMAN63278 | OMA1163384 | STABP_HUMAN | [Homo sapiens]
10 | MSDHGDVSLPPEDRVRALSQLGSAVEVNEDIPPRRYFRSGVEIIRMASIYSEEGNIEHAFILYNKYITLFIEKLPKHRDY
11 | KSAVIPEKKDTVKKLKEIAFPKAEELKAELLKRYTKEYTEYNEEKKKEAEELARNMAIQQELEKEKQRVAQQKQQQLEQE
12 | QFHAFEEMIRNQELEKERLKIVQEFGKVDPGLGGPLVPDLEKPSLDVFPTLTVSSIQPSDCHTTVRPAKPPVVDRSLKPG
13 | ALSNSESIPTIDGLRHVVVPGRLCPQFLQLASANTARGVETCGILCGKLMRNEFTITHVLIPKQSAGSDYCNTENEEELF
14 | LIQDQQGLITLGWIHTHPTQTAFLSSVDLHTHCSYQMMLPESVAIVCSPKFQETGFFKLTDHGLEEISSCRQKGFHPHSK
15 | DPPLFCSCSHVTVVDRAVTITDLR
16 |
17 | >RATNO29630 | OMA1163384 | STABP_RAT | [Rattus norvegicus]
18 | MSDHADVSLPPQDRVRILSQLGSAVELNEDIPPRRYFRSGVEIIRMASIYSEEGNIEHAFILYNKYITLFIEKLPKHRDY
19 | KSAIIPEKKDAVKKLKNVAFPKAEELKTELLKRYTKEYEQYKERKKKEEEELARNIAIQQELEKEKQRVAQQKQKQLEQE
20 | QFHAFEKMIQKQELEKERLKIVQEFGKVDPGPCGPLLPDLEKPCVDVAPSSPFSPTQTSDCNTTLRPAKPPVVDRSLKPG
21 | ALSVIENVPTIEGLRHIVVPRNLCSEFLQLASANTAKGIETCGVLCGKLMRNEFTITHVLIPRQNGGPDYCHTENEEEIF
22 | FMQDDLGLLTLGWIHTHPTQTAFLSSVDLHTHCSYQMMLPESIAIVCSPKFQETGFFKLTDYGLQEISTCRQKGFHPHGR
23 | DPPLFCDCSHVTVKDRIVTITDLR
24 |
25 | >GORGO24976 | OMA1163384 | G3RXN2 | [Gorilla gorilla gorilla]
26 | MSDHGDVSLPPEDRVRALSQLGSAVEVNEDIPPRRYFRSGVEIIRMASIYSEEGNIEHAFILYNKYITLFIEKLPKHRDY
27 | KSAVIPEKKDTVKKLKEIAFPKAEELKAELLKRYTKEYTEYNEEKKKEAEELARNMAIQQELEKEKQRVAQQKQQQLEQE
28 | QFHAFEEMIRNQELEKERLKIVQEFGKVDPGLGGPLVPNLEKPSLDVFPTSTVSSIQPSDCHTTVRPAKPPVVDRSLKPG
29 | ALSNSESIPTIDGLRHVVVPGRLCPQFLQLASANTARGVETCGILCGKLMRNEFTITHVLIPKQSAGSDYCNTENEEELF
30 | LIQDQQGLITLGWIHTHPTQTAFLSSVDLHTHCSYQMMLPESVAIVCSPKFQETGFFKLTDHGLEEISSCRQKGFHPHSK
31 | DPPLFCSCSHVTVVDRAVTITDLR
32 |
33 | >XENLA13786 | OMA1163384 | XELAEV_18018654mg | [Xenopus laevis]
34 | MPEHSDASLPPEERIRALVLKGTSVEVNDDIPPKRYYRSGVELIRMANVYSGEGSIENAFILYNKYITLFIEKLPKHRDY
35 | KTANVPEKKETLKKLKEIAFPKAEELKKELHKRYKKEYDEYSEKQRKEEEERARRLALQQQLDAEKQRVALLKQQQEQQE
36 | QVQAFEEMMRRKELEAERLRILHQFSKDEPEAEPLGSPLIPGVNEPPVTPLLPSYGTVQPHPPAVDRSLKPSSYGSNSSG
37 | VTSDGLRHVKIPRDVCCKFLQLSENNTQRGVETCGILCGKLMQNEFTVTHVIVPKQSGGPDYCNTESEEELFLIQDQQGL
38 | ITLGWIHTHPTQTAFLSSVDLHTHCSYQMMLPESIAIVCSPKFQETGFFKLTDYGMKEIGECRQKGFHPHCKEPPLFSAG
39 | GHVSVTEQDVTMMDLR
40 |
41 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1171372.fa:
--------------------------------------------------------------------------------
1 | >MNELE00942 | OMA1171372 | ML05061a | [Mnemiopsis leidyi]
2 | MDTFTSLSTGCFVVKPSKRNERLAEFPQLYTEESKMSLLGRYEQYISSKKKFTDALNITLEDINKEVSSKIVNFINEGLK
3 | ASVSRGIPTLTLKIGVNFKEFSPIYKLAAKEVSETFKIQCATVNSSQHSDMTAVLKDVFGQLLNSENDANKVLLKTLSMR
4 | GLLDHVMTSNNCNGIVLFIPRFDILPSTIMDKLIDICSSRNQEVPFFFVLGLSTGIELSAEWMSSSAISQLNIETVTPPS
5 | PTELLERVLFKSLLDTETSFKLSYRTFEVLLSRFSFSSYSLHDVMKTIDVALLSHSMHQPLFKHIEKTSSNIQFHANNLD
6 | DEEKALLLQLPSVQKYIEKCVVSNKSLALQLLEGDADAIDNLYQQCTENYQVLFHSFKVLHSLIKNIPGSSLVGKPLELY
7 | SFCLQGCVNEVKEVCIALKCFAMLQSSAFLERLQSAYQECQSVEKPCTKLQLLEKVLKVQIEEMKSILTETSKYELDCSD
8 | KKSYSEKVKSIQQSFISRLENYFSTLTAPTSWPMHEIIWYSNHVELQNMLVGKCRTALHRGLTNPNLYLQGSSDITAKPD
9 | LCHLYDLFQEHGKMISLHDWIQSFNALLGKKKISSETHAQFISAVSELHFMGYLKPTKRKTDHVAKLSLLGY
10 |
11 | >HUMAN85723 | OMA1171372 | ORC3_HUMAN | [Homo sapiens]
12 | MATSSMSKGCFVFKPNSKKRKISLPIEDYFNKGKNEPEDSKLRFETYQLIWQQMKSENERLQEELNKNLFDNLIEFLQKS
13 | HSGFQKNSRDLGGQIKLREIPTAALVLGVNVTDHDLTFGSLTEALQNNVTPYVVSLQAKDCPDMKHFLQKLISQLMDCCV
14 | DIKSKEEESVHVTQRKTHYSMDSLSSWYMTVTQKTDPKMLSKKRTTSSQWQSPPVVVILKDMESFATKVLQDFIIISSQH
15 | LHEFPLILIFGIATSPIIIHRLLPHAVSSLLCIELFQSLSCKEHLTTVLDKLLLTTQFPFKINEKVLQVLTNIFLYHDFS
16 | VQNFIKGLQLSLLEHFYSQPLSVLCCNLPEAKRRINFLSNNQCENIRRLPSFRRYVEKQASEKQVALLTNERYLKEETQL
17 | LLENLHVYHMNYFLVLRCLHKFTSSLPKYPLGRQIRELYCTCLEKNIWDSEEYASVLQLLRMLAKDELMTILEKCFKVFK
18 | SYCENHLGSTAKRIEEFLAQFQSLDETKEEEDASGSQPKGLQKTDLYHLQKSLLEMKELRRSKKQTKFEVLRENVVNFID
19 | CLVREYLLPPETQPLHEVVYFSAAHALREHLNAAPRIALHTALNNPYYYLKNEALKSEEGCIPNIAPDICIAYKLHLECS
20 | RLINLVDWSEAFATVVTAAEKMDANSATSEEMNEIIHARFIRAVSELELLGFIKPTKQKTDHVARLTWGGC
21 |
22 | >RATNO32812 | OMA1171372 | F1LSH3 | [Rattus norvegicus]
23 | RHTGPRTMATSSVSKGCFVFKPDFKKRKISVPIEDYFNNEELDSEDSKLRFETYRLLWQRIKSETEQLQEGLNENLFDNL
24 | VDFLQKSHSELQKNSGNWGSQMRLREIPTAALILGVNVTDHDVIFRSLTETLHNNVTPYVVSLQAKDCPDVKHFLQKLTS
25 | ELIDCCVDRNSKEEKNDKALRRTSYSMDSLSSWYSTVAQKTGPKMTSKKRATCSQWQSPPVVLILKNMESFSTKVLQDFI
26 | IISSQHLHEFPLILIFGIATSPVIIHRLLPHSVSSLLCIELFQSLSCKEHLTVVLDKLLLTPQFPFKLSKKALQVLTNIF
27 | LYHDFSIQNFIKGLKLSLLEHFYSQPLSVLCCDLSEAKKRINVFSVNQCEKIRRLPSFRRYVENQPLEKQVALLTNETFL
28 | KEETQSLLEDLHVYHINYFLVLRCLHNFTSSLPKYPLGRQIRELYCTCLEKKIWDSEEYESALQLLRMLAKDELMGILEQ
29 | CVKVLNSSTEKQLSNTAQKIKGFLTQFQNLDADSKEEEDACGSQPKGLQKTDLYHLQKSLLEMKELRRTTKKPTKFEMLR
30 | ENVINFIDNLVRDYLLPPEGQPLHEVVYFSAANTLREHLNAAPRIALHTALNNPYYYLKNEALKSEEGCIPSVAPDICIA
31 | YKLHLECSRLINLVDWSEAFATVVTAAEKMDTNSTVSEEMSEIIHARFIRAVSELELLGFIKPTKQKTDHVARLTWGGC
32 |
33 | >GORGO37065 | OMA1171372 | G3S685 | [Gorilla gorilla gorilla]
34 | MSNQWEKDGLYNKGGFFKPNVIIYRLQEELNKNLFDNLIEFLQKSHSGFQKNSRDLGGQIKLREIPTAALVLGVNVTDHD
35 | LTFRSLTEALQNNVTPYVVSLQAKDCPDMKHFLQKLISQLMDCCVDIKSKEEESVHVTQRKTHYSMDSLSSWYMTVTQKT
36 | DPKMLSKKRTTCSQWQSPPVVVILKDMESFATKVLQDFIIISSQHLHEFPLILIFGIATSPIIIHRLLPHAVSSLLCIEL
37 | FQSLSCKEHLTTVLDKLLLTTQFPFKINEKVLQVLTNIFLYHDFSIQNFIKGLQLSLLEHFYSQPLSVLCCNLPEAKRRI
38 | NFLSNNQCENIRRLPSFRRYVEKQASEKQVALLTNERYLKEETQLLLENLHVYHMNYFLVLRCLHKFTSSLPKYPLGRQI
39 | RELYCTCLEKNIWDSEEYASVLQLLRMLAKDELMTILEKCFKVFKSYCENHLGSTAKRIEEFLAQFQSLDAETKEEEDAS
40 | GSQPKGLQKTDLYHLQKSLLEMKELRRSKKQTKFEVLRENVVNFIDCLVREYLLPPETQPLHEVVYFSAAHALREHLNAA
41 | PRIALHTALNNPYYYLKNEALKSEEGCIPNIAPDICIAYKLHLECSRLINLVDWSEAFATVVTAAEKMDANSATSEEMNE
42 | IIHARFIRAVSELELLGFIKPTKQKTDHVARLTWGGC
43 |
44 | >XENLA24336 | OMA1171372 | A0A1L8G315 | [Xenopus laevis]
45 | MTTSSVSKGCFVFKPSAKKKKTSLTVADYFNEGLRDSEDSKKRFESCQLLWQQMKSQTEQLQEEMNKRLFENLIGFLRKS
46 | HADFHNKKDDWSCRMRASEIPTAALVLGVNVTDHDLTFNSLSDILHETITPFVVLLQSKECTGIKQLLQKLLTQLMGNTV
47 | DIDLEEEEEQVTISQRKMNCTLASLSDWYKRATKKSASPKKKRSLTSTHWESPPVVVIFKDLESFTASVLQEFIVISSGY
48 | VQDLPLVLVFGIATSPMIIHRLLSHSVSSRLCIELFQSMSCTEHLATVVDQLLLTNHFPIKLSGRVMQVLITIFLYHDFS
49 | VQNFIKGLQLSVVEHFYTQPLSVLCCSLSESRKRIKNLSHAQCENIRHLSSFMSYVESQTPENQVNLLTNDRFLKEMTQE
50 | FLERLNSYHENFTPILRCLHHFTCILPKYPLGKQIREIYCACLEKKVWETEDYNSALPLLRMLAKDEIVATLQKCVAVLK
51 | PYSEKKLGNALEKLEEFLINFQSLEETTQNEEDEDTSPQKSLQKKTDLYQLQKKLLEMKETRRTKKPSRFELLRQDVVDF
52 | IDGLVREYLLPPEMLPLHEVVYFSAASTLRRHLNAAPRVALHTALNNPASYLKCLENEGGSISNAAPDICIAYKLHLECG
53 | RLINLYDWLEAFATVVHAAEGSESDSAQQVDDVTHARFIRAVSELELLGFVKPTKQKTDHVARLTWGGC
54 |
55 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_1188079.fa:
--------------------------------------------------------------------------------
1 | >MNELE00930 | OMA1188079 | ML01881a | [Mnemiopsis leidyi]
2 | FLGMTQSSISSFFKPKRQLEDEDGKENKVSKMLKCSDDSVLKDWKISTSWEKCLSNELTKSYFTDISSFVAKERVSKTIY
3 | PSHDEVFSWTHHCKLDDVKVVILGQDPYHGPNQAHGLCFSVKVGVPPPPSLKNIFKAIKKDLDKFEEPGHGYLVGWARQG
4 | VLMLNAVLTVEKSKANSHKSKGWEKLTDHVIKYIGFHMKSCVFLLWGTPAMKKQSLINKTNHLVLTSGHPSPLSAHRGFF
5 | DCKHFSKANEYLLKNKKDAIDWNRLPTE
6 |
7 | >HUMAN15265 | OMA1188079 | UNG_HUMAN | [Homo sapiens]
8 | MGVFCLGPWGLGRKLRTPGKGPLQLLSRLCGDHLQAIPAKKAPAGQEEPGTPPSSPLSAEQLDRIQRNKAAALLRLAARN
9 | VPVGFGESWKKHLSGEFGKPYFIKLMGFVAEERKHYTVYPPPHQVFTWTQMCDIKDVKVVILGQDPYHGPNQAHGLCFSV
10 | QRPVPPPPSLENIYKELSTDIEDFVHPGHGDLSGWAKQGVLLLNAVLTVRAHQANSHKERGWEQFTDAVVSWLNQNSNGL
11 | VFLLWGSYAQKKGSAIDRKRHHVLQTAHPSPLSVYRGFFGCRHFSKTNELLQKSGKKPIDWKEL
12 |
13 | >RATNO05802 | OMA1188079 | A0A8J8YLI4 | [Rattus norvegicus]
14 | MIGQKTLYSFFSPTPTGKRTTRSPQPAPGSGVTAENSSDAAASPAKKARVEQDEPATPPSSPLSAEQLVRIQRNKAAALL
15 | RLAARNVPAGLGESWKQQLCGEFGKPYFVKLMGFVAEERKHHKVYPPPEQVFTWTQMCDIRDVKVVILGQDPYHGPNQAH
16 | GLCFSVQRPVPPPPSLENIFKELSTDIDGFVHPGHGDLSGWARQGVLLLNAVLTVRAHQANSHKERGWEQFTDAVVSWLN
17 | QNLNGLVFLLWGSYAQKKGSAIDRKRHHVLQTAHPSPLSVYRGFFGCRHFSKANELLQRSGKKPISWKEL
18 |
19 | >GORGO06002 | OMA1188079 | G3RY62 | [Gorilla gorilla gorilla]
20 | MIGQKTLYSFFSPSPARKRHAPSPEPAVQGTGVAGVPEESGDAAAIPAKKAPAGQEEPGTPPSSPLSAEQLDRIQRNKAA
21 | ALLRLAARNVPVGFGESWKKHLSGEFGKPYFIKLMGFVAEERKHYTVYPPPHQVFTWTQMCDIRDVKVVILGQDPYHGPN
22 | QAHGLCFSVQRPVPPPPSLENIYKELSTDIEDFVHPGHGDLSGWAKQGVLLLNAVLTVRAHQANSHKERGWEQFTDAVVS
23 | WLNQNSNGLVFLLWGSYAQKKGSAIDRKRHHVLQTAHPSPLSVYRGFFGCRHFSKTNELLQKSGKKPIDWKEL
24 |
25 | >XENLA02694 | OMA1188079 | XELAEV_18007455mg | [Xenopus laevis]
26 | MIGQRTINSFFGAAVKKRAASTVWDGEDSCKAGETTPVKKSRPSNENDIPSAVSPPLSPEQLERMQRNKAAALQKLAARH
27 | APQGLGESWKQELLAEFAKPYFVKLSNFIAEERKKCTVYPPPEEVFTWTQMVDIKDVKVVILGQDPYHGPNQAHGLCFSV
28 | KKPVPPPPSLVNMYKELETDIEGFSRPGHGDLTGWAKQGVLLLNAVLTVRAHNANSHKDCGWEQFTDVVVSWLNKNMDGL
29 | VFMLWGAYAQKKGSNIDRKRHHVLQTVHPSPLSVHRGFFGCRHFSKTNAYLQGLGKKPIDWKAL
30 |
31 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_649216.fa:
--------------------------------------------------------------------------------
1 | >MNELE00920 | OMA649216 | ML39325a | [Mnemiopsis leidyi]
2 | MSLSRLGFQLFRRGTLSTVSPAQQLVSARLFNYDQSHDADNYLKFGLLGGLGLFVTAMCKEEAANEEPANHSEATQEEEE
3 | EQKPKKKKKKGFGERKVMEYENRIREFSTPDKIFRYFATVRVEFENGKKEIFMTPKDFMRSITPGELQPSHLGLDLYRDV
4 | PISKLLDHVDEEEGEQPEFLSRLAQHGLISFQDYIFLLTLLSTPKHDCEIAFKMFDLYGDGCVSYQEFLDTRSVLESRSS
5 | MGKRHRDNIYSGNTINKDGHSALTKYFFGEDSAKKLTLDDFVVFMDGLKEDVFRMEFNKYDPVDGKITEQDFANLLLLHA
6 | TLSNQAKSKFVRRVKKAYKNESQGITFDQFMTFNHFLDHLDDVEILVSVYFAAGMKFNKASLKQVAHVVADVELDSHIID
7 | LVFTIFDDNGDELLSNREFISVLKERAHRGLEKPSDTGFVRLITALGACVASYVKGEEL
8 |
9 | >HUMAN01609 | OMA649216 | MICU1_HUMAN | [Homo sapiens]
10 | MFRLNSLSALAELAVGSRWYHGGSQPIQIRRRLMMVAFLGASAVTASTGLLWKRAHAESPPCVDNLKSDIGDKGKNKDEG
11 | DVCNHEKKTADLAPHPEEKKKKRSGFRDRKVMEYENRIRAYSTPDKIFRYFATLKVISEPGEAEVFMTPEDFVRSITPNE
12 | KQPEHLGLDQYIIKRFDGKKISQEREKFADEGSIFYTLGECGLISFSDYIFLTTVLSTPQRNFEIAFKMFDLNGDGEVDM
13 | EEFEQVQSIIRSQTSMGMRHRDRPTTGNTLKSGLCSALTTYFFGADLKGKLTIKNFLEFQRKLQHDVLKLEFERHDPVDG
14 | RITERQFGGMLLAYSGVQSKKLTAMQRQLKKHFKEGKGLTFQEVENFFTFLKNINDVDTALSFYHMAGASLDKVTMQQVA
15 | RTVAKVELSDHVCDVVFALFDCDGNGELSNKEFVSIMKQRLMRGLEKPKDMGFTRLMQAMWKCAQETAWDFALPKQ
16 |
17 | >RATNO21729 | OMA649216 | A0A8I6A7K0 | [Rattus norvegicus]
18 | MFRLNALSALAELAMGSRWYHGTSQPTQTKRRLMLVAFLGASAVTASTGLLWKKAHAESPPSVNSKKTDAGDKGKSKDTR
19 | EVSSHEGSAADTAAEPYPEEKKKKRSGFRDRKVMEYENRIRAYSTPDKIFRYFATLKVINEPGETEVFMTPQDFVRSITP
20 | NEKQPEHLGLDQYIIKRFDGKEFWQTEKIAQEREKFADEGSIFYTLGECGLISFSDYIFLTTVLSTPQRNFEIAFKMFDL
21 | NGDGEVDMEEFEQVQSIIRSQTSMGMRHRDRPTTGNTLKSGLCSALTTYFFGADLKGKLTIKNFLEFQRKLQHDVLKLEF
22 | ERHDPVDGRISERQFGGMLLAYSGVQSKKLTAMQRQLKKHFKDGKGLTFQEVENFFTFLKNINDVDTALSFYHMAGASLD
23 | KVTMQQVARTVAKVELSDHVCDVVFALFDCDGNGELSNKEFVSIMKQRLMRGLEKPKDMGFTRLMQAMWKCAQETAWDFA
24 | LPK
25 |
26 | >GORGO00558 | OMA649216 | A0A2I2YKG1 | [Gorilla gorilla gorilla]
27 | MFRLNSLSALAELAVGSRWYHGGSQPIQIRRRLMMVAFLGASAVTASTGLLWKRAHAESPPCVDNLKSDIGDKGKNKDEG
28 | DVCNHEKKTADLVPHPEEKKKKRSGFRDRKVMEYENRIRAYSTPDKIFRYFATLKVISEPGEAEVFMTPEDFVRSITPNE
29 | KQPEHLGLDQYIIKRFDGKDFWQTEKISQEREKFADEGSIFYTLGECGLISFSDYIFLTTVLSTPQRNFEIAFKMFDLNG
30 | DGEVDMEEFEQVQSIIRSQTSMGMRHRDRPTTGNTLKSGLCSALTTYFFGADLKGKLTIKNFLEFQRKLQHDVLKLEFER
31 | HDPVDGRITERQFGGMLLAYSGVQSKKLTAMQRQLKKHFKEGKGLTFQEVENFFTFLKNINDVDTALSFYHMAGASLDKV
32 | TMQQVARTVAKVELSDHVCDVVFALFDCDGNGELSNKEFVSIMKQRLMRGLEKPKDMGFTRLMQAMWKCAQETAWDFALP
33 | KQ
34 |
35 | >XENLA31939 | OMA649216 | A0A1L8FF34 | [Xenopus laevis]
36 | MFRLRFIPAVAGLAAVSRRYHGVANHTRSRRRLMMAAFVGATAVSASAGLLWKRANAEAQSSVKHNMREESSEKEKEPED
37 | TDQAVESSDEEQQQEGKKKKRVGFRDRKVMEYENRIRAYSTPDKIFRYFATLKVIHESGESEVFMTPQDFVRSITPNEKQ
38 | PENLGLDQFIVKRYDGKKISQEREKFADEDSIFYSLGECGLISFSDYIFLTTVLSTPQRNFEIAFKMFDLNGDGEVDMEE
39 | FEQVQSIIRSQTSMGMRHRDRSTTGNTLKTGFSSALTTYFFGADLKGKLTIKNFLEFQRKLQHDVLKLEFERQDPVDGRI
40 | TERQFGSMLLAYSGVQSKKLTHMLKQLKKRFKDAEGLTFEEVENFFTFLKNINDVDTALSFYHMAGASLDKVTMQQVART
41 | VAKVELSDHVCDVVFALFDCDGNGELSNKEFIAIMKQRLMRGLEKPKDMGFTRLMRAMWKCAQETAWDFAMPKQ
42 |
43 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_681083.fa:
--------------------------------------------------------------------------------
1 | >MNELE00869 | OMA681083 | ML02403a | [Mnemiopsis leidyi]
2 | MPMKLKFLFRASNKIKTKPSLNFLVMVKRLADPEVDENVEDIDSDFSDDDGASSVSDTGSVEETAQGKRLRLAKQYLDKL
3 | ENEQLKSENDTEINRDLIAHRLQQDVLAEKGKLETRVGKRLCVLENKFTLKGHRLSPTCLAITDTHLFSGSKDGAIIKWD
4 | LSTGKKLSVVKHDSKKQILALAASSDNVYLASGGQDKIIVLWDIESMTFVKCFRKHRGPITALTFQRNSHLLMSGSADRS
5 | VNLWNCDDKLYIESLYGHQDMVADMDSFLQERVVTVGGHDKTLRLWKIQEESQLVFNGHKNTVLDCVSMLNEEHFVTGSQ
6 | DNVLAVWHIKKKKPAITQLQAHAKGSWVSAVAGLKNTECFISGSNGGNVKVWACAENYRSMECIRSIEIIGTVNSIVISH
7 | DNSCFALAVGQEPKMGRWWSDKAARNRVLVFPMAIEDEVNR
8 |
9 | >HUMAN70334 | OMA681083 | U3IP2_HUMAN | [Homo sapiens]
10 | MSATAAARKRGKPASGAGAGAGAGKRRRKADSAGDRGKSKGGGKMNEEISSDSESESLAPRKPEEEEEEELEETAQEKKL
11 | RLAKLYLEQLRQQEEEKAEARAFEEDQVAGRLKEDVLEQRGRLQKLVAKEIQAPASADIRVLRGHQLSITCLVVTPDDSA
12 | IFSAAKDCSIIKWSVESGRKLHVIPRAKKGAEGKPPGHSSHVLCMAISSDGKYLASGDRSKLILIWEAQSCQHLYTFTGH
13 | RDAVSGLAFRRGTHQLYSTSHDRSVKVWNVAENSYVETLFGHQDAVAALDALSRECCVTAGGRDGTVRVWKIPEESQLVF
14 | YGHQGSIDCIHLINEEHMVSGADDGSVALWGLSKKRPLALQREAHGLRGEPGLEQPFWISSVAALLNTDLVATGSHSSCV
15 | RLWQCGEGFRQLDLLCDIPLVGFINSLKFSSSGDFLVAGVGQEHRLGRWWRIKEARNSVCIIPLRRVPVPPAAGS
16 |
17 | >RATNO41032 | OMA681083 | B0BND5 | [Rattus norvegicus]
18 | MNSMSTAVATRKRAKPAPGPGAAPVDGKRRRKVDSAASRGKSKGGGKMNEEISSDSESESLAPRKTEEEEEEELEETAQE
19 | KKLRLAKLYLEQLRQQEEEKAEARAFEEDQVAGRLKEDVLEQRGRLQKLVAKEIQAPAPTDIRVLRGHQLSITCLVITPD
20 | DLAIFSAAKDCTIIKWSVETGRKLHVIPRAKKGTQGQPSGHSSHILCMAISSDGKYLASGDRSKLILIWEAQSCQHLYTF
21 | TGHRDAVSGLAFRRGTHQLYSTSHDRSVKVWNAAENSYVETLFGHQDAVAALDALSRECCVTAGGRDGTVRVWKIPEESQ
22 | LVFYGHQGSIDCIHLINEEHMVSGADDGSVALWGLSKKRPLALQREAHGLHGEPGLEQPFWISSVAALLNTDLVATGSHN
23 | ACVRLWQCGEGFRQLDPLCDIPLVGFINSLKFSSGGDFLVAGVGQEHRLGRWWRIKEARNSVCIIPLRRVPVSPVAGS
24 |
25 | >GORGO30225 | OMA681083 | G3R9Q2 | [Gorilla gorilla gorilla]
26 | MSATAAARKRGKPASGVGAGAGAGKRRRKADSAGDRGKSKGGGKMNEEISSDSESESLAPRKPEEEEEEELEETAQEKKL
27 | RLAKLYLEQLRQQEEEKAEARAFEEDQVAGRLKEDVLEQRGRLQKLVAKEIQAPASADIRVLRGHQLSITCLVVTPDDSA
28 | IFSAAKDCTIIKWSVESGRKLRVIPRAKKGAEGKPPGHSSHVLCMAISSDGKYLASGDRSKLILIWEAQSCQHLYTFTGH
29 | RDAVSGLAFRRGTHQLYSTSHDRSVKVWNVAENSYVETLFGHQDAVAALDALSRECCVTAGGRDGTVRVWKIPEESQLVF
30 | YGHQGSIDCIHLINEEHMVSGADDGSVALWGLSKKRPLALQREAHGLRGEPGLEQPFWISSVAALLNTDLVATGSHSSCV
31 | RLWQCGEGFRQLDLLCDIPLVGFINSLKFSSSGDFLVAGVGQEHRLGRWWRIKQARNSVCIIPLRRVPVPPAAGS
32 |
33 | >XENLA20807 | OMA681083 | A0A1L8GHE5 | [Xenopus laevis]
34 | MSGLFIKKKSGVTPRRRRAEGNDAEATSQKKKKPKDTHLREEIESDSDTEIAPARTKPRQDEEDLDETAQEKKLRLAKEY
35 | LKQLQQQEEEQKEDEDQDTIANRLQEDVLEQRGRLQRPLAKELLPPEPSEIRLLRGHQGPITCLVISPDDSYLFSGSKDC
36 | SIIKWSVSDGKKIHKIPGGRKGTESTHVGHTGHVLCMALSSDGKYLASGDRNKLIFIWDPVTCQNLHKFQGHRDAVSGLS
37 | FQKGTHQLFSVSHDRSVKVWNVEENAYIETLFGHQDAITGLDSLSRERCVTVGGRDGTMRIWKIAEETQLVFSGHEGSID
38 | CVRLINEEHIVTGADDGSLALWTVGKKKPLTQMKCAHGSHGDAGLEQPYWISSIAAALNSDVVASGSHDGFVHVWRCGEG
39 | FRSLSPLFTVPVVGFVNSLQFSSSANFLVAGVGQEHRLGRWWRKKEAKNALCIIPFKRTLVLGS
40 |
41 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_683078.fa:
--------------------------------------------------------------------------------
1 | >MNELE00595 | OMA683078 | ML13641a | [Mnemiopsis leidyi]
2 | MENTVLRAKLIVLGDASVGKSSLVQVFHSDSQQGFPKAYSMTSDVQLQVKSVKIPDSPYTVELYVYDCAGQETFQPFISK
3 | ILGSSALVLLVSDLTNQSSLSAAVKWFERARNANKDFKMQGALVGNKCDLDLRRAIKASEAEETAANLGIPYFECSAKEG
4 | VQVDEPFYFLANCLYEQYIEQTQEFQNIADTV
5 |
6 | >HUMAN60325 | OMA683078 | IFT27_HUMAN | [Homo sapiens]
7 | MVKLAAKCILAGDPAVGKTALAQIFRSDGAHFQKSYTLTTGMDLVVKTVPVPDTGDSVELFIFDSAGKELFSEMLDKLWE
8 | SPNVLCLVYDVTNEESFNNCSKWLEKARSQAPGISLPGVLVGNKTDLAGRRAVDSAEARAWALGQGLECFETSVKEMENF
9 | EAPFHCLAKQFHQLYREKVEVFRALA
10 |
11 | >RATNO39676 | OMA683078 | A0A8I5ZYK4 | [Rattus norvegicus]
12 | QVKMDNDFSSALASGDPAVGKTALVQMFRSDGTHFQKNYTLTTGVDLVVKTVPVLDTNDSVELFIFDSAGKELFSEMLDK
13 | LWENPNVLCLVYDVTNEQSFISCTKWLEKVRSQTPGISLPGVLVGTKTDLAGRQTVDSAQAQAWALSQGLEFFETSVKEM
14 | DNYEAPFHCLAKQFYQLYREKVDIFHTLV
15 |
16 | >GORGO23896 | OMA683078 | G3R464 | [Gorilla gorilla gorilla]
17 | MRTKAFFFFFSDLTGDPAVGKTALAQIFRSDGAHFQKSYTLTTGMDLVVKTVPVPDTGDSVELFIFDSAGKELFSEMLDK
18 | LWESPNVLCLVYDVTNEESFNNCSKWLEKARSQAPGISLPGVLVGNKTDLAGRRAVDSAEARAWALGQGLECFETSVKEM
19 | ENLEAPFHCLAKQFHQLYREKVEVFRALA
20 |
21 | >XENLA18783 | OMA683078 | A0A1L8GP42 | [Xenopus laevis]
22 | MVKLSAKCIVAGDTAVGKSTLVQLFRSDGSHFPKNYSMTATVEVSVKTVQIPDTGDSVELFLCDSPGKAIFYEMTEKLWD
23 | QPGALCLVFDVTNESSFSSCTKWLQRVRSKTLSPHLPGVLVGNKTDMAGLRAVEKGQAEEWAASNGLEYFETSAKELENF
24 | ERPFQALAKAFHHLYQERVEHFQSLV
25 |
26 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_894224.fa:
--------------------------------------------------------------------------------
1 | >MNELE00417 | OMA894224 | ML36131a | [Mnemiopsis leidyi]
2 | MGPKGMDKILVSMGQDGYPGDIQVTNDGATILRSIGVDNPAAKVLVNISKVQDDEVGDGTTSVTVLAAELLREAEQLVAK
3 | KLHPQTIISGYRAALKVAVQVLTDTAIDNGKDNEAFKKDLMNIARTTLSSKILNQHKEHFAELAVNAVLRLKGSTDLELV
4 | QILKKTGGSIEDSYLDEGFLLEKEIGHNQPKRIENARILVANTPMDTDKIKVFGSKVKVDSTAKVADIELAEKNKMKQKV
5 | DKILSHDITCFINRQLIYDYPDQLFADAGIMAIEHADFDGIERLSKVLGAEIVSTFDQPDKVTLGSCKVIEEVILGEDKL
6 | IKFSGVKQGEACTVVLRGATKMIVDEAERSLHDALCVLTQTVKETRTVFGGGCSEMRMARHVEELAARTPGKEALAIESF
7 | ARALRQIPTIIADNGGYDSSQLVSELRAMHSQDELYMGLNMTTGEVGDMRELGITESFAVKHAVVNSAAEAAEMILRVDD
8 | ILKATPRQRGGNDCM
9 |
10 | >HUMAN14228 | OMA894224 | TCPB_HUMAN | [Homo sapiens]
11 | MASLSLAPVNIFKAGADEERAETARLTSFIGAIAIGDLVKSTLGPKGMDKILLSSGRDASLMVTNDGATILKNIGVDNPA
12 | AKVLVDMSRVQDDEVGDGTTSVTVLAAELLREAESLIAKKIHPQTIIAGWREATKAAREALLSSAVDHGSDEVKFRQDLM
13 | NIAGTTLSSKLLTHHKDHFTKLAVEAVLRLKGSGNLEAIHIIKKLGGSLADSYLDEGFLLDKKIGVNQPKRIENAKILIA
14 | NTGMDTDKIKIFGSRVRVDSTAKVAEIEHAEKEKMKEKVERILKHGINCFINRQLIYNYPEQLFGAAGVMAIEHADFAGV
15 | ERLALVTGGEIASTFDHPELVKLGSCKLIEEVMIGEDKLIHFSGVALGEACTIVLRGATQQILDEAERSLHDALCVLAQT
16 | VKDSRTVYGGGCSEMLMAHAVTQLANRTPGKEAVAMESYAKALRMLPTIIADNAGYDSADLVAQLRAAHSEGNTTAGLDM
17 | REGTIGDMAILGITESFQVKRQVLLSAAEAAEVILRVDNIIKAAPRKRVPDHHPC
18 |
19 | >RATNO37617 | OMA894224 | A0A8I6GLE7 | [Rattus norvegicus]
20 | MPSSGVSEHSNKASLSLAPVNIFKAGADEERAETARLSSFIGAIAIGDLVKSTLGPKGMDKILLSSGRDASLMVTNDGAT
21 | ILKNIGVDNPAAKVLVDMSRVQDDEVGDGTTSVTVLAAELLREAESLIAKKIHPQTIIAGWREATKAAREALLSSAVDHG
22 | SDEVKFWQDLMNIAGTTLSSKLLTHHKDHFTKLAVEAVLRLKGSGNLEAIHVIKKLGGSLADSYLDEGFLLDKKIGVNQP
23 | KRIENAKILIANTGMDTDKIKIFGSRVRVDSTAKVAEIEHAEKEKMKEKVERILKHGINCFINRQLIYNYPEQLFGAAGV
24 | MAIEHADFAGVERLALVTGGEIASTFDHPELVKLGSCKLIEEVMIGEDKLIHFSGVALGEACTIVLRGATQQILDEAERS
25 | LHDALCVLAQTVKDPRTVYGGGCSEMLMAHAVTMLASRTPGKEAVAMESFAKALRMLPTIIADNAGYDSADLVAQLRAAH
26 | SEGRITAGLDMKEGSIGDMAVLGITESFQVKRQVLLSAAEAAEVILRVDNIIKAAPRKRVPDHHPC
27 |
28 | >GORGO04796 | OMA894224 | A0A2I2Y483 | [Gorilla gorilla gorilla]
29 | MASLSLAPVNIFKAGADEERAETARLTSFIGAIAIGDLVKSTLGPKGMDKILLSSGRDASLMVTNDGATILKNIGVDNPA
30 | AKVLVDMSRVQDDEVGDGTTSVTVLAAELLREAESLIAKKIHPQTIIAGWREATKAAREALLSSAVDHGSDEVKFRQDLM
31 | NIAGTTLSSKLLTHHKDHFTKLAVEAVLRLKGSGNLEAIHIIKKLGGSLADSYLDEGFLLDKKIGVNQPKRIENAKILIA
32 | NTGMDTDKIKIFGSRVRVDSTAKVAEIEHAEKEKMKEKVERILKHGINCFINRQLIYNYPEQLFGAAGVMAIEHADFAGV
33 | ERLALVTGGEIASTFDHPELVKLGSCKLIEEVMIGEDKLIHFSGVALGEACTIVLRGATQQILDEAERSLHDALCVLAQT
34 | VKDSRTVYGGGCSEMLMAHAVTQLANRTPGKEAVAMESYAKALRMLPTIIADNAGYDSADLVAQLRAAHSEGNTTAGLDM
35 | REGTIGDMAILGITESFQVKRQVLLSAAEAAEVILRVDNIIKAAPRKRVPDHHPC
36 |
37 | >XENLA12675 | OMA894224 | A0A1L8GYM0 | [Xenopus laevis]
38 | MASLSLAPVNIFKAGADEEKAETARLSSFIGAIAIGDLVKSTLGPKGMDKILLSSGRDSSVTVTNDGATILKAIGIDNPA
39 | AQVLVDMSKVQDDEVGDGTTSVTVLAAELLREAEILVAKKIHPQTIVSGWRQATQVAREALLKASMDHGNDEEKFCCDLM
40 | NIARTTLSSKLLTHHKDHFSKLAVEAVLRLKGSGNLEAIHLIKKLGGSLTESYLDEGFLLDKKIGVNQPKRIENAKILIA
41 | NTGMDTDKIKVFGSRVRVDSTAKVAEIELAEKEKMKEKVERILKHGINCFINRQLIYNYPEQLFAAAGVMAIEHADFAGV
42 | ERLALVTGGEIASTFDHPELVKLGTCKLIEEVMIGEDKLIHFSGVAMGEACTIVLRGATQQILDEAERSLHDALCVLAQT
43 | VKDTRTVYGGGCSEMLMAHAVTELANRTPGKESVAMESFAKALRMLPTIIADNAGYDSADLVSQLRAAHSEGKSTYGLDM
44 | KNGIIGDMGELGITESFQVKRQVLLSASEAAEVILRVDNIIKAAPRKRVPDHHPC
45 |
46 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_944789.fa:
--------------------------------------------------------------------------------
1 | >MNELE00946 | OMA944789 | ML24671a | [Mnemiopsis leidyi]
2 | MIRGGKVAKRKSTTVLQEGTKKKCVNGAASSRQTSLQSHFKPLQPKQAPTTSSTGSKRRYVDSRTTSWEEVNKIRPELPT
3 | PPFGEDSPAYSSQIPNSVVIPLPPLNFAPNDKIWAGLKQQEEKRQASKLGILNHPFIKQGARAILLDWLIEVSQLYCLKR
4 | ETFYLSMDYIDRFISKRYDIKKEQLQLVGITALHMAAKLEEIYPPGLEKLSYITDNSCSKEAMWKMELEMMKALDWRLAA
5 | LTVNTWLNLYLQIEYYRGTSCSTFQFLRGEYSQSDFVKIIQLIDLCSLDVKSVEYRPSMIAASALWLVVPSKLKEVTGYS
6 | WDDLISCRHWMQPYAQVLKDQPAQQLKDFEDVEKKDRHHIQTHFKAIPLLHDVYELQESQPLTPDSDSDNENAEVAHYLT
7 | PNSSTHSSPSSSTKHR
8 |
9 | >HUMAN42399 | OMA944789 | CCNE1_HUMAN | [Homo sapiens]
10 | MPRERRERDAKERDTMKEDGGAEFSARSRKRKANVTVFLQDPDEEMAKIDRTARDQCGSQPWDNNAVCADPCSLIPTPDK
11 | EDDDRVYPNSTCKPRIIAPSRGSPLPVLSWANREEVWKIMLNKEKTYLRDQHFLEQHPLLQPKMRAILLDWLMEVCEVYK
12 | LHRETFYLAQDFFDRYMATQENVVKTLLQLIGISSLFIAAKLEEIYPPKLHQFAYVTDGACSGDEILTMELMIMKALKWR
13 | LSPLTIVSWLNVYMQVAYLNDLHEVLLPQYPQQIFIQIAELLDLCVLDVDCLEFPYGILAASALYHFSSSELMQKVSGYQ
14 | WCDIENCVKWMVPFAMVIRETGSSKLKHFRGVADEDAHNIQTHRDSLDLLDKARAKKAMLSEQNRASPLPSGLLTPPQSG
15 | KKQSSGPEMA
16 |
17 | >RATNO19647 | OMA944789 | B1WC54 | [Rattus norvegicus]
18 | MPRERKERDSKDHSNMKEEGGSDLSVRSRKRKANVAVFLQDPDEEIAKIDKTVKSQDSSQPWDDDSACVDPCSFIPTPNK
19 | EEDNELEYPKTAFQPRKIRPPRASPLPVLNWGNREEVWRIMLNKEKTYLRDEHFLQRHPLLQARMRAVLLDWLMEVCEVY
20 | KLHRETFYLAQDFFDRYMASQQNIIKTLLQLIGISALFIASKLEEIYPPKLHQFAYVTDGACSGDEILTMELMMMKALKW
21 | RLSPLTIVSWLNVYVQVAYVNDTGEVLMPQYPQQVFVQIAELLDLCVLDVGCLEFPYGVLAASALYHFSSLELMQKVSGY
22 | QWCDIEKCVKWMVPFAMVIREMGSSKLKHFRGVPMEDSHNIQTHTNSLDLLDKAQAKKAILSEQNRISPPPSGVLTPPHS
23 | SKKQSSEQETE
24 |
25 | >GORGO16559 | OMA944789 | G3QZF2 | [Gorilla gorilla gorilla]
26 | MPRERRERDAKERDTMKEDGGAEFSARSRKRKANVAVFLQDPDEEMAKIDRTARDQCGSQPWDNNAVCADPCSLIPTPDK
27 | EDDERVYPNSTCKPQIIAPSRGSPLPVLSWANREEVWKIMLNKEKTYLRDQHFLEQHPLLQPKMRAILLDWLMEVCEVYK
28 | LHRETFYLAQDFFDRYMATQENVVKTLLQLIGISSLFIAAKLEEIYPPKLHQFAYVTDGACSGDEILTMELMIMKALKWR
29 | LSPLTIVSWLNVYMQVAYLNDLHEVLLPQYPQQIFIQIAELLDLCVLDVDCLEFPYGILAASALYHFSSSELMQKVSGYQ
30 | WCDIENCVKWMVPFAMVIRETGSSKLKHFRGVADEDAHNIQTHRDSLDLLDKARAKKAMLSEQNRASPLPSGLLTPPQSG
31 | KKQSSGPEMA
32 |
33 | >XENLA17827 | OMA944789 | CCNE1_XENLA | [Xenopus laevis]
34 | MPVISNPAVEKSTKDEGTASCSVRSRKRKADVAIFLQDPDETLDSLEMTKKKQYQDRGPWSNEMTCKSPHKLIPTPEKEE
35 | HEPNPTNYSHFASLRFSPVSVSPLPRLGWANQDDVWRNMLNKDRIYLRDKNFFQKHPQLQPNMRAILLDWLMEVCEVYKL
36 | HRETFYLAQDFFDRFMATQKNVIKSRLQLIGITSLFIAAKLEEIYPPKLHQFSFITDGACTEDEITRMELIIMKDLGWCL
37 | SPMTIVSWFNVFLQVAYIRELQQFLRPQFPQEIYIQIVQLLDLCVLDICCLEYPYGVLAASAMYHFSCPELVEKVSGFKV
38 | TELQGCIKWLVPFAMAIKEGGKSKLNFFKGVDIEDAHNIQTHSGCLELMEKVYINQALLEEQNRTSPIPTGVLTPPQSNK
39 | KQKSDRAD
40 |
41 |
--------------------------------------------------------------------------------
/tests/marker_genes/OMAGroup_974829.fa:
--------------------------------------------------------------------------------
1 | >MNELE00836 | OMA974829 | ML01593a | [Mnemiopsis leidyi]
2 | MDLVQVGITTDELDRYAHDLIVQHAAYPAPLNYRGYPKSICTSVNNVLCHGIPNSRELQDGDIISIDVSIFYKGVFGDCC
3 | STRVVGEGDSTAHKLAKVTRDSTLAAIETCKPGTRLSSVGNTISKYAKEAGLSICKEFIGHGIGSYFHGLPEVYHYANSH
4 | GPTLRPGMVFTIEPILMEGRDTMAILADGWTAVSADSKRAAQFEHTILITDSEPEILSPHR
5 |
6 | >HUMAN65562 | OMA974829 | MAP12_HUMAN | [Homo sapiens]
7 | MAAPSGVHLLVRRGSHRIFSSPLNHIYLHKQSSSQQRRNFFFRRQRDISHSIVLPAAVSSAHPVPKHIKKPDYVTTGIVP
8 | DWGDSIEVKNEDQIQGLHQACQLARHVLLLAGKSLKVDMTTEEIDALVHREIISHNAYPSPLGYGGFPKSVCTSVNNVLC
9 | HGIPDSRPLQDGDIINIDVTVYYNGYHGDTSETFLVGNVDECGKKLVEVARRCRDEAIAACRAGAPFSVIGNTISHITHQ
10 | NGFQVCPHFVGHGIGSYFHGHPEIWHHANDSDLPMEEGMAFTIEPIITEGSPEFKVLEDAWTVVSLDNQRSAQFEHTVLI
11 | TSRGAQILTKLPHEA
12 |
13 | >RATNO27947 | OMA974829 | G3V670 | [Rattus norvegicus]
14 | MAAPIGVHLLVRGGCQRILSSPLHHLFLHKRAGSQQRRYFFWRQRDISHSVVSPAAVSPAHPVPEHIKKPDYVTTGIVPD
15 | WGDSIEVKNEDQIQGLREACRLARHVLLLAGKSLKVGMTTEEIDALVHREIIRRDAYPSPLGYGRFPKSVCTSVNNVLCH
16 | GIPDSRPLQDGDIINIDVTVYYNGYHGDTSETFLVGNVDESGTKLVEVARACRDEAIAACRAGAPFSVIGNTISHITRQN
17 | GLQVCPHFVGHGIGSYFHGHPEIWHHANDNDLPMEERMAFTIEPIITEGSPEFKVLEDAWTVVSLDNRRSAQFEHTVLIT
18 | PRGVEILTKVPQEA
19 |
20 | >GORGO26472 | OMA974829 | A0A2I2ZWI0 | [Gorilla gorilla gorilla]
21 | MAAPSGVHQLVRRGSHRIFSSPLNHIYLHKQSSSQQRRNFFFRRQRDISHSIVLPAAVSSAHPVPKHIKKPDYVTTGIVP
22 | DWGDSIEVKNEDQIQGLHQACQLARHVLLLAGKSLKVDMTTEEIDALVHREIISHNAYPSPLGYGGFPKSVCTSVNNVLC
23 | HGIPDSRPLQDGDIINIDVTVYYNGYHGDTSETFLVGNVDECGKKLVEVARRCRDEAIAACRAGAPFSVIGNTISHITHQ
24 | NGFQVCPHFVGHGIGSYFHGHPEIWHHANDNDLPMEEGMAFTIEPIITEGSPEFKVLEDAWTVVSLDNQRSAQFEHTVLI
25 | TSRGAQILTKLPHEA
26 |
27 | >XENLA39438 | OMA974829 | A0A1L8EW64 | [Xenopus laevis]
28 | MQVAGVISCALRRGSVTGCQRVFPLAVNHIYLHRQLNIQQRRYFFFRKQRSAAYDIVWPGTVSPAHPVPEHIMKPDYVTT
29 | GIVPDWGDYIEIKDEDQIQGLRQACQLARHILLMAGKSLKVGMTTEEIDALVHENIISWNAYPSPLGYGGFPKSVCTSVN
30 | NVVCHGIPDSRALQDGDIINIDVTVYFGGYHGDTSETFLVGNVDKCGRGLVKIARRCRDEAIAVCKPGAPFSSIGNTISR
31 | IAGENGFRVCPSFVGHGIGSFFHGHPEIWHHANNNDMPMEEGMAFTIEPIIMEGSPDFKILKDKWTAVSVDNKRSAQCEH
32 | TIVITSGGAEILTKLPQEE
33 |
34 |
--------------------------------------------------------------------------------
/tests/test_aligner.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import gzip
4 | import argparse
5 | from Bio import SeqIO
6 | from read2tree.Reads import Reads
7 | from read2tree.FastxReader import FastxReader
8 | dirname = os.path.dirname(__file__)
9 |
10 |
11 | class ReadTest(unittest.TestCase):
12 |
13 | def setup_reads_paired(self, sampling=False):
14 | arg_parser = argparse.ArgumentParser(prog='read2tree')
15 |
16 | arg_parser.add_argument('--standalone_path', default='.',
17 | help='[Default is current directory] Path to '
18 | 'oma standalone directory.')
19 |
20 | arg_parser.add_argument('--reads', nargs='+', default=None,
21 | help='Reads to be mapped to reference. If paired '
22 | 'end add separated by space.')
23 |
24 | arg_parser.add_argument('--read_type', default='short',
25 | help='[Default is short reads] Type of reads to '
26 | 'use for mapping. Either ngm for short reads or '
27 | 'ngmlr for long will be used.')
28 |
29 | arg_parser.add_argument('--dna_reference', default='',
30 | help='Reference file that contains nucleotide '
31 | 'sequences (fasta, hdf5). If not given it will use'
32 | 'the RESTapi and retrieve sequences '
33 | 'from http://omabrowser.org directly. '
34 | 'NOTE: internet connection required!')
35 |
36 | arg_parser.add_argument('--keep_all_ogs', action='store_true',
37 | help='Keep all orthologs after addition of '
38 | 'mapped seq, which means also the groups that '
39 | 'have no mapped sequence. Otherwise only groups '
40 | 'are used that have the mapped sequence for '
41 | 'alignment and tree inference.')
42 |
43 | arg_parser.add_argument('-r', '--reference', action='store_true',
44 | help='Just generate the reference dataset for '
45 | 'mapping.')
46 |
47 | arg_parser.add_argument('--remove_species_ogs', default=None,
48 | help='[Default is none] Remove species present '
49 | 'in data set after mapping step completed to '
50 | 'build OGs. Input is comma separated list '
51 | 'without spaces, e.g. XXX,YYY,AAA.')
52 |
53 | arg_parser.add_argument('-s', '--species_name', default=None,
54 | help='[Default is name of read] Name of species '
55 | 'for mapped sequence.')
56 |
57 | arg_parser.add_argument('--output_path', default='.', required=True,
58 | help='[Default is current directory] Path to '
59 | 'output directory.')
60 |
61 | argv = ['--standalone_path', 'tests/data/marker_genes/',
62 | '--dna_reference', 'tests/data/dna.fa', '--reads',
63 | 'tests/data/mapper/test3/test_1b.fq',
64 | 'tests/data/mapper/test3/test_2b.fq',
65 | '--output_path', 'tests/data/output', '--read_type',
66 | 'short', '--keep_all_ogs', '--reference',
67 | '--remove_species_ogs', 'CIOIN', '--species_name', 'ass']
68 |
69 | args = arg_parser.parse_args(argv)
70 | return alignments = Aligner(args, ogset.ogs, load=True)
71 |
--------------------------------------------------------------------------------
/tests/test_og.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | from Bio import SeqIO
4 | from read2tree.OGSet import OG
5 |
6 | dirname = os.path.dirname(__file__)
7 |
8 |
9 | class OGTest(unittest.TestCase):
10 |
11 | def setup(self):
12 | aa = list(SeqIO.parse('data/OG4.aa', format='fasta'))
13 | dna = list(SeqIO.parse('data/OG4.dna', format='fasta'))
14 | og = OG()
15 | og.aa = aa
16 | og.dna = dna
17 | return og
18 |
19 | def test_init(self):
20 | og = self.setup()
21 | self.assertEqual(og.dna[0].id, 'MOUSE21964_OG4')
22 |
23 | def test_get_og_dict(self):
24 | og = self.setup()
25 | dna_dict = og._get_og_dict(og)
26 | self.assertEqual(dna_dict['MOUSE21964'].name, 'MOUSE21964_OG4')
27 |
28 | def test_remove_species_records(self):
29 | og = self.setup()
30 | og_wo_mouse = og.remove_species_records('MOUSE')
31 | self.assertEqual(len(og_wo_mouse[0]), 4)
32 | self.assertEqual(len(og_wo_mouse[1]), 4)
33 |
34 | def test_get_species_id(self):
35 | og = self.setup()
36 | dna = og.dna[0]
37 | aa = og.aa[0]
38 | self.assertEqual(og._get_species_id(dna), 'MOUSE')
39 | self.assertEqual(og._get_species_id(aa), 'MOUSE')
40 |
41 |
42 | if __name__ == "__main__":
43 | unittest.main()
44 |
--------------------------------------------------------------------------------
/tests/test_ogset.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | from read2tree import OGSet
4 |
5 | API_URL = 'http://omabrowser.org/api'
6 |
7 | class OGSetTest(unittest.TestCase):
8 | def setUp(self):
9 | arg_parser = argparse.ArgumentParser(prog='read2tree')
10 |
11 | arg_parser.add_argument('--reads', nargs='+', default=None,
12 | help='Reads to be mapped to reference. If paired end '
13 | 'add separated by space.')
14 | arg_parser.add_argument('--read_split_length', type=int, default=400,
15 | help='Set read split length.')
16 | arg_parser.add_argument('--read_split_overlap', type=int, default=50,
17 | help='Set read split length overlap.')
18 | arg_parser.add_argument('-s', '--species_name', default=None,
19 | help='[Default is name of read] Name of species '
20 | 'for mapped sequence.')
21 |
22 | argv = ['--reads', 'tests/data/reads/test.fq']
23 |
24 | args = arg_parser.parse_args(argv)
25 | return OGSet(args)
26 |
27 | def test_OGSet(self):
28 | raise NotImplementedError
29 |
30 | def test_marker_genes_input(self):
31 | raise NotImplementedError
32 |
33 | def test_omastandalone_input(self):
34 | raise NotImplementedError
35 |
36 | def test_output_folder_structure(self):
37 | raise NotImplementedError
38 |
39 | def test_species_removal(self):
40 | raise NotImplementedError
41 |
42 | def test_species_removal_after_mapping(self):
43 | raise NotImplementedError
44 |
45 | def test_rest_api_connection(self):
46 | OGSet._read
47 |
48 | def test_rest_api_dna_downlaod(self):
49 | raise NotImplementedError
50 |
51 |
52 | if __name__ == "__main__":
53 | unittest.main()
54 |
--------------------------------------------------------------------------------
/tests/test_reads.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import gzip
4 | import argparse
5 | from Bio import SeqIO
6 | from read2tree.Reads import Reads
7 | from read2tree.FastxReader import FastxReader
8 | from read2tree.main import parse_args
9 | from read2tree._utils import exe_name
10 | dirname = os.path.dirname(__file__)
11 |
12 |
13 | class ReadTest(unittest.TestCase):
14 |
15 | def setup_long_reads(self, split=False):
16 | if split:
17 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test.fq.gz', '--split_reads',
18 | '--split_overlap', '50', '--split_len', '400', '--sample_reads', '--coverage', '10',
19 | '--genome_len', '1000']
20 | else:
21 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test.fq.gz']
22 |
23 | args = parse_args(argv, exe_name(), '')
24 | # args = arg_parser.parse_args(argv)
25 | return Reads(args)
26 |
27 | def setup_reads_paired(self, sampling=False):
28 |
29 | if sampling:
30 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test_1a.fq.gz',
31 | 'data/reads/test_2a.fq.gz', '--sample_reads', '--coverage', '10', '--genome_len', '1000']
32 | else:
33 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test_1a.fq.gz',
34 | 'data/reads/test_2a.fq.gz']
35 | args = parse_args(argv, exe_name(), '')
36 | return Reads(args)
37 |
38 | def test_split(self):
39 | test_seq = 'ACGTTTTTTGGAAGAGTTAGAGATTTTTAGAGAGGAGGGGT'
40 | expected = ['ACGTTTTTTG', 'GAAGAGTTAG', 'AGATTTTTAG', 'AGAGGAGGGG',
41 | 'GAGGAGGGGT']
42 | reads = self.setup_long_reads()
43 | # obtained = reads._split_len(test_seq, 10)
44 | obtained = reads._split_len_overlap(test_seq, 10, 0)
45 | self.assertEqual(expected, obtained)
46 |
47 | def test_splitOverlap(self):
48 | test_seq = 'ACGTTTTTTGGAAGAGTTAGAGATTTTTAGAGAGGAGGGGTTT'
49 | expected = ['ACGTTTTTTG', 'TTTTGGAAGA', 'GAAGAGTTAG', 'GTTAGAGATT',
50 | 'AGATTTTTAG', 'TTTAGAGAGG', 'AGAGGAGGGG', 'GGAGGGGTTT']
51 | reads = self.setup_long_reads()
52 | obtained = reads._split_len_overlap(test_seq, 10, 5)
53 | # print(reads._split_len_overlap('TTTTTAGAGAGGAGGGGTTT', 10, 5))
54 | self.assertEqual(expected, obtained)
55 |
56 | def test_get_4_line_fastq_string(self):
57 | reads = self.setup_long_reads()
58 | expected = '@SRR00001 length=16\nACGTTTGGGAAGGTTT\n+SRR00001 ' \
59 | 'length=16\n????????????????\n'
60 | read_id = 'SRR00001'
61 | seq = 'ACGTTTGGGAAGGTTT'
62 | qual = '????????????????'
63 | name = reads._get_4_line_fastq_string(read_id, seq, qual, x=0)
64 | self.assertEqual(name, expected)
65 |
66 | def test_read_num_split(self):
67 | reads = self.setup_long_reads(split=True)
68 | num_reads = reads._get_num_reads('data/reads/test.fq.gz')
69 | self.assertEqual(num_reads, 18)
70 |
71 | def test_read_len_split(self):
72 | reads = self.setup_long_reads(split=True)
73 | len_reads = reads._get_read_len('data/reads/test.fq.gz',1000)
74 | self.assertEqual(len_reads, 400)
75 |
76 | def test_read_num_paired(self):
77 | reads = self.setup_reads_paired()
78 | num_reads = reads._get_num_reads('data/reads/test_1a.fq.gz')
79 | self.assertEqual(num_reads, 1000)
80 |
81 | def test_read_len_paired(self):
82 | reads = self.setup_reads_paired()
83 | num_reads = reads._get_read_len('data/reads/test_1a.fq.gz', 1000)
84 | self.assertEqual(num_reads, 151.0)
85 |
86 | def test_read_num_by_coverage_paired(self):
87 | reads = self.setup_reads_paired(sampling=True)
88 | num_reads = reads._get_num_reads_by_coverage(
89 | 'data/reads/test_1a.fq.gz', 1000)
90 | self.assertEqual(num_reads, 34)
91 |
92 | def test_read_num_by_coverage_split(self):
93 | reads = self.setup_long_reads(split=True)
94 | num_reads = reads._get_num_reads_by_coverage(['data/reads/test.fq.gz'],1000)
95 | self.assertEqual(num_reads, 25)
96 |
97 | def test_read_vec_paired(self):
98 | reads = self.setup_reads_paired(sampling=True)
99 | num_reads = reads._get_vector_random_reads(
100 | 'data/reads/test_1a.fq.gz')
101 | self.assertEqual(len(num_reads), 34)
102 |
103 |
104 | if __name__ == "__main__":
105 | unittest.main()
106 |
--------------------------------------------------------------------------------
/tests/test_use.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import argparse
3 | import warnings
4 | warnings.filterwarnings('ignore')
5 | from read2tree.Progress import Progress
6 | from read2tree.stats.Coverage import Coverage
7 | from read2tree.stats.SeqCompleteness import SeqCompleteness
8 | import os
9 |
10 | class Use(unittest.TestCase):
11 |
12 | def test_OGSet(self):
13 |
14 | def test_write_progress(self):
15 |
16 | def test_read_progress(self):
17 |
18 |
19 | if __name__ == "__main__":
20 | unittest.main()
21 |
--------------------------------------------------------------------------------