├── .github └── workflows │ └── docker-image.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── archive ├── .idea │ ├── encodings.xml │ ├── inspectionProfiles │ │ └── profiles_settings.xml │ ├── libraries │ │ └── R_User_Library.xml │ ├── misc.xml │ ├── modules.xml │ ├── other.xml │ ├── pore2tree.iml │ └── vcs.xml ├── TODO ├── down_py_script.sh ├── r2t_py_script.sh ├── requirements.txt ├── rm_py_script.sh ├── scripts │ ├── SraRunTable.txt │ ├── adjust_mapping_names.py │ ├── clean_fasta_cdna_cds.py │ ├── concat_alignments.py │ ├── down_py_script.sh │ ├── from_assemblies.py │ ├── get_alignment_similarity.py │ ├── get_computation_progress.py │ ├── get_highly_conserved_og_dna_hdf5.py │ ├── get_og_from_readmapping.py │ ├── get_reconstructed_seq_by_species.py │ ├── get_seq_completeness.py │ ├── get_topological_likelihoods.py │ ├── lsf_submit.py │ ├── lsf_submit_mouse.py │ ├── lsf_submit_paired.py │ ├── map2align.py │ ├── map2align_test.py │ ├── monitor_folder_size.py │ ├── orthogroups_fasta_to_marker_genes.py │ ├── orthogroups_fasta_to_marker_genes_by_groups.py │ ├── protein_converter.py │ ├── r2t_py_script.sh │ ├── relabel_msa.py │ ├── remove_species_from_alignment.py │ ├── rm_py_script.sh │ ├── sample_from_reads.py │ ├── sample_reads.py │ ├── sge_submit.py │ ├── sge_submit_paired.py │ ├── sge_submit_paired_comic.py │ ├── slurm_submit.py │ ├── subsample_nextstrain_covid_genomes_with_sra_accession.py │ ├── treecl │ │ └── select_alignments.py │ └── trim_alignment.py ├── set_marker_genes │ ├── bacteria_markergenes.zip │ └── mammalia_markergenes.zip ├── tests │ ├── info.log │ ├── input.log │ ├── test_aligner.py │ ├── test_og.py │ ├── test_ogset.py │ ├── test_reads.py │ ├── test_seqCompleteness.py │ └── test_use.py └── wiki_images │ ├── covid1.jpg │ ├── covid2.jpg │ ├── figure1.jpg │ ├── figure_1sp.jpg │ ├── oma_page_0.png │ ├── oma_page_1.png │ ├── oma_page_2.png │ ├── oma_page_3.png │ ├── oma_page_4.png │ ├── oma_page_5.png │ ├── oma_page_6.png │ ├── oma_page_7.png │ └── oma_page_8.png ├── bin └── read2tree ├── environment.yml ├── read2tree ├── Aligner.py ├── Analyzer.py ├── FastxReader.py ├── GuidedAssembler.py ├── Mapper.py ├── MultiProcessingLog.py ├── OGSet.py ├── Progress.py ├── Reads.py ├── ReferenceSet.py ├── TreeInference.py ├── __init__.py ├── _utils.py ├── file_utils │ ├── __init__.py │ └── context_managers.py ├── logging │ ├── log.yaml │ └── log.yaml.bak ├── main.py ├── parser │ ├── OMAOutputParser.py │ └── __init__.py ├── stats │ ├── Coverage.py │ ├── SeqCompleteness.py │ └── __init__.py ├── utils │ ├── __init__.py │ └── seq_utils.py └── wrappers │ ├── __init__.py │ ├── abstract_cli.py │ ├── aligners │ ├── __init__.py │ ├── base_aligner.py │ ├── mafft.py │ ├── muscle.py │ ├── probcons.py │ └── prographmsa.py │ ├── options.py │ ├── read_mappers │ ├── __init__.py │ ├── base_mapper.py │ ├── ngm.py │ ├── ngmlr.py │ └── parser.py │ └── treebuilders │ ├── __init__.py │ ├── base_treebuilder.py │ ├── fasttree.py │ ├── iqtree.py │ ├── parsers.py │ ├── phyml.py │ ├── raxml.py │ └── src │ └── pip-delete-this-directory.txt ├── setup.py ├── src └── pip-delete-this-directory.txt └── tests ├── dna_ref.fa ├── marker_genes ├── OMAGroup_1001241.fa ├── OMAGroup_1008242.fa ├── OMAGroup_1065415.fa ├── OMAGroup_1121053.fa ├── OMAGroup_1125645.fa ├── OMAGroup_1133018.fa ├── OMAGroup_1151179.fa ├── OMAGroup_1163384.fa ├── OMAGroup_1171372.fa ├── OMAGroup_1188079.fa ├── OMAGroup_649157.fa ├── OMAGroup_649216.fa ├── OMAGroup_671579.fa ├── OMAGroup_681083.fa ├── OMAGroup_681195.fa ├── OMAGroup_683078.fa ├── OMAGroup_894224.fa ├── OMAGroup_898327.fa ├── OMAGroup_944789.fa └── OMAGroup_974829.fa ├── sample_1.fastq ├── sample_2.fastq ├── test_aligner.py ├── test_og.py ├── test_ogset.py ├── test_reads.py ├── test_seqCompleteness.py └── test_use.py /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | release: 9 | type: [published] 10 | 11 | env: 12 | TEST_TAG: dessimozlab/read2tree:test 13 | 14 | jobs: 15 | 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - 22 | name: Checkout 23 | uses: actions/checkout@v2 24 | with: 25 | submodules: recursive 26 | 27 | - 28 | name: Docker meta 29 | id: meta 30 | uses: docker/metadata-action@v4 31 | with: 32 | # list of Docker images to use as base name for tags 33 | images: | 34 | dessimozlab/read2tree 35 | # generate Docker tags based on the following events/attributes 36 | tags: | 37 | type=schedule 38 | type=ref,event=branch 39 | type=ref,event=pr 40 | type=semver,pattern={{version}} 41 | type=semver,pattern={{major}}.{{minor}} 42 | type=semver,pattern={{major}} 43 | type=sha 44 | - 45 | name: Set up QEMU 46 | uses: docker/setup-qemu-action@v2 47 | - 48 | name: Set up Docker Buildx 49 | uses: docker/setup-buildx-action@v2 50 | 51 | - 52 | name: Build and export to docker for testing 53 | uses: docker/build-push-action@v3 54 | with: 55 | context: . 56 | load: true 57 | tags: ${{ env.TEST_TAG }} 58 | - 59 | name: Test 60 | run: | 61 | docker run --rm -i -v $PWD/tests:/input -v $PWD/tests/:/reads -v $PWD/output:/out -v $PWD/run:/run ${{ env.TEST_TAG }} --tree --standalone_path /input/marker_genes --dna_reference /input/dna_ref.fa --reads /reads/sample_1.fastq --output_path /out 62 | if [ ! -f output/tree_sample_1.nwk ] ; then exit 1; fi 63 | - 64 | name: Login to DockerHub 65 | if: github.event_name != 'pull_request' && github.event_name != 'push' 66 | uses: docker/login-action@v2 67 | with: 68 | username: ${{ secrets.DOCKER_HUB_USERNAME }} 69 | password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} 70 | 71 | - 72 | name: Build and push 73 | uses: docker/build-push-action@v3 74 | with: 75 | context: . 76 | platforms: linux/amd64 77 | push: ${{ github.event_name != 'push' && github.event_name != 'pull_request' }} 78 | tags: ${{ steps.meta.outputs.tags }} 79 | labels: ${{ steps.meta.outputs.labels }} 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | build 3 | dist 4 | read2tree.egg-info 5 | docs/* 6 | tmp/* 7 | .Rhistory 8 | .RData 9 | *pyc 10 | tmp 11 | **/.ropeproject 12 | **/__pycache__ 13 | **/.ipynb_checkpoints 14 | .idea/workspace.xml 15 | .python-version 16 | .DS_Store 17 | tests/output 18 | tests/mplog.log 19 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # base image 2 | FROM continuumio/miniconda3 3 | 4 | LABEL software="read2tree" 5 | 6 | 7 | WORKDIR /app 8 | 9 | # Create the environment: 10 | COPY environment.yml . 11 | 12 | RUN apt-get -qq update \ 13 | && apt-get install -y --no-install-recommends \ 14 | wget \ 15 | && rm -rf /var/lib/apt/lists/* 16 | 17 | RUN conda env create -f environment.yml 18 | 19 | # Make RUN commands use the new environment: 20 | SHELL ["conda", "run", "-n", "read2tree_env", "/bin/bash", "-c"] 21 | 22 | # Make sure the environment is activated: 23 | RUN echo "Make sure numpy is installed:" \ 24 | && python -c "import numpy" \ 25 | && python -c "import ete3" \ 26 | && python -c "import pysam" 27 | 28 | COPY . . 29 | RUN python setup.py install 30 | 31 | ENV PATH /app/read2tree/bin:/opt/conda/envs/read2tree_env/bin:$PATH 32 | 33 | WORKDIR /run 34 | 35 | ENTRYPOINT ["read2tree"] 36 | 37 | CMD ["-h"] 38 | 39 | 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 webfucktory 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /archive/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /archive/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /archive/.idea/libraries/R_User_Library.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /archive/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | -------------------------------------------------------------------------------- /archive/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /archive/.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | -------------------------------------------------------------------------------- /archive/.idea/pore2tree.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 13 | 14 | 16 | -------------------------------------------------------------------------------- /archive/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /archive/TODO: -------------------------------------------------------------------------------- 1 | added here something to test windows bash 2 | * add better error handling when mapping doesn't work 3 | * tables needs hdf5 dependencies 4 | * some git repositories are difficult to access 5 | * each time the mapping function finishes it should check whether the currently running mapping is the 6 | last one and if so then once this is done it should delete all unnecessary files 7 | -------------------------------------------------------------------------------- /archive/down_py_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/down_GLYSP.o%J 3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/down_GLYSP.e%J 4 | #BSUB -u david.dylus@unil.ch 5 | #BSUB -J down_GLYSP 6 | #BSUB -n 4 7 | #BSUB -R "span[ptile=4]" 8 | #BSUB -R "rusage[mem=4000]" 9 | #BSUB -M 4000000 10 | srr=SRR3115005 11 | speciesid=GLYSP 12 | module add Utility/aspera_connect/3.7.4.147727 13 | source activate r2t 14 | mkdir /scratch/beegfs/weekly/ddylus/avian/reads/$speciesid 15 | echo 'Created read $speciesid' 16 | cd /scratch/beegfs/weekly/ddylus/avian/reads/$speciesid 17 | ascp -v -QT -k1 -l100M -i /software/Utility/aspera_connect/3.7.4.147727/etc/asperaweb_id_dsa.openssh anonftp@ftp.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/SRR/${srr:0:6}/$srr/$srr.sra ./ 18 | echo 'Finished download' 19 | parallel-fastq-dump -s *.sra -t 4 -O . --split-files --tmpdir . 20 | echo 'Finished getting fastq from sra and split files' 21 | mv *\_1.fastq $speciesid\_1.fq 22 | mv *\_2.fastq $speciesid\_2.fq 23 | echo 'Finished moving files' -------------------------------------------------------------------------------- /archive/r2t_py_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/r2t_GLYSP.o%J 3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/r2t_GLYSP.e%J 4 | #BSUB -u david.dylus@unil.ch 5 | #BSUB -J r2t_GLYSP 6 | #BSUB -n 4 7 | #BSUB -R "span[ptile=4]" 8 | #BSUB -R "rusage[mem=10000]" 9 | #BSUB -M 10000000 10 | source activate r2t 11 | reads=/scratch/beegfs/weekly/ddylus/avian/reads/GLYSP 12 | cd /scratch/beegfs/weekly/ddylus/avian/r2t/ 13 | python -W ignore ~/opt/read2tree/bin/read2tree --standalone_path /scratch/beegfs/weekly/ddylus/avian/marker_genes/ --dna_reference /scratch/beegfs/weekly/ddylus/avian/eukaryotes.cdna.fa --reads $reads/GLYSP_1.fq $reads/GLYSP_2.fq --output_path /scratch/beegfs/weekly/ddylus/avian/r2t/ --single_mapping /scratch/beegfs/weekly/ddylus/avian/r2t/02_ref_dna/MELGA_OGs.fa --threads 4 --min_species 8 -------------------------------------------------------------------------------- /archive/requirements.txt: -------------------------------------------------------------------------------- 1 | biopython 2 | numpy>=1.13.3 3 | scipy 4 | lxml 5 | pandas 6 | Cython 7 | ete3>=3.0.0b35 8 | pyparsing>=2.1.10 9 | pysam>=0.11.2.2 10 | six>=1.10.0 11 | requests>=2.13.0 12 | dendropy>=4.3.0 13 | tqdm>=4.19.1 14 | pyham 15 | pyyaml 16 | multiprocessing_logging 17 | -------------------------------------------------------------------------------- /archive/rm_py_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/rm_GLYSP.o%J 3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/rm_GLYSP.e%J 4 | #BSUB -u david.dylus@unil.ch 5 | #BSUB -J rm_GLYSP 6 | #BSUB -n 1 7 | #BSUB -R "span[ptile=1]" 8 | #BSUB -R "rusage[mem=1000]" 9 | #BSUB -M 1000000 10 | rm -r /scratch/beegfs/weekly/ddylus/avian/reads/GLYSP -------------------------------------------------------------------------------- /archive/scripts/adjust_mapping_names.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import shutil 3 | import os 4 | import getopt 5 | import glob 6 | 7 | from Bio import SeqIO 8 | from tables import * 9 | 10 | def main(): 11 | 12 | try: 13 | opts, args = getopt.getopt(sys.argv[1:], "m:r:h", ["mapping_folder=", "reference_folder="]) 14 | except getopt.GetoptError as e: 15 | print(str(e)) 16 | print('get_seq_completeness.py -m ') 17 | sys.exit(2) 18 | 19 | mapping_folder = None 20 | 21 | for opt, arg in opts: 22 | if opt == '-h': 23 | print('get_seq_completeness.py -m ') 24 | sys.exit() 25 | elif opt in ("-m", "--mapping_folder"): 26 | mapping_folder = arg 27 | if mapping_folder[-1] is not "/": 28 | mapping_folder += "/" 29 | else: 30 | assert False, "unhandled option" 31 | 32 | if mapping_folder: 33 | for file in glob.glob(mapping_folder + "/*.fa"): 34 | if "_OGs" not in os.path.basename(file): 35 | species_name = os.path.basename(file).split("_")[0] 36 | new_file_name = species_name + "_OGs_consensus.fa" 37 | shutil.move(file, os.path.join(mapping_folder, new_file_name)) 38 | for file in glob.glob(mapping_folder + "/*cov.txt"): 39 | if "_OGs" not in os.path.basename(file): 40 | species_name = os.path.basename(file).split("_")[0] 41 | new_file_name = species_name + "_OGs_cov.txt" 42 | shutil.move(file, os.path.join(mapping_folder, new_file_name)) 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /archive/scripts/clean_fasta_cdna_cds.py: -------------------------------------------------------------------------------- 1 | 2 | from Bio import SeqIO 3 | from Bio.Seq import Seq 4 | import sys 5 | from os import listdir 6 | import os 7 | 8 | 9 | 10 | 11 | def read_fasta_files(input_folder, format_input="fna"): 12 | 13 | files = listdir(input_folder) 14 | records_all = [] 15 | file_names = [] 16 | for file in files: 17 | sp_name = file.split(".")[:-1] 18 | if file.split(".")[-1] == format_input: 19 | file_names.append(file) 20 | records = list(SeqIO.parse(input_folder + file, "fasta")) 21 | records_all.append(records) 22 | else: 23 | print("we are not reading the file "+str(input_folder+file)+" since extension is not faa.") 24 | if records_all: 25 | print("there are ", len(file_names), format_input, " files, and the first file has ", len(records_all[0]), "sequences in it.") 26 | else: 27 | print("there is no " +format_input, " files in ",input_folder) 28 | return file_names, records_all 29 | 30 | 31 | def create_five_letter(file_names, output_five_letter_tsv = "clean_five_letter_species.tsv"): 32 | 33 | fiveLetter_species_dic = {} 34 | countr = 0 35 | for file_name in file_names: 36 | fiveLetter_species = "s" + str(countr).zfill(4) 37 | fiveLetter_species_dic[file_name] = fiveLetter_species 38 | countr += 1 39 | file_out = open(output_five_letter_tsv, "w") 40 | for species_name, fiveLetter in fiveLetter_species_dic.items(): 41 | file_out.write(species_name + "\t" + fiveLetter + "\n") 42 | file_out.close() 43 | print("the five letter codes for each faa files are written in "+output_five_letter_tsv) 44 | 45 | return fiveLetter_species_dic 46 | 47 | 48 | 49 | def clean_translate(records ,species_fivelet): 50 | 51 | records_nuc = [] 52 | records_aa = [] 53 | for record in records: 54 | sequence = record.seq 55 | remainder = len(sequence) % 3 56 | if remainder != 0: 57 | sequence +=Seq('N' * (3 - remainder)) 58 | record.seq= sequence 59 | 60 | id_old = str(record.id).replace("_","").replace(".","") 61 | id_new= species_fivelet + id_old 62 | 63 | nuc_seq= SeqIO.SeqRecord(sequence, id=id_new, description="cleaned for r2t", name = id_new) 64 | 65 | protein_seq = sequence.translate() 66 | protein_seq = SeqIO.SeqRecord(protein_seq, id=id_new, description="cleaned for r2t", name = id_new) 67 | 68 | 69 | records_nuc.append(nuc_seq) 70 | records_aa.append(protein_seq) 71 | 72 | print("the clean aa and nuc for "+species_fivelet+" is ready") 73 | 74 | return records_nuc, records_aa 75 | 76 | 77 | 78 | 79 | 80 | if __name__ == '__main__': 81 | 82 | input_folder_fna = sys.argv[1] + "/" # "myfolder/input_fna/" # 83 | 84 | file_names, records_all = read_fasta_files(input_folder_fna, "fna") 85 | fiveLetter_species_dic = create_five_letter(file_names) 86 | 87 | 88 | folder_aa= "clean_aa" 89 | 90 | 91 | if not os.path.exists(folder_aa): 92 | os.makedirs(folder_aa) 93 | else: 94 | print("ERROR the folder exists "+folder_aa +" better to remove it ") 95 | 96 | records_nuc_all_clean=[] 97 | for idx in range(len(file_names)): 98 | file_name = file_names[idx] 99 | records = records_all[idx] 100 | species_fivelet = fiveLetter_species_dic[file_name] 101 | 102 | records_nuc, records_aa = clean_translate(records ,species_fivelet) 103 | 104 | SeqIO.write(records_aa, folder_aa+"/"+species_fivelet+".fa", "fasta") 105 | 106 | records_nuc_all_clean += records_nuc # one big list 107 | 108 | 109 | SeqIO.write(records_nuc_all_clean, "dna_ref.fa", "fasta") 110 | 111 | print("we wrote "+str(len(file_names))+" faa fiels in the folder "+folder_aa+" and the nucluetide sequences all together in dna_ref.fa" ) 112 | 113 | print("Now you can use the folder with OMA standalone" ) 114 | -------------------------------------------------------------------------------- /archive/scripts/concat_alignments.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import getopt 4 | import glob 5 | from Bio import AlignIO, SeqIO 6 | 7 | from zoo.seq_utils.utils import concatenate 8 | 9 | <<<<<<< Updated upstream 10 | def concatenate_alignments(folder): 11 | ======= 12 | def concatenate_alignments(folder, min_taxa=0): 13 | >>>>>>> Stashed changes 14 | all_og_alignments = [] 15 | all_og_align_pos = {} 16 | start = 0 17 | for f in glob.glob(folder+'*.phy'): 18 | used_ogs = 0 19 | if os.path.getsize(f) > 0: 20 | try: 21 | msa = AlignIO.read(f, "phylip-relaxed") 22 | except ValueError: 23 | msa = AlignIO.read(f, "fasta") 24 | #for record in msa: 25 | # record.id = record.id[0:5] 26 | #msa[-1].id = "CANAL" 27 | if len(msa) >= min_taxa: 28 | print(f) 29 | used_ogs =+ 1 30 | all_og_alignments.append(msa) 31 | #all_og_align_pos[f] = [start, start + len(record.seq)] 32 | #start = len(record.seq) + 1 33 | con_alignment = concatenate(all_og_alignments) 34 | print('OGs used: {}!'.format(used_ogs)) 35 | return con_alignment 36 | 37 | 38 | def main(): 39 | 40 | try: 41 | opts, args = getopt.getopt(sys.argv[1:], "f:m:o:", ["folder=", "min_taxa=", "out_file="]) 42 | except getopt.GetoptError as e: 43 | print(str(e)) 44 | print('concat_alignments.py -f -m -o ') 45 | sys.exit(2) 46 | 47 | seq_folder = None 48 | out_file = None 49 | min_taxa = 0 50 | 51 | for opt, arg in opts: 52 | if opt == '-h': 53 | print('concat_alignments.py -f -m -o -d') 54 | sys.exit() 55 | elif opt in ("-f", "--folder"): 56 | seq_folder = arg 57 | elif opt in ("-o", "--out_file"): 58 | out_file = arg 59 | elif opt in ("-m", "--min_taxa"): 60 | min_taxa = int(arg) 61 | else: 62 | assert False, "unhandled option" 63 | 64 | 65 | 66 | if seq_folder[-1] is not "/": 67 | seq_folder += "/" 68 | 69 | if min_taxa > 0: 70 | out_file = out_file+"_"+str(min_taxa)+".phy" 71 | 72 | alignment = concatenate_alignments(seq_folder, min_taxa=min_taxa) 73 | if alignment is not None: 74 | align_output = open(out_file, "w") 75 | AlignIO.write(alignment, align_output, "phylip-relaxed") 76 | align_output.close() 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /archive/scripts/down_py_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/down_GLYSP.o%J 3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/down_GLYSP.e%J 4 | #BSUB -u david.dylus@unil.ch 5 | #BSUB -J down_GLYSP 6 | #BSUB -n 1 7 | #BSUB -R "span[ptile=1]" 8 | #BSUB -R "rusage[mem=2000]" 9 | #BSUB -M 2000000 10 | srr=SRR3115005 11 | speciesid=GLYSP 12 | module add Utility/aspera_connect/3.7.4.147727 13 | module add UHTS/Analysis/sratoolkit/2.8.2.1 14 | source activate r2t 15 | mkdir /scratch/beegfs/weekly/ddylus/avian/reads/$speciesid 16 | echo 'Created read $speciesid' 17 | cd /scratch/beegfs/weekly/ddylus/avian/reads/$speciesid 18 | ascp -v -QT -k1 -l100M -i /software/Utility/aspera_connect/3.7.4.147727/etc/asperaweb_id_dsa.openssh anonftp@ftp.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/SRR/${srr:0:6}/$srr/$srr.sra ./ 19 | echo 'Finished download' 20 | fastq-dump --split-files --gzip $srr.sra 21 | echo 'Finished getting fastq from sra and split files' 22 | mv *\_1.* $speciesid\_1.fq.gz 23 | mv *\_2.* $speciesid\_2.fq.gz 24 | echo 'Finished moving files' -------------------------------------------------------------------------------- /archive/scripts/get_highly_conserved_og_dna_hdf5.py: -------------------------------------------------------------------------------- 1 | from tables import * 2 | from Bio import SeqIO 3 | from Bio.Seq import Seq 4 | from Bio.SeqRecord import SeqRecord 5 | from pyoma.browser import db 6 | import familyanalyzer as fa 7 | 8 | # parameters 9 | MIN_SPECIES = 20 10 | DUP_RATIO = 0 11 | DIR = '/Users/daviddylus/Research/read2tree/reference_datasets/Dataset1/Output/' 12 | 13 | # read in files 14 | hog_XML = DIR+'HierarchicalGroups.orthoxml' 15 | og_XML = DIR+'OrthologousGroups.orthoxml' 16 | h5file = open_file("/Volumes/Untitled/OmaServer.h5", mode="r") 17 | 18 | genomeTab = h5file.root.Genome 19 | dbObj = db.Database(h5file) 20 | omaIdObj = db.OmaIdMapper(dbObj) 21 | 22 | if DUP_RATIO != 0: 23 | hog_op = fa.OrthoXMLParser(hog_XML) 24 | gene_family_xml_nodes_hog = hog_op.getToplevelGroups() 25 | # select all the families with more than X species and duplication ratio smaller than Y 26 | hog_families_X = {} 27 | for i, family in enumerate(gene_family_xml_nodes_hog): 28 | family_id = family.get('id') 29 | genes_per_hog = [val for sublist in hog_op.getGenesPerSpeciesInFam(family).values() for val in sublist] 30 | species_per_hog = hog_op.getGenesPerSpeciesInFam(family).keys() 31 | duplication_ratio = float(len(genes_per_hog)) / float(len(species_per_hog)) 32 | if len(species_per_hog) >= MIN_SPECIES and duplication_ratio <= DUP_RATIO: 33 | hog_families_X[family_id] = genes_per_hog 34 | 35 | print(len(hog_families_X)) 36 | 37 | 38 | og_op = fa.OrthoXMLParser(og_XML) 39 | gene_family_xml_nodes_og = og_op.getToplevelGroups() 40 | og_families_X = {} 41 | for i, family in enumerate(gene_family_xml_nodes_og): 42 | family_id = family.get('id') 43 | genes_per_og = [val for sublist in og_op.getGenesPerSpeciesInFam(family).values() for val in sublist] 44 | species_per_og = og_op.getGenesPerSpeciesInFam(family).keys() 45 | if len(species_per_og) >= MIN_SPECIES: 46 | og_families_X[family_id] = genes_per_og 47 | 48 | print(len(og_families_X)) 49 | 50 | if DUP_RATIO != 0: 51 | family_map = {} 52 | entries_map_omaids = {} 53 | cpt = 0 54 | for og in og_families_X: 55 | cpt += 1 56 | if cpt % 10 == 0: 57 | print("{} on {}".format(cpt, len(og_families_X))) 58 | a = og_families_X[og] 59 | for hog in hog_families_X: 60 | b = hog_families_X[hog] 61 | if len(set(a).intersection(b)) == 30: 62 | oma_ids_full = [og_op.mapGeneToXRef(val, 'protId') for val in og_families_X[og]] 63 | oma_ids = [og_op.mapGeneToXRef(val, 'protId').split(' | ')[0] for val in og_families_X[og]] 64 | entries = [omaIdObj.omaid_to_entry_nr(val) for val in oma_ids] 65 | for oma_id in oma_ids_full: 66 | entries_map_omaids[omaIdObj.omaid_to_entry_nr(oma_id.split(' | ')[0])] = oma_id 67 | family_map[og] = entries 68 | break 69 | print(len(entries_map_omaids)) 70 | else: 71 | family_map = {} 72 | entries_map_omaids = {} 73 | cpt = 0 74 | for og in og_families_X: 75 | cpt += 1 76 | if cpt % 1000 == 0: 77 | print(og_op.mapGeneToXRef(og_families_X[og][0], 'protId').split(' | ')[0]) 78 | print("{} on {}".format(cpt, len(og_families_X))) 79 | oma_ids_full = [og_op.mapGeneToXRef(val, 'protId') for val in og_families_X[og] if og_op.mapGeneToXRef(val, 'protId')] 80 | oma_ids = [val.split(' | ')[0] for val in oma_ids_full] 81 | entries = [omaIdObj.omaid_to_entry_nr(val) for val in oma_ids if omaIdObj.omaid_to_entry_nr(val)] 82 | print(entries) 83 | for oma_id in oma_ids_full: 84 | entries_map_omaids[omaIdObj.omaid_to_entry_nr(oma_id.split(' | ')[0])] = oma_id 85 | family_map[og] = entries 86 | print(len(entries_map_omaids)) 87 | 88 | 89 | family_map_invert = {} 90 | for key in family_map: 91 | for val in family_map[key]: 92 | family_map_invert[val]=key 93 | 94 | print(len(family_map_invert)) 95 | 96 | records = [] 97 | for key in family_map_invert: 98 | new_id = entries_map_omaids[key] + '| OG' + family_map_invert[key] 99 | record = SeqRecord(Seq(dbObj.get_cdna(key)), id=new_id, description="") 100 | records.append(record) 101 | 102 | with open("dataset2.fasta", "w") as output_handle: 103 | SeqIO.write(records, output_handle, "fasta") 104 | 105 | -------------------------------------------------------------------------------- /archive/scripts/get_og_from_readmapping.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import getopt 3 | from Bio import SeqIO 4 | from Bio.SeqIO import FastaIO 5 | 6 | def get_ogs(mapped_reads, og_data): 7 | og_data_names = [record.name for record in og_data] 8 | og_data_ogs = [record.description.split("| ")[-1] for record in og_data] 9 | list_of_ogs = {} 10 | for record in mapped_reads: 11 | if record.name in og_data_names: 12 | og_index = og_data_names.index(record.name) 13 | og_name = og_data[og_index].description.split("| ")[-1] 14 | indices = [i for i, x in enumerate(og_data_ogs) if x == og_name] 15 | seq_to_write = [og_data[i] for i in indices] 16 | record.seq = record.seq.upper() 17 | record.id = "SRR400661_" + record.id 18 | record.name = "SRR400661_" + record.name 19 | record.description = "SRR400661_" + record.description 20 | seq_to_write.append(record) 21 | list_of_ogs[og_name] = seq_to_write 22 | return list_of_ogs 23 | 24 | 25 | 26 | def main(): 27 | 28 | try: 29 | opts, args = getopt.getopt(sys.argv[1:], "r:d:o:", ["mapped_reads=", "ref_data=", "out_folder="]) 30 | except getopt.GetoptError as e: 31 | print(str(e)) 32 | print('concat_alignments.py -r -d -o ') 33 | sys.exit(2) 34 | 35 | mapped_reads = None 36 | ref_data = None 37 | out_folder = None 38 | 39 | for opt, arg in opts: 40 | if opt == '-h': 41 | print('concat_alignments.py -r -d -o ') 42 | sys.exit() 43 | elif opt in ("-r", "--reads"): 44 | mapped_reads = arg 45 | elif opt in ("-d", "--ref_data"): 46 | ref_data = arg 47 | elif opt in ("-o", "--out_folder"): 48 | out_folder = arg 49 | else: 50 | assert False, "unhandled option" 51 | 52 | read_mappings = list(SeqIO.parse(mapped_reads, "fasta")) 53 | og_data = list(SeqIO.parse(ref_data, "fasta")) 54 | 55 | if out_folder[-1] is not "/": 56 | out_folder += "/" 57 | 58 | list_of_ogs = get_ogs(read_mappings, og_data) 59 | if list_of_ogs is not None: 60 | for og in list_of_ogs: 61 | file_name = out_folder + og + ".fasta" 62 | fasta_out = FastaIO.FastaWriter(open(file_name, "w"), wrap=None) 63 | fasta_out.write_file(list_of_ogs[og]) 64 | 65 | 66 | if __name__ == "__main__": 67 | main() -------------------------------------------------------------------------------- /archive/scripts/get_reconstructed_seq_by_species.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import getopt 4 | import glob 5 | import pandas as pd 6 | 7 | from Bio import SeqIO 8 | from tables import * 9 | from Bio.SeqIO.FastaIO import FastaWriter 10 | 11 | 12 | def read_seq_records(folder): 13 | out_dic = {} 14 | for file in glob.glob(os.path.join(folder, "*.fa")): 15 | sp_name = os.path.basename(file).split("_")[0] 16 | out_dic[sp_name] = {rec.id: rec for rec in list(SeqIO.parse(file, "fasta"))} 17 | return out_dic 18 | 19 | def read_sc_file(file): 20 | tmp = pd.read_csv(file) 21 | return [t['gene_id']+"_"+t['og']+"_"+t['og'] for i,t in tmp.iterrows()] 22 | 23 | 24 | def main(): 25 | 26 | try: 27 | opts, args = getopt.getopt(sys.argv[1:], "m:s:h", ["mapping_folder=", "sc_file="]) 28 | except getopt.GetoptError as e: 29 | print(str(e)) 30 | print('get_reconstructed_seq_by_species.py -m -s ') 31 | sys.exit(2) 32 | 33 | mapping_folder = None 34 | sc_file = None 35 | 36 | for opt, arg in opts: 37 | if opt == '-h': 38 | print('get_reconstructed_seq_by_species.py -m -s ') 39 | sys.exit() 40 | elif opt in ("-m", "--mapping_folder"): 41 | mapping_folder = arg 42 | if mapping_folder[-1] is not "/": 43 | mapping_folder += "/" 44 | elif opt in ("-s", "--sc_file"): 45 | sc_file = arg 46 | else: 47 | assert False, "unhandled option" 48 | 49 | all_records = read_seq_records(mapping_folder) 50 | selected_seq = [all_records[idx[0:5]][idx] for idx in read_sc_file(sc_file)] 51 | print(selected_seq) 52 | file_name = mapping_folder.split("03_mapping_")[-1].split("/")[0]+"_consensus.fa" 53 | handleF = open(file_name, "w") 54 | writer = FastaWriter(handleF, wrap=None) 55 | writer.write_file(selected_seq) 56 | handleF.close() 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /archive/scripts/get_seq_completeness.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import getopt 4 | import glob 5 | 6 | from Bio import SeqIO 7 | from tables import * 8 | from read2tree.stats.SeqCompleteness import SeqCompleteness 9 | 10 | 11 | def read_seq_records(file): 12 | return list(SeqIO.parse(file, "fasta")) 13 | 14 | 15 | def main(): 16 | 17 | try: 18 | opts, args = getopt.getopt(sys.argv[1:], "m:r:h", ["mapping_folder=", "reference_folder="]) 19 | except getopt.GetoptError as e: 20 | print(str(e)) 21 | print('get_seq_completeness.py -m -r ') 22 | sys.exit(2) 23 | 24 | mapping_folder = None 25 | reference_folder = None 26 | 27 | for opt, arg in opts: 28 | if opt == '-h': 29 | print('get_seq_completeness.py -m -r ') 30 | sys.exit() 31 | elif opt in ("-m", "--mapping_folder"): 32 | mapping_folder = arg 33 | if mapping_folder[-1] is not "/": 34 | mapping_folder += "/" 35 | elif opt in ("-r", "--reference_folder"): 36 | reference_folder = arg 37 | if reference_folder[-1] is not "/": 38 | reference_folder += "/" 39 | else: 40 | assert False, "unhandled option" 41 | if reference_folder and mapping_folder: 42 | for file in glob.glob(reference_folder+"/*.fa"): 43 | species = os.path.basename(file).split("_")[0] 44 | print(species) 45 | ref_records = read_seq_records(file) 46 | mapping_file = os.path.join(mapping_folder,os.path.basename(file).split(".")[0]+"_consensus.fa") 47 | if os.path.exists(mapping_file): 48 | map_records = read_seq_records(mapping_file) 49 | seqC = SeqCompleteness(ref_records) 50 | seqC.get_seq_completeness(map_records) 51 | seqC.write_seq_completeness(os.path.join(mapping_folder, species + "_OGs_sc.txt")) 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /archive/scripts/map2align_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import getopt 4 | import glob 5 | 6 | from Bio import SeqIO 7 | from zoo.wrappers.treebuilders import Fasttree 8 | from tables import * 9 | from Bio import AlignIO 10 | from zoo.wrappers.aligners import Mafft 11 | from Bio.SeqIO import FastaIO 12 | 13 | 14 | from zoo.seq_utils.utils import concatenate 15 | 16 | 17 | def get_coverage(og): 18 | return (len(og[-1].seq)-og[-1].seq.count('X'))/len(og[-1].seq) 19 | 20 | def perform_mapping(DIR_MAPPING, FILE_OGS): 21 | og_dict = {} 22 | '''read in og with aa seq''' 23 | og = list(SeqIO.parse(FILE_OGS, "fasta")) 24 | for record in og: 25 | key = record.description.split(" | ")[-1] 26 | if key in og_dict: 27 | ids = [rec.id for rec in og_dict[key]] 28 | if record.id not in ids: 29 | og_dict[key].append(record) 30 | else: 31 | og_dict[key] = [] 32 | og_dict[key].append(record) 33 | 34 | 35 | # parse the mapped reads to ogs to dictionary 36 | all_dict = {} 37 | for file in glob.glob(DIR_MAPPING + "*.fa"): 38 | og_name = file.split("_")[-1].split(".")[0] 39 | og = og_dict[og_name] 40 | 41 | # change ids to species names 42 | for i, record in enumerate(og): 43 | s = record.id[0:5] 44 | record.id = s 45 | all_dict[og_name] = og 46 | 47 | OG_OUT = DIR_MAPPING + 'origin_og/' 48 | if not os.path.exists(OG_OUT): 49 | os.makedirs(OG_OUT) 50 | 51 | for key, item in all_dict.items(): 52 | file_name = OG_OUT + key + ".fa" 53 | fasta_out = FastaIO.FastaWriter(open(file_name, "w"), wrap=None) 54 | fasta_out.write_file(item) 55 | 56 | print("FINISHED PARSING OGs!") 57 | return all_dict 58 | 59 | def read_alignments(folder): 60 | align_list = [] 61 | for filename in glob.glob(folder+"*.phy"): 62 | # input_handle = open(filename, "rU") 63 | align_list.append(AlignIO.read(filename, "phylip-relaxed")) 64 | print("FINISHED READING ALIGNMENTS!") 65 | return align_list 66 | 67 | def perform_alignment(all_dict, DIR_MAPPING): 68 | align_dict = {} 69 | align_list = [] 70 | counter = 0 71 | for key, value in all_dict.items(): 72 | mafft_wrapper = Mafft(value, datatype="PROTEIN") 73 | mafft_wrapper.options.options['--localpair'].set_value(True) 74 | mafft_wrapper.options.options['--maxiterate'].set_value(1000) 75 | alignment = mafft_wrapper() 76 | align_dict[key] = alignment 77 | align_list.append(alignment) 78 | counter += 1 79 | if counter % 50 == 0: 80 | print('{} of {} alignments done'.format(counter, len(all_dict))) 81 | 82 | ALIGN_OUT = DIR_MAPPING + 'origin_align/' 83 | 84 | if not os.path.exists(ALIGN_OUT): 85 | os.makedirs(ALIGN_OUT) 86 | print("WRITING ALIGNMENT FILES INTO: {}!".format(ALIGN_OUT)) 87 | for key, value in align_dict.items(): 88 | output_handle = open(ALIGN_OUT + key + ".phy", "w") 89 | AlignIO.write(value, output_handle, "phylip") 90 | print("FINISHED ALIGNMENTS!") 91 | return align_list 92 | 93 | def concatenate_alignment(align_list, DIR_MAPPING): 94 | ALIGN_OUT = DIR_MAPPING + 'origin_align/' 95 | concat_align = concatenate(align_list) 96 | 97 | output_handle = open(ALIGN_OUT + "CONCAT.phy", "w") 98 | AlignIO.write(concat_align, output_handle, "phylip") 99 | print("FINISHED CONCATINATION!") 100 | return concat_align 101 | 102 | def build_tree(concat_align, DIR_MAPPING): 103 | 104 | fasttree_wrapper = Fasttree(concat_align, datatype="PROTEIN") 105 | tree = fasttree_wrapper() 106 | print("FINISHED TREE INFERENCE!") 107 | with open(DIR_MAPPING+"original_tree.nwk", "w") as text_file: 108 | text_file.write("{};".format(tree)) 109 | print("Resulting tree: {}".format(tree)) 110 | return tree 111 | 112 | def main(): 113 | 114 | try: 115 | opts, args = getopt.getopt(sys.argv[1:], "m:o:a:t:h", ["mapping_folder=", "ortholog_file=", "alignmnet_folder="]) 116 | except getopt.GetoptError as e: 117 | print(str(e)) 118 | print('map2align_test.py -m -o -a ') 119 | sys.exit(2) 120 | 121 | mapping_folder = None 122 | ortholog_file = None 123 | alignment_folder = None 124 | 125 | for opt, arg in opts: 126 | if opt == '-h': 127 | print('map2align_test.py -m -o -a ') 128 | sys.exit() 129 | elif opt in ("-m", "--mapping_folder"): 130 | mapping_folder = arg 131 | if mapping_folder[-1] is not "/": 132 | mapping_folder += "/" 133 | elif opt in ("-a", "--alignmnet_folder"): 134 | alignment_folder = arg 135 | if alignment_folder[-1] is not "/": 136 | alignment_folder += "/" 137 | elif opt in ("-o", "--ortholog_folder"): 138 | ortholog_file = arg 139 | else: 140 | assert False, "unhandled option" 141 | 142 | 143 | if alignment_folder: 144 | align = read_alignments(alignment_folder) 145 | else: 146 | mapping = perform_mapping(mapping_folder, ortholog_file) 147 | align = perform_alignment(mapping, mapping_folder) 148 | 149 | concatenation = concatenate_alignment(align, mapping_folder) 150 | build_tree(concatenation, mapping_folder) 151 | 152 | 153 | if __name__ == "__main__": 154 | main() 155 | -------------------------------------------------------------------------------- /archive/scripts/monitor_folder_size.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import pandas as pd 4 | import subprocess 5 | 6 | 7 | def output_shell(line): 8 | """ 9 | Save output of shell line that has pipes 10 | taken from: https://stackoverflow.com/questions/7389662/link-several-popen-commands-with-pipes 11 | :param line: 12 | :return: 13 | """ 14 | try: 15 | shell_command = subprocess.Popen(line, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 16 | except OSError: 17 | return None 18 | except ValueError: 19 | return None 20 | 21 | (output, err) = shell_command.communicate() 22 | shell_command.wait() 23 | if shell_command.returncode != 0: 24 | print("Shell command failed to execute") 25 | return None 26 | 27 | return output 28 | 29 | def du(path): 30 | return subprocess.check_output(['du', '-sh', path]).split()[0].decode('utf-8') 31 | 32 | def bjobs(): 33 | return output_shell("bjobs | grep -c 'RUN'") 34 | 35 | 36 | if __name__ == "__main__": 37 | path = sys.argv[1] if len(sys.argv) > 1 else '.' 38 | 39 | bjobs_exist = True 40 | folder_size = [] 41 | number_jobs = [] 42 | total_time = [] 43 | current_time = 0 44 | time_interval = 10 45 | 46 | try: 47 | with open('./monitoring.csv', 'a') as file: 48 | file.write('current_time,folder_size,num_bjobs\n') 49 | while True and bjobs_exist: 50 | folder_size.append(du(path)) 51 | number_jobs.append(bjobs()) 52 | total_time.append(current_time) 53 | to_write = str(current_time)+','+str(folder_size[-1])+','+str(number_jobs[-1])+'\n' 54 | file.write(to_write) 55 | current_time += time_interval 56 | time.sleep(time_interval) 57 | # if "No unfinished job found" in output_shell("bjobs"): 58 | # bjobs_exist = False 59 | except KeyboardInterrupt: 60 | #time.sleep(time_interval) 61 | file.close() 62 | # d = {"folder_size": folder_size, "current_time": total_time, "num_bjobs": number_jobs} 63 | # df = pd.DataFrame(d) 64 | # df.to_csv("./monitoring.csv") 65 | raise -------------------------------------------------------------------------------- /archive/scripts/orthogroups_fasta_to_marker_genes.py: -------------------------------------------------------------------------------- 1 | from Bio.SeqIO.FastaIO import FastaWriter 2 | from Bio import SeqIO 3 | import tqdm, os, glob, re 4 | from xml.dom import minidom 5 | 6 | 7 | 8 | def _find_index_substring(ids, search_string, tmp_list): 9 | best_index = None 10 | max_occurence = 0 11 | tmp_ids = [re.sub(r'\..*', '', tmp) for tmp in tmp_list] 12 | use_ids = [re.sub(r'\W+', '', tmp_id) for tmp_id in tmp_ids] 13 | index = [i for i, s in enumerate(ids) if search_string in s] 14 | for i in index: 15 | string_occurence = len([k for k in use_ids if k in ids[i]]) 16 | if string_occurence > max_occurence: 17 | best_index = i 18 | max_occurence = string_occurence 19 | if best_index: 20 | return best_index 21 | else: 22 | return None 23 | 24 | 25 | def _get_all_ids(f_orthoxml): 26 | all_prot_ids = [] 27 | xmldoc = minidom.parse(f_orthoxml) 28 | itemlist = xmldoc.getElementsByTagName('gene') 29 | print(" --- loading all protids ---") 30 | for s in tqdm.tqdm(itemlist): 31 | tmp = s.attributes['protId'].value 32 | all_prot_ids.append(tmp) 33 | return all_prot_ids 34 | 35 | 36 | def _write(file, value): 37 | """ 38 | Write output to fasta file 39 | :param file: file and location of outputfile 40 | :param value: 41 | :return: 42 | """ 43 | handle = open(file, "w") 44 | writer = FastaWriter(handle, wrap=None) 45 | writer.write_file(value) 46 | handle.close() 47 | 48 | 49 | def _get_species_id(record): 50 | if '[' in record.description and ']' in record.description: 51 | return record.description[record.description.find( 52 | "[")+1:record.description.find("]")] 53 | else: 54 | return record.id[0:5] 55 | 56 | def run(orthogroups_fasta_folder, orthogroups_xml, output_path, min_species): 57 | if not os.path.exists(output_path): 58 | os.makedirs(output_path) 59 | all_prot_ids = _get_all_ids(orthogroups_xml) 60 | for f in tqdm.tqdm(glob.glob(os.path.join(orthogroups_fasta_folder, '*.fa'))): 61 | records = list(SeqIO.parse(f, 'fasta')) 62 | if len(records) >= min_species: 63 | for rec in records: 64 | sp_id = _get_species_id(rec) 65 | tmp_lst = rec.description.split() 66 | if sp_id not in tmp_lst[0]: 67 | tmp = tmp_lst[-2] 68 | tmp_id = re.sub(r'\..*', '', tmp) 69 | use_id = re.sub(r'\W+', '', tmp_id) 70 | new_id = _find_index_substring(all_prot_ids, use_id, tmp_lst) 71 | if new_id: 72 | rec.id = all_prot_ids[new_id] 73 | new_description = rec.description.split()[-1] 74 | rec.description = new_description 75 | rec.name = '' 76 | output_file = os.path.join(output_path, 77 | os.path.basename(f)) 78 | _write(output_file, records) 79 | 80 | 81 | if __name__ == "__main__": 82 | import argparse 83 | parser = argparse.ArgumentParser( 84 | description="""Transform OrthogroupsFasta into marker_genes""") 85 | parser.add_argument('--oxml', default=None, 86 | help='[Default is none] Remove species present ' 87 | 'in data set after mapping step completed to ' 88 | 'build OGs. Input is comma separated list ' 89 | 'without spaces, e.g. XXX,YYY,AAA.') 90 | parser.add_argument('--ofolder', default='marker_genes', required=True, 91 | help='[Default is current directory] Path to ' 92 | 'output directory.') 93 | parser.add_argument('--ofasta', default='.', required=True, 94 | help='[Default is current directory] Path to ' 95 | 'output directory.') 96 | parser.add_argument('--min_species', type=int, default=None, 97 | help='Min number of species in selected ' 98 | 'orthologous groups. If not selected it will be ' 99 | 'estimated such that around 1000 OGs ' 100 | 'are available.') 101 | 102 | conf = parser.parse_args() 103 | 104 | run(conf.ofasta, conf.oxml, conf.ofolder, conf.min_species) 105 | -------------------------------------------------------------------------------- /archive/scripts/orthogroups_fasta_to_marker_genes_by_groups.py: -------------------------------------------------------------------------------- 1 | from Bio.SeqIO.FastaIO import FastaWriter 2 | from Bio import SeqIO 3 | import tqdm, os, glob 4 | 5 | def _oma_replace(row): 6 | if 'OMA0000' in row: 7 | return 'OMA0000' 8 | elif 'OMA000' in row: 9 | return 'OMA000' 10 | elif 'OMA00' in row: 11 | return 'OMA00' 12 | elif 'OMA0' in row: 13 | return 'OMA0' 14 | elif 'OMA' in row: 15 | return 'OMA' 16 | 17 | 18 | def _get_all_ids(orthogroups_txt): 19 | with open(orthogroups_txt) as f: 20 | lines = f.readlines() 21 | x = [] 22 | for l in lines: 23 | if '#' not in l: 24 | x.append(l.rstrip("\n").split("\t")) 25 | og_dic = {} 26 | for r in x: 27 | tmp = r[0].replace(_oma_replace(r[0]), 'OG') 28 | r[0] = tmp 29 | og_dic[tmp] = {i[0:5]: i[6:] for i in r[1:]} 30 | return og_dic 31 | 32 | 33 | def _write(file, value): 34 | """ 35 | Write output to fasta file 36 | :param file: file and location of outputfile 37 | :param value: 38 | :return: 39 | """ 40 | handle = open(file, "w") 41 | writer = FastaWriter(handle, wrap=None) 42 | writer.write_file(value) 43 | handle.close() 44 | 45 | 46 | def _get_species_id(record): 47 | if '[' in record.description and ']' in record.description: 48 | return record.description[record.description.find( 49 | "[")+1:record.description.find("]")] 50 | else: 51 | return record.id[0:5] 52 | 53 | def run(orthogroups_fasta_folder, og_dic, output_path, min_species): 54 | if not os.path.exists(output_path): 55 | os.makedirs(output_path) 56 | for f in tqdm.tqdm(glob.glob(os.path.join(orthogroups_fasta_folder, '*.fa'))): 57 | new_name_dic = og_dic[os.path.basename(f).split(".")[0]] 58 | records = list(SeqIO.parse(f, 'fasta')) 59 | if len(records) >= min_species: 60 | for rec in records: 61 | sp_id = _get_species_id(rec) 62 | new_id = new_name_dic[sp_id].split()[0] 63 | rec.id = new_id 64 | rec.description = new_name_dic[sp_id].replace(new_id, "") + " [" + sp_id + "]" 65 | output_file = os.path.join(output_path, 66 | os.path.basename(f)) 67 | _write(output_file, records) 68 | 69 | 70 | if __name__ == "__main__": 71 | import argparse 72 | parser = argparse.ArgumentParser( 73 | description="""Transform OrthogroupsFasta into marker_genes""") 74 | parser.add_argument('--ogroups', default=None, 75 | help='[Default is none] Remove species present ' 76 | 'in data set after mapping step completed to ' 77 | 'build OGs. Input is comma separated list ' 78 | 'without spaces, e.g. XXX,YYY,AAA.') 79 | parser.add_argument('--ofolder', default='marker_genes', required=True, 80 | help='[Default is current directory] Path to ' 81 | 'output directory.') 82 | parser.add_argument('--ofasta', default='.', required=True, 83 | help='[Default is current directory] Path to ' 84 | 'output directory.') 85 | parser.add_argument('--min_species', type=int, default=None, 86 | help='Min number of species in selected ' 87 | 'orthologous groups. If not selected it will be ' 88 | 'estimated such that around 1000 OGs ' 89 | 'are available.') 90 | 91 | conf = parser.parse_args() 92 | og_dic = _get_all_ids(conf.ogroups) 93 | 94 | run(conf.ofasta, og_dic, conf.ofolder, conf.min_species) 95 | -------------------------------------------------------------------------------- /archive/scripts/protein_converter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from Bio import SeqIO 3 | 4 | # Get input and output file paths from command-line arguments 5 | # Daniel Paiva Agustinho 6 | input_file = sys.argv[1] 7 | output_file = sys.argv[2] 8 | 9 | with open(input_file, "r") as input_handle: 10 | with open(output_file, "w") as output_handle: 11 | for record in SeqIO.parse(input_handle, "fasta"): 12 | protein_seq = record.seq.translate() 13 | 14 | # Extract the entire original header 15 | original_header = record.description 16 | 17 | # Create a new sequence record with the original header 18 | protein_seq = SeqIO.SeqRecord( 19 | protein_seq, id=record.id, description=original_header 20 | ) 21 | SeqIO.write(protein_seq, output_handle, "fasta") 22 | print("done",str(output_file)) 23 | -------------------------------------------------------------------------------- /archive/scripts/r2t_py_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/r2t_GLYSP.o%J 3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/r2t_GLYSP.e%J 4 | #BSUB -u david.dylus@unil.ch 5 | #BSUB -J r2t_GLYSP 6 | #BSUB -n 1 7 | #BSUB -R "span[ptile=1]" 8 | #BSUB -R "rusage[mem=4000]" 9 | #BSUB -M 4000000 10 | source activate r2t 11 | reads=/scratch/beegfs/weekly/ddylus/avian/reads/GLYSP 12 | cd /scratch/beegfs/weekly/ddylus/avian/r2t/ 13 | python -W ignore /scratch/beegfs/monthly/ddylus/opt/read2tree/bin/read2tree --standalone_path /scratch/beegfs/weekly/ddylus/avian/marker_genes/ --dna_reference /scratch/beegfs/weekly/ddylus/avian/eukaryotes.cdna.fa --reads $reads/GLYSP_1.fq.gz $reads/GLYSP_2.fq.gz --output_path /scratch/beegfs/weekly/ddylus/avian/r2t/ --single_mapping /scratch/beegfs/weekly/ddylus/avian/r2t/02_ref_dna/MELGA_OGs.fa --threads 4 --min_species 8 -------------------------------------------------------------------------------- /archive/scripts/relabel_msa.py: -------------------------------------------------------------------------------- 1 | import Bio.AlignIO 2 | import csv 3 | 4 | 5 | def load_oma_species(fn): 6 | with open(fn, 'rt') as fh: 7 | reader = csv.reader((l for l in fh if not l.startswith('#')), dialect="excel-tab") 8 | mapping = {row[0]: row[2].replace(' ','_') + "__" + row[1] for row in reader} 9 | return mapping 10 | 11 | 12 | def load_nextstrain_metadata(fn): 13 | with open(fn, 'rt') as fh: 14 | reader = csv.DictReader(fh, dialect="excel-tab") 15 | mapping = {row['sra_accession']: row['sra_accession'] + "__" + row['strain'].replace(' ','_') + "__" + row['Nextstrain_clade'].replace(' ','_').replace('(','[').replace(')',']') + row['date'] 16 | for row in reader} 17 | return mapping 18 | 19 | 20 | def update_msa_ids(msa_path, new_path, mapping, format="phylip-relaxed"): 21 | msa = Bio.AlignIO.read(msa_path, format=format) 22 | for rec in msa: 23 | rec.id = mapping.get(rec.id, rec.id) 24 | Bio.AlignIO.write(msa, new_path, format=format) 25 | 26 | 27 | if __name__ == "__main__": 28 | import argparse 29 | parser = argparse.ArgumentParser(description="update labels of sequence ids") 30 | parser.add_argument('--oma-map', help="path to the oma-species.txt file to change 5letter codes with scientific names") 31 | parser.add_argument('--nextstrain', help="path to the nextstrain metadata file with the sra accessions") 32 | parser.add_argument('--msa-format', help="format of the msa. if not set, it will be guessed based on file extension") 33 | parser.add_argument('--out', required=True, help="Path to the output filename") 34 | parser.add_argument('msa', help="Path to the input msa filename") 35 | 36 | conf = parser.parse_args() 37 | mapping = {} 38 | if conf.oma_map: 39 | mapping.update(load_oma_species(conf.oma_map)) 40 | if conf.nextstrain: 41 | mapping.update(load_nextstrain_metadata(conf.nextstrain)) 42 | 43 | if conf.msa_format is None: 44 | conf.msa_format = "phylip-relaxed" if conf.msa.endswith('.phy') else "fasta" 45 | update_msa_ids(conf.msa, conf.out, mapping, format=conf.msa_format) 46 | 47 | -------------------------------------------------------------------------------- /archive/scripts/remove_species_from_alignment.py: -------------------------------------------------------------------------------- 1 | from Bio import AlignIO 2 | from Bio.Align import MultipleSeqAlignment 3 | from Bio.Alphabet import IUPAC, Gapped 4 | 5 | 6 | def get_alignment(file, species_list): 7 | keep_species = [] 8 | alignment = AlignIO.read(file, 'phylip-relaxed') 9 | for i, record in enumerate(alignment): 10 | if record.id not in species_list: 11 | keep_species.append(record) 12 | return MultipleSeqAlignment(keep_species, Gapped(IUPAC.protein, "-")) 13 | 14 | 15 | def write_alignment(output, alignment): 16 | AlignIO.write(alignment, output, 'phylip-relaxed') 17 | 18 | 19 | if __name__ == "__main__": 20 | import argparse 21 | parser = argparse.ArgumentParser( 22 | description="""Remove species from given alignment.""") 23 | parser.add_argument('-s', '--remove_species', default=None, 24 | help='[Default is none] Remove species present ' 25 | 'in data set after mapping step completed to ' 26 | 'build OGs. Input is comma separated list ' 27 | 'without spaces, e.g. XXX,YYY,AAA.') 28 | parser.add_argument('-o', '--output', default='.', required=True, 29 | help='[Default is current directory] Path to ' 30 | 'output directory.') 31 | parser.add_argument('-i', '--input', default='.', required=True, 32 | help='[Default is current directory] Path to ' 33 | 'output directory.') 34 | 35 | conf = parser.parse_args() 36 | 37 | new_alignment = get_alignment(conf.input, conf.remove_species) 38 | write_alignment(conf.output, new_alignment) 39 | -------------------------------------------------------------------------------- /archive/scripts/rm_py_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #BSUB -o /scratch/beegfs/weekly/ddylus/avian/lsf_out/rm_GLYSP.o%J 3 | #BSUB -e /scratch/beegfs/weekly/ddylus/avian/lsf_out/rm_GLYSP.e%J 4 | #BSUB -u david.dylus@unil.ch 5 | #BSUB -J rm_GLYSP 6 | #BSUB -n 1 7 | #BSUB -R "span[ptile=1]" 8 | #BSUB -R "rusage[mem=1000]" 9 | #BSUB -M 1000000 10 | rm -r /scratch/beegfs/weekly/ddylus/avian/reads/GLYSP -------------------------------------------------------------------------------- /archive/scripts/sample_from_reads.py: -------------------------------------------------------------------------------- 1 | #from __future__ import division 2 | import random 3 | import argparse 4 | import sys 5 | # bp length of mouse transcriptome in OMA: 37.914.531 6 | # bp length of CANVA genome 2.5Mpb 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("-i", "--input", nargs='+', default=None, help="input FASTQ filename") 10 | parser.add_argument("-out", "--output", help="output FASTQ filename") 11 | parser.add_argument("-f", "--fraction", type=float, help="fraction of reads to sample") 12 | parser.add_argument("-n", "--number", type=int, help="number of reads to sample") 13 | parser.add_argument("-s", "--sample", type=int, help="number of output files to write", default=1) 14 | args = parser.parse_args() 15 | 16 | if args.fraction and args.number: 17 | sys.exit("give either a fraction or a number, not both") 18 | 19 | if not args.fraction and not args.number: 20 | sys.exit("you must give either a fraction or a number") 21 | 22 | print("counting records....") 23 | with open(args.input[0]) as input: 24 | num_lines = sum([1 for line in input]) 25 | total_records = int(num_lines / 4) 26 | 27 | if args.fraction: 28 | args.number = int(total_records * args.fraction) 29 | 30 | print("sampling " + str(args.number) + " out of " + str(total_records) + " records") 31 | 32 | output_sequence_sets = [] 33 | output_file_left = [] 34 | if len(args.input) > 1: 35 | output_file_right = [] 36 | for i in range(args.sample): 37 | output_sequence_sets.append(set(random.sample(range(total_records + 1), args.number))) 38 | #output_file = args.input[0].split("/")[-1].split(".")[0] 39 | output_file = args.output 40 | output_file_left.append(open(output_file + "_0_" + str(i) + ".fq", "w")) 41 | if len(args.input) > 1: 42 | output_file_right.append(open(output_file + "_1_" + str(i) + ".fq", "w")) 43 | 44 | initial_length = 0 45 | sampling_length = 0 46 | 47 | record_number = 0 48 | with open(args.input[0]) as read_input: 49 | for line1 in read_input: 50 | line2 = read_input.readline() 51 | initial_length += len(line2) 52 | line3 = read_input.readline() 53 | line4 = read_input.readline() 54 | for i, output in enumerate(output_file_left): 55 | if record_number in output_sequence_sets[i]: 56 | output.write(line1) 57 | output.write(line2) 58 | output.write(line3) 59 | output.write(line4) 60 | sampling_length += len(line2) 61 | record_number += 1 62 | 63 | if len(args.input) > 1: 64 | record_number = 0 65 | with open(args.input[1]) as read_input: 66 | for line1 in read_input: 67 | line2 = read_input.readline() 68 | line3 = read_input.readline() 69 | line4 = read_input.readline() 70 | for i, output in enumerate(output_file_right): 71 | if record_number in output_sequence_sets[i]: 72 | output.write(line1) 73 | output.write(line2) 74 | output.write(line3) 75 | output.write(line4) 76 | sampling_length += len(line2) 77 | record_number += 1 78 | 79 | 80 | #output[0].close() 81 | # if len(args.input) > 1: 82 | # output[1].close() 83 | print("The mean length of all reads is {} and the mean length of the subsampled reads is {}".format(initial_length/total_records, sampling_length/args.number)) 84 | print("The sum length of all reads is {} and the sum length of the subsampled reads is {}".format(initial_length, sampling_length)) 85 | print("done!") 86 | -------------------------------------------------------------------------------- /archive/scripts/subsample_nextstrain_covid_genomes_with_sra_accession.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import lzma 3 | import csv 4 | import random 5 | 6 | 7 | def get_sra_datasets(fn): 8 | with lzma.open(fn, "rt", newline="") as fh: 9 | reader = csv.DictReader(fh, dialect="excel-tab") 10 | for row in reader: 11 | if row["sra_accession"] not in ('', '?'): 12 | yield row 13 | 14 | 15 | def subsample(metafile, nr_per_clade): 16 | sra = sorted(get_sra_datasets(metafile), key=lambda x: x["Nextstrain_clade"]) 17 | sub = [] 18 | for clade, samples in itertools.groupby(sra, key=lambda x: x["Nextstrain_clade"]): 19 | if clade == "": 20 | continue 21 | samples = list(samples) 22 | print(f"{clade}: {len(samples)}") 23 | sub.extend(random.sample(samples, min(nr_per_clade, len(samples)))) 24 | return sub 25 | 26 | def write(outfn, sub): 27 | with open(outfn,'w') as fout: 28 | w = csv.DictWriter(fout, fieldnames=sub[0].keys(), dialect="excel-tab") 29 | w.writeheader() 30 | w.writerows(sub) 31 | 32 | if __name__ == "__main__": 33 | import argparse 34 | parser = argparse.ArgumentParser(description="subsample nextstrain samples from all clades that contain sra accession ids") 35 | parser.add_argument("--out", required=True, help="path to output file") 36 | parser.add_argument("--nr-per-clade", default=2, type=int, help="number of samples to use per nextstrain clade. [default: 2]") 37 | parser.add_argument("metafile", help="metadata.tsv.xz file from nextstrain, e.g. https://data.nextstrain.org/files/ncov/open/global/metadata.tsv.xz") 38 | conf = parser.parse_args() 39 | 40 | subset = subsample(conf.metafile, conf.nr_per_clade) 41 | write(conf.out, subset) 42 | -------------------------------------------------------------------------------- /archive/scripts/treecl/select_alignments.py: -------------------------------------------------------------------------------- 1 | from Bio import AlignIO 2 | import tqdm, os, glob 3 | 4 | def run(afolder, ofolder, min_species): 5 | if not os.path.exists(ofolder): 6 | os.makedirs(ofolder) 7 | for f in tqdm.tqdm(glob.glob(os.path.join(afolder, '*.fa'))): 8 | if os.path.getsize(f) > 0: 9 | try: 10 | msa = AlignIO.read(f, "phylip-relaxed") 11 | except ValueError: 12 | msa = AlignIO.read(f, "fasta") 13 | if len(msa) >= min_species: 14 | align_output = open(os.path.join(ofolder, os.path.basename(f).split(".")[0]+".phy"), "w") 15 | AlignIO.write(msa, align_output, "phylip-relaxed") 16 | align_output.close() 17 | 18 | 19 | if __name__ == "__main__": 20 | import argparse 21 | parser = argparse.ArgumentParser( 22 | description="""Transform OrthogroupsFasta into marker_genes""") 23 | parser.add_argument('--afolder', default=None, required=True, 24 | help='[Default is none] Folder that contains alignments' 25 | 'without spaces, e.g. XXX,YYY,AAA.') 26 | parser.add_argument('--ofolder', default='alignments_selected', required=True, 27 | help='[Default is current directory] Path to ' 28 | 'output directory.') 29 | parser.add_argument('--min_species', type=int, default=0, 30 | help='Min number of species in selected ' 31 | 'alignments. ') 32 | 33 | conf = parser.parse_args() 34 | 35 | run(conf.afolder, conf.ofolder, conf.min_species) 36 | -------------------------------------------------------------------------------- /archive/scripts/trim_alignment.py: -------------------------------------------------------------------------------- 1 | import Bio.AlignIO 2 | import Bio.Align 3 | import collections 4 | import math 5 | 6 | def load_msa(fn): 7 | if fn.endswith('.phy'): 8 | format = 'phylip-relaxed' 9 | elif fn.endswith('.fa'): 10 | format = 'fasta' 11 | else: 12 | raise UnkownFormatError('unknown format for '+fn) 13 | with open(fn, 'rt') as fh: 14 | msa = next(Bio.AlignIO.parse(fn, format)) 15 | return msa 16 | 17 | 18 | def write_msa(fn, msa): 19 | with open(fn, 'wt') as fh: 20 | Bio.AlignIO.write(msa, fh, 'phylip-relaxed') 21 | 22 | 23 | def count_nucs(data): 24 | c = collections.Counter(data) 25 | valid = sum(c[x] for x in ('ATCGN')) 26 | return valid 27 | 28 | def trim(msa, min_residue): 29 | keep = [] 30 | for col in range(msa.get_alignment_length()): 31 | if count_nucs(msa[:,col]) >= min_residue: 32 | keep.append(col) 33 | print(len(keep)) 34 | trimmed = msa[:, keep[0]:keep[0]+1] 35 | for k in keep[1:]: 36 | trimmed = trimmed + msa[:, k:k+1] 37 | return keep, trimmed 38 | 39 | def filter_taxa(msa, min_residue): 40 | filtered = Bio.Align.MultipleSeqAlignment(filter(lambda taxon: count_nucs(taxon) > min_residue, msa)) 41 | return filtered 42 | 43 | class UnknownFormatError(Exception): 44 | pass 45 | 46 | 47 | if __name__ == "__main__": 48 | import argparse 49 | parser = argparse.ArgumentParser(description="sample part of the alignment that contains enough data, and through out species which have too little data") 50 | parser.add_argument('alignment', help="path to multiple sequence alignment") 51 | parser.add_argument('--min-per-col', type=int, help="Min nr of taxa that need to have a nuc at a column to be included. Defaults to ceil(nr_taxa*0.3)") 52 | parser.add_argument('--min-res-per-species', default=400, type=int, help="Minimum number of residues for a taxon in the trimmed alignment to not be kicked out. Defaults to 400") 53 | parser.add_argument('--out', help="Outfile of trimmed alignment") 54 | conf = parser.parse_args() 55 | 56 | msa = load_msa(conf.alignment) 57 | if conf.min_per_col is None: 58 | conf.min_per_col = math.ceil(0.3*len(msa)) 59 | if conf.out is None: 60 | conf.out = conf.alignment+".trimmed" 61 | 62 | print("Loaded MSA ({}x{}). Filter cols with less than {} residue" 63 | .format(len(msa), msa.get_alignment_length(), conf.min_per_col)) 64 | keep, trimmed_msa = trim(msa, conf.min_per_col) 65 | print(" after filtering columns: {}x{}".format(len(trimmed_msa), trimmed_msa.get_alignment_length())) 66 | filtered = filter_taxa(trimmed_msa, conf.min_res_per_species) 67 | print(" after filtering taxa: {}x{}".format(len(filtered), filtered.get_alignment_length())) 68 | write_msa(conf.out, filtered) 69 | -------------------------------------------------------------------------------- /archive/set_marker_genes/bacteria_markergenes.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/set_marker_genes/bacteria_markergenes.zip -------------------------------------------------------------------------------- /archive/set_marker_genes/mammalia_markergenes.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/set_marker_genes/mammalia_markergenes.zip -------------------------------------------------------------------------------- /archive/tests/info.log: -------------------------------------------------------------------------------- 1 | 2018-11-19 08:44:51,173:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz --- 2 | 2018-11-19 08:44:51,180:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz --- 3 | 2018-11-19 08:44:51,183:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz --- 4 | 2018-11-19 08:48:03,835:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz --- 5 | 2018-11-19 08:48:03,839:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz --- 6 | 2018-11-19 08:48:03,842:read2tree.Reads:test: --- Splitting reads from tests/data/reads/test.fq.gz --- 7 | -------------------------------------------------------------------------------- /archive/tests/input.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/tests/input.log -------------------------------------------------------------------------------- /archive/tests/test_aligner.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import gzip 4 | import argparse 5 | from Bio import SeqIO 6 | from read2tree.Reads import Reads 7 | from read2tree.FastxReader import FastxReader 8 | dirname = os.path.dirname(__file__) 9 | 10 | 11 | class ReadTest(unittest.TestCase): 12 | 13 | def setup_reads_paired(self, sampling=False): 14 | arg_parser = argparse.ArgumentParser(prog='read2tree') 15 | 16 | arg_parser.add_argument('--standalone_path', default='.', 17 | help='[Default is current directory] Path to ' 18 | 'oma standalone directory.') 19 | 20 | arg_parser.add_argument('--reads', nargs='+', default=None, 21 | help='Reads to be mapped to reference. If paired ' 22 | 'end add separated by space.') 23 | 24 | arg_parser.add_argument('--read_type', default='short', 25 | help='[Default is short reads] Type of reads to ' 26 | 'use for mapping. Either ngm for short reads or ' 27 | 'ngmlr for long will be used.') 28 | 29 | arg_parser.add_argument('--dna_reference', default='', 30 | help='Reference file that contains nucleotide ' 31 | 'sequences (fasta, hdf5). If not given it will use' 32 | 'the RESTapi and retrieve sequences ' 33 | 'from http://omabrowser.org directly. ' 34 | 'NOTE: internet connection required!') 35 | 36 | arg_parser.add_argument('--keep_all_ogs', action='store_true', 37 | help='Keep all orthologs after addition of ' 38 | 'mapped seq, which means also the groups that ' 39 | 'have no mapped sequence. Otherwise only groups ' 40 | 'are used that have the mapped sequence for ' 41 | 'alignment and tree inference.') 42 | 43 | arg_parser.add_argument('-r', '--reference', action='store_true', 44 | help='Just generate the reference dataset for ' 45 | 'mapping.') 46 | 47 | arg_parser.add_argument('--remove_species_ogs', default=None, 48 | help='[Default is none] Remove species present ' 49 | 'in data set after mapping step completed to ' 50 | 'build OGs. Input is comma separated list ' 51 | 'without spaces, e.g. XXX,YYY,AAA.') 52 | 53 | arg_parser.add_argument('-s', '--species_name', default=None, 54 | help='[Default is name of read] Name of species ' 55 | 'for mapped sequence.') 56 | 57 | arg_parser.add_argument('--output_path', default='.', required=True, 58 | help='[Default is current directory] Path to ' 59 | 'output directory.') 60 | 61 | argv = ['--standalone_path', 'tests/data/marker_genes/', 62 | '--dna_reference', 'tests/data/dna.fa', '--reads', 63 | 'tests/data/mapper/test3/test_1b.fq', 64 | 'tests/data/mapper/test3/test_2b.fq', 65 | '--output_path', 'tests/data/output', '--read_type', 66 | 'short', '--keep_all_ogs', '--reference', 67 | '--remove_species_ogs', 'CIOIN', '--species_name', 'ass'] 68 | 69 | args = arg_parser.parse_args(argv) 70 | return alignments = Aligner(args, ogset.ogs, load=True) 71 | -------------------------------------------------------------------------------- /archive/tests/test_og.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from Bio import SeqIO 4 | from read2tree.OGSet import OG 5 | 6 | dirname = os.path.dirname(__file__) 7 | 8 | 9 | class OGTest(unittest.TestCase): 10 | 11 | def setup(self): 12 | aa = list(SeqIO.parse('data/OG4.aa', format='fasta')) 13 | dna = list(SeqIO.parse('data/OG4.dna', format='fasta')) 14 | og = OG() 15 | og.aa = aa 16 | og.dna = dna 17 | return og 18 | 19 | def test_init(self): 20 | og = self.setup() 21 | self.assertEqual(og.dna[0].id, 'MOUSE21964_OG4') 22 | 23 | def test_get_og_dict(self): 24 | og = self.setup() 25 | dna_dict = og._get_og_dict(og) 26 | self.assertEqual(dna_dict['MOUSE21964'].name, 'MOUSE21964_OG4') 27 | 28 | def test_remove_species_records(self): 29 | og = self.setup() 30 | og_wo_mouse = og.remove_species_records('MOUSE') 31 | self.assertEqual(len(og_wo_mouse[0]), 4) 32 | self.assertEqual(len(og_wo_mouse[1]), 4) 33 | 34 | def test_get_species_id(self): 35 | og = self.setup() 36 | dna = og.dna[0] 37 | aa = og.aa[0] 38 | self.assertEqual(og._get_species_id(dna), 'MOUSE') 39 | self.assertEqual(og._get_species_id(aa), 'MOUSE') 40 | 41 | 42 | if __name__ == "__main__": 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /archive/tests/test_ogset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from read2tree import OGSet 4 | 5 | API_URL = 'http://omabrowser.org/api' 6 | 7 | class OGSetTest(unittest.TestCase): 8 | def setUp(self): 9 | arg_parser = argparse.ArgumentParser(prog='read2tree') 10 | 11 | arg_parser.add_argument('--reads', nargs='+', default=None, 12 | help='Reads to be mapped to reference. If paired end ' 13 | 'add separated by space.') 14 | arg_parser.add_argument('--read_split_length', type=int, default=400, 15 | help='Set read split length.') 16 | arg_parser.add_argument('--read_split_overlap', type=int, default=50, 17 | help='Set read split length overlap.') 18 | arg_parser.add_argument('-s', '--species_name', default=None, 19 | help='[Default is name of read] Name of species ' 20 | 'for mapped sequence.') 21 | 22 | argv = ['--reads', 'tests/data/reads/test.fq'] 23 | 24 | args = arg_parser.parse_args(argv) 25 | return OGSet(args) 26 | 27 | def test_OGSet(self): 28 | raise NotImplementedError 29 | 30 | def test_marker_genes_input(self): 31 | raise NotImplementedError 32 | 33 | def test_omastandalone_input(self): 34 | raise NotImplementedError 35 | 36 | def test_output_folder_structure(self): 37 | raise NotImplementedError 38 | 39 | def test_species_removal(self): 40 | raise NotImplementedError 41 | 42 | def test_species_removal_after_mapping(self): 43 | raise NotImplementedError 44 | 45 | def test_rest_api_connection(self): 46 | OGSet._read 47 | 48 | def test_rest_api_dna_downlaod(self): 49 | raise NotImplementedError 50 | 51 | 52 | if __name__ == "__main__": 53 | unittest.main() 54 | -------------------------------------------------------------------------------- /archive/tests/test_reads.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import gzip 4 | import argparse 5 | from Bio import SeqIO 6 | from read2tree.Reads import Reads 7 | from read2tree.FastxReader import FastxReader 8 | from read2tree.main import parse_args 9 | from read2tree._utils import exe_name 10 | dirname = os.path.dirname(__file__) 11 | 12 | 13 | class ReadTest(unittest.TestCase): 14 | 15 | def setup_long_reads(self, split=False): 16 | if split: 17 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test.fq.gz', '--split_reads', 18 | '--split_overlap', '50', '--split_len', '400', '--sample_reads', '--coverage', '10', 19 | '--genome_len', '1000'] 20 | else: 21 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test.fq.gz'] 22 | 23 | args = parse_args(argv, exe_name(), '') 24 | # args = arg_parser.parse_args(argv) 25 | return Reads(args) 26 | 27 | def setup_reads_paired(self, sampling=False): 28 | 29 | if sampling: 30 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test_1a.fq.gz', 31 | 'data/reads/test_2a.fq.gz', '--sample_reads', '--coverage', '10', '--genome_len', '1000'] 32 | else: 33 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test_1a.fq.gz', 34 | 'data/reads/test_2a.fq.gz'] 35 | args = parse_args(argv, exe_name(), '') 36 | return Reads(args) 37 | 38 | def test_split(self): 39 | test_seq = 'ACGTTTTTTGGAAGAGTTAGAGATTTTTAGAGAGGAGGGGT' 40 | expected = ['ACGTTTTTTG', 'GAAGAGTTAG', 'AGATTTTTAG', 'AGAGGAGGGG', 41 | 'GAGGAGGGGT'] 42 | reads = self.setup_long_reads() 43 | # obtained = reads._split_len(test_seq, 10) 44 | obtained = reads._split_len_overlap(test_seq, 10, 0) 45 | self.assertEqual(expected, obtained) 46 | 47 | def test_splitOverlap(self): 48 | test_seq = 'ACGTTTTTTGGAAGAGTTAGAGATTTTTAGAGAGGAGGGGTTT' 49 | expected = ['ACGTTTTTTG', 'TTTTGGAAGA', 'GAAGAGTTAG', 'GTTAGAGATT', 50 | 'AGATTTTTAG', 'TTTAGAGAGG', 'AGAGGAGGGG', 'GGAGGGGTTT'] 51 | reads = self.setup_long_reads() 52 | obtained = reads._split_len_overlap(test_seq, 10, 5) 53 | # print(reads._split_len_overlap('TTTTTAGAGAGGAGGGGTTT', 10, 5)) 54 | self.assertEqual(expected, obtained) 55 | 56 | def test_get_4_line_fastq_string(self): 57 | reads = self.setup_long_reads() 58 | expected = '@SRR00001 length=16\nACGTTTGGGAAGGTTT\n+SRR00001 ' \ 59 | 'length=16\n????????????????\n' 60 | read_id = 'SRR00001' 61 | seq = 'ACGTTTGGGAAGGTTT' 62 | qual = '????????????????' 63 | name = reads._get_4_line_fastq_string(read_id, seq, qual, x=0) 64 | self.assertEqual(name, expected) 65 | 66 | def test_read_num_split(self): 67 | reads = self.setup_long_reads(split=True) 68 | num_reads = reads._get_num_reads('data/reads/test.fq.gz') 69 | self.assertEqual(num_reads, 18) 70 | 71 | def test_read_len_split(self): 72 | reads = self.setup_long_reads(split=True) 73 | len_reads = reads._get_read_len('data/reads/test.fq.gz',1000) 74 | self.assertEqual(len_reads, 400) 75 | 76 | def test_read_num_paired(self): 77 | reads = self.setup_reads_paired() 78 | num_reads = reads._get_num_reads('data/reads/test_1a.fq.gz') 79 | self.assertEqual(num_reads, 1000) 80 | 81 | def test_read_len_paired(self): 82 | reads = self.setup_reads_paired() 83 | num_reads = reads._get_read_len('data/reads/test_1a.fq.gz', 1000) 84 | self.assertEqual(num_reads, 151.0) 85 | 86 | def test_read_num_by_coverage_paired(self): 87 | reads = self.setup_reads_paired(sampling=True) 88 | num_reads = reads._get_num_reads_by_coverage( 89 | 'data/reads/test_1a.fq.gz', 1000) 90 | self.assertEqual(num_reads, 34) 91 | 92 | def test_read_num_by_coverage_split(self): 93 | reads = self.setup_long_reads(split=True) 94 | num_reads = reads._get_num_reads_by_coverage(['data/reads/test.fq.gz'],1000) 95 | self.assertEqual(num_reads, 25) 96 | 97 | def test_read_vec_paired(self): 98 | reads = self.setup_reads_paired(sampling=True) 99 | num_reads = reads._get_vector_random_reads( 100 | 'data/reads/test_1a.fq.gz') 101 | self.assertEqual(len(num_reads), 34) 102 | 103 | 104 | if __name__ == "__main__": 105 | unittest.main() 106 | -------------------------------------------------------------------------------- /archive/tests/test_use.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import argparse 3 | import warnings 4 | warnings.filterwarnings('ignore') 5 | from read2tree.Progress import Progress 6 | from read2tree.stats.Coverage import Coverage 7 | from read2tree.stats.SeqCompleteness import SeqCompleteness 8 | import os 9 | 10 | class Use(unittest.TestCase): 11 | 12 | def test_OGSet(self): 13 | 14 | def test_write_progress(self): 15 | 16 | def test_read_progress(self): 17 | 18 | 19 | if __name__ == "__main__": 20 | unittest.main() 21 | -------------------------------------------------------------------------------- /archive/wiki_images/covid1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/covid1.jpg -------------------------------------------------------------------------------- /archive/wiki_images/covid2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/covid2.jpg -------------------------------------------------------------------------------- /archive/wiki_images/figure1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/figure1.jpg -------------------------------------------------------------------------------- /archive/wiki_images/figure_1sp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/figure_1sp.jpg -------------------------------------------------------------------------------- /archive/wiki_images/oma_page_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_0.png -------------------------------------------------------------------------------- /archive/wiki_images/oma_page_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_1.png -------------------------------------------------------------------------------- /archive/wiki_images/oma_page_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_2.png -------------------------------------------------------------------------------- /archive/wiki_images/oma_page_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_3.png -------------------------------------------------------------------------------- /archive/wiki_images/oma_page_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_4.png -------------------------------------------------------------------------------- /archive/wiki_images/oma_page_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_5.png -------------------------------------------------------------------------------- /archive/wiki_images/oma_page_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_6.png -------------------------------------------------------------------------------- /archive/wiki_images/oma_page_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_7.png -------------------------------------------------------------------------------- /archive/wiki_images/oma_page_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/archive/wiki_images/oma_page_8.png -------------------------------------------------------------------------------- /bin/read2tree: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python -W ignore 2 | ''' 3 | Wrapper to enable the user to call the installed hogprop without the '.py' 4 | ending. 5 | 6 | -- Alex Warwick Vesztrocy, June 2016 7 | ''' 8 | from read2tree.main import main 9 | from read2tree._utils import exe_name 10 | import sys 11 | 12 | 13 | if __name__ == '__main__': 14 | desc = 'read2tree is a pipeline allowing to use read data in combination with ' \ 15 | 'an OMA standalone output run to produce high quality trees. ' 16 | main(sys.argv[1:], exe_name=exe_name(), desc=desc) 17 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: read2tree_env 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - python=3.9 7 | - numpy 8 | - biopython 9 | - ete3 10 | - lxml 11 | - tqdm 12 | - scipy 13 | - pyparsing 14 | - requests 15 | - natsort 16 | - pyyaml 17 | - filelock 18 | - dendropy 19 | - mafft 20 | - iqtree 21 | - ngmlr 22 | - nextgenmap 23 | - samtools 24 | - filelock 25 | - pyham 26 | - pysam -------------------------------------------------------------------------------- /read2tree/Analyzer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | This file contains definitions of a class which surrounds possible alignment methods 4 | 5 | -- David Dylus, July--XXX 2017 6 | ''' 7 | import os 8 | from Bio import AlignIO 9 | import re 10 | 11 | class Analyzer(object): 12 | 13 | def __init__(self, args, og_set=None): 14 | print('--- Alignment of OGs ---') 15 | self.args = args 16 | self.cov = {} 17 | self.seq_completeness = {} 18 | 19 | self._genome_or_transcriptome_length = args.gt_length 20 | 21 | if " " in args.reads: 22 | self._reads = args.reads.rstrip().split(" ") 23 | else: 24 | self._reads = args.reads 25 | 26 | if len(self._reads) == 2: 27 | self._species_name = self._reads[0].split("/")[-1].split(".")[0] 28 | else: 29 | self._species_name = self._reads.split("/")[-1].split(".")[0] 30 | 31 | self.treeStats = {} 32 | self.alignmentStats = {} 33 | 34 | # def __call__(self, *args, **kwargs): 35 | # raise NotImplementedError 36 | 37 | def _get_coverage_reads(self, args): 38 | """ 39 | 40 | :param args: 41 | :return: coverage 42 | """ 43 | with open(args.reads[0]) as input: 44 | read_length = input.readline().split("length=")[-1] 45 | num_lines = sum([1 for line in input]) 46 | 47 | total_records = int(num_lines / 4) 48 | coverage = (total_records * read_length * len(args.reads))/self._genome_or_transcriptome_length 49 | return coverage 50 | 51 | def _get_number_results(self): 52 | raise NotImplementedError 53 | 54 | def _get_rf_dist(self, ref_tree): 55 | raise NotImplementedError 56 | 57 | def _get_length_align(self): 58 | raise NotImplementedError 59 | 60 | def _get_num_OGs(self): 61 | raise NotImplementedError 62 | 63 | def _get_mean_ACGT(self, args): 64 | import glob 65 | for folder in glob.iglob(args.output + '/05_*', recursive=True): 66 | print(folder) 67 | all_coverages = [] 68 | 69 | for file in glob.iglob(folder + '/*.phy'): 70 | align = AlignIO.read(file, "phylip-relaxed") 71 | for record in align: 72 | if self._species_name[0:5] in record.id: 73 | seq = re.sub('-', '', str(record.seq)) 74 | xx = seq.count("X") 75 | aa = len(seq) - xx 76 | all_coverages.append((aa / len(seq))) 77 | print(sum(all_coverages) / len(all_coverages)) 78 | 79 | def _get_branch_length_mapped_seq(self): 80 | raise NotImplementedError 81 | 82 | def write_to_csv(self): 83 | raise NotImplementedError -------------------------------------------------------------------------------- /read2tree/FastxReader.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import logging 3 | import gzip 4 | import mimetypes 5 | # from memory_profiler import memory_usage 6 | 7 | class FastxReader(object): 8 | 9 | def __init__(self, file): 10 | 11 | self._file = file 12 | guessed_type = mimetypes.guess_type(file)[1] 13 | if guessed_type: 14 | if 'gzip' in guessed_type: 15 | self._file_handle = 'gzip' 16 | else: 17 | self._file_handle = 'txt' 18 | 19 | def open_fastx(self): 20 | if self._file_handle in 'gzip': 21 | return gzip.open(self._file, 'rt') 22 | else: 23 | return open(self._file, 'rt') 24 | 25 | def readfq_id(self, file_handle): 26 | for l in file_handle: 27 | name = l.rstrip() 28 | seq = next(file_handle).rstrip() 29 | tmp = next(file_handle).rstrip() 30 | qual = next(file_handle).rstrip() 31 | yield name.split(' ')[0] 32 | 33 | def readfq(self, file_handle): 34 | for l in file_handle: 35 | name = l.rstrip() 36 | seq = next(file_handle).rstrip() 37 | tmp = next(file_handle).rstrip() 38 | qual = next(file_handle).rstrip() 39 | yield name, seq, qual 40 | 41 | def readfa(self, file_handle): 42 | for l in file_handle: 43 | name = l.rstrip() 44 | seq = next(file_handle).rstrip() 45 | yield name, seq 46 | 47 | def readfx(self, file_handle): 48 | for l in file_handle: 49 | name = l.rstrip() 50 | seq = next(file_handle).rstrip() 51 | if '@' in name[0]: 52 | tmp = next(file_handle).rstrip() 53 | qual = next(file_handle).rstrip() 54 | elif '>' in name[0]: 55 | qual = None 56 | yield name, seq, qual 57 | 58 | # def readfx(self, file_handle): # this is a generator function 59 | # ''' 60 | # This function was copy and pasted from https://github.com/lh3/readfq 61 | # Readfq is a fast implementation of a read iterator and provides a 62 | # massive spead up compared to regular 63 | # implementations 64 | # :param file_handle: is a filehandle 65 | # :return: name, seq, quality 66 | # ''' 67 | # last = None # this is a buffer keeping the last unprocessed line 68 | # while True: # mimic closure; is it a bad idea? 69 | # if not last: # the first record or a record following a fastq 70 | # for l in file_handle: # search for the start of the next record 71 | # if l[0] in '>@': # fasta/q header line 72 | # last = l[:-1] # save this line 73 | # break 74 | # if not last: 75 | # break 76 | # name, seqs, last = last, [], None 77 | # for l in file_handle: # read the sequence 78 | # if l[0] in '@+>': 79 | # last = l[:-1] 80 | # break 81 | # seqs.append(l[:-1]) 82 | # if not last or last[0] != '+': # this is a fasta record 83 | # yield name, ''.join(seqs), None # yield a fasta record 84 | # if not last: 85 | # break 86 | # else: # this is a fastq record 87 | # seq, leng, seqs = ''.join(seqs), 0, [] 88 | # for l in file_handle: # read the quality 89 | # seqs.append(l[:-1]) 90 | # leng += len(l) - 1 91 | # if leng >= len(seq): # have read enough quality 92 | # last = None 93 | # yield name, seq, ''.join(seqs) # yield a fastq record 94 | # break 95 | # if last: # reach EOF before reading enough quality 96 | # yield name, seq, None # yield a fasta record instead 97 | # break 98 | -------------------------------------------------------------------------------- /read2tree/GuidedAssembler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | This file contains definitions of a class which surrounds possible alignment methods 4 | 5 | -- David Dylus, July--XXX 2017 6 | ''' 7 | import logging 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class Aligner(object): 12 | 13 | def __init__(self, args=None, alignments=None): 14 | 15 | self.args = args 16 | self.alignments = alignments 17 | self.placement_dic = alignments.placement_dic 18 | 19 | 20 | -------------------------------------------------------------------------------- /read2tree/MultiProcessingLog.py: -------------------------------------------------------------------------------- 1 | # taken from https://gist.github.com/JesseBuesking/10674086 2 | 3 | from logging.handlers import RotatingFileHandler 4 | import multiprocessing, threading, logging, sys, traceback 5 | #import os 6 | 7 | 8 | class MultiProcessingLog(logging.Handler): 9 | def __init__(self, name, mode, maxsize, rotate): 10 | logging.Handler.__init__(self) 11 | 12 | self._handler = RotatingFileHandler(name, mode, maxsize, rotate) 13 | self.queue = multiprocessing.Queue(-1) 14 | 15 | t = threading.Thread(target=self.receive) 16 | t.daemon = True 17 | t.start() 18 | 19 | def setFormatter(self, fmt): 20 | logging.Handler.setFormatter(self, fmt) 21 | self._handler.setFormatter(fmt) 22 | 23 | def receive(self): 24 | while True: 25 | try: 26 | record = self.queue.get() 27 | self._handler.emit(record) 28 | #print('received on pid {}'.format(os.getpid())) 29 | except (KeyboardInterrupt, SystemExit): 30 | raise 31 | except EOFError: 32 | break 33 | except: 34 | traceback.print_exc(file=sys.stderr) 35 | 36 | def send(self, s): 37 | self.queue.put_nowait(s) 38 | 39 | def _format_record(self, record): 40 | # ensure that exc_info and args have been stringified. Removes any 41 | # chance of unpickleable things inside and possibly reduces message size 42 | # sent over the pipe 43 | if record.args: 44 | record.msg = record.msg % record.args 45 | record.args = None 46 | if record.exc_info: 47 | dummy = self.format(record) 48 | record.exc_info = None 49 | 50 | return record 51 | 52 | def emit(self, record): 53 | try: 54 | s = self._format_record(record) 55 | self.send(s) 56 | except (KeyboardInterrupt, SystemExit): 57 | raise 58 | except: 59 | self.handleError(record) 60 | 61 | def close(self): 62 | self._handler.close() 63 | logging.Handler.close(self) -------------------------------------------------------------------------------- /read2tree/ReferenceSet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | This file contains definitions of a class which allows to create 4 | the reference orthologous groups with their DNA sequences. 5 | 6 | -- David Dylus, July--XXX 2017 7 | ''' 8 | 9 | import os 10 | import glob 11 | import logging 12 | import time 13 | from tqdm import tqdm 14 | from Bio import SeqIO 15 | from Bio.SeqIO.FastaIO import FastaWriter 16 | 17 | from read2tree.Progress import Progress 18 | 19 | 20 | class ReferenceSet(object): 21 | ''' 22 | Structure for reference 23 | ''' 24 | 25 | def __init__(self, args, og_set=None, load=True, progress=None): 26 | """ 27 | 28 | :param args: list of arguments from command line 29 | :param og_set: set of OGs used to obtain reference DNA sequences 30 | :param load: set to True when reference loaded from folder/file of list of arguments 31 | """ 32 | self.ref = {} 33 | self.load = load 34 | self.args = args 35 | self.progress = progress 36 | 37 | self.logger = logging.getLogger(__name__) 38 | self._species_name = self.args.species_name 39 | 40 | if load is False: 41 | self.ref = self._load_records_folder() 42 | elif og_set is not None and load is True: 43 | self.ref = self._generate_reference(og_set) 44 | self.write() 45 | # self.progress.set_status('ref') 46 | 47 | # if args.remove_species: 48 | # self.ref = self._remove_species() 49 | 50 | def _read_fasta(self, ref_file): 51 | ''' 52 | 53 | :param ref_file: file that contains all the DNA sequences from the oma database 54 | :return: 55 | ''' 56 | print('--- Reading DNA reference into memory ---') 57 | return SeqIO.index(ref_file, "fasta") 58 | 59 | def _load_records_folder(self): 60 | """ 61 | Parse species with their dna sequences from folder 62 | :return: 63 | """ 64 | ref_dict = {} 65 | print('--- Generating reference for mapping from folder ---') 66 | ref_dna = os.path.join(self.args.output_path, '02_ref_dna') 67 | for file in tqdm(glob.glob(os.path.join(ref_dna, "*.fa")), desc="Re-loading references for mapping from folder", unit=" species"): 68 | species_name = file.split("/")[-1].split("_")[0] 69 | ref_dict[species_name] = Reference() 70 | ref_dict[species_name].dna = list(SeqIO.parse(file, 'fasta')) 71 | 72 | return ref_dict 73 | 74 | def _generate_reference(self, og_set): 75 | ''' 76 | Split records into dictionary with keys being species and the values the corresponded sequence records 77 | ''' 78 | print('--- Generating reference for mapping ---') 79 | start = time.time() 80 | ref_set = {} 81 | for name, og in tqdm(og_set.items(), desc="Loading records", unit=" record"): 82 | for record in og.aa: 83 | species = record.id[0:5] 84 | record.id = record.id # +"_"+name 85 | if species in ref_set.keys(): 86 | ref_set[species].aa.append(record) 87 | else: 88 | ref_set[species] = Reference() 89 | ref_set[species].aa.append(record) 90 | 91 | for record in og.dna: 92 | species = record.id[0:5] 93 | record.id = record.id # + "_" + name 94 | if species in ref_set.keys(): 95 | ref_set[species].dna.append(record) 96 | else: 97 | ref_set[species] = Reference() 98 | ref_set[species].dna.append(record) 99 | end = time.time() 100 | elapsed_time = end - start 101 | self.logger.info('{}: Extracted {} reference species form {} ogs took {}' 102 | .format(self._species_name, len(ref_set.keys()), 103 | len(og_set.keys()), elapsed_time)) 104 | return ref_set 105 | 106 | def write(self): 107 | ''' 108 | Write for each species all the DNA sequences into separate fasta files 109 | :param output_folder: folder where files should be stored 110 | ''' 111 | out_dna = os.path.join(self.args.output_path, '02_ref_dna') 112 | if not os.path.exists(out_dna): 113 | os.makedirs(out_dna) 114 | for key, value in self.ref.items(): 115 | if value.dna: # only write if not empty 116 | value.write_dna(key, out_dna) 117 | 118 | def _remove_species(self): 119 | raise NotImplementedError 120 | 121 | 122 | class Reference(object): 123 | 124 | def __init__(self, args=None): 125 | self.args = args 126 | self.aa = [] 127 | self.dna = [] 128 | 129 | def write_aa(self, species, output_folder): 130 | handle = open(os.path.join(output_folder, species + '_OGs.fa'), "w") 131 | writer = FastaWriter(handle, wrap=None) 132 | writer.write_file(self.aa) 133 | handle.close() 134 | 135 | def write_dna(self, species, output_folder): 136 | handle = open(os.path.join(output_folder, species + '_OGs.fa'), "w") 137 | writer = FastaWriter(handle, wrap=None) 138 | writer.write_file(self.dna) 139 | handle.close() 140 | -------------------------------------------------------------------------------- /read2tree/TreeInference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | This file contains definitions of a class which surrounds the wrappers to build trees given a set of command line arguments. 4 | 5 | -- David Dylus, July--XXX 2017 6 | ''' 7 | import os 8 | import time 9 | import logging 10 | from read2tree.wrappers.treebuilders import Fasttree, Iqtree 11 | from read2tree.wrappers.treebuilders.base_treebuilder import DataType 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class TreeInference(object): 18 | 19 | def __init__(self, args, concat_alignment=None): 20 | print('--- Tree inference ---') 21 | 22 | self.args = args 23 | 24 | self.elapsed_time = 0 25 | 26 | if self.args.reads: 27 | if len(self.args.reads) == 2: 28 | self._reads = self.args.reads 29 | self._species_name = self._reads[0].split("/")[-1].split(".")[0] 30 | else: 31 | self._reads = self.args.reads[0] 32 | self._species_name = self._reads.split("/")[-1].split(".")[0] 33 | 34 | if self.args.species_name: 35 | self._species_name = self.args.species_name 36 | 37 | if not self.args.reads and not self.args.species_name: 38 | self._species_name = 'merge' 39 | 40 | self.tree = None 41 | if concat_alignment is not None: 42 | self.tree = self._infer_tree(concat_alignment) 43 | 44 | def _infer_tree(self, concat_alignment): 45 | start = time.time() 46 | output_folder = self.args.output_path 47 | if not os.path.exists(output_folder): 48 | os.makedirs(output_folder) 49 | #fasttree_wrapper = Fasttree(concat_alignment, datatype=DataType.PROTEIN) 50 | #tree = fasttree_wrapper() 51 | iqtree_wrapper = Iqtree(concat_alignment, datatype=DataType.PROTEIN) 52 | iqtree_wrapper.options.options['-m'].set_value('LG') 53 | iqtree_wrapper.options.options['-nt'].set_value(self.args.threads) 54 | tree = iqtree_wrapper() 55 | with open(os.path.join(output_folder, "tree_" + self._species_name + ".nwk"), "w") as text_file: 56 | text_file.write("{}".format(tree)) 57 | self.tree = "{}".format(tree) 58 | end = time.time() 59 | self.elapsed_time = end - start 60 | logger.info('{}: Tree inference took {}.'.format(self._species_name, 61 | self.elapsed_time)) 62 | 63 | return tree 64 | -------------------------------------------------------------------------------- /read2tree/__init__.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | import logging 3 | import logging.config 4 | import yaml 5 | import os 6 | from pkg_resources import resource_string 7 | logging.getLogger(__name__).addHandler(logging.NullHandler()) 8 | 9 | __version__ = '0.1.5' 10 | __copyright__ = 'read2tree (C) 2017-{:d} David Dylus' \ 11 | .format(date.today().year) 12 | 13 | # path = './log.yaml' 14 | # if os.path.exists(path): 15 | # with open(path, 'rt') as f: 16 | # config = yaml.load(f.read()) 17 | # logging.config.dictConfig(config) 18 | 19 | conf = resource_string(__name__, 'logging/log.yaml') 20 | 21 | D = yaml.load(conf, Loader=yaml.FullLoader) 22 | D.setdefault('version', 1) 23 | logging.config.dictConfig(D) 24 | # del D 25 | -------------------------------------------------------------------------------- /read2tree/_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | Utilities for parsing the annotations files. 4 | 5 | -- Alex Warwick Vesztrocy - March--June 2016 6 | ''' 7 | import bz2 8 | import gzip 9 | import os 10 | import sys 11 | 12 | 13 | # File opening. This is based on the example on SO here: 14 | # http://stackoverflow.com/a/26986344 15 | fmagic = {b'\x1f\x8b\x08': gzip.open, 16 | b'\x42\x5a\x68': bz2.BZ2File} 17 | 18 | 19 | def auto_open(fn, *args): 20 | ''' 21 | Opens files based on their "magic bytes". Supports bz2 and gzip. If it 22 | finds neither of these, presumption is it is a standard, uncompressed 23 | file. 24 | ''' 25 | if os.path.isfile(fn) and os.stat(fn).st_size > 0: 26 | with open(fn, 'rb') as fp: 27 | fs = fp.read(max([len(x) for x in fmagic])) 28 | for (magic, _open) in fmagic.items(): 29 | if fs.startswith(magic): 30 | return _open(fn, *args) 31 | else: 32 | if fn.endswith('gz'): 33 | return gzip.open(fn, *args) 34 | elif fn.endswith('bz2'): 35 | return bz2.BZ2File(fn, *args) 36 | 37 | return open(fn, *args) 38 | 39 | 40 | def exe_name(): 41 | ''' 42 | Return the executable's basename, for inclusion in the help (with the 43 | help of argparse). 44 | ''' 45 | return os.path.basename(sys.argv[0]) 46 | 47 | 48 | class LazyProperty(object): 49 | ''' 50 | Decorator to evaluate a property only on access. 51 | 52 | Compute the attribute value and caches it in the instance. 53 | Python Cookbook (Denis Otkidach) 54 | http://stackoverflow.com/users/168352/denis-otkidach 55 | This decorator allows you to create a property which can be computed 56 | once and accessed many times. 57 | 58 | (Include from pyoma.browser.models - Adrian Altenhoff) 59 | ''' 60 | def __init__(self, method, name=None): 61 | # record the unbound-method and the name 62 | self.method = method 63 | self.name = name or method.__name__ 64 | self.__doc__ = method.__doc__ 65 | 66 | def __get__(self, inst, cls): 67 | if inst is None: 68 | return self 69 | # compute, cache and return the instance's attribute value 70 | result = self.method(inst) 71 | # setattr redefines the instance's attribute so this doesn't get called 72 | # again 73 | setattr(inst, self.name, result) 74 | return result 75 | 76 | 77 | def get_job_id(): 78 | ''' 79 | Gets job ID. 80 | ''' 81 | if 'JOB_ID' in os.environ: 82 | # SGE 83 | return int(os.environ['JOB_ID']) 84 | elif 'LSB_JOBID' in os.environ: 85 | # LSF 86 | return int(os.environ['LSB_JOBID']) 87 | elif 'PBS_JOBID' in os.environ: 88 | # PBS / Torque 89 | return int(os.environ['PBS_JOBID']) 90 | elif 'SLURM_ARRAY_JOB_ID' in os.environ: 91 | # Slurm 92 | return int(os.environ['SLURM_ARRAY_JOB_ID']) 93 | else: 94 | # No parallelism detected. 95 | return None 96 | 97 | 98 | def get_worker_id(): 99 | ''' 100 | Gets worker ID from the array ID in the job handler. 101 | number of workers. 102 | ''' 103 | try: 104 | if 'SGE_TASK_ID' in os.environ: 105 | # SGE 106 | return int(os.environ['SGE_TASK_ID']) 107 | elif 'LSB_JOBINDEX' in os.environ: 108 | return int(os.environ['LSB_JOBINDEX']) 109 | elif 'PBS_ARRAYID' in os.environ: 110 | # PBS / Torque 111 | return int(os.environ['PBS_ARRAYID']) 112 | elif 'SLURM_ARRAY_TASK_ID' in os.environ: 113 | # Slurm 114 | return int(os.environ['SLURM_ARRAY_TASK_ID']) 115 | except ValueError: 116 | # int() to base10 error 117 | pass 118 | 119 | # No parallelism detected. 120 | return None 121 | 122 | 123 | def check_array_ids(args): 124 | ''' 125 | Checks the IDs added to args for array jobs. Raises errors if not setup 126 | correctly. 127 | ''' 128 | if args.worker_id > args.array or args.worker_id == 0: 129 | raise RuntimeError('Recognised: worker ID {} and array size {}. ' 130 | 'Worker IDs should run from 1-N (N is array size' 131 | ').'.format(args.worker_id, args.array)) 132 | if args.job_id is None or args.worker_id is None: 133 | raise RuntimeError('User requested HOGPROP to run as job array.' 134 | 'Can\'t find job ID ({}) or array ID ({}).' 135 | .format(args.job_id, args.worker_id)) 136 | -------------------------------------------------------------------------------- /read2tree/file_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .context_managers import * 2 | -------------------------------------------------------------------------------- /read2tree/file_utils/context_managers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | 5 | __all__ = ['TempFile', 'TempDir', 'ChDir', 'MkDir', 'NonDeletingTempDir'] 6 | 7 | 8 | class TempFile(object): 9 | """ 10 | Context manager for working with a temporary file 11 | that automatically cleans up. 12 | 13 | Usage: 14 | 15 | with TempFile() as tmp: 16 | # In scope, tmp exists on the disk 17 | # Do some work with tmp, e.g. tmp.write('something') 18 | 19 | # Out of scope, tmp is deleted 20 | 21 | with TempFile('local_temp_space') as tmp: 22 | # tmp is created in the directory 'local_temp_space' 23 | # The specified directory must exist, or an error is thrown 24 | 25 | """ 26 | 27 | def __init__(self, dir_=None): 28 | if dir_ is not None and not os.path.exists(dir_): 29 | raise IOError('Directory "{}"" does not exist'.format(dir_)) 30 | self.dir = dir_ 31 | 32 | def __enter__(self): 33 | self._fd, self._wrapped_tmp = tempfile.mkstemp(dir=self.dir) 34 | return os.path.abspath(self._wrapped_tmp) 35 | 36 | def __exit__(self, type, value, tb): 37 | os.close(self._fd) 38 | os.remove(self._wrapped_tmp) 39 | 40 | 41 | class TempDir(object): 42 | """ 43 | Context manager for working with a temporary file 44 | that automatically cleans up. 45 | 46 | Usage: 47 | 48 | with TempDir() as tmpd: 49 | # In scope, tmpd exists on the disk 50 | # Do some work with tmpd ... 51 | 52 | # Out of scope, tmpd is deleted along with all its content 53 | 54 | Can be nested with TempFile, e.g. 55 | 56 | with TempDir() as tmpd, TempFile(tmpd) as tmpf: 57 | # tempfile tmpf is created inside temporary directory tmpd 58 | # On exit, everything is deleted 59 | 60 | """ 61 | 62 | def __enter__(self): 63 | self._wrapped_tmpdir = tempfile.mkdtemp() 64 | return os.path.abspath(self._wrapped_tmpdir) 65 | 66 | def __exit__(self, type, value, tb): 67 | shutil.rmtree(self._wrapped_tmpdir) 68 | 69 | 70 | class NonDeletingTempDir(TempDir): 71 | def __exit__(self, tpye, value, tb): 72 | pass 73 | 74 | 75 | class ChDir(object): 76 | """ 77 | Context manager to switch to a working directory, 78 | and return to the current directory (like 'Dir.chdir do' block in Ruby) 79 | 80 | Usage: 81 | 82 | with TempDir() as dir, ChDir(dir): 83 | # Do some work in the working temp directory 'dir' 84 | 85 | # Exit 'dir' 86 | """ 87 | 88 | def __init__(self, working_dir): 89 | if not os.path.exists(working_dir): 90 | raise IOError('Directory "{}"" does not exist'.format(working_dir)) 91 | self._cdir = os.getcwd() 92 | self._wdir = working_dir 93 | 94 | def __enter__(self): 95 | os.chdir(self._wdir) 96 | 97 | def __exit__(self, type, value, tb): 98 | os.chdir(self._cdir) 99 | 100 | 101 | class MkDir(ChDir): 102 | """ 103 | Context manager to create and switch to a working directory, 104 | then return to the current directory. 105 | 106 | Usage: 107 | 108 | with TempDir() as dir, MkDir(dir): 109 | # Do some work in the working temp directory 'dir' 110 | 111 | # Exit 'dir' 112 | """ 113 | 114 | def __init__(self, working_dir): 115 | if not os.path.exists(working_dir): 116 | try: 117 | os.makedirs(working_dir) 118 | except OSError as e: 119 | if e.errno != 17: 120 | raise 121 | pass # path was created by another thread / process 122 | # this is a race condition, but probably benign 123 | 124 | def __enter__(self): 125 | pass 126 | 127 | def __exit__(self, type, value, tb): 128 | pass 129 | -------------------------------------------------------------------------------- /read2tree/logging/log.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 1 3 | disable_existing_loggers: False 4 | formatters: 5 | simple: 6 | format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 7 | handlers: 8 | console: 9 | class: logging.StreamHandler 10 | level: INFO 11 | formatter: simple 12 | stream: ext://sys.stdout 13 | mplog: 14 | class: read2tree.MultiProcessingLog.MultiProcessingLog 15 | level: DEBUG 16 | formatter: simple 17 | name: mplog.log 18 | mode: a 19 | maxsize: 1024 20 | rotate: 0 21 | root: 22 | level: DEBUG 23 | handlers: [console, mplog] -------------------------------------------------------------------------------- /read2tree/logging/log.yaml.bak: -------------------------------------------------------------------------------- 1 | --- 2 | version: 1 3 | disable_existing_loggers: False 4 | formatters: 5 | simple: 6 | format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 7 | handlers: 8 | console: 9 | class: logging.StreamHandler 10 | level: INFO 11 | formatter: simple 12 | stream: ext://sys.stdout 13 | mplog: 14 | class: read2tree.MultiProcessingLog.MultiProcessingLog 15 | level: DEBUG 16 | formatter: simple 17 | name: mplog.log 18 | mode: a 19 | maxsize: 1024 20 | rotate: 0 21 | root: 22 | level: DEBUG 23 | handlers: [console, mplog] -------------------------------------------------------------------------------- /read2tree/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .OMAOutputParser import * -------------------------------------------------------------------------------- /read2tree/stats/Coverage.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import numpy as np 3 | 4 | 5 | class Coverage(object): 6 | 7 | def __init__(self, args): 8 | self.args = args 9 | self.coverage = {} 10 | 11 | def get_coverage_bam(self, file_name): 12 | mybam = pysam.AlignmentFile(file_name, 'rb') 13 | for ref in mybam.references: 14 | self.coverage[self._get_clean_id(ref)] \ 15 | = self._get_gene_coverage(mybam, ref) 16 | 17 | def _get_clean_id(self, id): 18 | id = id.split(" ")[0] 19 | id = id.split("_") 20 | return id[0]+"_"+id[1] 21 | 22 | def add_coverage(self, ref, coverage): 23 | self.coverage[ref] = coverage 24 | 25 | def write_coverage_bam(self, file_name): 26 | out_text = '' 27 | header = '#species,og,gene_id,coverage,std\n' 28 | out_text += header 29 | for key, value in self.coverage.items(): 30 | species = key[0:5] 31 | og = key.split("_")[-1] 32 | gene_id = key.split("_")[0] 33 | coverage = value 34 | line = species + "," + og + "," + gene_id + "," + \ 35 | str(coverage[0]) + "," + str(coverage[1]) + "\n" 36 | out_text += line 37 | 38 | with open(file_name, "w") as myfile: 39 | myfile.write(out_text) 40 | 41 | def read_coverage_from_file(self, file_name): 42 | raise NotImplementedError 43 | 44 | def _get_gene_coverage(self, mybam, ref): 45 | """ 46 | 47 | :param mybam: bam_file object from pysam 48 | :param ref: the gene_id reference to pileup the the number of reads per column 49 | :return: average coverage per gene 50 | """ 51 | column_coverage = [] 52 | for pileupcolumn in mybam.pileup(ref, 0, 100000): 53 | if pileupcolumn.n >= self.args.min_cons_coverage: 54 | column_coverage.append(pileupcolumn.n) 55 | np_column_coverage = np.array(column_coverage) 56 | return [np.mean(np_column_coverage), np.std(np_column_coverage)] 57 | -------------------------------------------------------------------------------- /read2tree/stats/SeqCompleteness.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import numpy as np 3 | 4 | 5 | class SeqCompleteness(object): 6 | 7 | def __init__(self, mapped_ref=None, tested_ref=None): 8 | self.seq_completeness = {} 9 | 10 | if mapped_ref: 11 | self.map_ref_records = self._get_og_dict(mapped_ref) 12 | else: 13 | self.map_ref_records = None 14 | 15 | if tested_ref: 16 | self.ref_records = self._get_og_dict(tested_ref) 17 | else: 18 | self.ref_records = None 19 | 20 | def get_seq_completeness(self, records): 21 | for record in records: 22 | self.seq_completeness[ 23 | record.id] = self._get_single_seq_completeness(record) 24 | 25 | def _get_single_seq_completeness(self, mapped_record, gene_code='dna'): 26 | """ 27 | Calculate single sequence completeness using the number of dna or aa 28 | positions that are not n/X divided by either 29 | length of sequence or full length or reference 30 | :param mapped_record: sequence record that was produced by mapping 31 | :param gene_code: dna or aa 32 | :return: tuple with partial seq completeness computed using just the 33 | mapped_record itself and ref_seq_completeness computed 34 | using also t 35 | """ 36 | 37 | map_ref_record = self.map_ref_records[self._get_og_id(mapped_record.id)] 38 | map_ref_seq = str(map_ref_record.seq).upper() 39 | map_seq = str(mapped_record.seq).upper() 40 | if self.ref_records and self._get_og_id(mapped_record.id) in \ 41 | self.ref_records.keys(): 42 | ref_record = self.ref_records[self._get_og_id(mapped_record.id)] 43 | ref_seq = str(ref_record.seq).upper() 44 | else: 45 | ref_seq = map_ref_seq 46 | if gene_code == 'dna': 47 | ref_seq_len = len(ref_seq) 48 | map_seq_len = len(map_ref_seq) 49 | non_n_len = len(map_ref_seq) - str(map_seq).count('N') 50 | map_seq_completeness = non_n_len / map_seq_len 51 | ref_seq_completeness = non_n_len / ref_seq_len 52 | elif gene_code == 'aa': 53 | ref_seq_len = len(ref_seq) 54 | map_seq_len = len(map_seq) 55 | non_n_len = len(map_seq) - str(map_seq).count('X') 56 | map_seq_completeness = non_n_len / map_seq_len 57 | ref_seq_completeness = non_n_len / ref_seq_len 58 | return [map_seq_completeness, ref_seq_completeness, 59 | non_n_len, map_seq_len, ref_seq_len] 60 | 61 | def _get_og_dict(self, ref_og): 62 | dna_dict = {} 63 | for record in ref_og: 64 | if '_' in record.id: 65 | split_id = record.id.split("_") 66 | tmp = split_id[0]+"_"+split_id[1] 67 | record.id = tmp 68 | og_id = split_id[1] 69 | 70 | dna_dict[og_id] = record 71 | return dna_dict 72 | 73 | def _get_og_id(self, id): 74 | split_id = id.split("_") 75 | # return split_id[0]+"_"+split_id[1] 76 | return split_id[1] 77 | 78 | def _get_gene_id(self, id): 79 | split_id = id.split("_") 80 | return split_id[0] 81 | 82 | def add_seq_completeness(self, ref, seq_completeness): 83 | self.seq_completeness[ref] = seq_completeness 84 | 85 | def write_seq_completeness(self, file_name): 86 | out_text = '' 87 | header = '#species,og,gene_id,map_seq_completeness,' \ 88 | 'ref_seq_completeness,inferred_len,given_len,ref_len\n' 89 | out_text += header 90 | for key, value in self.seq_completeness.items(): 91 | species = key[0:5] 92 | og = key.split("_")[-1] 93 | gene_id = key.split("_")[0] 94 | seq_completeness = value 95 | line = species + "," + og + "," + gene_id + "," + \ 96 | str(seq_completeness[0]) + "," + str(seq_completeness[1]) + \ 97 | "," + str(seq_completeness[2]) + "," + \ 98 | str(seq_completeness[3]) + "," + \ 99 | str(seq_completeness[4]) + "\n" 100 | out_text += line 101 | 102 | with open(file_name, "w") as myfile: 103 | myfile.write(out_text) 104 | -------------------------------------------------------------------------------- /read2tree/stats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DessimozLab/read2tree/925b6502ecc35686b7af68bc5a8e5dd6eb89eb50/read2tree/stats/__init__.py -------------------------------------------------------------------------------- /read2tree/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .seq_utils import * 2 | -------------------------------------------------------------------------------- /read2tree/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | class WrapperError(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /read2tree/wrappers/aligners/__init__.py: -------------------------------------------------------------------------------- 1 | from .mafft import Mafft 2 | from .muscle import Muscle 3 | from .prographmsa import ProGraphMSA 4 | from .probcons import ProbCons 5 | from .base_aligner import AlignmentInput, DataType, WrapperError -------------------------------------------------------------------------------- /read2tree/wrappers/aligners/base_aligner.py: -------------------------------------------------------------------------------- 1 | import os, types, itertools 2 | from abc import ABCMeta, abstractmethod 3 | from enum import Enum 4 | from Bio import AlignIO, SeqIO 5 | from Bio.Align import MultipleSeqAlignment 6 | from read2tree.utils.seq_utils import is_dna 7 | 8 | 9 | from read2tree.wrappers import WrapperError 10 | 11 | 12 | AlignmentInput = Enum('AlignmentInput', 'OBJECT FILENAME') 13 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN') 14 | 15 | class Aligner(object): 16 | """ 17 | Base class for wrappers of Multiple Sequence Aligner software 18 | 19 | The wrapper is written as a callable class. 20 | This can hold data (state) to do with the operation it performs, so it can keep results, 21 | execution times and other metadata, as well as perform the task. 22 | 23 | This is a base implementation to be extended. The important parts are 24 | __init__ (does the setup) and __call__ (does the work). All 25 | else are helper methods. 26 | 27 | :Example: 28 | 29 | :: 30 | 31 | callable_wrapper = ConcreteAligner(aln) 32 | result = callable_wrapper() 33 | time_taken = callable_wrapper.elapsed_time 34 | result_again = callable_wrapper.result 35 | 36 | """ 37 | __metaclass__ = ABCMeta 38 | 39 | def __init__(self, input_, datatype=DataType.UNKNOWN, binary=None): 40 | """ 41 | Should work the same whether you're working with a Biopython object or a file 42 | but the implementation differs, e.g. a Biopython object will need 43 | to be written temporarily to disk for the Aligner to work on it. 44 | 45 | :param input_: can be either a filename or a biopython multiple 46 | sequence alignment (a collection of :class:`Bio.SeqRecord.SeqRecord`) 47 | 48 | :param binary: is the alignment's executable file, or None. If set to 49 | None, it is assumed to be found in the PATH. 50 | 51 | :param datatype: means is it DNA or protein? 52 | """ 53 | self.input_type = identify_input(input_) # Figure out what it is - file or object 54 | 55 | if datatype == DataType.UNKNOWN: 56 | #dup, input_ = itertools.tee(input_) 57 | self.datatype = guess_datatype(input_, from_filename=self.input_type==AlignmentInput.FILENAME) 58 | if self.input_type == AlignmentInput.OBJECT: 59 | dup, input_ = itertools.tee(input_) 60 | self.datatype = guess_datatype(dup, False) 61 | else: 62 | self.datatype = guess_datatype(input_, True) 63 | else: 64 | self.datatype = datatype 65 | 66 | self.input = input_ # store it 67 | self.elapsed_time = None 68 | self.stdout = None 69 | self.stderr = None 70 | try: 71 | self.cli = self._init_cli(binary) 72 | except IOError as err: 73 | raise WrapperError('Error searching for binary: {}'.format(err)) 74 | # End setup 75 | 76 | @abstractmethod 77 | def __call__(self, *args, **kwargs): 78 | """ 79 | How to call the underlying aligner 80 | """ 81 | pass 82 | 83 | @abstractmethod 84 | def _init_cli(self, binary): 85 | pass 86 | 87 | import logging 88 | logger = logging.getLogger() 89 | 90 | def guess_datatype(alignment, from_filename=False): 91 | logger.warning("Guessing is not recommended - specify the sequence type with option datatype={DNA, PROTEIN}, be more confident") 92 | if from_filename: 93 | try: 94 | alignment = SeqIO.parse(alignment, 'fasta') 95 | except: 96 | alignment = SeqIO.parse(alignment, 'phylip-relaxed') 97 | return DataType.DNA if is_dna(alignment) else DataType.PROTEIN 98 | 99 | 100 | def identify_input(alignment): 101 | """ 102 | Work out if we're dealing with a Biopython object (return True), a file 103 | (return False), or invalid input (raise error) 104 | """ 105 | try: 106 | if isinstance(alignment, (MultipleSeqAlignment, types.GeneratorType, list)): 107 | # `alignment` is a Biopython MultipleSequenceAlignment 108 | return AlignmentInput.OBJECT 109 | 110 | elif isinstance(alignment, str) and os.path.exists(alignment): 111 | # `alignment` is a filepath 112 | return AlignmentInput.FILENAME 113 | 114 | except: 115 | # `alignment` is some other thing we can't handle 116 | raise ValueError('{} is not an alignment object or a valid filename'.format(alignment)) 117 | 118 | 119 | # TODO: Break the identify_input function into two parts - one to work out the datatype, one to work out whether 120 | # this is a file or an object 121 | -------------------------------------------------------------------------------- /read2tree/wrappers/aligners/muscle.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import time 3 | from Bio import AlignIO, SeqIO 4 | from six import StringIO 5 | from ..abstract_cli import AbstractCLI 6 | from .base_aligner import Aligner, AlignmentInput, DataType 7 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, TreeInputOption, OptionSet 8 | 9 | 10 | class MuscleCLI(AbstractCLI): 11 | """ 12 | Muscle low-level command line interface 13 | 14 | example: 15 | muscle_cli = MuscleCLI() 16 | process = muscle_cli(cmd='muscle args...') 17 | stdout = muscle_cli.get_stdout() 18 | """ 19 | @property 20 | def _default_exe(self): 21 | return 'muscle' 22 | 23 | # def _set_help(self): 24 | # self(help=True, wait=True) 25 | # self._help = self.get_stdout() 26 | 27 | def set_default_dna_options(aligner): 28 | """ 29 | Dummy function as sensible default already provided by mafft --auto 30 | """ 31 | aligner.options = get_default_options() 32 | 33 | 34 | def set_default_protein_options(aligner): 35 | """ 36 | Dummy function as sensible default already provided by mafft --auto 37 | """ 38 | aligner.options = get_default_options() 39 | 40 | class Muscle(Aligner): 41 | """ 42 | Convenient wrapper for Muscle multiple sequence aligner 43 | 44 | The wrapper is written as a callable class. 45 | This can hold data (state) to do with the operation it performs, so it can keep results, 46 | execution times and other metadata, as well as perform the task. 47 | 48 | This is a basic implementation that can be extended. The important parts are 49 | __init__ (does the setup) and __call__ (does the work). All 50 | else are helper methods. 51 | 52 | :Example: 53 | 54 | :: 55 | 56 | callable_wrapper = Muscle(aln) 57 | result = callable_wrapper() 58 | time_taken = callable_wrapper.elapsed_time 59 | result_again = callable_wrapper.result 60 | """ 61 | 62 | def __init__(self, input_, *args, **kwargs): 63 | super(Muscle, self).__init__(input_, *args, **kwargs) 64 | self.options = get_default_options() 65 | 66 | if self.datatype == DataType.DNA: 67 | set_default_dna_options(self) 68 | else: 69 | set_default_protein_options(self) 70 | 71 | def __call__(self, *args, **kwargs): 72 | """ 73 | Anything to do with calling Muscle should go here. 74 | If any extra arguments need to be passed they can 75 | be specified (listed as *args and **kwargs for now). 76 | """ 77 | start = time.time() # time the execution 78 | 79 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is 80 | with tempfile.NamedTemporaryFile(mode="wt") as filehandle: 81 | SeqIO.write(self.input, filehandle, 'fasta') 82 | filehandle.seek(0) 83 | output, error = self._call(filehandle.name, *args, **kwargs) 84 | else: 85 | output, error = self._call(self.input, *args, **kwargs) 86 | 87 | self.result = self._read_result(output) # store result 88 | self.stdout = output 89 | self.stderr = error 90 | 91 | end = time.time() 92 | self.elapsed_time = end - start 93 | return self.result 94 | # End call 95 | 96 | # Any other accessory methods 97 | def _call(self, filename, *args, **kwargs): 98 | """ 99 | Call underlying low level _MuscleCLI wrapper. 100 | Options are passed via *args and **kwargs 101 | [This only covers the simplest automatic 102 | case] 103 | """ 104 | self.cli('{} -in {}'.format(self.command(), filename), 105 | wait=True) 106 | return self.cli.get_stdout(), self.cli.get_stderr() 107 | 108 | def command(self): 109 | return str(self.options) 110 | 111 | def _read_result(self, output): 112 | """ 113 | Read back the result. 114 | """ 115 | fileobj = StringIO(output) 116 | return AlignIO.read(fileobj, 'fasta') 117 | 118 | def _init_cli(self, binary): 119 | return MuscleCLI(executable=binary) 120 | 121 | 122 | def get_default_options(): 123 | return OptionSet([ 124 | # Algorithm 125 | 126 | # Find diagonals (faster for similar sequences) 127 | FlagOption('-diags', False, active=False), 128 | 129 | # Maximum number of iterations(integer, default 16) 130 | IntegerOption('-maxiters', 16, active=False), 131 | 132 | # Maximum time to iterate in hours (default no limit) 133 | FloatOption('-maxhours', 0.0, active=False) 134 | 135 | #reeInputOption('-usetree', '', active=False) 136 | ]) 137 | -------------------------------------------------------------------------------- /read2tree/wrappers/aligners/probcons.py: -------------------------------------------------------------------------------- 1 | import time 2 | from Bio import AlignIO, SeqIO 3 | from six import StringIO 4 | from ..abstract_cli import AbstractCLI 5 | from .base_aligner import Aligner, AlignmentInput, DataType 6 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet 7 | import tempfile 8 | 9 | 10 | class ProbConsCLI(AbstractCLI): 11 | """ 12 | ProbCons low-level command line interface 13 | 14 | :Example: 15 | 16 | :: 17 | 18 | probcons_cli = _ProbConsCLI() 19 | process = mafft_cli(cmd='mafft args...') 20 | stdout = mafft_cli.get_stdout() 21 | """ 22 | @property 23 | def _default_exe(self): 24 | return 'probcons' 25 | 26 | # def _set_help(self): 27 | # self(help=True, wait=True) 28 | # self._help = self.get_stdout() 29 | 30 | 31 | def set_default_dna_options(aligner): 32 | """ 33 | Dummy function as sensible default already provided by mafft --auto 34 | """ 35 | aligner.options = get_default_options() 36 | 37 | 38 | def set_default_protein_options(aligner): 39 | """ 40 | Dummy function as sensible default already provided by mafft --auto 41 | """ 42 | aligner.options = get_default_options() 43 | 44 | 45 | class ProbCons(Aligner): 46 | """ 47 | Convenient wrapper for ProbCons multiple sequence aligner 48 | 49 | The wrapper is written as a callable class. 50 | This can hold data (state) to do with the operation it performs, so it can keep results, 51 | execution times and other metadata, as well as perform the task. 52 | 53 | This is a basic implementation that can be extended. The important parts are 54 | __init__ (does the setup) and __call__ (does the work). All 55 | else are helper methods. 56 | 57 | :Example: 58 | 59 | :: 60 | 61 | callable_wrapper = ProbCons(aln) 62 | result = callable_wrapper() 63 | time_taken = callable_wrapper.elapsed_time 64 | result_again = callable_wrapper.result 65 | 66 | 67 | .. note:: There exists an ipython notebook on how to work with wrappers, 68 | including dealing with non-default parameters. 69 | """ 70 | 71 | def __init__(self, input_, *args, **kwargs): 72 | super(ProbCons, self).__init__(input_, *args, **kwargs) 73 | self.options = get_default_options() 74 | if self.datatype == DataType.DNA: 75 | set_default_dna_options(self) 76 | else: 77 | set_default_protein_options(self) 78 | 79 | def __call__(self, *args, **kwargs): 80 | """ 81 | Anything to do with calling Mafft should go here. 82 | If any extra arguments need to be passed they can 83 | be specified (listed as *args and **kwargs for now). 84 | """ 85 | start = time.time() # time the execution 86 | 87 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is 88 | with tempfile.NamedTemporaryFile(mode='wt') as filehandle: 89 | SeqIO.write(self.input, filehandle, 'fasta') 90 | filehandle.seek(0) 91 | output, error = self._call(filehandle.name, *args, **kwargs) 92 | 93 | else: 94 | output, error = self._call(self.input, *args, **kwargs) 95 | 96 | self.result = self._read_result(output) # store result 97 | self.stdout = output 98 | self.stderr = error 99 | 100 | end = time.time() 101 | self.elapsed_time = end - start 102 | return self.result 103 | # End call 104 | 105 | # Any other accessory methods 106 | def _call(self, filename, *args, **kwargs): 107 | """ 108 | Call underlying low level _Mafft wrapper. 109 | Options are passed via *args and **kwargs 110 | [This only covers the simplest automatic 111 | case] 112 | """ 113 | self.cli('{} {}'.format(self.command(), filename), 114 | wait=True) 115 | return self.cli.get_stdout(), self.cli.get_stderr() 116 | 117 | def command(self): 118 | return str(self.options) 119 | 120 | def _read_result(self, output): 121 | """ 122 | Read back the result. 123 | """ 124 | fileobj = StringIO(output) 125 | return AlignIO.read(fileobj, 'fasta') 126 | 127 | def _init_cli(self, binary): 128 | return ProbConsCLI(executable=binary) 129 | 130 | 131 | def get_default_options(): 132 | return OptionSet([ 133 | # Algorithm 134 | 135 | # use CLUSTALW output format instead of MFA 136 | FlagOption('-clustalw', False, active=False), 137 | 138 | # use 0 <= REPS <= 5 (default: 2) passes of consistency transformation 139 | IntegerOption('-c', 0, active=False), 140 | 141 | # use 0 <= REPS <= 1000 (default: 100) passes of iterative-refinement 142 | IntegerOption('-ir', 100, active=False), 143 | 144 | # use 0 <= REPS <= 20 (default: 0) rounds of pretraining 145 | IntegerOption('-pre', 0, active=False), 146 | 147 | # generate all-pairs pairwise alignments 148 | FlagOption('-pairs', False, active=False), 149 | 150 | #use Viterbi algorithm to generate all pairs(automatically enables - pairs) 151 | FlagOption('-viterbi', False, active=False), 152 | 153 | # write annotation for multiple alignment to FILENAME 154 | StringOption('-annot', '', active=False), 155 | 156 | # print sequences in alignment order rather than input order (default: off) 157 | FlagOption('-a', False, active=False) 158 | 159 | ]) 160 | -------------------------------------------------------------------------------- /read2tree/wrappers/aligners/prographmsa.py: -------------------------------------------------------------------------------- 1 | import time 2 | from Bio import AlignIO, SeqIO 3 | import tempfile 4 | from six import StringIO 5 | from ..abstract_cli import AbstractCLI 6 | from .base_aligner import Aligner, AlignmentInput, DataType 7 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet 8 | 9 | 10 | class ProGraphMSACLI(AbstractCLI): 11 | """ 12 | PrographMSA low-level command line interface 13 | 14 | :Example: 15 | 16 | :: 17 | 18 | prograph_cli = ProGraphMSACLI() 19 | process = prograph_cli(cmd='mafft args...') 20 | stdout = prograph_cli.get_stdout() 21 | """ 22 | 23 | @property 24 | def _default_exe(self): 25 | return 'ProGraphMSA' 26 | 27 | 28 | def set_default_dna_options(aligner): 29 | """ 30 | Dummy function as sensible default already provided by mafft --auto 31 | """ 32 | aligner.options = get_default_options() 33 | 34 | 35 | def set_default_protein_options(aligner): 36 | """ 37 | Dummy function as sensible default already provided by mafft --auto 38 | """ 39 | aligner.options = get_default_options() 40 | 41 | 42 | class ProGraphMSA(Aligner): 43 | """ 44 | Convenient wrapper for ProGraphMSA multiple sequence aligner 45 | 46 | The wrapper is written as a callable class. 47 | This can hold data (state) to do with the operation it performs, so it can keep results, 48 | execution times and other metadata, as well as perform the task. 49 | 50 | This is a basic implementation that can be extended. The important parts are 51 | __init__ (does the setup) and __call__ (does the work). All 52 | else are helper methods. 53 | 54 | :Example: 55 | 56 | :: 57 | 58 | callable_wrapper = Mafft(aln) 59 | result = callable_wrapper() 60 | time_taken = callable_wrapper.elapsed_time 61 | result_again = callable_wrapper.result 62 | """ 63 | 64 | def __init__(self, input_, *args, **kwargs): 65 | super(ProGraphMSA, self).__init__(input_, *args, **kwargs) 66 | self.options = get_default_options() 67 | if self.datatype == DataType.DNA: 68 | set_default_dna_options(self) 69 | else: 70 | set_default_protein_options(self) 71 | 72 | def __call__(self, *args, **kwargs): 73 | """ 74 | Anything to do with calling ProGraphMSA should go here. 75 | If any extra arguments need to be passed they can 76 | be specified (listed as *args and **kwargs for now). 77 | """ 78 | start = time.time() # time the execution 79 | 80 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is 81 | with tempfile.NamedTemporaryFile(mode="wt") as fh: 82 | SeqIO.write(self.input, fh, 'fasta') 83 | fh.seek(0) 84 | output, error = self._call(fh.name, *args, **kwargs) 85 | 86 | else: 87 | output, error = self._call(self.input, *args, **kwargs) 88 | 89 | self.result = self._read_result(output) # store result 90 | self.stdout = output 91 | self.stderr = error 92 | 93 | end = time.time() 94 | self.elapsed_time = end - start 95 | return self.result 96 | # End call 97 | 98 | # Any other accessory methods 99 | def _call(self, filename, *args, **kwargs): 100 | """ 101 | Call underlying low level ProGraphMSA wrapper. 102 | Options are passed via *args and **kwargs 103 | [This only covers the simplest automatic 104 | case] 105 | """ 106 | self.cli('{} {}'.format(self.command(), filename), 107 | wait=True) 108 | return self.cli.get_stdout(), self.cli.get_stderr() 109 | 110 | def command(self): 111 | return str(self.options) 112 | 113 | def _read_result(self, output): 114 | """ 115 | Read back the result. 116 | """ 117 | fileobj = StringIO(output) 118 | return AlignIO.read(fileobj, 'fasta') 119 | 120 | def _init_cli(self, binary): 121 | return ProGraphMSACLI(executable=binary) 122 | 123 | 124 | def get_default_options(): 125 | return OptionSet([ 126 | # Algorithm 127 | 128 | # output fasta format (instead of stockholm), better because no tree output is produced 129 | FlagOption('--fasta', True, active=True), 130 | 131 | # output all ancestral sequences 132 | FlagOption('--ancestral_seqs', False, active=False), 133 | 134 | # output sequences in input order (default: tree order) 135 | FlagOption('--input_order', False, active=False), 136 | 137 | # output all intermediate guide trees 138 | FlagOption('--all_trees', False, active=False), 139 | 140 | # use ML distances with gap 141 | FlagOption('--mldist_gap', False, active=False), 142 | 143 | # use ML distances 144 | FlagOption('--mldist', False, active=False), 145 | 146 | # use of guide tree 147 | StringOption('--tree', '', active=False) 148 | 149 | ]) 150 | -------------------------------------------------------------------------------- /read2tree/wrappers/read_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | from .ngm import NGM 2 | from .ngmlr import NGMLR -------------------------------------------------------------------------------- /read2tree/wrappers/read_mappers/base_mapper.py: -------------------------------------------------------------------------------- 1 | import os, types 2 | from abc import ABCMeta, abstractmethod 3 | from enum import Enum 4 | from Bio.SeqRecord import SeqRecord 5 | from read2tree.wrappers import WrapperError 6 | 7 | import logging 8 | logger = logging.getLogger(__name__) 9 | 10 | ReferenceInput = Enum('ReferenceInput', 'OBJECT STRING FILENAME') 11 | ReadInput = Enum('ReadInput', 'OBJECT STRING FILENAME') 12 | 13 | class ReadMapper(object): 14 | """ 15 | Base class for wrappers of read mapping software 16 | 17 | The wrapper is written as a callable class. 18 | This can hold data (state) to do with the operation it performs, so it can keep results, 19 | execution times and other metadata, as well as perform the task. 20 | 21 | This is a base implementation to be extended. The important parts are 22 | __init__ (does the setup) and __call__ (does the work). All 23 | else are helper methods. 24 | 25 | :Example: 26 | 27 | :: 28 | 29 | callable_wrapper = ConcreteAligner(aln) 30 | result = callable_wrapper() 31 | time_taken = callable_wrapper.elapsed_time 32 | result_again = callable_wrapper.result 33 | """ 34 | __metaclass__ = ABCMeta 35 | 36 | def __init__(self, reference=None, reads=None, tmp_folder=None, binary=None): 37 | """ 38 | ..note:: TODO: this documentation is not correct. it needs to be updateted. 39 | 40 | Should work the same whether you're working with a Biopython object or a file 41 | but the implementation differs, e.g. a Biopython object will need 42 | to be written temporarily to disk for the Aligner to work on it. 43 | 44 | alignment is one of 4 things: 45 | a filename 46 | a Biopython MSA 47 | a list of Seq objects 48 | anything else (throw an exception) 49 | 50 | binary is the alignment's executable file, or None 51 | """ 52 | if reference is not None: 53 | self.ref_input_type = identify_reference(reference) # Figure out what it is - file or object 54 | self.ref_input = reference # store it 55 | else: 56 | self.ref_input_type = None 57 | self.ref_input = None 58 | 59 | if reads is not None: 60 | self.read_input_type = identify_reads(reads) # Figure out what it is - file or object 61 | self.read_input = reads # store it 62 | else: 63 | self.read_input_type = None 64 | self.read_input = None 65 | 66 | if tmp_folder is not None: 67 | self.tmp_folder = tmp_folder 68 | else: 69 | self.tmp_folder = "./" # set to current folder 70 | 71 | self.elapsed_time = None 72 | self.stdout = None 73 | self.stderr = None 74 | try: 75 | self.cli = self._init_cli(binary) 76 | except IOError as err: 77 | raise WrapperError('Error searching for binary: {}'.format(err)) 78 | # End setup 79 | 80 | @abstractmethod 81 | def __call__(self, *args, **kwargs): 82 | """ 83 | How to call the underlying aligner 84 | """ 85 | pass 86 | 87 | @abstractmethod 88 | def _init_cli(self, binary): 89 | """ 90 | Set up the command-line interface to the wrapped software 91 | :param binary: filename of executable binary file 92 | :return: concrete CLI type inheriting from AbstractCLI 93 | """ 94 | pass 95 | 96 | def identify_reference(sequence): 97 | """ 98 | Work out if we're dealing with a fasta (return True), a file 99 | (return False), or invalid input (raise error) 100 | 101 | :param alignment: either an Biopython MultipleSequenceAlignment or 102 | a filename pointing to an existing msa file. 103 | """ 104 | try: 105 | if isinstance(sequence, (SeqRecord, types.GeneratorType, list)): 106 | # `sequence` is a Biopython MultipleSequenceAlignment 107 | return ReferenceInput.OBJECT 108 | if isinstance(sequence, str) and not os.path.exists(sequence): 109 | return ReferenceInput.STRING 110 | elif isinstance(sequence, str) and os.path.exists(sequence): 111 | # `sequence` is a filepath 112 | return ReferenceInput.FILENAME 113 | 114 | except: 115 | # `sequence` is some other thing we can't handle 116 | raise ValueError('{} is not an sequence object or a valid filename'.format(sequence)) 117 | 118 | 119 | def identify_reads(reads): 120 | """ 121 | Work out if we're dealing with a fasta (return True), a file 122 | (return False), or invalid input (raise error) 123 | 124 | :param alignment: either an Biopython MultipleSequenceAlignment or 125 | a filename pointing to an existing msa file. 126 | """ 127 | if isinstance(reads, list): 128 | read = reads[0] 129 | else: 130 | read = reads 131 | 132 | try: 133 | if isinstance(read, (SeqRecord, types.GeneratorType, list)): 134 | # `sequence` is a Biopython MultipleSequenceAlignment 135 | return ReadInput.OBJECT 136 | elif isinstance(read, str) and not os.path.exists(read): 137 | return ReadInput.STRING 138 | elif isinstance(read, str) and os.path.exists(read): 139 | # `sequence` is a filepath 140 | return ReadInput.FILENAME 141 | 142 | except: 143 | # `sequence` is some other thing we can't handle 144 | raise ValueError('{} is not an sequence object or a valid filename'.format(sequence)) 145 | 146 | 147 | -------------------------------------------------------------------------------- /read2tree/wrappers/read_mappers/parser.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pysam 3 | from pyparsing import Suppress, SkipTo, Word, Regex, Literal, OneOrMore, \ 4 | Group, LineEnd, CharsNotIn, nums, alphanums, ParseException 5 | 6 | 7 | logger = logging.getLogger(__name__) 8 | logger.addHandler(logging.StreamHandler()) 9 | 10 | FLOAT = Word(nums + '.-').setParseAction(lambda x: float(x[0])) 11 | INT = Word(nums).setParseAction(lambda x: int(x[0])) 12 | WORD = Word(alphanums + '_-%. ') 13 | SPACEDWORD = Word(alphanums+' _') 14 | 15 | 16 | class NGMParser(object): 17 | """ 18 | Simple prottest result parser. 19 | [MAIN] Done (15778 reads mapped (4.14%), 365184 reads not mapped, 15778 lines written)(elapsed: 73.131973s) 20 | """ 21 | 22 | def __init__(self): 23 | self.READS_MAPPED = Literal('[MAIN] Done (') 24 | self.TOTAL_READS = Regex(r'\[MAIN\] Done \(\d+ reads mapped \(\d+\.\d+\%\), ') 25 | self.MAPPING_TIME = Literal('elapsed: ') 26 | self.rm = Suppress(SkipTo(self.READS_MAPPED)) + Suppress(self.READS_MAPPED) + INT 27 | self.tr = Suppress(SkipTo(self.TOTAL_READS)) + Suppress(self.TOTAL_READS) + INT 28 | self.mt = Suppress(SkipTo(self.MAPPING_TIME)) + Suppress(self.MAPPING_TIME) + FLOAT 29 | 30 | def parse(self, stdout): 31 | try: 32 | reads_mapped = self.rm.parseString(stdout).asList()[0] 33 | total_reads = self.tr.parseString(stdout).asList()[0] 34 | mapping_time = self.mt.parseString(stdout).asList()[0] 35 | except ParseException as err: 36 | print(stdout) 37 | logger.error(err) 38 | else: 39 | return reads_mapped, total_reads, mapping_time 40 | 41 | def to_dict(self, file, stdout): 42 | try: 43 | reads_mapped, total_reads, mapping_time = self.parse(stdout) 44 | except UnboundLocalError: 45 | reads_mapped = None 46 | total_reads = None 47 | mapping_time = None 48 | pass 49 | samfile = pysam.AlignmentFile(file, "r") 50 | result = {'file': file, 51 | 'reads_mapped': reads_mapped, 52 | 'total_reads': total_reads, 53 | 'mapping_time': mapping_time, 54 | 'sam': samfile} 55 | return result 56 | 57 | 58 | class NGMLRParser(object): 59 | """ 60 | Simple prottest result parser. 61 | for the following example output line: 62 | Processed: 75400 (0.00), R/S: 60.15, RL: 7675, Time: 3.00 11.00 10.07, Align: 1.00, 310, 3.04 63 | Done (77 reads mapped (0.10%), 75323 reads not mapped, 75402 lines written)(elapsed: 20m, 0 r/s) 64 | """ 65 | 66 | def __init__(self): 67 | self.TOTAL_MAPPED_READS = Literal('Done (') 68 | self.TOTAL_READS = Literal('Processed: ') 69 | # These are all the models that are possible to be tested using phyml 70 | self.tmr = Suppress(SkipTo(self.TOTAL_MAPPED_READS)) + \ 71 | Suppress(self.TOTAL_MAPPED_READS) + FLOAT 72 | self.tr = Suppress(SkipTo(self.TOTAL_READS)) + Suppress(self.TOTAL_READS) + FLOAT 73 | 74 | def parse(self, stdout): 75 | try: 76 | total_mapped_reads = self.tmr.parseString(stdout).asList()[0] 77 | total_reads = self.tr.parseString(stdout).asList()[0] 78 | 79 | except ParseException as err: 80 | logger.error(err) 81 | 82 | return total_mapped_reads, total_reads 83 | 84 | def to_dict(self, file, stdout): 85 | total_mapped_reads, total_reads = self.parse(stdout) 86 | samfile = pysam.AlignmentFile(file, "r") 87 | result = {'file': file, 88 | 'reads_mapped': total_mapped_reads, 89 | 'total_reads': total_reads, 90 | 'sam': samfile} 91 | 92 | return result 93 | -------------------------------------------------------------------------------- /read2tree/wrappers/treebuilders/__init__.py: -------------------------------------------------------------------------------- 1 | from .phyml import Phyml 2 | from .raxml import Raxml 3 | from .iqtree import Iqtree 4 | from .fasttree import Fasttree 5 | -------------------------------------------------------------------------------- /read2tree/wrappers/treebuilders/base_treebuilder.py: -------------------------------------------------------------------------------- 1 | import os, types, itertools 2 | from abc import ABCMeta, abstractmethod 3 | from enum import Enum 4 | from Bio import AlignIO, SeqIO 5 | from Bio.Align import MultipleSeqAlignment 6 | from read2tree.utils.seq_utils import is_dna 7 | 8 | from read2tree.wrappers import WrapperError 9 | from read2tree.wrappers.aligners.base_aligner import identify_input 10 | 11 | import logging 12 | logger = logging.getLogger(__name__) 13 | 14 | AlignmentInput = Enum('AlignmentInput', 'OBJECT FILENAME') 15 | DataType = Enum('DataType', 'DNA PROTEIN UNKNOWN') 16 | 17 | 18 | class TreeBuilder(object): 19 | """ 20 | Base class for wrappers of tree building software 21 | 22 | The wrapper is written as a callable class. 23 | This can hold data (state) to do with the operation it performs, so it can keep results, 24 | execution times and other metadata, as well as perform the task. 25 | 26 | This is a base implementation to be extended. The important parts are 27 | __init__ (does the setup) and __call__ (does the work). All 28 | else are helper methods. 29 | 30 | :Example: 31 | 32 | :: 33 | 34 | callable_wrapper = ConcreteAligner(aln) 35 | result = callable_wrapper() 36 | time_taken = callable_wrapper.elapsed_time 37 | result_again = callable_wrapper.result 38 | """ 39 | __metaclass__ = ABCMeta 40 | 41 | def __init__(self, alignment=None, datatype=DataType.UNKNOWN, binary=None): 42 | """ 43 | ..note:: TODO: this documentation is not correct. it needs to be updateted. 44 | 45 | Should work the same whether you're working with a Biopython object or a file 46 | but the implementation differs, e.g. a Biopython object will need 47 | to be written temporarily to disk for the Aligner to work on it. 48 | 49 | alignment is one of 4 things: 50 | a filename 51 | a Biopython MSA 52 | a list of Seq objects 53 | anything else (throw an exception) 54 | 55 | binary is the alignment's executable file, or None 56 | """ 57 | 58 | if alignment is not None: 59 | self.input_type = identify_input(alignment) # Figure out what it is - file or object 60 | if datatype == DataType.UNKNOWN: 61 | # dup, input_ = itertools.tee(input_) 62 | self.datatype = guess_datatype(alignment, from_filename=self.input_type == AlignmentInput.FILENAME) 63 | else: 64 | self.datatype = datatype 65 | 66 | self.input = alignment # store it 67 | else: 68 | self.input_type = None 69 | self.input = None 70 | 71 | 72 | self.elapsed_time = None 73 | self.stdout = None 74 | self.stderr = None 75 | try: 76 | self.cli = self._init_cli(binary) 77 | except IOError as err: 78 | raise WrapperError('Error searching for binary: {}'.format(err)) 79 | # End setup 80 | 81 | @abstractmethod 82 | def __call__(self, *args, **kwargs): 83 | """ 84 | How to call the underlying aligner 85 | """ 86 | pass 87 | 88 | @abstractmethod 89 | def _init_cli(self, binary): 90 | """ 91 | Set up the command-line interface to the wrapped software 92 | :param binary: filename of executable binary file 93 | :return: concrete CLI type inheriting from AbstractCLI 94 | """ 95 | pass 96 | 97 | 98 | def guess_datatype(alignment, from_filename=False): 99 | logger.warning("Guessing is not recommended - specify the sequence type with option datatype={DNA, PROTEIN}, be more confident") 100 | if from_filename: 101 | try: 102 | alignment = list(SeqIO.parse(alignment, 'fasta')) 103 | except: 104 | alignment = list(SeqIO.parse(alignment, 'phylip-relaxed')) 105 | return DataType.DNA if is_dna(alignment) else DataType.PROTEIN 106 | 107 | 108 | def identify_input(alignment): 109 | """ 110 | Work out if we're dealing with an alignment (return True), a file 111 | (return False), or invalid input (raise error) 112 | 113 | :param alignment: either an Biopython MultipleSequenceAlignment or 114 | a filename pointing to an existing msa file. 115 | """ 116 | try: 117 | if isinstance(alignment, (MultipleSeqAlignment, types.GeneratorType, list)): 118 | # `alignment` is a Biopython MultipleSequenceAlignment 119 | return AlignmentInput.OBJECT 120 | 121 | elif isinstance(alignment, str) and os.path.exists(alignment): 122 | # `alignment` is a filepath 123 | return AlignmentInput.FILENAME 124 | 125 | except: 126 | # `alignment` is some other thing we can't handle 127 | raise ValueError('{} is not an alignment object or a valid filename'.format(alignment)) 128 | 129 | 130 | -------------------------------------------------------------------------------- /read2tree/wrappers/treebuilders/fasttree.py: -------------------------------------------------------------------------------- 1 | # Author: Ivana Pilizota 2 | # Date: 1 November 2016 3 | 4 | import logging 5 | import os 6 | import time 7 | 8 | from Bio import SeqIO 9 | from pyparsing import ParseException 10 | import tempfile 11 | 12 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType 13 | from .parsers import FasttreeParser 14 | 15 | from ..abstract_cli import AbstractCLI 16 | from ..options import OptionSet, StringOption, IntegerOption 17 | from ...file_utils import TempFile, TempDir 18 | 19 | logger = logging.getLogger(__name__) 20 | logger.addHandler(logging.StreamHandler()) 21 | logger.setLevel(logging.INFO) 22 | 23 | 24 | class FasttreeCLI(AbstractCLI): 25 | @property 26 | def _default_exe(self): 27 | return 'FastTree' 28 | 29 | 30 | def set_default_dna_options(treebuilder): 31 | """ 32 | Dummy function as sensible default 33 | """ 34 | treebuilder.options = get_default_options() 35 | 36 | 37 | def set_default_protein_options(treebuilder): 38 | """ 39 | Dummy function as sensible default 40 | """ 41 | treebuilder.options = get_default_options() 42 | 43 | 44 | class Fasttree(TreeBuilder): 45 | 46 | def __init__(self, alignment, *args, **kwargs): 47 | self.options = get_default_options() 48 | super(Fasttree, self).__init__(alignment=alignment, *args, **kwargs) 49 | if self.input is not None: 50 | if self.datatype == DataType.DNA: 51 | set_default_dna_options(self) 52 | else: 53 | set_default_protein_options(self) 54 | 55 | def __call__(self, *args, **kwargs): 56 | """ 57 | Sets up temporary output file location and calls FastTree using _call() function. 58 | Writes temporary input file if we're working with SeqIO object 59 | Saves the stdout and stderr and returns 60 | """ 61 | start = time.time() # time the execution 62 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is 63 | with tempfile.NamedTemporaryFile(mode='wt') as fh: 64 | SeqIO.write(self.input, fh, 'phylip-relaxed') # default interleaved 65 | fh.seek(0) 66 | output, error = self._call(fh.name, *args, **kwargs) 67 | self.result = self._read_result(output, error) # store result 68 | else: 69 | filename = os.path.abspath(self.input) 70 | output, error = self._call(filename, *args, **kwargs) 71 | self.result = self._read_result(output, error) # store result 72 | 73 | end = time.time() 74 | self.elapsed_time = end - start 75 | return self.result["tree"] 76 | # End call 77 | 78 | # Any other accessory methods 79 | def _call(self, filename, *args, **kwargs): 80 | """ 81 | Call underlying low level FastTree wrapper. 82 | Options are passed via *args and **kwargs 83 | [This only covers the simplest automatic 84 | case] 85 | """ 86 | #hard code tmp_output as the output name since we don't save it anyway 87 | #self.cli('{} -log {log_output} {seqfile} > {tmp_path}'.format(self.command(), tmp_path=os.path.join(tmpd,'tmp_output'), log_output=logfile, seqfile=filename), wait=True) 88 | self.cli('{} {seq_file}'.format(self.command(), seq_file=filename), wait=True) 89 | 90 | return (self.cli.get_stdout(), self.cli.get_stderr()) 91 | 92 | def command(self): 93 | return str(self.options) 94 | 95 | def _read_result(self, stdout, stderr): 96 | """ 97 | Read back the result. 98 | """ 99 | parser = FasttreeParser() 100 | 101 | try: 102 | parser.parse(tree=stdout, other=stderr) 103 | result = parser.to_dict() 104 | except IOError as ioerr: 105 | logger.error('Error reading results') 106 | result = None 107 | except ParseException as parseerr: 108 | logger.error('Other parse error', parseerr) 109 | result = None 110 | 111 | return result 112 | 113 | def _init_cli(self, binary): 114 | return FasttreeCLI(executable=binary) 115 | 116 | 117 | def get_default_options(): 118 | 119 | return OptionSet([ 120 | # Algorithm 121 | 122 | # Set datatype to DNA (nt) or AA alignment: AA by default. If set to True will assume DNA format. 123 | StringOption('-nt', active=False), 124 | 125 | # Set the WAG model for AA alignment. Default Jones-Taylor-Thorton 126 | StringOption('-wag', active=False), 127 | 128 | # Set the GTR model for nt alignment. Default Jones-Taylor-Thorton 129 | StringOption('-gtr', active=False), 130 | 131 | # Set the gamma model. Default Jones-Taylor-Thorton 132 | StringOption('-gamma', active=False), 133 | 134 | # Specify the number of rate categories of sites. Default 20. 135 | IntegerOption('-cat', 20, active=False), 136 | 137 | # Specify starting tree 138 | StringOption('-intree', '', active=False), 139 | 140 | # Speed up the neighbor joining phase & reduce memory usage (recommended for >50,000 sequences) 141 | StringOption('-fastest', active=False), 142 | 143 | # Set the number of rounds of maximum-likelihood NNIs. Deafault 4*log2(N), N = the number of unique sequences 144 | IntegerOption('-mlnni', 0, active=False), 145 | 146 | ]) 147 | -------------------------------------------------------------------------------- /read2tree/wrappers/treebuilders/iqtree.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import logging 4 | import tempfile 5 | from pyparsing import ParseException 6 | 7 | from Bio import SeqIO 8 | from .parsers import IqtreeParser 9 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType 10 | 11 | from ..abstract_cli import AbstractCLI 12 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet 13 | 14 | from ...file_utils import TempFile,TempDir 15 | 16 | logger = logging.getLogger(__name__) 17 | logger.addHandler(logging.StreamHandler()) 18 | logger.setLevel(logging.INFO) 19 | 20 | 21 | class IqtreeCLI(AbstractCLI): 22 | @property 23 | def _default_exe(self): 24 | return 'iqtree' 25 | 26 | 27 | def set_default_dna_options(treebuilder): 28 | """ 29 | Dummy function as sensible default 30 | """ 31 | treebuilder.options = get_default_options() 32 | 33 | 34 | def set_default_protein_options(treebuilder): 35 | """ 36 | Dummy function as sensible default 37 | """ 38 | treebuilder.options = get_default_options() 39 | 40 | 41 | class Iqtree(TreeBuilder): 42 | 43 | def __init__(self, alignment, *args, **kwargs): 44 | self.options = get_default_options() 45 | super(Iqtree, self).__init__(alignment=alignment, *args, **kwargs) 46 | if self.input is not None: 47 | if self.datatype == DataType.DNA: 48 | set_default_dna_options(self) 49 | else: 50 | set_default_protein_options(self) 51 | 52 | def __call__(self, *args, **kwargs): 53 | """ 54 | Sets up temporary output file location and calls iqtree using _call() function. 55 | Writes temporary input file if we're working with SeqIO object 56 | Saves the stdout and stderr and returns 57 | """ 58 | start = time.time() # time the execution 59 | if "TMPDIR" in os.environ: 60 | tmp_output_folder = tempfile.TemporaryDirectory(prefix='iqtree', dir=os.environ.get("TMPDIR")) 61 | else: 62 | tmp_output_folder = tempfile.TemporaryDirectory(prefix='iqtree_') 63 | tmpd = tmp_output_folder.name 64 | if self.input_type is AlignmentInput.OBJECT: # different operation depending on what it is 65 | filename = os.path.join(tmpd,'tmp_output.phy') 66 | SeqIO.write(self.input, filename, 'phylip-relaxed') # default interleaved 67 | output, error = self._call(filename, tmpd, *args, **kwargs) 68 | elif self.input_type is AlignmentInput.FILENAME: 69 | filename = self.input 70 | output, error = self._call(filename, tmpd, *args, **kwargs) 71 | else: 72 | output, error = self._call(None, tmpd, *args, **kwargs) 73 | self.result = self._read_result(tmpd) # store result 74 | 75 | self.stdout = output 76 | self.stderr = error 77 | 78 | end = time.time() 79 | self.elapsed_time = end - start 80 | return self.result 81 | # End call 82 | 83 | # Any other accessory methods 84 | def _call(self, filename, tmpd, *args, **kwargs): 85 | """ 86 | Call underlying low level _iqtree wrapper. 87 | Options are passed via *args and **kwargs 88 | [This only covers the simplest automatic 89 | case] 90 | """ 91 | self.cli('{} -pre {tmp_path} -s {seqfile}'.format(self.command(), 92 | tmp_path=os.path.join(tmpd, 'tmp_output'), 93 | seqfile=filename), 94 | wait=True) 95 | return self.cli.get_stdout(), self.cli.get_stderr() 96 | 97 | def command(self): 98 | return str(self.options) 99 | 100 | def _read_result(self, tmpd): 101 | """ 102 | Read back the result. 103 | """ 104 | expected_outfiles = [os.path.join(tmpd, 'tmp_output.treefile')] 105 | 106 | parser = IqtreeParser() 107 | 108 | try: 109 | result = parser.to_dict(*expected_outfiles) 110 | 111 | except IOError as ioerr: 112 | logger.error('Error reading results') 113 | result = None 114 | except ParseException as parseerr: 115 | logger.error('Other parse error', parseerr) 116 | result = None 117 | 118 | return result["tree"] 119 | 120 | def _init_cli(self, binary): 121 | return IqtreeCLI(executable=binary) 122 | 123 | 124 | def get_default_options(): 125 | return OptionSet([ 126 | # Number of threads 127 | IntegerOption('-nt', 1, active=True), 128 | 129 | # Set the model for either DNA or AA alignment 130 | StringOption('-m', '', active=False), 131 | 132 | # If set to true will assume sequential format 133 | #FlagOption('-q', False, active=False), 134 | 135 | # Limit memory needs to 4G 136 | StringOption('-mem', '4G', active=True), 137 | 138 | # Set seed to 12345 139 | IntegerOption('-seed', 12345, active=False), 140 | 141 | # Ultrafast bootstrap (>=1000) 142 | IntegerOption('-bb', 0, active=False), 143 | 144 | # SH-like approximate likelihood ratio test (SH-aLRT) 145 | IntegerOption('-alrt', 0, active=False), 146 | 147 | # Bootstrap + ML tree + consensus tree (>=100) 148 | IntegerOption('-b', 0, active=False) 149 | ]) 150 | -------------------------------------------------------------------------------- /read2tree/wrappers/treebuilders/phyml.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import tempfile 4 | import logging 5 | from pyparsing import ParseException 6 | from Bio import AlignIO, SeqIO 7 | 8 | from .base_treebuilder import TreeBuilder, AlignmentInput, DataType 9 | from .parsers import PhymlParser 10 | 11 | from ..abstract_cli import AbstractCLI 12 | from ..options import StringOption, FlagOption, IntegerOption, FloatOption, MultiOption, OptionSet 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | logger.addHandler(logging.StreamHandler()) 17 | logger.setLevel(logging.INFO) 18 | 19 | 20 | class PhymlCLI(AbstractCLI): 21 | @property 22 | def _default_exe(self): 23 | return 'phyml' 24 | 25 | 26 | def set_default_dna_options(treebuilder): 27 | """ 28 | Dummy function as sensible default 29 | """ 30 | treebuilder.options = get_default_options() 31 | 32 | 33 | def set_default_protein_options(treebuilder): 34 | """ 35 | Dummy function as sensible default 36 | """ 37 | treebuilder.options = get_default_options() 38 | 39 | 40 | class Phyml(TreeBuilder): 41 | """ Phyml tree reconstruction 42 | 43 | This wrapper can be called to reconstruct a phylogenetic tree 44 | using PhyML. 45 | """ 46 | 47 | def __init__(self, alignment, *args, **kwargs): 48 | """ 49 | :param alignment: input multiple sequence alignment. This can be either 50 | a filename or an biopython SeqRecord collection. 51 | """ 52 | super(Phyml, self).__init__(alignment, *args, **kwargs) 53 | self.options = get_default_options() 54 | if self.datatype == DataType.DNA: 55 | set_default_dna_options(self) 56 | else: 57 | set_default_protein_options(self) 58 | 59 | def __call__(self, *args, **kwargs): 60 | """ 61 | Anything to do with calling Mafft should go here. 62 | If any extra arguments need to be passed they can 63 | be specified (listed as *args and **kwargs for now). 64 | """ 65 | start = time.time() # time the execution 66 | 67 | if self.input_type == AlignmentInput.OBJECT: # different operation depending on what it is 68 | with tempfile.NamedTemporaryFile(mode='wt') as fh: 69 | SeqIO.write(self.input, fh, 'phylip-relaxed') # default interleaved 70 | fh.seek(0) 71 | output, error = self._call(fh.name, *args, **kwargs) 72 | self.result = self._read_result(fh.name) # store result 73 | else: 74 | path = os.path.dirname(self.input) 75 | filename = os.path.basename(self.input) 76 | os.chdir(path) # some operations done because phyml can not deal with large filenames that are caused due to a large path 77 | output, error = self._call(filename, *args, **kwargs) 78 | self.result = self._read_result(filename) # store result 79 | 80 | self.stdout = output 81 | self.stderr = error 82 | 83 | end = time.time() 84 | self.elapsed_time = end - start 85 | return self.result["tree"] 86 | # End call 87 | 88 | # Any other accessory methods 89 | def _call(self, filename, *args, **kwargs): 90 | """ 91 | Call underlying low level _Phyml wrapper. 92 | Options are passed via *args and **kwargs 93 | [This only covers the simplest automatic 94 | case] 95 | """ 96 | self.cli('{} -i {}'.format(self.command(), filename), 97 | wait=True) 98 | return self.cli.get_stdout(), self.cli.get_stderr() 99 | 100 | def command(self): 101 | return str(self.options) 102 | 103 | def _read_result(self, output): 104 | """ 105 | Read back the result. 106 | """ 107 | 108 | #TODO: change the output dictionary into a better format 109 | expected_outfiles = ['{}_phyml_stats'.format(output), '{}_phyml_tree'.format(output)] 110 | parser = PhymlParser() 111 | 112 | # Phyml outputs two outfiles, a stats file and a tree file. 113 | # Sometimes it appends .txt, sometimes not. Seems to be platform-specific. 114 | # Here we assume they are without .txt, but if we can't find them, try 115 | # looking for the .txt onees instead 116 | try: 117 | # Check if these are the .txt style outfiles 118 | if not os.path.exists(expected_outfiles[0]): 119 | expected_outfiles = [x + '.txt' for x in expected_outfiles] 120 | result = parser.to_dict(*expected_outfiles) 121 | 122 | except IOError as ioerr: 123 | logger.error('Error reading results') 124 | result = None 125 | except ParseException as parseerr: 126 | logger.error('Other parse error', parseerr) 127 | result = None 128 | 129 | return result 130 | 131 | def _init_cli(self, binary): 132 | return PhymlCLI(executable=binary) 133 | 134 | 135 | def get_default_options(): 136 | return OptionSet([ 137 | # Algorithm 138 | 139 | # Set datatype to nt or aa 140 | StringOption('-d', 'aa', active=True), 141 | 142 | # Set the model for either DNA or AA alignment 143 | StringOption('-m', '', active=False), 144 | 145 | # If set to true will assume sequential format 146 | FlagOption('-q', False, active=False), 147 | 148 | # Set bootstrap value 149 | IntegerOption('-b', 0, active=False), 150 | 151 | # Tree topology search operation option 152 | StringOption('-s', 'NNI', active=False) 153 | ]) 154 | -------------------------------------------------------------------------------- /read2tree/wrappers/treebuilders/src/pip-delete-this-directory.txt: -------------------------------------------------------------------------------- 1 | This file is placed here by pip to indicate the source was put 2 | here by pip. 3 | 4 | Once this package is successfully installed this source code will be 5 | deleted (unless you remove this file). 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | name = 'read2tree' 4 | 5 | __version__ = None 6 | with open('{:s}/__init__.py'.format(name), 'rt') as fp: 7 | for line in fp: 8 | if line.startswith('__version__'): 9 | exec(line.rstrip()) 10 | 11 | # conda install -c conda-forge biopython numpy Cython ete3 lxml tqdm scipy pyparsing requests natsort pyyaml 12 | # conda install -c bioconda dendropy 13 | requirements = ["numpy", "biopython", "ete3", "lxml", "tqdm", "scipy", 14 | "pyparsing", "requests", "natsort", "pyyaml", "dendropy", 15 | "pysam", "pyham", "filelock"] 16 | 17 | with open("README.md", "r", encoding="utf-8") as fh: 18 | long_description = fh.read() 19 | 20 | setup( 21 | name=name, 22 | version=__version__, 23 | author='David Dylus and Fritz Sedlaczek', 24 | author_email='daviddylus@gmail.com', 25 | description='read2tree allows to build high quality phylogenetic trees ' 26 | 'using reads and a reference set of orthologous groups ' 27 | '(DNA + Protein).', 28 | long_description=long_description, 29 | long_description_content_type="text/markdown", 30 | url="https://github.com/dessimozlab/read2tree", 31 | packages=find_packages(".", exclude=["archive"]), 32 | include_package_data=True, 33 | package_data={ 34 | 'read2tree': ['logging/log.yaml'] 35 | }, 36 | install_requires=requirements, 37 | classifiers=[ 38 | "Programming Language :: Python :: 3", 39 | "Environment :: Console", 40 | "License :: OSI Approved :: MIT License", 41 | ], 42 | scripts=['bin/read2tree'], 43 | python_requires=">=3.5", 44 | ) 45 | -------------------------------------------------------------------------------- /src/pip-delete-this-directory.txt: -------------------------------------------------------------------------------- 1 | This file is placed here by pip to indicate the source was put 2 | here by pip. 3 | 4 | Once this package is successfully installed this source code will be 5 | deleted (unless you remove this file). 6 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_1001241.fa: -------------------------------------------------------------------------------- 1 | >MNELE00784 | OMA1001241 | ML06054a | [Mnemiopsis leidyi] 2 | MFRNPKIIYSWPYGQHFCKYLRRNASFGEVHPLFESLIAGNRAALARAITLSESTLERHKQESAHLMSSVLKHNLQNRSL 3 | RIGISGPPGAGKSTFIEAIGLHITELNNKLAVLAVDPSSTRSGGSLLADKTRMQQLSVEKLAYIRPSPNRGHLGGVARAT 4 | NAAIQLCEAGGYNVIIVETVGAGQSEIAVANMTDIFVLLVPPGSGDELQGIKKGIVEVADMILVTKADGNLKTAARLVKT 5 | EYSRALRLLRNHDDTSWKPFVQTVSSISGKGISDAWSDMLEFHQEMISTGKYQDRRKKQRVTWLWDHVQDELLEHLRKDT 6 | LNAKFQEKLEADVRNGVILPSTAAQKLLNLFLKGKDNIS 7 | 8 | >HUMAN77595 | OMA1001241 | Q495G5 | [Homo sapiens] 9 | MPMLLPHPHQHFLKGLLRAPFRCYHFIFHSSTHLGSGIPCAQPFNSLGLHCTKWMLLSDGLKRKLCVQTTLKDHTEGLSD 10 | KEQRFVDKLYTGLIQGQRACLAEAITLVESTHSRKKELAQVLLQKVLLYHREQEQSNKGKPLAFRVGLSGPPGAGKSTFI 11 | EYFGKMLTERGHKLSVLAVDPSSCTSGGSLLGDKTRMTELSRDMNAYIRPSPTRGTLGGVTRTTNEAILLCEGAGYDIIL 12 | IETVGVGQSEFAVADMVDMFVLLLPPAGGDELQGIKRGIIEMADLVAVTKSDGDLIVPARRIQAEYVSALKLLRKRSQVW 13 | KPKVIRISARSGEGISEMWDKMKDFQDLMLASGELTAKRRKQQKVWMWNLIQESVLEHFRTHPTVREQIPLLEQKVLIGA 14 | LSPGLAADFLLKALKAETNKIHPV 15 | 16 | >RATNO14529 | OMA1001241 | D3ZNY3 | [Rattus norvegicus] 17 | MTIPTLLLSPYRRLLTRLSRVPSPQLLHSSLPTLHPRDALPNSFGHHCSKRVLLSDGFRRTLCIRATLKDHTEGLSDKEQ 18 | RFVDRLYMGLVQGQRACLAEAITLVESTHTRKKELAQVLLQRVLAHQRERELQNHGKPFTFRVGLSGPPGAGKSTFIECF 19 | GKMLTERGHRLSVLAVDPSSCTSGGSLLGDKTRMTELSRDMNAYIRPSPTSGTLGGVTRTTNEAIVLCEGGGYDIILIET 20 | VGVGQSEFAVADMVDMFVLLLPPAGGDELQGIKRGIIEMADLVVITKSDGDLVVPARRIQAEYVSALKLLRRRSEVWRPK 21 | VIRISARSGEGITEMWDIMREFQHRMLASGELAAKRQTQHKVWMWNLIQENVLEHFKTHPSIREQIPLMEREVLSGALSP 22 | GRAADLLLKAFKSRH 23 | 24 | >GORGO31007 | OMA1001241 | G3QJC8 | [Gorilla gorilla gorilla] 25 | MPMLLPHPHQHFLKGLLRAPFRCYHFIFHSSTHLGSGIPCAQPFNSLGLHCTKWMLLSDGLKRKLCVQTTLKDHTEGLSD 26 | KEQRFVDKLYTGLIQGQRACLAEAITLVESTHSRKKELAQVLLQKVLLYHREQEQSNKGKPLAFRVGLSGPPGAGKSTFI 27 | EYFGKMLTERGHKLSVLAVDPSSCTSGGSLLGDKTRMTELSRDMNAYIRPSPTRGTLGGVTRTTNEAILLCEGAGYDIIL 28 | IETVGVGQSEFAVADMVDMFVLLLPPAGGDELQGIKRGIIEMADLVAVTKSDGDLIVPARRIQAEYVSALKLLRKRSQVW 29 | KPKVIRISARSGEGISEMWDKMKDFQDLMLASGELTAKRRKQQKVWMWNLIQESVLEHFRTHPTVREQIPLLEQKVLIGA 30 | LSPGLAADFLLKTNKIHPV 31 | 32 | >XENLA00784 | OMA1001241 | XELAEV_18005522mg | [Xenopus laevis] 33 | MQGITLCCIKTIAHPVSRYFTRNIVSLVKPAQSLGTVSESCKRKTDSFIKLFRTRLCISAVTHQDADILTEKEKRLLNNL 34 | YTGLIRGQRACLAEAITLVESTHSRKREMAQVLLHMVLSHHREQEKLNSGKPLAFRVGLSGPPGAGKSTFIEIFGKMLTE 35 | EGHKVAVLAVDPSSSTSGGSLLGDKTRMTELSRDMNAYIRPSPTRGTLGGVTRTTNEAILLCEGSGYNIILVETVGVGQS 36 | EFAVADMVDMFVLLLPPAGGDELQVMRISARTGEGIQELWNKLLEFQSNMLTSGELIGKRRSQQRVWMWNLIQENVLLYF 37 | RNHPAVKDQIPVLEERVRTGTLSPGLAADMLLKAFSKSS 38 | 39 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_1008242.fa: -------------------------------------------------------------------------------- 1 | >MNELE00922 | OMA1008242 | ML11532a | [Mnemiopsis leidyi] 2 | MTDFDKLPSFKALEILAEKAKSVQLKDLFANDPNRFSKYSQAIEIDELKLLVDFSKNKIDEDIFGELLKLVKDAQVEEMR 3 | DKMFKGEPINFTEQRAVLHIALRNRSNNPILVDGQDVTPKVNQVLEKMKIFADNLRNGTWKGVTGKAITDVVNIGIGGSD 4 | LGPLMVTEALKSYRGNGPDVHFVSNIDGTHIATVLEKVNFESTLFIIASKTFGTLETLTNARTAREWFIKKSGDPSGVAK 5 | HFIALSTNAKLVSEFGIDTANMFEFWDWVGGRYSLWSAIGMSIMCHIGSDNFIKLLEGAHAMDNHFTSAPVEKNIPIILA 6 | VLGVWYNNFLGAQTHALLPYDQYMHRFAAYFQQGDMESNGKGVSREGTRVKYSTGPIVWGEPGTNGQHAFYQLIHQGTKL 7 | IPCDFIMPVQSLNPIGDHHEILTANFLAQTAALMTGRGNEEARKELSSMSAEDQDRLSIYKEFTGDRPTNSILFTKLTPA 8 | MLGALIVMYEHKIFVQGVLWNINSFDQMGVELGKKLALKIQPLLKDDNNVDSEDSSTNGLINFIKANRK 9 | 10 | >HUMAN42527 | OMA1008242 | G6PI_HUMAN | [Homo sapiens] 11 | MAALTRDPQFQKLQQWYREHRSELNLRRLFDANKDRFNHFSLTLNTNHGHILVDYSKNLVTEDVMRMLVDLAKSRGVEAA 12 | RERMFNGEKINYTEGRAVLHVALRNRSNTPILVDGKDVMPEVNKVLDKMKSFCQRVRSGDWKGYTGKTITDVINIGIGGS 13 | DLGPLMVTEALKPYSSGGPRVWYVSNIDGTHIAKTLAQLNPESSLFIIASKTFTTQETITNAETAKEWFLQAAKDPSAVA 14 | KHFVALSTNTTKVKEFGIDPQNMFEFWDWVGGRYSLWSAIGLSIALHVGFDNFEQLLSGAHWMDQHFRTTPLEKNAPVLL 15 | ALLGIWYINCFGCETHAMLPYDQYLHRFAAYFQQGDMESNGKYITKSGTRVDHQTGPIVWGEPGTNGQHAFYQLIHQGTK 16 | MIPCDFLIPVQTQHPIRKGLHHKILLANFLAQTEALMRGKSTEEARKELQAAGKSPEDLERLLPHKVFEGNRPTNSIVFT 17 | KLTPFMLGALVAMYEHKIFVQGIIWDINSFDQWGVELGKQLAKKIEPELDGSAQVTSHDASTNGLINFIKQQREARVQ 18 | 19 | >RATNO16818 | OMA1008242 | G6PI_RAT | [Rattus norvegicus] 20 | MAALTRNPEFQKLLEWHRANSANLKLRELFEADPERFNHFSLNLNTNHGHILLDYSKNLVNKEVLHMLVDLAKSRGVEAA 21 | RDNMFSGLKINSTEDRAVLHVALRNRSNRSIMMDGKDVMPEVNKVLDKMKSFCQRVRSGDWKGYTGKAITDIINIGIGGS 22 | DLGPLMVTEALKPYSKGGPRVWFVSNIDGTHIAKTLANLNPESSLFIIASKTFTTQETITNAETAKEWFLQAAKDPSAVA 23 | KHFVALSTNTDKVKEFGIDPKNMFEFWDWVGGRYSLWSAIGLSIALHVGFDHFEQLLSGAHWMDQHFMKTPLDKNAPVLL 24 | ALLGIWYINFYGCETHAMLPYDQYMHRFAAYFQQGDMESNGKYITKSGARVDYQTGPIVWGEPGTNGQHAFYQLIHQGTK 25 | MIPCDFLIPVQTQHPIRNGLHHKILLANFLAQTEALMKGKSPEEARKELQAAGKSPEELEKLLPHKVFEGNRPTNSIVFT 26 | KLTPFILGALIAMYEHKIFVQGIIWDINSFDQWGVELGKQLAKKIEPELDGSSAVTSHDSSTNGLIGFIKLQRDTKID 27 | 28 | >GORGO15800 | OMA1008242 | A0A2I2YE48 | [Gorilla gorilla gorilla] 29 | TSGQRPAKRRRKSPAMAALTRDPQFQKLQQWYREHGSELNLRRLFDANKDRFNHFSLTLNTNHGHILVDYSKNLVTEDVM 30 | RMLVDLAKSRGVEAARERMFNGEKINYTEGRAVLHVALRNRSNTPILVDGKDVMPEVNKVLDKMKSFCQRVRSGDWKGYT 31 | GKTITDVINIGIGGSDLGPLMVTEALKPYSSGGPRVWYVSNIDGTHIAKTLAQLNPESSLFIIASKTFTTQETITNAETA 32 | KEWFLQAAKDPSAVAKHFVALSTNTTKVKEFGIDPQNMFEFWDWVGGRYSLWSAIGLSIALHVGFDNFEQLLSGAHWMDQ 33 | HFRTTPLEKNAPVLLALLGIWYINCFGCETHAMLPYDQYLHRFAAYFQQGDMESNGKYITKSGTRVDHQTGPIVWGEPGT 34 | NGQHAFYQLIHQGTKMIPCDFLIPVQTQHPIRKGLHHKILLANFLAQTEALMRGKSTEEARKELQAAGKSPEDLERLLPH 35 | KVFEGNRPTNSIVFTKLTPFMLGALVAMYEHKIFVQGIIWDINSFDQWGVELGKQLAKKIEPELDGSAQVTSHDASTNGL 36 | INFIKQQREARVQ 37 | 38 | >XENLA17790 | OMA1008242 | A0A1L8GL32 | [Xenopus laevis] 39 | MALSCDPVYQKLSQWYEAHHGSLNMRQMFEADKDRFSKFSKKLATDDGDILLDYSKNLVNEEVLKLLIELAHSRGVESAR 40 | QKMFSAEKINFTENRAVLHIALRNRSNTPITLEGKDVMPEVNAVLEKMKAFCQKVRSGDWKGYTGKAITDVINIGIGGSD 41 | LGPLMVTESLKPYSKGGPRVWFVSNIDGTHIAKTLAELNPETSLFIIASKTFTTQETITNAETAKEWFLTSAKDASAVAK 42 | HFVALSTNAPKVKDFGIDTANMFEFWDWVGGRYSLWSAIGLSIALHVGFDNFEKLLAGAHWMDNHFNKTPLENNVPVLLA 43 | MLGIWYTNFYGCETHALLPYDQYMHRFAAYFQQGDMESNGKYITKTGARVNYSTGPVVWGEPGTNGQHAFYQLIHQGTRK 44 | IPCDFLIPAQTQHPIRNGLHHKILLSNFLAQTEALMKGKSTEEAKKELQASGLTGDALEKLLPHKVFEGNRPTNSIVFTK 45 | LNPFILGALIAMYEHKIFVQGVVWDINSYDQWGVELGKQLAKKIEPELESDATITSHDSSTNGLIDFIKKHRG 46 | 47 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_1065415.fa: -------------------------------------------------------------------------------- 1 | >MNELE00913 | OMA1065415 | ML14561a | [Mnemiopsis leidyi] 2 | MLRDPETVHPLDECKTWPEIRDKLRLWRKENVRCSDQIVELGEYALKHYQTNLGREKWAVFEQVCVAALDLCPAKMKLVN 3 | TCIKELAEQFPSSLRVSMLEGLKYEYLKKWDDALEMYEDMIEYEPTFPAPYKRKVAILKAQNKISDAVNDLNRYLNTFSC 4 | DHESWLELSDIYISNQNYKQALFCVEELLLQYPHNHLYHQRYADILFTIGGKDNLELSCKYYCKAAELNPGNVRALFGIQ 5 | LASSTLSSIGKLSSKAKSDNQSLAAWASDMIEDFYKSQKTSKNLIIEVAGVLDKLSLK 6 | 7 | >HUMAN95181 | OMA1065415 | EMC2_HUMAN | [Homo sapiens] 8 | MAKVSELYDVTWEEMRDKMRKWREENSRNSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELR 9 | RQFPGSHRVKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYLEQFVGDQEAWHE 10 | LAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENLELSRKYFAQALKLNNRNMRALFGLYMSASHIA 11 | SNPKASAKTKKDNMKYASWAASQINRAYQFAGRSKKETKYSLKAVEDMLETLQITQS 12 | 13 | >RATNO39215 | OMA1065415 | EMC2_RAT | [Rattus norvegicus] 14 | MAKVTERYDVTWEEMRDKMRKWREENSRNSEQIMEVGEELINDYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELR 15 | RQFPGSHRVKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYLEQFVGDQEAWHE 16 | LAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENLELSRKYFAQALKLNNRNMRALFGLYMSASHIA 17 | SNPKASAKMKKDNIRYAGWAANQINRAYQFAGRSKKETKSSLKAVEDMLETLQITQS 18 | 19 | >GORGO40150 | OMA1065415 | G3S2S4 | [Gorilla gorilla gorilla] 20 | MKYSSSHTLYCLKEEMRDKMRKWREENSRNSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQEL 21 | RRQFPGSHRVKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYLEQFVGDQEAWH 22 | ELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENLELSRKYFAQALKLNNRNMRALFGLYMSASHI 23 | ASNPKASAKTKKDNMKYASWAASQINRAYQFAGRSKKETKYSLKAVEDMLETLQITQS 24 | 25 | >XENLA27199 | OMA1065415 | EMC2A_XENLA | [Xenopus laevis] 26 | MSKVSDLFDVTWEDMRDKMKTWREENYRNSEHVIEVGEELINEHASKLGDDIWIIYEQVMIAALDCGRDDIAMSCLQELR 27 | RQFPGSHRVKRLTGLRFEAMERYDDALQIYDRILQDDPTNTAARKRKIAIRKAQGRNSEAIRELNEYLEQFVGDQEAWHE 28 | LAELYINELDYAKAAFCLEELILTNPHNHFYYQQFAEVKYTQGGLENLELSRKYFSQALKLNNHNMRALFGLYISSVHIA 29 | SNPKASAKMKKDNVKYATWAASQIKKAYQLAGRTMTDTQTSLKAVEDMLETLQITQS 30 | 31 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_1121053.fa: -------------------------------------------------------------------------------- 1 | >MNELE00419 | OMA1121053 | ML26071a | [Mnemiopsis leidyi] 2 | MEKAVLLAALLIATAGAASVQVSDPAKCTLCQAVVTELKVVMEDKDTKDFLAVLQTFICENVPIEDCNNWVSGELAQLDS 3 | LVEGLDPNQACSSLALCAVHTSPLLSSIQCDFCEFLGDEVVKRVLTNATIDEVVTAAETICSELPFGSNECNALVKEYGH 4 | YYLELLVGSIDVAQLCSEVGLCSEQVREMVLNSRLFQILQRGLKDDEGCKACVDGMDVIKEVLSSKDTLDLLHIAVHEIC 5 | GLVSVTGCELIADTALDQIIEKLLPMFVPETVCQQIGACPALTAQDVFSPATVGDDSPLCTGCHDLLGEVKKVANDPETK 6 | QINKDLAPVLCEVLSIPFCQSLISKFLEGALEKAQNLDVDETCVSLKACEAADEVVENWKDTCSECAMIADLILKELQDP 7 | SVQQEIESVVDELCSVLPISDCKETLHSYLVMIESLIAGMNGKTLCGYIGLCSSKMSPMKKATGVTEITKLDFTPSKVGD 8 | TCSECTMIAGEVISLLENGTIDSLIKEAISELCTVLPISDCEATIDGYFDEIVALLKNLDGKTLCSLVGLC 9 | 10 | >HUMAN01568 | OMA1121053 | ENSG00000197746.14 | [Homo sapiens] 11 | MYALFLLASLLGAALAGPVLGLKECTRGSAVWCQNVKTASDCGAVKHCLQTVWNKPTVKSLPCDICKDVVTAAGDMLKDN 12 | ATEEEILVYLEKTCDWLPKPNMSASCKEIVDSYLPVILDIIKGEMSRPGEVCSALNLCESLQKHLAELNHQKQLESNKIP 13 | ELDMTEVVAPFMANIPLLLYPQDGPRSKPQPKDNGDVCQDCIQMVTDIQTAVRTNSTFVQALVEHVKEECDRLGPGMADI 14 | CKNYISQYSEIAIQMMMHMSLQQPKEICALVGFCDEVKEMPMQTLVPAKVASKNVIPALELVEPIKKHEVPAKSDVYCEV 15 | CEFLVKEVTKLIDNNKTEKEILDAFDKMCSKLPKSLSEECQEVVDTYGSSILSILLEEVSPELVCSMLHLCSGTRLPALT 16 | VHVTQPKDGGFCEVCKKLVGYLDRNLEKNSTKQEILAALEKGCSFLPDPYQKQCDQFVAEYEPVLIEILVEVMDPSFVCL 17 | KIGACPSAHKPLLGTEKCIWGPSYWCQNTETAAQCNAVEHCKRHVWN 18 | 19 | >RATNO22029 | OMA1121053 | A0A8I6ASQ4 | [Rattus norvegicus] 20 | MYALALLASLLVTALTSPVQDPKICSGGSAVVCRDVKTAVDCRAVKHCQQMVWSKPTAKSLPCDICKTVVTEAGNLLKDN 21 | ATEEEILHYLEKTCAWIHDSSLSASCKEVVDSYLPVILDMIKGEMSNPGEVCSALNLCQSLQEYLAEQNQRQLESNKIPE 22 | VDLARVVAPFMSNIPLLLYPQDRPRSQPQPKANEDVCQDCMKLVTDIQTAVRTNSSFVQGLVDHVKEDCDRLGPGVSDIC 23 | KNYVDQYSEVAVQMMMHMQDQQPKEICVMVGFCDEVKRVPMRTLVPATEAIKNILPALELTDPYEDVIQAQNVIFCQVCQ 24 | LVMRKLSELIINNATEELLIKGLSKACSLLPAPASTKCQEVLVTFGPSLLDVLMHEVNPNFLCGVISLCSANPNLVGTLE 25 | QPAAAIVSALPKEPAPPKQPEEPKQSALRAHVPPQKNGGFCEVCKKLVIYLEHNLEKNSTKEEILAALEKGCSFLPDPYQ 26 | KQCDEFVAEYEPLLLEILVEVMDPSFVCSKIGVCPSAYKLLLGTEKCVWGPGYWCQNMETAARCNAVDHCKRHVWN 27 | 28 | >GORGO01692 | OMA1121053 | G3S0G3 | [Gorilla gorilla gorilla] 29 | MYALFLLASLLGAALAGPVLGLKECTRGSAVWCQNVKTASDCGAVKHCLQTVWNKPTVKSLPCDICKDVVTAAGDMLKDN 30 | ATEEEILVYLEKTCDWLPKPNMSASCKEIVDSYLPVILDIIKGEMSRPGEVCSALNLCESLQKHLAELNHQKQLESNKIP 31 | ELDMTEVVAPFMANIPLLLYPQDGPRSKPQPKDNGDVCQDCIQMVTDIQTAVRTNSTFVQALVEHVKEECDRLGPGMADI 32 | CKNYISQYSEIAIQMMMHMQPKEICALVGFCDEVKEMPMQTLVPAKVASKNVIPALELVEPIKKHEVPAKSDVYCEVCEF 33 | LVKEVTKLIDNNKTEKEILDAFDKMCSKLPKSLSEECQEVVDTYGSSILSILLEEVSPELVCSMLHLCSGTRLPALTVHV 34 | TQPKDGGFCEVCKKLVGYLDRNLEKNSTKQEILAALEKGCSFLPDPYQKQCDQFVAEYEPVLIEILVEVMDPSFVCLKIG 35 | ACPSAHKPLLGTEKCVWGPSYWCQNTETAAQCNAVEHCKRHVWN 36 | 37 | >XENLA29771 | OMA1121053 | XELAEV_18034910mg | [Xenopus laevis] 38 | MKKFAVLVCALAVVAATPLFGTEQCAKGPEVWCENVRTASQCGAVKHCQQNVWNKPTVKSMPCDFCKEVVTVLGNYLKDN 39 | ITQDEIKQYLNKVCDFIPDPGLASTCKQEVSDYFTIVLNLLEQELSNPGVLCSSLGLCTSLQRHLASLKQPTQLLTNEIP 40 | DVDAAKLVYPYIVNIPQLLYPQEKTLKEPKTGDICNDCTKLVSDVQDALRSNSSFSKKLVDHFLQECNLLDPAIAEMCKS 41 | YINQYSDIAIQVLLQMQPKQLCGMAGFCDQEKSTPLQNIIPAKSLIPAVKVQPAVKITKNPLPGNNVLCEVCELMVSQLE 42 | KLLDNNRTRENIKHGLEKVCKLLPSQYTQKCEDMIEEYSDALIELLEQEANPQAICTALGYCSGSKNLKIVKISAEKAAA 43 | GDYCAVCKMLMRYVDELLEKNATEIRIKAFLGRICNFLPDSMQNECSALVNEYEPLFIQLLLEALDPSFICIKVNLCQNK 44 | KVLLGTEKCMWGPSYWCKDMETAANCNALEHCRRHVWN 45 | 46 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_1125645.fa: -------------------------------------------------------------------------------- 1 | >MNELE00647 | OMA1125645 | ML13582a | [Mnemiopsis leidyi] 2 | MVFYFKTVVNGREYMIYMGRDKMENEDLLRWGFPEDVWFHVDKLSSAHVYLRLNKGEGVADIPKELVDQCCQLVKANSIQ 3 | GCKLANVDIVYTPYPNLKKTGDMVAGQVGFHKNKKVVKVNVEKNNEVWKKLEKTREEREVDLQEERNRREREEQEELKKQ 4 | KKLQREMEKLQIEKEKKEREMKSYKLMNADPEKCTSNQFDNESDAERELVDDFM 5 | 6 | >HUMAN93157 | OMA1125645 | CCD25_HUMAN | [Homo sapiens] 7 | MVFYFTSSSVNSSAYTIYMGKDKYENEDLIKHGWPEDIWFHVDKLSSAHVYLRLHKGENIEDIPKEVLMDCAHLVKANSI 8 | QGCKMNNVNVVYTPWSNLKKTADMDVGQIGFHRQKDVKIVTVEKKVNEILNRLEKTKVERFPDLAAEKECRDREERNEKK 9 | AQIQEMKKREKEEMKKKREMDELRSYSSLMKVENMSSNQDGNDSDEFM 10 | 11 | >RATNO10219 | OMA1125645 | D4AAU6 | [Rattus norvegicus] 12 | MVFYFTSSSVNSSTYTIYMGKDKYENEDLIKYGWPEDVWFHVDKLSSAHVYLRLQKGEKIEDIPKEVLMDCAHLVKANSI 13 | QGCKMNNVNVVYTPWSNLKKTADMDVGQIGFHRQKDVKIVTVEKKVNEILNRLEKTKLEKFPDLAAEKEGRDREERNEKK 14 | AQIQEMKRKEKEEMKKKREMDELRSYSSLMKVENMSSNQDGNDSDEFM 15 | 16 | >GORGO40924 | OMA1125645 | G3R2K5 | [Gorilla gorilla gorilla] 17 | MVFYFTSSSVNSSAYTIYMGKDKYENEDLIKHGWPEDIWFHVDKLSSAHVYLRLHKGENIEDIPKEVLMDCAHLVKANSI 18 | QGCKMNNVNVVYTPWSNLKKTADMDVGQIGFHRQKDVKIVTVEKKVNEILNRLEKTKVERFPDLAAEKECRDREERNEKK 19 | AQIQEMKKREKEEMKKKREMDELRSYSSLMKVENMSSNQDGNDSDEFM 20 | 21 | >XENLA23340 | OMA1125645 | Q7T0Y7 | [Xenopus laevis] 22 | MVFYFTSDVISPAYTIYMGKDKYENEDLIKYGWPEDIWFHVDKLSSAHVYLRLQKGQTIEDIPKEVLLDCVQLVKANSIQ 23 | GCKMNNLNVVYTPWANLKKTADMDVGQIGFYRQKDVKTMSVEKVNKIVNRLEKTKDERFPDLAAEKEARDREERNEKKAQ 24 | IQEIKKKEKDEMKKKKEMEELRSYSSLMKSENMSSNQDGNDSDDFM 25 | 26 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_1133018.fa: -------------------------------------------------------------------------------- 1 | >MNELE00906 | OMA1133018 | ML18772a | [Mnemiopsis leidyi] 2 | MTITSERRDQVLLGPLPPSFLRLETRTTDGETVTTVTADPAVGRQVELPPQGAGAPNNGAGAPNNGGGAPNNGGQPRPPP 3 | QHYTAHPVQPYVPQAGLSITIAQAVLNKSYTLIGSMDPYVRLKVGHNTYETFTHAGADKTPCWNKVYHCPLPNTHSVRTV 4 | SVEIFDEKALTDDQRIAYAKISVPQSVFEGHTLDEWFPLSGKLGEAKEGSINLIISYTTMPMITLPYQRIHYPYVGAAPQ 5 | HFTPRPPPQISEADVTSIKDMFPAVDKEVIRTVLESKHGNVESAVAAILQIMEGEQGGQQ 6 | 7 | >HUMAN03808 | OMA1133018 | TOLIP_HUMAN | [Homo sapiens] 8 | MATTVSTQRGPVYIGELPQDFLRITPTQQQRQVQLDAQAAQQLQYGGAVGTVGRLNITVVQAKLAKNYGMTRMDPYCRLR 9 | LGYAVYETPTAHNGAKNPRWNKVIHCTVPPGVDSFYLEIFDERAFSMDDRIAWTHITIPESLRQGKVEDKWYSLSGRQGD 10 | DKEGMINLVMSYALLPAAMVMPPQPVVLMPTVYQQGVGYVPITGMPAVCSPGMVPVALPPAAVNAQPRCSEEDLKAIQDM 11 | FPNMDQEVIRSVLEAQRGNKDAAINSLLQMGEEP 12 | 13 | >RATNO18182 | OMA1133018 | TOLIP_RAT | [Rattus norvegicus] 14 | MATTVSTQRGPVYIGELPQDFLRITPTQQQQQIQLDAQAAQQLQYGGAVGTVGRLSITVVQAKLAKNYGMTRMDPYCRLR 15 | LGYAVYETPTAHNGAKNPRWNKVIQCTVPPGVDSFYLEIFDERAFSMDDRIAWTHITIPESLKQGQVEDEWYSLSGRQGD 16 | DKEGMINLVMSYTSLPAAMMMPPQPVVLMPTVYQQGVGYVPIAGMPAVCSPGMVPMAMPPPAVAPQPRCNEEDLKAIQDM 17 | FPNMDREVIRSVLEAQRGNKDAAINSLLQMGEES 18 | 19 | >GORGO04229 | OMA1133018 | G3R9G9 | [Gorilla gorilla gorilla] 20 | MATTVSTQRGPVYIGELPQDFLRITPTQQQRQVQLDAQAAQQLQYGGAVGTVGRLNITVVQAKLAKNYGMTRMDPYCRLR 21 | LGYAVYETPTAHNGAKNPRWNKVIHCTVPPGVDSFYLEIFDELLPAAMVMPPQPVVLMPTVYQQGVGYVPITGMPAVCSP 22 | GMVPVALPPAAVNAQPRCSEEDLKAIQDMFPNMDQEVIRSVLEAQRGNKDAAINSLLQMGEEP 23 | 24 | >XENLA19417 | OMA1133018 | TOIPA_XENLA | [Xenopus laevis] 25 | MATSISTQRGQVFIGELPQDFLRIAPTQQQQQIQLDAQAAQQLQYSGVMGTMGRLSITVVQAKLAKNYGMTRMDPYCRIR 26 | LGYAVYETPTAHNGAKNPRWNKVIQCTIPPGVDSFYLEIFDERAFSMDDRIAWTHITIPETLKEGKHVDEWFSLSGRQGD 27 | DKEGMINLVMSYTSVPAMMPAQPVVLMPTVYQQGVGYVPIAGPVYNPGMPMIASPPAVNPQHQTQEVDIQSIKDMFPTID 28 | PEVIRSVLEAQGGNRDAAINSLLQMVEDS 29 | 30 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_1151179.fa: -------------------------------------------------------------------------------- 1 | >MNELE00706 | OMA1151179 | ML10761a | [Mnemiopsis leidyi] 2 | MALSRFYIPCHALVKLAPQTRTAVTSVVLERLEQKKKEALLGGGQHRIDAQHKKGKLTARERIEVLLDEGSFVEWDQLVE 3 | HDCIDWGMDKTHFAGDGVVTGTGTVNGRQVFLFSQDFTVFGGSLSAAYASKICKIMDHAEMVGAPLLGLNDSGGARIQEG 4 | VASLGGYGDIFLRNVLLSGVVPQISLIMGPCAGGAVYSPAITDFTFMVKGTSHMFITGPDVVKQVTNETVTQEELGGSAA 5 | HCSTSGCAAGACENDVHLLLQTRRLLEFLPSNNQEKSPVRPCSDPAEREIPALDNIVPNSPISPYDIKHIVEFLVDEGDF 6 | FEIMPDYAKNIVVGFARMNGETVGIVGNQPLVAAGCLDINASVKGARFVRFCDSFNIPLIILEDVPGFLPGTQQEHGGII 7 | KHGAKLLYALAEATVPKLTVITRKAYGGAYVVMNSKHIRADVNYAWPSSEIAVMGSKGAVAIICRGDPDLAKREEEYIDT 8 | FANPFPTAKKGFVDDVIMPRDTRKRLCADLKWLRNKSQKNPWKKHGNIPL 9 | 10 | >HUMAN72443 | OMA1151179 | PCCB_HUMAN | [Homo sapiens] 11 | MAAALRVAAVGARLSVLASGLRAAVRSLCSQATSVNERIENKRRTALLGGGQRRIDAQHKRGKLTARERISLLLDPGSFV 12 | ESDMFVEHRCADFGMAADKNKFPGDSVVTGRGRINGRLVYVFSQDFTVFGGSLSGAHAQKICKIMDQAITVGAPVIGLND 13 | SGGARIQEGVESLAGYADIFLRNVTASGVIPQISLIMGPCAGGAVYSPALTDFTFMVKDTSYLFITGPDVVKSVTNEDVT 14 | QEELGGAKTHTTMSGVAHRAFENDVDALCNLRDFFNYLPLSSQDPAPVRECHDPSDRLVPELDTIVPLESTKAYNMVDII 15 | HSVVDEREFFEIMPNYAKNIIVGFARMNGRTVGIVGNQPKVASGCLDINSSVKGARFVRFCDAFNIPLITFVDVPGFLPG 16 | TAQEYGGIIRHGAKLLYAFAEATVPKVTVITRKAYGGAYDVMSSKHLCGDTNYAWPTAEIAVMGAKGAVEIIFKGHENVE 17 | AAQAEYIEKFANPFPAAVRGFVDDIIQPSSTRARICCDLDVLASKKVQRPWRKHANIPL 18 | 19 | >RATNO41763 | OMA1151179 | Q68FZ8 | [Rattus norvegicus] 20 | MAAAIRIRAMAAGTRLRVLNCGLRTTIRSLCSQPVSVNERIENKRHAALLGGGQRRIDAQHKRGKLTARERISLLLDPGS 21 | FVESDMFVEHRCADFGMAAEKNKFPGDSVVTGRGRINGRLVYVFSQDFTVFGGSLSGAHAQKICKIMDQAITVGAPVIGL 22 | NDSGGARIQEGVESLAGYADIFLRNVTASGVIPQISLIMGPCAGGAVYSPALTDFTFMVKDTSYLFITGPEVVKSVTNED 23 | VTQEQLGGAKTHTTVSGVAHRAFDNDVDALCNLREFFNFLPLSNQDPAPIRECHDPSDRLVPELDTVVPLESSKAYNMLD 24 | IIHAVIDEREFFEIMPNYAKNIVIGFARMNGRTVGIVGNQPNVASGCLDINSSVKGARFVRFCDAFSIPLITFVDVPGFL 25 | PGTAQEYGGIIRHGAKLLYAFAEATVPKITVITRKAYGGAYDVMSSKHLLGDTNYAWPTAEIAVMGAKGAVEIIFKGHQD 26 | VEAAQAEYVEKFANPFPAAVRGFVDDIIQPSSTRARICCDLEVLASKKVHRPWRKHANVPL 27 | 28 | >GORGO29347 | OMA1151179 | A0A2I2YT25 | [Gorilla gorilla gorilla] 29 | MAAALRVAAAGARLSVLASGLRAAVRSLCSQATSVNERIENKRRTALLGGGQRRIDAQHKRGKLTARERISLLLDPGSFV 30 | ESDMFVEHRCADFGMAADKNKFPGDSVVTGRGRINGRLVYVFSQDFTVFGGSLSGAHAQKICKIMDQAITVGAPVIGLND 31 | SGGARIQEGVESLAGYADIFLRNVTASGVIPQISLIMGPCAGGAVYSPALTDFTFMVKDTSYLFITGPDVVKSVTNEDVT 32 | QEELGGAKTHTTVSGVAHRAFENDVDALCNLRDFFNYLPLSSQDPAPVRECHDPSDRLVPELDTIVPLESTKAYNMVDII 33 | HSVVDEHEFFEIMPNYAKNIIVGFARMNGRTVGIVGNQPKVASGCLDINSSVKGARFVRFCDAFNIPLITFVDVPGFLPG 34 | TAQEYGGIIRHGAKLLYAFAEATVPKVTVITRKAYGGAYDVMSSKHLCGDTNYAWPTAEIAVMGAKGAVEIIFKGHENVE 35 | AAQAEYIEKFANPFPAAVRGFVDDIIQPSSTRARICCDLDVLASKKVRVQEVFHQVVQSGHRDGRGAVRW 36 | 37 | >XENLA22949 | OMA1151179 | Q52L44 | [Xenopus laevis] 38 | MAAVRSVSRFLAAVRGSGSVCGPRGLLRAYSVSHLSVPERIEKKRREALLGGGEQRIEAQHRRGKLTARERISLLLDPGS 39 | FAEYDMFVEHRCSDFGMEEDRNKYPGDSVVTGQGRINGRLVYVFSQDFTVFGGSLSGAHAQKICKIMDQAVMVGAPVIGL 40 | NDSGGARIQEGVESLAGYADIFLRNVLSSGVVPQISLIMGPCAGGAVYSPALTDFTFMVKDTSYLFITGPDVVKSVTNED 41 | VTQEDLGGAKTHTALSGVAHRAFENDIDALLNLREFFNFLPLSNKDSAPVRKCHDPSDRLVPGLDTVVPMESTKAYDMLD 42 | IIHSIIDEREFFEIMPNYAKNIVVGFARMNGRTVGIVGNQPKVASGCLDINSSVKGARFVRFCDAFNIPIITFVDVPGFL 43 | PGTAQEYGGIIRHGAKLLFAFAEATVPKITVITRKAYGGAYDVMSSKHLRGDVNYAWPTAEVAVMGAKGAVQIIFRGKQN 44 | QAEAEEEYVEKFANPFPAAVRGFVDDIIQPSKTRMRICRDLEVLASKQQVNPWKKHANIPL 45 | 46 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_1163384.fa: -------------------------------------------------------------------------------- 1 | >MNELE00619 | OMA1163384 | ML08751a | [Mnemiopsis leidyi] 2 | MEKSELSKAWSIDPRERIKALSELASGSVSISHGIPIKRYYRSGVELERMAKVYEDENNLEKAFFLYMKYTTLFVECLPK 3 | HPDYKSPQTSNERKVVRSKLKTIFDRAEFIKNNLTITYAGQHKKWIMEEQIRKAEAEQKRLEEEARIEAEAVAAKRAEME 4 | RRETELALELEQIEKQLEETRTIAVKASEKPVVVPPPMHRQATYPSLPVETPAKQSNTSSFNEAFNMRSPATPLAAPSLS 5 | LPSAPSAPSAHIQIISDTGPFPTVDRSTKPAAPQIDRSTKPATLAASDMFAEMMTQDSQRAVIIPSSLPDKFLSVCLDNT 6 | QKNVETCGILAAKLTANNFTITHVILPKQRGTPDSCQTLAEEELFEYQDKLDLITVGWIHTHPTQSAFLSSVDLHTHCSY 7 | QLMLREAIAIVCAPKHNRLFTVILKELLEDSRSFNGDPPHLTLNHGFPVIRLSRPKKIA 8 | 9 | >HUMAN63278 | OMA1163384 | STABP_HUMAN | [Homo sapiens] 10 | MSDHGDVSLPPEDRVRALSQLGSAVEVNEDIPPRRYFRSGVEIIRMASIYSEEGNIEHAFILYNKYITLFIEKLPKHRDY 11 | KSAVIPEKKDTVKKLKEIAFPKAEELKAELLKRYTKEYTEYNEEKKKEAEELARNMAIQQELEKEKQRVAQQKQQQLEQE 12 | QFHAFEEMIRNQELEKERLKIVQEFGKVDPGLGGPLVPDLEKPSLDVFPTLTVSSIQPSDCHTTVRPAKPPVVDRSLKPG 13 | ALSNSESIPTIDGLRHVVVPGRLCPQFLQLASANTARGVETCGILCGKLMRNEFTITHVLIPKQSAGSDYCNTENEEELF 14 | LIQDQQGLITLGWIHTHPTQTAFLSSVDLHTHCSYQMMLPESVAIVCSPKFQETGFFKLTDHGLEEISSCRQKGFHPHSK 15 | DPPLFCSCSHVTVVDRAVTITDLR 16 | 17 | >RATNO29630 | OMA1163384 | STABP_RAT | [Rattus norvegicus] 18 | MSDHADVSLPPQDRVRILSQLGSAVELNEDIPPRRYFRSGVEIIRMASIYSEEGNIEHAFILYNKYITLFIEKLPKHRDY 19 | KSAIIPEKKDAVKKLKNVAFPKAEELKTELLKRYTKEYEQYKERKKKEEEELARNIAIQQELEKEKQRVAQQKQKQLEQE 20 | QFHAFEKMIQKQELEKERLKIVQEFGKVDPGPCGPLLPDLEKPCVDVAPSSPFSPTQTSDCNTTLRPAKPPVVDRSLKPG 21 | ALSVIENVPTIEGLRHIVVPRNLCSEFLQLASANTAKGIETCGVLCGKLMRNEFTITHVLIPRQNGGPDYCHTENEEEIF 22 | FMQDDLGLLTLGWIHTHPTQTAFLSSVDLHTHCSYQMMLPESIAIVCSPKFQETGFFKLTDYGLQEISTCRQKGFHPHGR 23 | DPPLFCDCSHVTVKDRIVTITDLR 24 | 25 | >GORGO24976 | OMA1163384 | G3RXN2 | [Gorilla gorilla gorilla] 26 | MSDHGDVSLPPEDRVRALSQLGSAVEVNEDIPPRRYFRSGVEIIRMASIYSEEGNIEHAFILYNKYITLFIEKLPKHRDY 27 | KSAVIPEKKDTVKKLKEIAFPKAEELKAELLKRYTKEYTEYNEEKKKEAEELARNMAIQQELEKEKQRVAQQKQQQLEQE 28 | QFHAFEEMIRNQELEKERLKIVQEFGKVDPGLGGPLVPNLEKPSLDVFPTSTVSSIQPSDCHTTVRPAKPPVVDRSLKPG 29 | ALSNSESIPTIDGLRHVVVPGRLCPQFLQLASANTARGVETCGILCGKLMRNEFTITHVLIPKQSAGSDYCNTENEEELF 30 | LIQDQQGLITLGWIHTHPTQTAFLSSVDLHTHCSYQMMLPESVAIVCSPKFQETGFFKLTDHGLEEISSCRQKGFHPHSK 31 | DPPLFCSCSHVTVVDRAVTITDLR 32 | 33 | >XENLA13786 | OMA1163384 | XELAEV_18018654mg | [Xenopus laevis] 34 | MPEHSDASLPPEERIRALVLKGTSVEVNDDIPPKRYYRSGVELIRMANVYSGEGSIENAFILYNKYITLFIEKLPKHRDY 35 | KTANVPEKKETLKKLKEIAFPKAEELKKELHKRYKKEYDEYSEKQRKEEEERARRLALQQQLDAEKQRVALLKQQQEQQE 36 | QVQAFEEMMRRKELEAERLRILHQFSKDEPEAEPLGSPLIPGVNEPPVTPLLPSYGTVQPHPPAVDRSLKPSSYGSNSSG 37 | VTSDGLRHVKIPRDVCCKFLQLSENNTQRGVETCGILCGKLMQNEFTVTHVIVPKQSGGPDYCNTESEEELFLIQDQQGL 38 | ITLGWIHTHPTQTAFLSSVDLHTHCSYQMMLPESIAIVCSPKFQETGFFKLTDYGMKEIGECRQKGFHPHCKEPPLFSAG 39 | GHVSVTEQDVTMMDLR 40 | 41 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_1171372.fa: -------------------------------------------------------------------------------- 1 | >MNELE00942 | OMA1171372 | ML05061a | [Mnemiopsis leidyi] 2 | MDTFTSLSTGCFVVKPSKRNERLAEFPQLYTEESKMSLLGRYEQYISSKKKFTDALNITLEDINKEVSSKIVNFINEGLK 3 | ASVSRGIPTLTLKIGVNFKEFSPIYKLAAKEVSETFKIQCATVNSSQHSDMTAVLKDVFGQLLNSENDANKVLLKTLSMR 4 | GLLDHVMTSNNCNGIVLFIPRFDILPSTIMDKLIDICSSRNQEVPFFFVLGLSTGIELSAEWMSSSAISQLNIETVTPPS 5 | PTELLERVLFKSLLDTETSFKLSYRTFEVLLSRFSFSSYSLHDVMKTIDVALLSHSMHQPLFKHIEKTSSNIQFHANNLD 6 | DEEKALLLQLPSVQKYIEKCVVSNKSLALQLLEGDADAIDNLYQQCTENYQVLFHSFKVLHSLIKNIPGSSLVGKPLELY 7 | SFCLQGCVNEVKEVCIALKCFAMLQSSAFLERLQSAYQECQSVEKPCTKLQLLEKVLKVQIEEMKSILTETSKYELDCSD 8 | KKSYSEKVKSIQQSFISRLENYFSTLTAPTSWPMHEIIWYSNHVELQNMLVGKCRTALHRGLTNPNLYLQGSSDITAKPD 9 | LCHLYDLFQEHGKMISLHDWIQSFNALLGKKKISSETHAQFISAVSELHFMGYLKPTKRKTDHVAKLSLLGY 10 | 11 | >HUMAN85723 | OMA1171372 | ORC3_HUMAN | [Homo sapiens] 12 | MATSSMSKGCFVFKPNSKKRKISLPIEDYFNKGKNEPEDSKLRFETYQLIWQQMKSENERLQEELNKNLFDNLIEFLQKS 13 | HSGFQKNSRDLGGQIKLREIPTAALVLGVNVTDHDLTFGSLTEALQNNVTPYVVSLQAKDCPDMKHFLQKLISQLMDCCV 14 | DIKSKEEESVHVTQRKTHYSMDSLSSWYMTVTQKTDPKMLSKKRTTSSQWQSPPVVVILKDMESFATKVLQDFIIISSQH 15 | LHEFPLILIFGIATSPIIIHRLLPHAVSSLLCIELFQSLSCKEHLTTVLDKLLLTTQFPFKINEKVLQVLTNIFLYHDFS 16 | VQNFIKGLQLSLLEHFYSQPLSVLCCNLPEAKRRINFLSNNQCENIRRLPSFRRYVEKQASEKQVALLTNERYLKEETQL 17 | LLENLHVYHMNYFLVLRCLHKFTSSLPKYPLGRQIRELYCTCLEKNIWDSEEYASVLQLLRMLAKDELMTILEKCFKVFK 18 | SYCENHLGSTAKRIEEFLAQFQSLDETKEEEDASGSQPKGLQKTDLYHLQKSLLEMKELRRSKKQTKFEVLRENVVNFID 19 | CLVREYLLPPETQPLHEVVYFSAAHALREHLNAAPRIALHTALNNPYYYLKNEALKSEEGCIPNIAPDICIAYKLHLECS 20 | RLINLVDWSEAFATVVTAAEKMDANSATSEEMNEIIHARFIRAVSELELLGFIKPTKQKTDHVARLTWGGC 21 | 22 | >RATNO32812 | OMA1171372 | F1LSH3 | [Rattus norvegicus] 23 | RHTGPRTMATSSVSKGCFVFKPDFKKRKISVPIEDYFNNEELDSEDSKLRFETYRLLWQRIKSETEQLQEGLNENLFDNL 24 | VDFLQKSHSELQKNSGNWGSQMRLREIPTAALILGVNVTDHDVIFRSLTETLHNNVTPYVVSLQAKDCPDVKHFLQKLTS 25 | ELIDCCVDRNSKEEKNDKALRRTSYSMDSLSSWYSTVAQKTGPKMTSKKRATCSQWQSPPVVLILKNMESFSTKVLQDFI 26 | IISSQHLHEFPLILIFGIATSPVIIHRLLPHSVSSLLCIELFQSLSCKEHLTVVLDKLLLTPQFPFKLSKKALQVLTNIF 27 | LYHDFSIQNFIKGLKLSLLEHFYSQPLSVLCCDLSEAKKRINVFSVNQCEKIRRLPSFRRYVENQPLEKQVALLTNETFL 28 | KEETQSLLEDLHVYHINYFLVLRCLHNFTSSLPKYPLGRQIRELYCTCLEKKIWDSEEYESALQLLRMLAKDELMGILEQ 29 | CVKVLNSSTEKQLSNTAQKIKGFLTQFQNLDADSKEEEDACGSQPKGLQKTDLYHLQKSLLEMKELRRTTKKPTKFEMLR 30 | ENVINFIDNLVRDYLLPPEGQPLHEVVYFSAANTLREHLNAAPRIALHTALNNPYYYLKNEALKSEEGCIPSVAPDICIA 31 | YKLHLECSRLINLVDWSEAFATVVTAAEKMDTNSTVSEEMSEIIHARFIRAVSELELLGFIKPTKQKTDHVARLTWGGC 32 | 33 | >GORGO37065 | OMA1171372 | G3S685 | [Gorilla gorilla gorilla] 34 | MSNQWEKDGLYNKGGFFKPNVIIYRLQEELNKNLFDNLIEFLQKSHSGFQKNSRDLGGQIKLREIPTAALVLGVNVTDHD 35 | LTFRSLTEALQNNVTPYVVSLQAKDCPDMKHFLQKLISQLMDCCVDIKSKEEESVHVTQRKTHYSMDSLSSWYMTVTQKT 36 | DPKMLSKKRTTCSQWQSPPVVVILKDMESFATKVLQDFIIISSQHLHEFPLILIFGIATSPIIIHRLLPHAVSSLLCIEL 37 | FQSLSCKEHLTTVLDKLLLTTQFPFKINEKVLQVLTNIFLYHDFSIQNFIKGLQLSLLEHFYSQPLSVLCCNLPEAKRRI 38 | NFLSNNQCENIRRLPSFRRYVEKQASEKQVALLTNERYLKEETQLLLENLHVYHMNYFLVLRCLHKFTSSLPKYPLGRQI 39 | RELYCTCLEKNIWDSEEYASVLQLLRMLAKDELMTILEKCFKVFKSYCENHLGSTAKRIEEFLAQFQSLDAETKEEEDAS 40 | GSQPKGLQKTDLYHLQKSLLEMKELRRSKKQTKFEVLRENVVNFIDCLVREYLLPPETQPLHEVVYFSAAHALREHLNAA 41 | PRIALHTALNNPYYYLKNEALKSEEGCIPNIAPDICIAYKLHLECSRLINLVDWSEAFATVVTAAEKMDANSATSEEMNE 42 | IIHARFIRAVSELELLGFIKPTKQKTDHVARLTWGGC 43 | 44 | >XENLA24336 | OMA1171372 | A0A1L8G315 | [Xenopus laevis] 45 | MTTSSVSKGCFVFKPSAKKKKTSLTVADYFNEGLRDSEDSKKRFESCQLLWQQMKSQTEQLQEEMNKRLFENLIGFLRKS 46 | HADFHNKKDDWSCRMRASEIPTAALVLGVNVTDHDLTFNSLSDILHETITPFVVLLQSKECTGIKQLLQKLLTQLMGNTV 47 | DIDLEEEEEQVTISQRKMNCTLASLSDWYKRATKKSASPKKKRSLTSTHWESPPVVVIFKDLESFTASVLQEFIVISSGY 48 | VQDLPLVLVFGIATSPMIIHRLLSHSVSSRLCIELFQSMSCTEHLATVVDQLLLTNHFPIKLSGRVMQVLITIFLYHDFS 49 | VQNFIKGLQLSVVEHFYTQPLSVLCCSLSESRKRIKNLSHAQCENIRHLSSFMSYVESQTPENQVNLLTNDRFLKEMTQE 50 | FLERLNSYHENFTPILRCLHHFTCILPKYPLGKQIREIYCACLEKKVWETEDYNSALPLLRMLAKDEIVATLQKCVAVLK 51 | PYSEKKLGNALEKLEEFLINFQSLEETTQNEEDEDTSPQKSLQKKTDLYQLQKKLLEMKETRRTKKPSRFELLRQDVVDF 52 | IDGLVREYLLPPEMLPLHEVVYFSAASTLRRHLNAAPRVALHTALNNPASYLKCLENEGGSISNAAPDICIAYKLHLECG 53 | RLINLYDWLEAFATVVHAAEGSESDSAQQVDDVTHARFIRAVSELELLGFVKPTKQKTDHVARLTWGGC 54 | 55 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_1188079.fa: -------------------------------------------------------------------------------- 1 | >MNELE00930 | OMA1188079 | ML01881a | [Mnemiopsis leidyi] 2 | FLGMTQSSISSFFKPKRQLEDEDGKENKVSKMLKCSDDSVLKDWKISTSWEKCLSNELTKSYFTDISSFVAKERVSKTIY 3 | PSHDEVFSWTHHCKLDDVKVVILGQDPYHGPNQAHGLCFSVKVGVPPPPSLKNIFKAIKKDLDKFEEPGHGYLVGWARQG 4 | VLMLNAVLTVEKSKANSHKSKGWEKLTDHVIKYIGFHMKSCVFLLWGTPAMKKQSLINKTNHLVLTSGHPSPLSAHRGFF 5 | DCKHFSKANEYLLKNKKDAIDWNRLPTE 6 | 7 | >HUMAN15265 | OMA1188079 | UNG_HUMAN | [Homo sapiens] 8 | MGVFCLGPWGLGRKLRTPGKGPLQLLSRLCGDHLQAIPAKKAPAGQEEPGTPPSSPLSAEQLDRIQRNKAAALLRLAARN 9 | VPVGFGESWKKHLSGEFGKPYFIKLMGFVAEERKHYTVYPPPHQVFTWTQMCDIKDVKVVILGQDPYHGPNQAHGLCFSV 10 | QRPVPPPPSLENIYKELSTDIEDFVHPGHGDLSGWAKQGVLLLNAVLTVRAHQANSHKERGWEQFTDAVVSWLNQNSNGL 11 | VFLLWGSYAQKKGSAIDRKRHHVLQTAHPSPLSVYRGFFGCRHFSKTNELLQKSGKKPIDWKEL 12 | 13 | >RATNO05802 | OMA1188079 | A0A8J8YLI4 | [Rattus norvegicus] 14 | MIGQKTLYSFFSPTPTGKRTTRSPQPAPGSGVTAENSSDAAASPAKKARVEQDEPATPPSSPLSAEQLVRIQRNKAAALL 15 | RLAARNVPAGLGESWKQQLCGEFGKPYFVKLMGFVAEERKHHKVYPPPEQVFTWTQMCDIRDVKVVILGQDPYHGPNQAH 16 | GLCFSVQRPVPPPPSLENIFKELSTDIDGFVHPGHGDLSGWARQGVLLLNAVLTVRAHQANSHKERGWEQFTDAVVSWLN 17 | QNLNGLVFLLWGSYAQKKGSAIDRKRHHVLQTAHPSPLSVYRGFFGCRHFSKANELLQRSGKKPISWKEL 18 | 19 | >GORGO06002 | OMA1188079 | G3RY62 | [Gorilla gorilla gorilla] 20 | MIGQKTLYSFFSPSPARKRHAPSPEPAVQGTGVAGVPEESGDAAAIPAKKAPAGQEEPGTPPSSPLSAEQLDRIQRNKAA 21 | ALLRLAARNVPVGFGESWKKHLSGEFGKPYFIKLMGFVAEERKHYTVYPPPHQVFTWTQMCDIRDVKVVILGQDPYHGPN 22 | QAHGLCFSVQRPVPPPPSLENIYKELSTDIEDFVHPGHGDLSGWAKQGVLLLNAVLTVRAHQANSHKERGWEQFTDAVVS 23 | WLNQNSNGLVFLLWGSYAQKKGSAIDRKRHHVLQTAHPSPLSVYRGFFGCRHFSKTNELLQKSGKKPIDWKEL 24 | 25 | >XENLA02694 | OMA1188079 | XELAEV_18007455mg | [Xenopus laevis] 26 | MIGQRTINSFFGAAVKKRAASTVWDGEDSCKAGETTPVKKSRPSNENDIPSAVSPPLSPEQLERMQRNKAAALQKLAARH 27 | APQGLGESWKQELLAEFAKPYFVKLSNFIAEERKKCTVYPPPEEVFTWTQMVDIKDVKVVILGQDPYHGPNQAHGLCFSV 28 | KKPVPPPPSLVNMYKELETDIEGFSRPGHGDLTGWAKQGVLLLNAVLTVRAHNANSHKDCGWEQFTDVVVSWLNKNMDGL 29 | VFMLWGAYAQKKGSNIDRKRHHVLQTVHPSPLSVHRGFFGCRHFSKTNAYLQGLGKKPIDWKAL 30 | 31 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_649216.fa: -------------------------------------------------------------------------------- 1 | >MNELE00920 | OMA649216 | ML39325a | [Mnemiopsis leidyi] 2 | MSLSRLGFQLFRRGTLSTVSPAQQLVSARLFNYDQSHDADNYLKFGLLGGLGLFVTAMCKEEAANEEPANHSEATQEEEE 3 | EQKPKKKKKKGFGERKVMEYENRIREFSTPDKIFRYFATVRVEFENGKKEIFMTPKDFMRSITPGELQPSHLGLDLYRDV 4 | PISKLLDHVDEEEGEQPEFLSRLAQHGLISFQDYIFLLTLLSTPKHDCEIAFKMFDLYGDGCVSYQEFLDTRSVLESRSS 5 | MGKRHRDNIYSGNTINKDGHSALTKYFFGEDSAKKLTLDDFVVFMDGLKEDVFRMEFNKYDPVDGKITEQDFANLLLLHA 6 | TLSNQAKSKFVRRVKKAYKNESQGITFDQFMTFNHFLDHLDDVEILVSVYFAAGMKFNKASLKQVAHVVADVELDSHIID 7 | LVFTIFDDNGDELLSNREFISVLKERAHRGLEKPSDTGFVRLITALGACVASYVKGEEL 8 | 9 | >HUMAN01609 | OMA649216 | MICU1_HUMAN | [Homo sapiens] 10 | MFRLNSLSALAELAVGSRWYHGGSQPIQIRRRLMMVAFLGASAVTASTGLLWKRAHAESPPCVDNLKSDIGDKGKNKDEG 11 | DVCNHEKKTADLAPHPEEKKKKRSGFRDRKVMEYENRIRAYSTPDKIFRYFATLKVISEPGEAEVFMTPEDFVRSITPNE 12 | KQPEHLGLDQYIIKRFDGKKISQEREKFADEGSIFYTLGECGLISFSDYIFLTTVLSTPQRNFEIAFKMFDLNGDGEVDM 13 | EEFEQVQSIIRSQTSMGMRHRDRPTTGNTLKSGLCSALTTYFFGADLKGKLTIKNFLEFQRKLQHDVLKLEFERHDPVDG 14 | RITERQFGGMLLAYSGVQSKKLTAMQRQLKKHFKEGKGLTFQEVENFFTFLKNINDVDTALSFYHMAGASLDKVTMQQVA 15 | RTVAKVELSDHVCDVVFALFDCDGNGELSNKEFVSIMKQRLMRGLEKPKDMGFTRLMQAMWKCAQETAWDFALPKQ 16 | 17 | >RATNO21729 | OMA649216 | A0A8I6A7K0 | [Rattus norvegicus] 18 | MFRLNALSALAELAMGSRWYHGTSQPTQTKRRLMLVAFLGASAVTASTGLLWKKAHAESPPSVNSKKTDAGDKGKSKDTR 19 | EVSSHEGSAADTAAEPYPEEKKKKRSGFRDRKVMEYENRIRAYSTPDKIFRYFATLKVINEPGETEVFMTPQDFVRSITP 20 | NEKQPEHLGLDQYIIKRFDGKEFWQTEKIAQEREKFADEGSIFYTLGECGLISFSDYIFLTTVLSTPQRNFEIAFKMFDL 21 | NGDGEVDMEEFEQVQSIIRSQTSMGMRHRDRPTTGNTLKSGLCSALTTYFFGADLKGKLTIKNFLEFQRKLQHDVLKLEF 22 | ERHDPVDGRISERQFGGMLLAYSGVQSKKLTAMQRQLKKHFKDGKGLTFQEVENFFTFLKNINDVDTALSFYHMAGASLD 23 | KVTMQQVARTVAKVELSDHVCDVVFALFDCDGNGELSNKEFVSIMKQRLMRGLEKPKDMGFTRLMQAMWKCAQETAWDFA 24 | LPK 25 | 26 | >GORGO00558 | OMA649216 | A0A2I2YKG1 | [Gorilla gorilla gorilla] 27 | MFRLNSLSALAELAVGSRWYHGGSQPIQIRRRLMMVAFLGASAVTASTGLLWKRAHAESPPCVDNLKSDIGDKGKNKDEG 28 | DVCNHEKKTADLVPHPEEKKKKRSGFRDRKVMEYENRIRAYSTPDKIFRYFATLKVISEPGEAEVFMTPEDFVRSITPNE 29 | KQPEHLGLDQYIIKRFDGKDFWQTEKISQEREKFADEGSIFYTLGECGLISFSDYIFLTTVLSTPQRNFEIAFKMFDLNG 30 | DGEVDMEEFEQVQSIIRSQTSMGMRHRDRPTTGNTLKSGLCSALTTYFFGADLKGKLTIKNFLEFQRKLQHDVLKLEFER 31 | HDPVDGRITERQFGGMLLAYSGVQSKKLTAMQRQLKKHFKEGKGLTFQEVENFFTFLKNINDVDTALSFYHMAGASLDKV 32 | TMQQVARTVAKVELSDHVCDVVFALFDCDGNGELSNKEFVSIMKQRLMRGLEKPKDMGFTRLMQAMWKCAQETAWDFALP 33 | KQ 34 | 35 | >XENLA31939 | OMA649216 | A0A1L8FF34 | [Xenopus laevis] 36 | MFRLRFIPAVAGLAAVSRRYHGVANHTRSRRRLMMAAFVGATAVSASAGLLWKRANAEAQSSVKHNMREESSEKEKEPED 37 | TDQAVESSDEEQQQEGKKKKRVGFRDRKVMEYENRIRAYSTPDKIFRYFATLKVIHESGESEVFMTPQDFVRSITPNEKQ 38 | PENLGLDQFIVKRYDGKKISQEREKFADEDSIFYSLGECGLISFSDYIFLTTVLSTPQRNFEIAFKMFDLNGDGEVDMEE 39 | FEQVQSIIRSQTSMGMRHRDRSTTGNTLKTGFSSALTTYFFGADLKGKLTIKNFLEFQRKLQHDVLKLEFERQDPVDGRI 40 | TERQFGSMLLAYSGVQSKKLTHMLKQLKKRFKDAEGLTFEEVENFFTFLKNINDVDTALSFYHMAGASLDKVTMQQVART 41 | VAKVELSDHVCDVVFALFDCDGNGELSNKEFIAIMKQRLMRGLEKPKDMGFTRLMRAMWKCAQETAWDFAMPKQ 42 | 43 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_681083.fa: -------------------------------------------------------------------------------- 1 | >MNELE00869 | OMA681083 | ML02403a | [Mnemiopsis leidyi] 2 | MPMKLKFLFRASNKIKTKPSLNFLVMVKRLADPEVDENVEDIDSDFSDDDGASSVSDTGSVEETAQGKRLRLAKQYLDKL 3 | ENEQLKSENDTEINRDLIAHRLQQDVLAEKGKLETRVGKRLCVLENKFTLKGHRLSPTCLAITDTHLFSGSKDGAIIKWD 4 | LSTGKKLSVVKHDSKKQILALAASSDNVYLASGGQDKIIVLWDIESMTFVKCFRKHRGPITALTFQRNSHLLMSGSADRS 5 | VNLWNCDDKLYIESLYGHQDMVADMDSFLQERVVTVGGHDKTLRLWKIQEESQLVFNGHKNTVLDCVSMLNEEHFVTGSQ 6 | DNVLAVWHIKKKKPAITQLQAHAKGSWVSAVAGLKNTECFISGSNGGNVKVWACAENYRSMECIRSIEIIGTVNSIVISH 7 | DNSCFALAVGQEPKMGRWWSDKAARNRVLVFPMAIEDEVNR 8 | 9 | >HUMAN70334 | OMA681083 | U3IP2_HUMAN | [Homo sapiens] 10 | MSATAAARKRGKPASGAGAGAGAGKRRRKADSAGDRGKSKGGGKMNEEISSDSESESLAPRKPEEEEEEELEETAQEKKL 11 | RLAKLYLEQLRQQEEEKAEARAFEEDQVAGRLKEDVLEQRGRLQKLVAKEIQAPASADIRVLRGHQLSITCLVVTPDDSA 12 | IFSAAKDCSIIKWSVESGRKLHVIPRAKKGAEGKPPGHSSHVLCMAISSDGKYLASGDRSKLILIWEAQSCQHLYTFTGH 13 | RDAVSGLAFRRGTHQLYSTSHDRSVKVWNVAENSYVETLFGHQDAVAALDALSRECCVTAGGRDGTVRVWKIPEESQLVF 14 | YGHQGSIDCIHLINEEHMVSGADDGSVALWGLSKKRPLALQREAHGLRGEPGLEQPFWISSVAALLNTDLVATGSHSSCV 15 | RLWQCGEGFRQLDLLCDIPLVGFINSLKFSSSGDFLVAGVGQEHRLGRWWRIKEARNSVCIIPLRRVPVPPAAGS 16 | 17 | >RATNO41032 | OMA681083 | B0BND5 | [Rattus norvegicus] 18 | MNSMSTAVATRKRAKPAPGPGAAPVDGKRRRKVDSAASRGKSKGGGKMNEEISSDSESESLAPRKTEEEEEEELEETAQE 19 | KKLRLAKLYLEQLRQQEEEKAEARAFEEDQVAGRLKEDVLEQRGRLQKLVAKEIQAPAPTDIRVLRGHQLSITCLVITPD 20 | DLAIFSAAKDCTIIKWSVETGRKLHVIPRAKKGTQGQPSGHSSHILCMAISSDGKYLASGDRSKLILIWEAQSCQHLYTF 21 | TGHRDAVSGLAFRRGTHQLYSTSHDRSVKVWNAAENSYVETLFGHQDAVAALDALSRECCVTAGGRDGTVRVWKIPEESQ 22 | LVFYGHQGSIDCIHLINEEHMVSGADDGSVALWGLSKKRPLALQREAHGLHGEPGLEQPFWISSVAALLNTDLVATGSHN 23 | ACVRLWQCGEGFRQLDPLCDIPLVGFINSLKFSSGGDFLVAGVGQEHRLGRWWRIKEARNSVCIIPLRRVPVSPVAGS 24 | 25 | >GORGO30225 | OMA681083 | G3R9Q2 | [Gorilla gorilla gorilla] 26 | MSATAAARKRGKPASGVGAGAGAGKRRRKADSAGDRGKSKGGGKMNEEISSDSESESLAPRKPEEEEEEELEETAQEKKL 27 | RLAKLYLEQLRQQEEEKAEARAFEEDQVAGRLKEDVLEQRGRLQKLVAKEIQAPASADIRVLRGHQLSITCLVVTPDDSA 28 | IFSAAKDCTIIKWSVESGRKLRVIPRAKKGAEGKPPGHSSHVLCMAISSDGKYLASGDRSKLILIWEAQSCQHLYTFTGH 29 | RDAVSGLAFRRGTHQLYSTSHDRSVKVWNVAENSYVETLFGHQDAVAALDALSRECCVTAGGRDGTVRVWKIPEESQLVF 30 | YGHQGSIDCIHLINEEHMVSGADDGSVALWGLSKKRPLALQREAHGLRGEPGLEQPFWISSVAALLNTDLVATGSHSSCV 31 | RLWQCGEGFRQLDLLCDIPLVGFINSLKFSSSGDFLVAGVGQEHRLGRWWRIKQARNSVCIIPLRRVPVPPAAGS 32 | 33 | >XENLA20807 | OMA681083 | A0A1L8GHE5 | [Xenopus laevis] 34 | MSGLFIKKKSGVTPRRRRAEGNDAEATSQKKKKPKDTHLREEIESDSDTEIAPARTKPRQDEEDLDETAQEKKLRLAKEY 35 | LKQLQQQEEEQKEDEDQDTIANRLQEDVLEQRGRLQRPLAKELLPPEPSEIRLLRGHQGPITCLVISPDDSYLFSGSKDC 36 | SIIKWSVSDGKKIHKIPGGRKGTESTHVGHTGHVLCMALSSDGKYLASGDRNKLIFIWDPVTCQNLHKFQGHRDAVSGLS 37 | FQKGTHQLFSVSHDRSVKVWNVEENAYIETLFGHQDAITGLDSLSRERCVTVGGRDGTMRIWKIAEETQLVFSGHEGSID 38 | CVRLINEEHIVTGADDGSLALWTVGKKKPLTQMKCAHGSHGDAGLEQPYWISSIAAALNSDVVASGSHDGFVHVWRCGEG 39 | FRSLSPLFTVPVVGFVNSLQFSSSANFLVAGVGQEHRLGRWWRKKEAKNALCIIPFKRTLVLGS 40 | 41 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_683078.fa: -------------------------------------------------------------------------------- 1 | >MNELE00595 | OMA683078 | ML13641a | [Mnemiopsis leidyi] 2 | MENTVLRAKLIVLGDASVGKSSLVQVFHSDSQQGFPKAYSMTSDVQLQVKSVKIPDSPYTVELYVYDCAGQETFQPFISK 3 | ILGSSALVLLVSDLTNQSSLSAAVKWFERARNANKDFKMQGALVGNKCDLDLRRAIKASEAEETAANLGIPYFECSAKEG 4 | VQVDEPFYFLANCLYEQYIEQTQEFQNIADTV 5 | 6 | >HUMAN60325 | OMA683078 | IFT27_HUMAN | [Homo sapiens] 7 | MVKLAAKCILAGDPAVGKTALAQIFRSDGAHFQKSYTLTTGMDLVVKTVPVPDTGDSVELFIFDSAGKELFSEMLDKLWE 8 | SPNVLCLVYDVTNEESFNNCSKWLEKARSQAPGISLPGVLVGNKTDLAGRRAVDSAEARAWALGQGLECFETSVKEMENF 9 | EAPFHCLAKQFHQLYREKVEVFRALA 10 | 11 | >RATNO39676 | OMA683078 | A0A8I5ZYK4 | [Rattus norvegicus] 12 | QVKMDNDFSSALASGDPAVGKTALVQMFRSDGTHFQKNYTLTTGVDLVVKTVPVLDTNDSVELFIFDSAGKELFSEMLDK 13 | LWENPNVLCLVYDVTNEQSFISCTKWLEKVRSQTPGISLPGVLVGTKTDLAGRQTVDSAQAQAWALSQGLEFFETSVKEM 14 | DNYEAPFHCLAKQFYQLYREKVDIFHTLV 15 | 16 | >GORGO23896 | OMA683078 | G3R464 | [Gorilla gorilla gorilla] 17 | MRTKAFFFFFSDLTGDPAVGKTALAQIFRSDGAHFQKSYTLTTGMDLVVKTVPVPDTGDSVELFIFDSAGKELFSEMLDK 18 | LWESPNVLCLVYDVTNEESFNNCSKWLEKARSQAPGISLPGVLVGNKTDLAGRRAVDSAEARAWALGQGLECFETSVKEM 19 | ENLEAPFHCLAKQFHQLYREKVEVFRALA 20 | 21 | >XENLA18783 | OMA683078 | A0A1L8GP42 | [Xenopus laevis] 22 | MVKLSAKCIVAGDTAVGKSTLVQLFRSDGSHFPKNYSMTATVEVSVKTVQIPDTGDSVELFLCDSPGKAIFYEMTEKLWD 23 | QPGALCLVFDVTNESSFSSCTKWLQRVRSKTLSPHLPGVLVGNKTDMAGLRAVEKGQAEEWAASNGLEYFETSAKELENF 24 | ERPFQALAKAFHHLYQERVEHFQSLV 25 | 26 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_894224.fa: -------------------------------------------------------------------------------- 1 | >MNELE00417 | OMA894224 | ML36131a | [Mnemiopsis leidyi] 2 | MGPKGMDKILVSMGQDGYPGDIQVTNDGATILRSIGVDNPAAKVLVNISKVQDDEVGDGTTSVTVLAAELLREAEQLVAK 3 | KLHPQTIISGYRAALKVAVQVLTDTAIDNGKDNEAFKKDLMNIARTTLSSKILNQHKEHFAELAVNAVLRLKGSTDLELV 4 | QILKKTGGSIEDSYLDEGFLLEKEIGHNQPKRIENARILVANTPMDTDKIKVFGSKVKVDSTAKVADIELAEKNKMKQKV 5 | DKILSHDITCFINRQLIYDYPDQLFADAGIMAIEHADFDGIERLSKVLGAEIVSTFDQPDKVTLGSCKVIEEVILGEDKL 6 | IKFSGVKQGEACTVVLRGATKMIVDEAERSLHDALCVLTQTVKETRTVFGGGCSEMRMARHVEELAARTPGKEALAIESF 7 | ARALRQIPTIIADNGGYDSSQLVSELRAMHSQDELYMGLNMTTGEVGDMRELGITESFAVKHAVVNSAAEAAEMILRVDD 8 | ILKATPRQRGGNDCM 9 | 10 | >HUMAN14228 | OMA894224 | TCPB_HUMAN | [Homo sapiens] 11 | MASLSLAPVNIFKAGADEERAETARLTSFIGAIAIGDLVKSTLGPKGMDKILLSSGRDASLMVTNDGATILKNIGVDNPA 12 | AKVLVDMSRVQDDEVGDGTTSVTVLAAELLREAESLIAKKIHPQTIIAGWREATKAAREALLSSAVDHGSDEVKFRQDLM 13 | NIAGTTLSSKLLTHHKDHFTKLAVEAVLRLKGSGNLEAIHIIKKLGGSLADSYLDEGFLLDKKIGVNQPKRIENAKILIA 14 | NTGMDTDKIKIFGSRVRVDSTAKVAEIEHAEKEKMKEKVERILKHGINCFINRQLIYNYPEQLFGAAGVMAIEHADFAGV 15 | ERLALVTGGEIASTFDHPELVKLGSCKLIEEVMIGEDKLIHFSGVALGEACTIVLRGATQQILDEAERSLHDALCVLAQT 16 | VKDSRTVYGGGCSEMLMAHAVTQLANRTPGKEAVAMESYAKALRMLPTIIADNAGYDSADLVAQLRAAHSEGNTTAGLDM 17 | REGTIGDMAILGITESFQVKRQVLLSAAEAAEVILRVDNIIKAAPRKRVPDHHPC 18 | 19 | >RATNO37617 | OMA894224 | A0A8I6GLE7 | [Rattus norvegicus] 20 | MPSSGVSEHSNKASLSLAPVNIFKAGADEERAETARLSSFIGAIAIGDLVKSTLGPKGMDKILLSSGRDASLMVTNDGAT 21 | ILKNIGVDNPAAKVLVDMSRVQDDEVGDGTTSVTVLAAELLREAESLIAKKIHPQTIIAGWREATKAAREALLSSAVDHG 22 | SDEVKFWQDLMNIAGTTLSSKLLTHHKDHFTKLAVEAVLRLKGSGNLEAIHVIKKLGGSLADSYLDEGFLLDKKIGVNQP 23 | KRIENAKILIANTGMDTDKIKIFGSRVRVDSTAKVAEIEHAEKEKMKEKVERILKHGINCFINRQLIYNYPEQLFGAAGV 24 | MAIEHADFAGVERLALVTGGEIASTFDHPELVKLGSCKLIEEVMIGEDKLIHFSGVALGEACTIVLRGATQQILDEAERS 25 | LHDALCVLAQTVKDPRTVYGGGCSEMLMAHAVTMLASRTPGKEAVAMESFAKALRMLPTIIADNAGYDSADLVAQLRAAH 26 | SEGRITAGLDMKEGSIGDMAVLGITESFQVKRQVLLSAAEAAEVILRVDNIIKAAPRKRVPDHHPC 27 | 28 | >GORGO04796 | OMA894224 | A0A2I2Y483 | [Gorilla gorilla gorilla] 29 | MASLSLAPVNIFKAGADEERAETARLTSFIGAIAIGDLVKSTLGPKGMDKILLSSGRDASLMVTNDGATILKNIGVDNPA 30 | AKVLVDMSRVQDDEVGDGTTSVTVLAAELLREAESLIAKKIHPQTIIAGWREATKAAREALLSSAVDHGSDEVKFRQDLM 31 | NIAGTTLSSKLLTHHKDHFTKLAVEAVLRLKGSGNLEAIHIIKKLGGSLADSYLDEGFLLDKKIGVNQPKRIENAKILIA 32 | NTGMDTDKIKIFGSRVRVDSTAKVAEIEHAEKEKMKEKVERILKHGINCFINRQLIYNYPEQLFGAAGVMAIEHADFAGV 33 | ERLALVTGGEIASTFDHPELVKLGSCKLIEEVMIGEDKLIHFSGVALGEACTIVLRGATQQILDEAERSLHDALCVLAQT 34 | VKDSRTVYGGGCSEMLMAHAVTQLANRTPGKEAVAMESYAKALRMLPTIIADNAGYDSADLVAQLRAAHSEGNTTAGLDM 35 | REGTIGDMAILGITESFQVKRQVLLSAAEAAEVILRVDNIIKAAPRKRVPDHHPC 36 | 37 | >XENLA12675 | OMA894224 | A0A1L8GYM0 | [Xenopus laevis] 38 | MASLSLAPVNIFKAGADEEKAETARLSSFIGAIAIGDLVKSTLGPKGMDKILLSSGRDSSVTVTNDGATILKAIGIDNPA 39 | AQVLVDMSKVQDDEVGDGTTSVTVLAAELLREAEILVAKKIHPQTIVSGWRQATQVAREALLKASMDHGNDEEKFCCDLM 40 | NIARTTLSSKLLTHHKDHFSKLAVEAVLRLKGSGNLEAIHLIKKLGGSLTESYLDEGFLLDKKIGVNQPKRIENAKILIA 41 | NTGMDTDKIKVFGSRVRVDSTAKVAEIELAEKEKMKEKVERILKHGINCFINRQLIYNYPEQLFAAAGVMAIEHADFAGV 42 | ERLALVTGGEIASTFDHPELVKLGTCKLIEEVMIGEDKLIHFSGVAMGEACTIVLRGATQQILDEAERSLHDALCVLAQT 43 | VKDTRTVYGGGCSEMLMAHAVTELANRTPGKESVAMESFAKALRMLPTIIADNAGYDSADLVSQLRAAHSEGKSTYGLDM 44 | KNGIIGDMGELGITESFQVKRQVLLSASEAAEVILRVDNIIKAAPRKRVPDHHPC 45 | 46 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_944789.fa: -------------------------------------------------------------------------------- 1 | >MNELE00946 | OMA944789 | ML24671a | [Mnemiopsis leidyi] 2 | MIRGGKVAKRKSTTVLQEGTKKKCVNGAASSRQTSLQSHFKPLQPKQAPTTSSTGSKRRYVDSRTTSWEEVNKIRPELPT 3 | PPFGEDSPAYSSQIPNSVVIPLPPLNFAPNDKIWAGLKQQEEKRQASKLGILNHPFIKQGARAILLDWLIEVSQLYCLKR 4 | ETFYLSMDYIDRFISKRYDIKKEQLQLVGITALHMAAKLEEIYPPGLEKLSYITDNSCSKEAMWKMELEMMKALDWRLAA 5 | LTVNTWLNLYLQIEYYRGTSCSTFQFLRGEYSQSDFVKIIQLIDLCSLDVKSVEYRPSMIAASALWLVVPSKLKEVTGYS 6 | WDDLISCRHWMQPYAQVLKDQPAQQLKDFEDVEKKDRHHIQTHFKAIPLLHDVYELQESQPLTPDSDSDNENAEVAHYLT 7 | PNSSTHSSPSSSTKHR 8 | 9 | >HUMAN42399 | OMA944789 | CCNE1_HUMAN | [Homo sapiens] 10 | MPRERRERDAKERDTMKEDGGAEFSARSRKRKANVTVFLQDPDEEMAKIDRTARDQCGSQPWDNNAVCADPCSLIPTPDK 11 | EDDDRVYPNSTCKPRIIAPSRGSPLPVLSWANREEVWKIMLNKEKTYLRDQHFLEQHPLLQPKMRAILLDWLMEVCEVYK 12 | LHRETFYLAQDFFDRYMATQENVVKTLLQLIGISSLFIAAKLEEIYPPKLHQFAYVTDGACSGDEILTMELMIMKALKWR 13 | LSPLTIVSWLNVYMQVAYLNDLHEVLLPQYPQQIFIQIAELLDLCVLDVDCLEFPYGILAASALYHFSSSELMQKVSGYQ 14 | WCDIENCVKWMVPFAMVIRETGSSKLKHFRGVADEDAHNIQTHRDSLDLLDKARAKKAMLSEQNRASPLPSGLLTPPQSG 15 | KKQSSGPEMA 16 | 17 | >RATNO19647 | OMA944789 | B1WC54 | [Rattus norvegicus] 18 | MPRERKERDSKDHSNMKEEGGSDLSVRSRKRKANVAVFLQDPDEEIAKIDKTVKSQDSSQPWDDDSACVDPCSFIPTPNK 19 | EEDNELEYPKTAFQPRKIRPPRASPLPVLNWGNREEVWRIMLNKEKTYLRDEHFLQRHPLLQARMRAVLLDWLMEVCEVY 20 | KLHRETFYLAQDFFDRYMASQQNIIKTLLQLIGISALFIASKLEEIYPPKLHQFAYVTDGACSGDEILTMELMMMKALKW 21 | RLSPLTIVSWLNVYVQVAYVNDTGEVLMPQYPQQVFVQIAELLDLCVLDVGCLEFPYGVLAASALYHFSSLELMQKVSGY 22 | QWCDIEKCVKWMVPFAMVIREMGSSKLKHFRGVPMEDSHNIQTHTNSLDLLDKAQAKKAILSEQNRISPPPSGVLTPPHS 23 | SKKQSSEQETE 24 | 25 | >GORGO16559 | OMA944789 | G3QZF2 | [Gorilla gorilla gorilla] 26 | MPRERRERDAKERDTMKEDGGAEFSARSRKRKANVAVFLQDPDEEMAKIDRTARDQCGSQPWDNNAVCADPCSLIPTPDK 27 | EDDERVYPNSTCKPQIIAPSRGSPLPVLSWANREEVWKIMLNKEKTYLRDQHFLEQHPLLQPKMRAILLDWLMEVCEVYK 28 | LHRETFYLAQDFFDRYMATQENVVKTLLQLIGISSLFIAAKLEEIYPPKLHQFAYVTDGACSGDEILTMELMIMKALKWR 29 | LSPLTIVSWLNVYMQVAYLNDLHEVLLPQYPQQIFIQIAELLDLCVLDVDCLEFPYGILAASALYHFSSSELMQKVSGYQ 30 | WCDIENCVKWMVPFAMVIRETGSSKLKHFRGVADEDAHNIQTHRDSLDLLDKARAKKAMLSEQNRASPLPSGLLTPPQSG 31 | KKQSSGPEMA 32 | 33 | >XENLA17827 | OMA944789 | CCNE1_XENLA | [Xenopus laevis] 34 | MPVISNPAVEKSTKDEGTASCSVRSRKRKADVAIFLQDPDETLDSLEMTKKKQYQDRGPWSNEMTCKSPHKLIPTPEKEE 35 | HEPNPTNYSHFASLRFSPVSVSPLPRLGWANQDDVWRNMLNKDRIYLRDKNFFQKHPQLQPNMRAILLDWLMEVCEVYKL 36 | HRETFYLAQDFFDRFMATQKNVIKSRLQLIGITSLFIAAKLEEIYPPKLHQFSFITDGACTEDEITRMELIIMKDLGWCL 37 | SPMTIVSWFNVFLQVAYIRELQQFLRPQFPQEIYIQIVQLLDLCVLDICCLEYPYGVLAASAMYHFSCPELVEKVSGFKV 38 | TELQGCIKWLVPFAMAIKEGGKSKLNFFKGVDIEDAHNIQTHSGCLELMEKVYINQALLEEQNRTSPIPTGVLTPPQSNK 39 | KQKSDRAD 40 | 41 | -------------------------------------------------------------------------------- /tests/marker_genes/OMAGroup_974829.fa: -------------------------------------------------------------------------------- 1 | >MNELE00836 | OMA974829 | ML01593a | [Mnemiopsis leidyi] 2 | MDLVQVGITTDELDRYAHDLIVQHAAYPAPLNYRGYPKSICTSVNNVLCHGIPNSRELQDGDIISIDVSIFYKGVFGDCC 3 | STRVVGEGDSTAHKLAKVTRDSTLAAIETCKPGTRLSSVGNTISKYAKEAGLSICKEFIGHGIGSYFHGLPEVYHYANSH 4 | GPTLRPGMVFTIEPILMEGRDTMAILADGWTAVSADSKRAAQFEHTILITDSEPEILSPHR 5 | 6 | >HUMAN65562 | OMA974829 | MAP12_HUMAN | [Homo sapiens] 7 | MAAPSGVHLLVRRGSHRIFSSPLNHIYLHKQSSSQQRRNFFFRRQRDISHSIVLPAAVSSAHPVPKHIKKPDYVTTGIVP 8 | DWGDSIEVKNEDQIQGLHQACQLARHVLLLAGKSLKVDMTTEEIDALVHREIISHNAYPSPLGYGGFPKSVCTSVNNVLC 9 | HGIPDSRPLQDGDIINIDVTVYYNGYHGDTSETFLVGNVDECGKKLVEVARRCRDEAIAACRAGAPFSVIGNTISHITHQ 10 | NGFQVCPHFVGHGIGSYFHGHPEIWHHANDSDLPMEEGMAFTIEPIITEGSPEFKVLEDAWTVVSLDNQRSAQFEHTVLI 11 | TSRGAQILTKLPHEA 12 | 13 | >RATNO27947 | OMA974829 | G3V670 | [Rattus norvegicus] 14 | MAAPIGVHLLVRGGCQRILSSPLHHLFLHKRAGSQQRRYFFWRQRDISHSVVSPAAVSPAHPVPEHIKKPDYVTTGIVPD 15 | WGDSIEVKNEDQIQGLREACRLARHVLLLAGKSLKVGMTTEEIDALVHREIIRRDAYPSPLGYGRFPKSVCTSVNNVLCH 16 | GIPDSRPLQDGDIINIDVTVYYNGYHGDTSETFLVGNVDESGTKLVEVARACRDEAIAACRAGAPFSVIGNTISHITRQN 17 | GLQVCPHFVGHGIGSYFHGHPEIWHHANDNDLPMEERMAFTIEPIITEGSPEFKVLEDAWTVVSLDNRRSAQFEHTVLIT 18 | PRGVEILTKVPQEA 19 | 20 | >GORGO26472 | OMA974829 | A0A2I2ZWI0 | [Gorilla gorilla gorilla] 21 | MAAPSGVHQLVRRGSHRIFSSPLNHIYLHKQSSSQQRRNFFFRRQRDISHSIVLPAAVSSAHPVPKHIKKPDYVTTGIVP 22 | DWGDSIEVKNEDQIQGLHQACQLARHVLLLAGKSLKVDMTTEEIDALVHREIISHNAYPSPLGYGGFPKSVCTSVNNVLC 23 | HGIPDSRPLQDGDIINIDVTVYYNGYHGDTSETFLVGNVDECGKKLVEVARRCRDEAIAACRAGAPFSVIGNTISHITHQ 24 | NGFQVCPHFVGHGIGSYFHGHPEIWHHANDNDLPMEEGMAFTIEPIITEGSPEFKVLEDAWTVVSLDNQRSAQFEHTVLI 25 | TSRGAQILTKLPHEA 26 | 27 | >XENLA39438 | OMA974829 | A0A1L8EW64 | [Xenopus laevis] 28 | MQVAGVISCALRRGSVTGCQRVFPLAVNHIYLHRQLNIQQRRYFFFRKQRSAAYDIVWPGTVSPAHPVPEHIMKPDYVTT 29 | GIVPDWGDYIEIKDEDQIQGLRQACQLARHILLMAGKSLKVGMTTEEIDALVHENIISWNAYPSPLGYGGFPKSVCTSVN 30 | NVVCHGIPDSRALQDGDIINIDVTVYFGGYHGDTSETFLVGNVDKCGRGLVKIARRCRDEAIAVCKPGAPFSSIGNTISR 31 | IAGENGFRVCPSFVGHGIGSFFHGHPEIWHHANNNDMPMEEGMAFTIEPIIMEGSPDFKILKDKWTAVSVDNKRSAQCEH 32 | TIVITSGGAEILTKLPQEE 33 | 34 | -------------------------------------------------------------------------------- /tests/test_aligner.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import gzip 4 | import argparse 5 | from Bio import SeqIO 6 | from read2tree.Reads import Reads 7 | from read2tree.FastxReader import FastxReader 8 | dirname = os.path.dirname(__file__) 9 | 10 | 11 | class ReadTest(unittest.TestCase): 12 | 13 | def setup_reads_paired(self, sampling=False): 14 | arg_parser = argparse.ArgumentParser(prog='read2tree') 15 | 16 | arg_parser.add_argument('--standalone_path', default='.', 17 | help='[Default is current directory] Path to ' 18 | 'oma standalone directory.') 19 | 20 | arg_parser.add_argument('--reads', nargs='+', default=None, 21 | help='Reads to be mapped to reference. If paired ' 22 | 'end add separated by space.') 23 | 24 | arg_parser.add_argument('--read_type', default='short', 25 | help='[Default is short reads] Type of reads to ' 26 | 'use for mapping. Either ngm for short reads or ' 27 | 'ngmlr for long will be used.') 28 | 29 | arg_parser.add_argument('--dna_reference', default='', 30 | help='Reference file that contains nucleotide ' 31 | 'sequences (fasta, hdf5). If not given it will use' 32 | 'the RESTapi and retrieve sequences ' 33 | 'from http://omabrowser.org directly. ' 34 | 'NOTE: internet connection required!') 35 | 36 | arg_parser.add_argument('--keep_all_ogs', action='store_true', 37 | help='Keep all orthologs after addition of ' 38 | 'mapped seq, which means also the groups that ' 39 | 'have no mapped sequence. Otherwise only groups ' 40 | 'are used that have the mapped sequence for ' 41 | 'alignment and tree inference.') 42 | 43 | arg_parser.add_argument('-r', '--reference', action='store_true', 44 | help='Just generate the reference dataset for ' 45 | 'mapping.') 46 | 47 | arg_parser.add_argument('--remove_species_ogs', default=None, 48 | help='[Default is none] Remove species present ' 49 | 'in data set after mapping step completed to ' 50 | 'build OGs. Input is comma separated list ' 51 | 'without spaces, e.g. XXX,YYY,AAA.') 52 | 53 | arg_parser.add_argument('-s', '--species_name', default=None, 54 | help='[Default is name of read] Name of species ' 55 | 'for mapped sequence.') 56 | 57 | arg_parser.add_argument('--output_path', default='.', required=True, 58 | help='[Default is current directory] Path to ' 59 | 'output directory.') 60 | 61 | argv = ['--standalone_path', 'tests/data/marker_genes/', 62 | '--dna_reference', 'tests/data/dna.fa', '--reads', 63 | 'tests/data/mapper/test3/test_1b.fq', 64 | 'tests/data/mapper/test3/test_2b.fq', 65 | '--output_path', 'tests/data/output', '--read_type', 66 | 'short', '--keep_all_ogs', '--reference', 67 | '--remove_species_ogs', 'CIOIN', '--species_name', 'ass'] 68 | 69 | args = arg_parser.parse_args(argv) 70 | return alignments = Aligner(args, ogset.ogs, load=True) 71 | -------------------------------------------------------------------------------- /tests/test_og.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from Bio import SeqIO 4 | from read2tree.OGSet import OG 5 | 6 | dirname = os.path.dirname(__file__) 7 | 8 | 9 | class OGTest(unittest.TestCase): 10 | 11 | def setup(self): 12 | aa = list(SeqIO.parse('data/OG4.aa', format='fasta')) 13 | dna = list(SeqIO.parse('data/OG4.dna', format='fasta')) 14 | og = OG() 15 | og.aa = aa 16 | og.dna = dna 17 | return og 18 | 19 | def test_init(self): 20 | og = self.setup() 21 | self.assertEqual(og.dna[0].id, 'MOUSE21964_OG4') 22 | 23 | def test_get_og_dict(self): 24 | og = self.setup() 25 | dna_dict = og._get_og_dict(og) 26 | self.assertEqual(dna_dict['MOUSE21964'].name, 'MOUSE21964_OG4') 27 | 28 | def test_remove_species_records(self): 29 | og = self.setup() 30 | og_wo_mouse = og.remove_species_records('MOUSE') 31 | self.assertEqual(len(og_wo_mouse[0]), 4) 32 | self.assertEqual(len(og_wo_mouse[1]), 4) 33 | 34 | def test_get_species_id(self): 35 | og = self.setup() 36 | dna = og.dna[0] 37 | aa = og.aa[0] 38 | self.assertEqual(og._get_species_id(dna), 'MOUSE') 39 | self.assertEqual(og._get_species_id(aa), 'MOUSE') 40 | 41 | 42 | if __name__ == "__main__": 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /tests/test_ogset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from read2tree import OGSet 4 | 5 | API_URL = 'http://omabrowser.org/api' 6 | 7 | class OGSetTest(unittest.TestCase): 8 | def setUp(self): 9 | arg_parser = argparse.ArgumentParser(prog='read2tree') 10 | 11 | arg_parser.add_argument('--reads', nargs='+', default=None, 12 | help='Reads to be mapped to reference. If paired end ' 13 | 'add separated by space.') 14 | arg_parser.add_argument('--read_split_length', type=int, default=400, 15 | help='Set read split length.') 16 | arg_parser.add_argument('--read_split_overlap', type=int, default=50, 17 | help='Set read split length overlap.') 18 | arg_parser.add_argument('-s', '--species_name', default=None, 19 | help='[Default is name of read] Name of species ' 20 | 'for mapped sequence.') 21 | 22 | argv = ['--reads', 'tests/data/reads/test.fq'] 23 | 24 | args = arg_parser.parse_args(argv) 25 | return OGSet(args) 26 | 27 | def test_OGSet(self): 28 | raise NotImplementedError 29 | 30 | def test_marker_genes_input(self): 31 | raise NotImplementedError 32 | 33 | def test_omastandalone_input(self): 34 | raise NotImplementedError 35 | 36 | def test_output_folder_structure(self): 37 | raise NotImplementedError 38 | 39 | def test_species_removal(self): 40 | raise NotImplementedError 41 | 42 | def test_species_removal_after_mapping(self): 43 | raise NotImplementedError 44 | 45 | def test_rest_api_connection(self): 46 | OGSet._read 47 | 48 | def test_rest_api_dna_downlaod(self): 49 | raise NotImplementedError 50 | 51 | 52 | if __name__ == "__main__": 53 | unittest.main() 54 | -------------------------------------------------------------------------------- /tests/test_reads.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import gzip 4 | import argparse 5 | from Bio import SeqIO 6 | from read2tree.Reads import Reads 7 | from read2tree.FastxReader import FastxReader 8 | from read2tree.main import parse_args 9 | from read2tree._utils import exe_name 10 | dirname = os.path.dirname(__file__) 11 | 12 | 13 | class ReadTest(unittest.TestCase): 14 | 15 | def setup_long_reads(self, split=False): 16 | if split: 17 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test.fq.gz', '--split_reads', 18 | '--split_overlap', '50', '--split_len', '400', '--sample_reads', '--coverage', '10', 19 | '--genome_len', '1000'] 20 | else: 21 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test.fq.gz'] 22 | 23 | args = parse_args(argv, exe_name(), '') 24 | # args = arg_parser.parse_args(argv) 25 | return Reads(args) 26 | 27 | def setup_reads_paired(self, sampling=False): 28 | 29 | if sampling: 30 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test_1a.fq.gz', 31 | 'data/reads/test_2a.fq.gz', '--sample_reads', '--coverage', '10', '--genome_len', '1000'] 32 | else: 33 | argv = ['--output_path', 'data/output', '--reads', 'data/reads/test_1a.fq.gz', 34 | 'data/reads/test_2a.fq.gz'] 35 | args = parse_args(argv, exe_name(), '') 36 | return Reads(args) 37 | 38 | def test_split(self): 39 | test_seq = 'ACGTTTTTTGGAAGAGTTAGAGATTTTTAGAGAGGAGGGGT' 40 | expected = ['ACGTTTTTTG', 'GAAGAGTTAG', 'AGATTTTTAG', 'AGAGGAGGGG', 41 | 'GAGGAGGGGT'] 42 | reads = self.setup_long_reads() 43 | # obtained = reads._split_len(test_seq, 10) 44 | obtained = reads._split_len_overlap(test_seq, 10, 0) 45 | self.assertEqual(expected, obtained) 46 | 47 | def test_splitOverlap(self): 48 | test_seq = 'ACGTTTTTTGGAAGAGTTAGAGATTTTTAGAGAGGAGGGGTTT' 49 | expected = ['ACGTTTTTTG', 'TTTTGGAAGA', 'GAAGAGTTAG', 'GTTAGAGATT', 50 | 'AGATTTTTAG', 'TTTAGAGAGG', 'AGAGGAGGGG', 'GGAGGGGTTT'] 51 | reads = self.setup_long_reads() 52 | obtained = reads._split_len_overlap(test_seq, 10, 5) 53 | # print(reads._split_len_overlap('TTTTTAGAGAGGAGGGGTTT', 10, 5)) 54 | self.assertEqual(expected, obtained) 55 | 56 | def test_get_4_line_fastq_string(self): 57 | reads = self.setup_long_reads() 58 | expected = '@SRR00001 length=16\nACGTTTGGGAAGGTTT\n+SRR00001 ' \ 59 | 'length=16\n????????????????\n' 60 | read_id = 'SRR00001' 61 | seq = 'ACGTTTGGGAAGGTTT' 62 | qual = '????????????????' 63 | name = reads._get_4_line_fastq_string(read_id, seq, qual, x=0) 64 | self.assertEqual(name, expected) 65 | 66 | def test_read_num_split(self): 67 | reads = self.setup_long_reads(split=True) 68 | num_reads = reads._get_num_reads('data/reads/test.fq.gz') 69 | self.assertEqual(num_reads, 18) 70 | 71 | def test_read_len_split(self): 72 | reads = self.setup_long_reads(split=True) 73 | len_reads = reads._get_read_len('data/reads/test.fq.gz',1000) 74 | self.assertEqual(len_reads, 400) 75 | 76 | def test_read_num_paired(self): 77 | reads = self.setup_reads_paired() 78 | num_reads = reads._get_num_reads('data/reads/test_1a.fq.gz') 79 | self.assertEqual(num_reads, 1000) 80 | 81 | def test_read_len_paired(self): 82 | reads = self.setup_reads_paired() 83 | num_reads = reads._get_read_len('data/reads/test_1a.fq.gz', 1000) 84 | self.assertEqual(num_reads, 151.0) 85 | 86 | def test_read_num_by_coverage_paired(self): 87 | reads = self.setup_reads_paired(sampling=True) 88 | num_reads = reads._get_num_reads_by_coverage( 89 | 'data/reads/test_1a.fq.gz', 1000) 90 | self.assertEqual(num_reads, 34) 91 | 92 | def test_read_num_by_coverage_split(self): 93 | reads = self.setup_long_reads(split=True) 94 | num_reads = reads._get_num_reads_by_coverage(['data/reads/test.fq.gz'],1000) 95 | self.assertEqual(num_reads, 25) 96 | 97 | def test_read_vec_paired(self): 98 | reads = self.setup_reads_paired(sampling=True) 99 | num_reads = reads._get_vector_random_reads( 100 | 'data/reads/test_1a.fq.gz') 101 | self.assertEqual(len(num_reads), 34) 102 | 103 | 104 | if __name__ == "__main__": 105 | unittest.main() 106 | -------------------------------------------------------------------------------- /tests/test_use.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import argparse 3 | import warnings 4 | warnings.filterwarnings('ignore') 5 | from read2tree.Progress import Progress 6 | from read2tree.stats.Coverage import Coverage 7 | from read2tree.stats.SeqCompleteness import SeqCompleteness 8 | import os 9 | 10 | class Use(unittest.TestCase): 11 | 12 | def test_OGSet(self): 13 | 14 | def test_write_progress(self): 15 | 16 | def test_read_progress(self): 17 | 18 | 19 | if __name__ == "__main__": 20 | unittest.main() 21 | --------------------------------------------------------------------------------