├── LICENSE ├── MANIFEST.in ├── README.md ├── _config.yml ├── _data └── navigation.yml ├── bin └── aTRAM-master │ ├── .gitignore │ ├── .travis.yml │ ├── LICENSE.txt │ ├── README.md │ ├── atram.py │ ├── atram_framer.py │ ├── atram_preprocessor.py │ ├── atram_stitcher.py │ ├── doc │ ├── atram_preprocessor_reference.md │ ├── atram_preprocessor_tutorial.md │ ├── atram_reference.md │ ├── atram_stitcher_reference.md │ ├── atram_stitcher_tutorial.md │ ├── atram_tutorial.md │ ├── images │ │ ├── atram_preprocessor.odg │ │ ├── atram_preprocessor.png │ │ ├── atram_step_1.odg │ │ ├── atram_step_1.png │ │ ├── atram_step_2.odg │ │ ├── atram_step_2.png │ │ ├── atram_step_3.odg │ │ ├── atram_step_3.png │ │ ├── atram_step_4.odg │ │ ├── atram_step_4.png │ │ ├── atram_stitcher.odg │ │ └── atram_stitcher.png │ ├── introduction.md │ ├── tips.md │ └── troubleshooting.md │ ├── environment.yml │ ├── lib │ ├── __init__.py │ ├── assembler.py │ ├── assemblers │ │ ├── __init__.py │ │ ├── abyss.py │ │ ├── base.py │ │ ├── none.py │ │ ├── spades.py │ │ ├── trinity.py │ │ └── velvet.py │ ├── bio.py │ ├── blast.py │ ├── core_atram.py │ ├── core_framer.py │ ├── core_preprocessor.py │ ├── core_stitcher.py │ ├── db.py │ ├── db_atram.py │ ├── db_preprocessor.py │ ├── db_stitcher.py │ ├── exonerate.py │ ├── log.py │ └── util.py │ ├── requirements.txt │ ├── setup.py │ ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── lib │ │ ├── __init__.py │ │ ├── assemblers │ │ │ ├── __init__.py │ │ │ └── test_base.py │ │ ├── test_bio.py │ │ ├── test_blast.py │ │ ├── test_core_atram.py │ │ ├── test_core_preprocessor.py │ │ ├── test_db.py │ │ └── test_log.py │ └── requirements-test.txt │ ├── util_atram_db_to_fasta.py │ └── util_check_requirements.py ├── docs ├── .DS_Store ├── .ipynb_checkpoints │ ├── align_contigs-checkpoint.ipynb │ ├── align_paralogs-checkpoint.ipynb │ ├── cleaning_trimming-checkpoint.ipynb │ ├── contig_assembly-checkpoint.ipynb │ ├── extract_contigs-checkpoint.ipynb │ ├── locus_selection-checkpoint.ipynb │ ├── phasing-checkpoint.ipynb │ ├── phylogeny_msc-checkpoint.ipynb │ └── reference_assembly-checkpoint.ipynb ├── _config.yml ├── _data │ └── navigation.yml ├── documentation │ ├── .DS_Store │ ├── main_doc.html │ ├── subdocs │ │ ├── align_contigs.html │ │ ├── align_paralogs.html │ │ ├── cleaning_trimming.html │ │ ├── contig_assembly.html │ │ ├── extract_contigs.html │ │ ├── locus_selection.html │ │ ├── phasing.html │ │ ├── phylogeny_msc.html │ │ └── reference_assembly.html │ └── tutorial.html ├── index.md ├── markdown │ ├── main_doc.html │ ├── main_doc.ipynb │ ├── main_doc.md │ ├── tutorial.html │ ├── tutorial.md │ └── tutorial.pdf └── notebook │ ├── .ipynb_checkpoints │ └── main_doc-checkpoint.ipynb │ ├── main_doc.ipynb │ └── subdocs │ ├── align_contigs.ipynb │ ├── align_paralogs.ipynb │ ├── cleaning_trimming.ipynb │ ├── contig_assembly.ipynb │ ├── extract_contigs.ipynb │ ├── locus_selection.ipynb │ ├── phasing.ipynb │ ├── phylogeny_msc.ipynb │ └── reference_assembly.ipynb ├── images ├── .DS_Store ├── exon_vs_contig_based_assembly.pdf ├── exon_vs_contig_based_assembly.png ├── paralog_contig_alignment.png ├── reads_to_contig.jpg ├── secapr_logo.png ├── secapr_logo_small.png ├── secapr_workflow.png ├── stacey_phylogeny.jpg ├── stacey_phylogeny.pdf └── wrong_contig.JPG ├── recipe ├── install_secapr_env.sh ├── meta.yaml ├── secapr.yml └── secapr │ ├── channeldata.json │ ├── index.html │ ├── meta.yaml │ └── noarch │ ├── current_repodata.json │ ├── current_repodata.json.bz2 │ ├── index.html │ ├── repodata.json │ ├── repodata.json.bz2 │ ├── repodata_from_packages.json │ └── repodata_from_packages.json.bz2 ├── secapr ├── .DS_Store ├── __init__.py ├── __main__.py ├── _version.py ├── add_missing_sequences.py ├── align_sequences.py ├── assemble_reads.py ├── automate_all.py ├── clean_reads.py ├── clean_reads_old.py ├── concatenate_alignments.py ├── create_consensus_from_alleles.py ├── extract_alignments_from_phyluce_get_inf_sites_output.py ├── find_target_contigs.py ├── helpers.py ├── join_exons.py ├── locus_selection.py ├── merge_probes.py ├── mpileup_fasta.py ├── paralogs_to_ref.py ├── phase_alleles.py ├── phase_alleles_provide_reference_fasta.py ├── plot_sequence_yield.py ├── process_pileup.py ├── quality_check.py ├── reference_assembly.py ├── remove_uninformative_seqs.py ├── utils.py └── varscan_vcf_2_fasta.py ├── setup.cfg ├── setup.py ├── src ├── align_paralogs.py ├── apply_read_thres_select_best_loci.py ├── check_avg_contig_length.py ├── check_target_contig_matches.py ├── estimate_pop_parameters.py ├── extract_longest_contig.py ├── fastqc_visualization.r ├── find_good_loci_for_each_sample_from_readcov_info.py ├── get_stats_from_log_files.py ├── heatmap_plot.py ├── merge_baits_for_each_locus.py ├── plot_contig_data_function.py ├── plot_contig_length_overview.py ├── plot_exon_alignment_yield.py ├── plot_exon_contig_yield.py ├── plot_exon_read_coverage.py ├── plot_exon_yield_all_datatypes.py ├── plot_quality_test_results.py ├── plotting_function_final.py ├── remove_short_contigs.py └── simmatrix_geonoma_allele_data.R └── versioneer.py /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-2017 Tobias Hofmann 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include versioneer.py 2 | include secapr/_version.py 3 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate -------------------------------------------------------------------------------- /_data/navigation.yml: -------------------------------------------------------------------------------- 1 | main: 2 | - title: "Quick-Start Guide" 3 | url: /docs/quick-start-guide/ 4 | - title: "Posts" 5 | url: /year-archive/ 6 | - title: "Categories" 7 | url: /categories/ 8 | - title: "Tags" 9 | url: /tags/ 10 | - title: "Pages" 11 | url: /page-archive/ 12 | - title: "Collections" 13 | url: /collection-archive/ 14 | - title: "External Link" 15 | url: https://google.com 16 | -------------------------------------------------------------------------------- /bin/aTRAM-master/.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | *.pyc 3 | *.swp 4 | __pycache__ 5 | 6 | admin/ 7 | args/ 8 | atram_db/ 9 | build/ 10 | data/ 11 | db/ 12 | dist/ 13 | input/ 14 | old/ 15 | output/ 16 | query/ 17 | temp/ 18 | venv/ 19 | 20 | junk* 21 | 22 | .cache 23 | .coverage 24 | .hypothesis 25 | .pytest_cache 26 | 27 | .idea 28 | .mypy_cache/ 29 | .ropeproject/ 30 | .spyproject/ 31 | .vscode 32 | 33 | .ipynb_checkpoints/ 34 | .~lock* 35 | -------------------------------------------------------------------------------- /bin/aTRAM-master/.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | install: "pip install -r tests/requirements-test.txt" 5 | script: "pytest tests" 6 | -------------------------------------------------------------------------------- /bin/aTRAM-master/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Julie M. Allen 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /bin/aTRAM-master/README.md: -------------------------------------------------------------------------------- 1 | # aTRAM: automated Target Restricted Assembly Method [![Build Status](https://travis-ci.org/juliema/aTRAM.svg?branch=master)](https://travis-ci.org/juliema/aTRAM) 2 | 3 | - [Background](#Background) 4 | - [Installation](#Installation) 5 | - [Quick start](#Quick-start) 6 | - [Documentation](#Documentation) 7 | 8 | ## Background 9 | 10 | aTRAM ("automated target restricted assembly method") is an iterative assembler 11 | that performs reference-guided local de novo assemblies using a variety of 12 | available methods. It is well-suited to various tasks where Next-Generation 13 | Sequence (NGS) data needs to be queried for gene sequences, such as 14 | phylogenomics. The design philosophy is modular and expandable, with support 15 | for four de-novo assemblers to date: Velvet, Abyss, Trinity, and Spades. 16 | 17 | aTRAM 2 is a major overhaul of the aTRAM approach to assembling loci from (NGS) 18 | data. The new code has been reimplemented in Python, and the approach to short 19 | read library construction is completely revamped, resulting in major 20 | performance and assembly improvements. 21 | 22 | Please consult the reference below for more information about aTRAM1.0: 23 | `Allen, JM, DI Huang, QC Cronk, KP Johnson. 2015. aTRAM automated target 24 | restricted assembly method a fast method for assembling loci across divergent 25 | taxa from next-generation sequencing data. BMC Bioinformatics 16:98 26 | DOI 10.1186/s12859-015-0515-2` 27 | 28 | The reference for aTRAM 2.0: 29 | `Allen J.M., R. LaFrance, R. A. Folk, K. P. Johnson, and R. P. Guralnick 30 | In Press. aTRAM 2.0: An improved, flexible locus assembler for NGS data. 31 | Evolutionary Informatics` 32 | 33 | ## Installation 34 | 35 | You will need to have Python3 installed, as well as pip, a package manager for 36 | Python. 37 | 38 | ```bash 39 | git clone https://github.com/juliema/aTRAM.git 40 | pip install --user --requirement atram/requirements.txt 41 | ``` 42 | 43 | ### aTRAM uses these programs so you need to install them. 44 | 45 | You will need to use a locally installed BLAST: 46 | 47 | - [BLAST]( 48 | http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download), 49 | version 2.7.1 50 | 51 | You will also need at least one of the supported assembly modules: 52 | 53 | - [Velvet](https://www.ebi.ac.uk/~zerbino/velvet/) 54 | - [Trinity](http://trinityrnaseq.github.io/), version 2.5.1 55 | - [Abyss](http://www.bcgsc.ca/platform/bioinfo/software/abyss), version 2.0.2 56 | - [SPAdes](http://cab.spbu.ru/software/spades/), version 3.11.1 57 | 58 | If you want to use the atram stitcher you will need to install exonerate: 59 | 60 | - [exonerate]( 61 | https://www.ebi.ac.uk/about/vertebrate-genomics/software/exonerate-user-guide) 62 | 63 | ### Installation using [`conda`](https://www.anaconda.com/distribution/): 64 | 65 | Alternatively, you can install both dependencies and `aTRAM` by using `conda`. Inside the `aTRAM` directory, run the following: 66 | 67 | ```bash 68 | conda env create -f environment.yml 69 | conda activate aTRAM 70 | ``` 71 | 72 | ## Quick start 73 | 74 | Note: aTRAM 2 is not backwards compatible with aTRAM 1. It is also best to 75 | rebuild any libraries after major updates. 76 | 77 | ### Library Preparation 78 | 79 | Use `atram_preprocessor.py` for this. 80 | 81 | - Define your new library name with the --blast-db option. Which consists of a 82 | path and the library prefix itself. This program will add suffixes to 83 | differentiate different database files. 84 | 85 | - Then give it your fastq files. You can either list the forward and reverse 86 | read files, or put them into one file and use the --mixed-ends option. 87 | 88 | Under the hood, aTRAM is building BLAST databases and an SQLite3 database for 89 | rapid read retrieval. 90 | 91 | ```bash 92 | atram_preprocessor.py \ 93 | --blast-db=path_to_atram_library/LIBRARY_PREFIX \ 94 | --end-1=path_to_reads/read_1.fastq \ 95 | --end-2=path_to_reads/read_2.fastq 96 | ``` 97 | 98 | ### Assembling Loci 99 | 100 | `atram.py` uses the databases built by `atram_preprocessor.py` to assemble 101 | loci. 102 | 103 | - You need to give it the same --blast-db option from the preprocessor. 104 | - You also need to give it a query sequence. The query sequence is a FASTA 105 | file. 106 | - An assembler choice. The assembler choice is one of the assemblers mentioned 107 | above (velvet, trinity, abyss, or spades). 108 | - And an output prefix. The `--output-prefix` works just like the 109 | `--blast-db-prefix` with the directory part and the library prefix itself. 110 | 111 | ```bash 112 | atram.py \ 113 | --blast-db=path_to_atram_library/LIBRARY_PREFIX \ 114 | --query=path_to_reference_loci/Locus.fasta \ 115 | --assembler=ASSEMBLER_CHOICE \ 116 | --output-prefix=path_to_output/OUTPUT_PREFIX 117 | ``` 118 | 119 | ### Stitching genes from assembled loci 120 | 121 | `atram_stitcher.py` Takes the output assemblies from `atram.py` and reference 122 | amino acid targets and then stitches them together using an iterative process. 123 | 124 | - Give it a directory containing the assemblies. 125 | - A set of reference amino acid sequences in a FASTA file. 126 | - A list of taxon names. One taxon per line. 127 | 128 | ```bash 129 | atram_stitcher.py \ 130 | --assemblies-dir=path_to_assemblies \ 131 | --reference-genes=path_to_genes/ref_genes.fasta \ 132 | --taxa=path_to/taxon_list.txt 133 | ``` 134 | 135 | ## Documentation 136 | 137 | - [aTRAM Overview](doc/introduction.md) 138 | - [Tips for using aTRAM](doc/tips.md) 139 | - [Troubleshooting](doc/troubleshooting.md) 140 | 141 | ### Tutorials 142 | - [Pre-processing tutorial](doc/atram_preprocessor_tutorial.md) 143 | - [aTRAM tutorial](doc/atram_tutorial.md) 144 | - [Stitcher tutorial](doc/atram_stitcher_tutorial.md) 145 | 146 | ### Program Reference 147 | - [atram_preprocessor.py](doc/atram_preprocessor_reference.md) 148 | - [atram.py](doc/atram_reference.md) 149 | - [atram_stitcher.py](doc/atram_stitcher_reference.md) 150 | -------------------------------------------------------------------------------- /bin/aTRAM-master/atram_framer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Start the atram exon framer. 4 | 5 | This wrapper module parses the input arguments and passes them to the module 6 | that does the actual framing (core_framer.py). 7 | """ 8 | 9 | from os.path import join 10 | from datetime import date 11 | import argparse 12 | import textwrap 13 | import lib.db as db 14 | import lib.util as util 15 | import lib.core_framer as framer 16 | 17 | 18 | def parse_command_line(): 19 | """Process command-line arguments.""" 20 | description = """ 21 | This program will align contigs to a reference sequence and put them 22 | into the correct reading frame. 23 | """ 24 | 25 | parser = argparse.ArgumentParser( 26 | fromfile_prefix_chars='@', 27 | formatter_class=argparse.RawDescriptionHelpFormatter, 28 | description=textwrap.dedent(description)) 29 | 30 | parser.add_argument('--version', action='version', 31 | version='%(prog)s {}'.format(db.ATRAM_VERSION)) 32 | 33 | parser.add_argument( 34 | '-T', '--taxa', metavar='TAXA', required=True, 35 | help="""A text file of all of your taxon names.""") 36 | 37 | parser.add_argument( 38 | '-r', '--reference-genes', '--refs', metavar='FASTA', required=True, 39 | help="""Reference amino acid sequences in a FASTA file.""") 40 | 41 | parser.add_argument( 42 | '-a', '--assemblies-dir', metavar='PATH', required=True, 43 | help="""The path to the DNA contigs.""") 44 | 45 | parser.add_argument( 46 | '-m', '--min-length', metavar='LENGTH', default=100, type=int, 47 | help="""Remove contigs that are less than this length. The default is 48 | 100.""") 49 | 50 | parser.add_argument( 51 | '-t', '--temp-dir', metavar='DIR', 52 | help="""Place temporary files in this directory. All files will be 53 | deleted after aTRAM completes. The directory must exist.""") 54 | 55 | parser.add_argument( 56 | '--keep-temp-dir', action='store_true', 57 | help="""This flag will keep the temporary files in the --temp-dir 58 | around for debugging.""") 59 | 60 | parser.add_argument( 61 | '-l', '--log-file', 62 | help="""Log file (full path). The default is 63 | "atram_framer_.log".""") 64 | parser.add_argument( 65 | '--log-level', choices=['debug', 'info', 'error'], default='info', 66 | help="""Log messages of the given level (or above). 'debug' shows the 67 | most messages and 'error' shows the least. The default is 68 | 'info'""") 69 | 70 | parser.add_argument( 71 | '-o', '--output-prefix', 72 | help="""This is the prefix of all of the output files. So you can 73 | identify different stitcher output file sets. You may include a 74 | directory as part of the prefix. The stitcher will add suffixes to 75 | differentiate output files.""") 76 | 77 | parser.add_argument( 78 | '-f', '--file-filter', default='*.fasta', 79 | help="""Use this to filter files in the assemblies directory. For 80 | example '*filtered*.fasta' will select all fasta files in the 81 | assemblies directory with the word filtered in them. The default 82 | is to select all fasta files in the assemblies directory 83 | '*.fasta'.""") 84 | 85 | parser.add_argument( 86 | '--reference-name', action='store_true', 87 | help="""Prepend the reference name to the final assembled gene name? 88 | if false the gene name in the reference file with just be the 89 | if you select this then the assembled gene name 90 | will be ..""") 91 | 92 | parser.add_argument( 93 | '--long-contig', type=float, default=0.7, 94 | help="""A long contig is considered to be this fraction [0-1] of the 95 | longest contig assembled by exonerate. The default is 0.7.""") 96 | 97 | args = parser.parse_args() 98 | 99 | util.temp_dir_exists(args.temp_dir) 100 | 101 | if not args.output_prefix: 102 | args.output_prefix = join( 103 | '.', 'atram_framer_' + date.today().isoformat()) 104 | 105 | if not args.log_file and args.output_prefix[-1] == '/': 106 | args.log_file = join( 107 | args.output_prefix, 108 | 'atram_framer_' + date.today().isoformat() + '.log') 109 | else: 110 | args.log_file = args.output_prefix + '.log' 111 | 112 | return args 113 | 114 | 115 | if __name__ == '__main__': 116 | ARGS = parse_command_line() 117 | framer.frame(ARGS) 118 | -------------------------------------------------------------------------------- /bin/aTRAM-master/atram_stitcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Start the atram exon stitcher. 4 | 5 | This wrapper module parses the input arguments and passes them to the module 6 | that does the actual stitching (core_stitcher.py). 7 | """ 8 | 9 | from os.path import join 10 | from datetime import date 11 | import argparse 12 | import textwrap 13 | import lib.db as db 14 | import lib.log as log 15 | import lib.util as util 16 | import lib.core_stitcher as stitcher 17 | 18 | 19 | def parse_command_line(): 20 | """Process command-line arguments.""" 21 | description = """ 22 | This program will find and stitch together exons from targeted 23 | assemblies using amino acid targets and DNA assemblies. 24 | """ 25 | 26 | parser = argparse.ArgumentParser( 27 | fromfile_prefix_chars='@', 28 | formatter_class=argparse.RawDescriptionHelpFormatter, 29 | description=textwrap.dedent(description)) 30 | 31 | parser.add_argument('--version', action='version', 32 | version='%(prog)s {}'.format(db.ATRAM_VERSION)) 33 | 34 | parser.add_argument( 35 | '-T', '--taxa', metavar='TAXA', required=True, 36 | help="""A text file of all of your taxon names.""") 37 | 38 | parser.add_argument( 39 | '-r', '--reference-genes', '--refs', metavar='FASTA', required=True, 40 | help="""Reference amino acid sequences in a FASTA file.""") 41 | 42 | parser.add_argument( 43 | '-a', '--assemblies-dir', metavar='PATH', required=True, 44 | help="""The path to the DNA contigs.""") 45 | 46 | parser.add_argument( 47 | '-O', '--overlap', type=int, default=10, 48 | help="""Contigs must overlap by this many codons before it is 49 | considered a real overlap.""") 50 | 51 | parser.add_argument( 52 | '-t', '--temp-dir', metavar='DIR', 53 | help="""Place temporary files in this directory. All files will be 54 | deleted after aTRAM completes. The directory must exist.""") 55 | 56 | parser.add_argument( 57 | '--keep-temp-dir', action='store_true', 58 | help="""This flag will keep the temporary files in the --temp-dir 59 | around for debugging.""") 60 | 61 | parser.add_argument( 62 | '-l', '--log-file', 63 | help="""Log file (full path). The default is 64 | "atram_stitcher_.log".""") 65 | parser.add_argument( 66 | '--log-level', choices=['debug', 'info', 'error'], default='info', 67 | help="""Log messages of the given level (or above). 'debug' shows the 68 | most messages and 'error' shows the least. The default is 69 | 'info'""") 70 | 71 | parser.add_argument( 72 | '-i', '--iterations', type=int, default=2, metavar='N', 73 | help="""The number of times to run the main stitcher loop. This 74 | must be either 1 or 2, the default is 2.""") 75 | 76 | parser.add_argument( 77 | '-o', '--output-prefix', 78 | help="""This is the prefix of all of the output files. So you can 79 | identify different stitcher output file sets. You may include a 80 | directory as part of the prefix. The stitcher will add suffixes to 81 | differentiate output files.""") 82 | 83 | parser.add_argument( 84 | '-f', '--file-filter', default='*.fasta', 85 | help="""Use this to filter files in the assemblies directory. For 86 | example '*filtered*.fasta' will select all fasta files in the 87 | assemblies directory with the word filtered in them. The default 88 | is to select all fasta files in the assemblies directory 89 | '*.fasta'.""") 90 | 91 | parser.add_argument( 92 | '--reference-name', action='store_true', 93 | help="""Prepend the reference name to the final assembled gene name? 94 | if false the gene name in the reference file with just be the 95 | if you select this then the assembled gene name 96 | will be ..""") 97 | 98 | args = parser.parse_args() 99 | 100 | util.temp_dir_exists(args.temp_dir) 101 | 102 | if not args.output_prefix: 103 | args.output_prefix = join( 104 | '.', 'atram_stitcher_' + date.today().isoformat()) 105 | 106 | if not args.log_file and args.output_prefix[-1] == '/': 107 | args.log_file = join( 108 | args.output_prefix, 109 | 'atram_stitcher_' + date.today().isoformat() + '.log') 110 | else: 111 | args.log_file = args.output_prefix + '.log' 112 | 113 | if 1 > args.iterations > 2: 114 | log.fatal('The iterations must be either 1 or 2.') 115 | 116 | return args 117 | 118 | 119 | if __name__ == '__main__': 120 | ARGS = parse_command_line() 121 | stitcher.stitch(ARGS) 122 | -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/atram_preprocessor_reference.md: -------------------------------------------------------------------------------- 1 | # aTRAM Preprocessor 2 | 3 | This script prepares data for use by the atram.py 4 | script. It takes fasta or fastq files of paired-end (or 5 | single-end) sequence reads and creates a set of atram 6 | databases. 7 | 8 | You need to prepare the sequence read archive files so that the 9 | header lines contain only a sequence ID with the optional 10 | paired-end suffix at the end of the header line. The separator 11 | for the optional trailing paired-end suffix may be a space, 12 | a slash "/", a dot ".", or an underscore "_". 13 | 14 | For example: 15 | 16 | >DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/1 17 | GATTAA... 18 | >DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/2 19 | ATAGCC... 20 | >DBRHHJN1:427:H9YYAADXX:1:1101:10006:63769/2 21 | CGAAAA... 22 | 23 | ## Arguments 24 | 25 | `-h, --help` 26 | 27 | Show this help message and exit. 28 | 29 | `--version` 30 | 31 | Show program's version number and exit. 32 | 33 | `--end-1 FASTA/Q [FASTA/Q ...], -1 FASTA/Q [FASTA/Q ...]` 34 | 35 | Sequence read archive files that have only end 1 36 | sequences. The sequence names do not need an end 37 | suffix, we will assume the suffix is always 1. The 38 | files are in fasta or fastq format. You may enter more 39 | than one file or you may use wildcards. 40 | 41 | If there are end 2 sequences in the file then you will want to use the 42 | `--mixed-ends` argument instead. 43 | 44 | `--end-2 FASTA/Q [FASTA/Q ...], -2 FASTA/Q [FASTA/Q ...]` 45 | 46 | Sequence read archive files that have only end 2 47 | sequences. The sequence names do not need an end 48 | suffix, we will assume the suffix is always 2. The 49 | files are in fasta or fastq format. You may enter more 50 | than one file or you may use wildcards. 51 | 52 | If there are end 1 sequences in the file then you will want to use the 53 | `--mixed-ends` argument instead. 54 | 55 | `--mixed-ends FASTA/Q [FASTA/Q ...], -m FASTA/Q [FASTA/Q ...]` 56 | 57 | Sequence read archive files that have a mix of both 58 | end 1 and end 2 sequences (or single ends). The files 59 | are in fasta or fastq format. You may enter more than 60 | one file or you may use wildcards. 61 | 62 | The sequence names must have a sequence end suffix like "/1" or "_2". 63 | 64 | `--single-ends FASTA/Q [FASTA/Q ...], -0 FASTA/Q [FASTA/Q ...]` 65 | 66 | Sequence read archive files that have only unpaired 67 | sequences. Any sequence suffix will be ignored. The 68 | files are in fasta or fastq format. You may enter more 69 | than one file or you may use wildcards. 70 | 71 | This option will ignore any sequence ends in the in the sequence name. 72 | 73 | `-b DB, --blast-db DB, --db DB` 74 | 75 | This is the prefix of all of the blast database files. 76 | So you can identify different blast database sets. You 77 | may include a directory as part of the prefix. The 78 | default is "./atram_". 79 | 80 | For example, if you want to keep you data base in a directory called: 81 | `/home/my_dir/atram_db/`. And you want to identify that these sequences are for 82 | _Canis lupus_. Then you might prefix this database with 83 | `/home/my_dir/atram_db/canis_lupus`. aTRAM will create a bunch a file in that 84 | directory with names starting with 'canis_lupus'. Like 85 | 'canis_lupus.001.blast.nhr' or 'canis_lupus.sqlite.db', etc. This allows you to 86 | keep many aTRAM databases in the same directory. 87 | 88 | 89 | `--cpus CPUS, --processes CPUS, --max-processes CPUS` 90 | 91 | Number of CPU threads to use. On this machine the 92 | default is to use the number of processes on your computer minus 4. More 93 | threads can improve the speed but if you use too many you will slow down the 94 | computer for any other use. 95 | 96 | `-t DIR, --temp-dir DIR` 97 | 98 | Place temporary files in this directory. All files 99 | will be deleted after aTRAM completes. The directory 100 | must exist. 101 | 102 | Sometimes using the system temporary directory is not the right option because 103 | aTRAM is filling up that disk or it is too slow or you want to debug issues. 104 | Note: If you want to keep the temporary files around for debugging then you 105 | should also use the `--keep-temp-dir` option. 106 | 107 | `--keep-temp-dir` 108 | 109 | This flag will keep the temporary files in the --temp-dir around for debugging. 110 | 111 | `-l LOG_FILE, --log-file LOG_FILE` 112 | 113 | Log file (full path). The default is to use the DB and 114 | program name to come up with a name like 115 | "_atram_preprocessor.log". 116 | 117 | `-s SHARDS, --shards SHARDS, --number SHARDS` 118 | 119 | Number of blast DB shards to create. The default is to 120 | have each shard contain roughly 250MB of sequence 121 | data. 122 | 123 | `--path PATH` 124 | 125 | If makeblastdb is not in your $PATH then use this to prepend directories to 126 | your path. For instance, if you installed makeblastdb in 127 | `/home/my_dir/bin/makeblastdb`, you could use this option to find it. 128 | 129 | `--fasta` 130 | 131 | Are these fasta files? If you do not specify either --fasta or --fastq then 132 | aTRAM will guess the file type by looking at the last character of the file 133 | name. This option is most useful when you are dealing with compressed files 134 | which will defeat aTRAM's FASTA/Q format guessing algorithm. 135 | 136 | `--fastq` 137 | 138 | Are these fastq files? If you do not specify either --fasta or --fastq then 139 | aTRAM will guess the file type by looking at the last character of the file 140 | name. This option is most useful when you are dealing with compressed files 141 | which will defeat aTRAM's FASTA/Q format guessing algorithm. 142 | 143 | `--gzip` 144 | 145 | Are these gzip files? aTRAM does not try to guess if the file is compressed. 146 | 147 | `--bzip` 148 | 149 | Are these bzip files? aTRAM does not try to guess if the file is compressed. 150 | -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/atram_preprocessor_tutorial.md: -------------------------------------------------------------------------------- 1 | # A tutorial for atram_preprocessor.py 2 | 3 | - [The input FASTA files](#The-input-FASTA-files) 4 | - [Naming the database](#Naming-the-database) 5 | - [Running the preprocessor](#Running-the-preprocessor) 6 | 7 | This program reads a series of related FASTA or FASTQ files and builds an aTRAM 8 | database. This aTRAM database is actually two or more databases 9 | (typically several). 10 | 11 | - An SQLite3 database that holds the contents of the FASTA/FASTQ files in a 12 | format that can be easily and quickly queried. It takes three pieces of 13 | information from the original files are: the sequence name, sequence end 14 | (1, 2, or none), and the sequence itself. 15 | 16 | - A set of BLAST databases. atram.py uses multiple BLAST databases. This 17 | dataset division enables parallelized read queries and greatly improves 18 | performance even for serial queries. 19 | 20 | ![atram_preprocessor.py](images/atram_preprocessor.png "aTRAM pre-processor") 21 | 22 | ## The input FASTA files 23 | 24 | We have provided some input files that we can use to practice using aTRAM in 25 | the `doc/data` directory. For this tutorial, we will be using the 26 | `tutorial_end_1.fasta.gz` and `tutorial_end_2.fasta.gz` files. 27 | 28 | ## Setting up a directory for the aTRAM databases 29 | 30 | "Where do we want to store the built aTRAM databases?" I prefer to keep my 31 | aTRAM built databases in their own directory on a large disk. So, I create a 32 | `atram_db` directory and point aTRAM to that. In Linux Bash this looks like: 33 | 34 | ```bash 35 | mkdir -p /path/to/atram_db 36 | ``` 37 | 38 | ## Naming the database 39 | 40 | The next question is, "What do we want to call our new databases?" Because the 41 | input files are called `tutorial_end_1.fasta.gz` and `tutorial_end_2.fasta.gz` 42 | I'm going to call the database `tutorial` but you may use whatever name you 43 | want. 44 | 45 | ## Running the preprocessor 46 | 47 | We now have everything we need to run the aTRAM preprocessor. 48 | - Where we are storing the aTRAM databases (`/path/to/atram_db/`) 49 | - What the name (prefix) of the aTRAM databases will be (`tutorial`) 50 | - Where the FASTA files are stored (`/path/to/doc/data/`) 51 | - What FASTA files are going to be used for input 52 | `(tutorial_end_1.fasta.gz & tutorial_end_2.fasta.gz`) 53 | - and the fact that the input FASTA files are gzipped 54 | 55 | The command becomes: 56 | 57 | ```bash 58 | ./atram_preprocessor.py \ 59 | --blast-db=/path/to/atram_db/tutorial \ 60 | --end-1=/path/to/doc/data/tutorial_end_1.fasta.gz \ 61 | --end-2=/path/to/doc/data/tutorial_end_2.fasta.gz \ 62 | --gzip 63 | ``` 64 | 65 | When you run the command you should see atram_preprocessor log output that looks similar to: 66 | 67 | ``` 68 | 2019-08-16 16:30:30 INFO: ################################################################################ 69 | 2019-08-16 16:30:30 INFO: aTRAM version: v2.2.0 70 | 2019-08-16 16:30:30 INFO: Python version: 3.7.3 (default, Apr 3 2019, 05:39:12) [GCC 8.3.0] 71 | 2019-08-16 16:30:30 INFO: ./atram_preprocessor.py --blast-db=atram_db/tutorial --end-1=doc/data/tutorial_end_1.fasta.gz --end-2=doc/data/tutorial_end_2.fasta.gz --gzip 72 | 2019-08-16 16:30:30 INFO: Loading "doc/data/tutorial_end_1.fasta.gz" into sqlite database 73 | 2019-08-16 16:30:33 INFO: Loading "doc/data/tutorial_end_2.fasta.gz" into sqlite database 74 | 2019-08-16 16:30:36 INFO: Creating an index for the sequence table 75 | 2019-08-16 16:30:36 INFO: Assigning sequences to shards 76 | 2019-08-16 16:30:36 INFO: Making blast DBs 77 | 2019-08-16 16:30:56 INFO: Finished making blast all 1 DBs 78 | ``` 79 | 80 | There is some information about the aTRAM run-time environment. Following that, 81 | we are noting when important events in the database build process happened. 82 | Finally, we timestamp when aTRAM is finished. In this case the "Finished making 83 | blast all 1 DBs" indicates that the size of the input FASTA files are small and 84 | there is only one BLAST database shard needed. When you run this on larger 85 | datasets there will be more. 86 | 87 | ## atram_preprocessor.py output 88 | 89 | Let's look at what is created in the atram_db directory: 90 | - A set of BLAST files starting `tutorial.001.*`. The '001' is the shard count. 91 | - A log file `tutorial.atram_preprocessor.log`. This file contains more data 92 | than the output to the screen and is useful for debugging problems. 93 | - An SQLite3 database that contains the data from the FASTA files that atram.py 94 | needs. The data in this database is tightly coupled to the BLAST databases so 95 | if there is a need to edit your data you will need to rerun the preprocessor. 96 | 97 | ## Next 98 | 99 | We are now ready to run atram itself. See the 100 | [aTRAM tutorial](atram_tutorial.md). 101 | -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/atram_stitcher_reference.md: -------------------------------------------------------------------------------- 1 | # aTRAM Stitcher 2 | 3 | This program will find and stitch together exons from targeted 4 | assemblies using amino acid targets and DNA assemblies. 5 | 6 | ## Arguments 7 | 8 | `-h, --help` 9 | 10 | Show this help message and exit. 11 | 12 | `--version` 13 | 14 | Show program's version number and exit. 15 | 16 | `-T TAXA, --taxa TAXA` 17 | 18 | A text file of all of your taxon names. 19 | 20 | `-r FASTA, --reference-genes FASTA, --refs FASTA` 21 | 22 | Reference amino acid sequences in a FASTA file. 23 | 24 | `-a PATH, --assemblies-dir PATH` 25 | 26 | The path to the DNA contigs. 27 | 28 | `-O OVERLAP, --overlap OVERLAP` 29 | 30 | Contigs must overlap by this many codons before it is 31 | considered a real overlap. 32 | 33 | `-t DIR, --temp-dir DIR` 34 | 35 | Place temporary files in this directory. All files 36 | will be deleted after aTRAM completes. The directory 37 | must exist. 38 | 39 | `--keep-temp-dir` 40 | 41 | This flag will keep the temporary files in the --temp- 42 | dir around for debugging. 43 | 44 | `-l LOG_FILE, --log-file LOG_FILE` 45 | 46 | Log file (full path). The default is 47 | "atram_stitcher_.log". 48 | 49 | `-i N, --iterations N ` 50 | 51 | The number of times to run the main stitcher loop. 52 | This must be either 1 or 2, the default is 2. 53 | 54 | `-o OUTPUT_PREFIX, --output-prefix OUTPUT_PREFIX` 55 | 56 | This is the prefix of all of the output files. So you 57 | can identify different stitcher output file sets. You 58 | may include a directory as part of the prefix. The 59 | stitcher will add suffixes to differentiate output 60 | files. 61 | 62 | `-f FILE_FILTER, --file-filter FILE_FILTER` 63 | 64 | Use this to filter files in the assemblies directory. 65 | For example '*filtered*.fasta' will select all fasta 66 | files in the assemblies directory with the word 67 | filtered in them. The default is to select all fasta 68 | files in the assemblies directory '*.fasta'. 69 | 70 | `--reference-name` 71 | 72 | Prepend the reference name to the final assembled gene 73 | name? if false the gene name in the reference file 74 | with just be the if you select this then 75 | the assembled gene name will be .. 77 | -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/atram_stitcher_tutorial.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/atram_stitcher_tutorial.md -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/atram_tutorial.md: -------------------------------------------------------------------------------- 1 | # Tutorial for atram.py 2 | 3 | We assume that you have already run the 4 | [Pre-processing tutorial](atram_preprocessor_tutorial.md). If not, go back and 5 | run it because you will need the output for this tutorial. 6 | 7 | ```bash 8 | ./atram.py \ 9 | --query=/path/to/doc/data/tutorial-query.pep.fasta \ 10 | --blast-db=/path/to/atram_db/tutorial \ 11 | --output=/path/to/output/tutorial \ 12 | --assembler=velvet 13 | ``` 14 | -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/images/atram_preprocessor.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/images/atram_preprocessor.odg -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/images/atram_preprocessor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/images/atram_preprocessor.png -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/images/atram_step_1.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/images/atram_step_1.odg -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/images/atram_step_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/images/atram_step_1.png -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/images/atram_step_2.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/images/atram_step_2.odg -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/images/atram_step_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/images/atram_step_2.png -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/images/atram_step_3.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/images/atram_step_3.odg -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/images/atram_step_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/images/atram_step_3.png -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/images/atram_step_4.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/images/atram_step_4.odg -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/images/atram_step_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/images/atram_step_4.png -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/images/atram_stitcher.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/images/atram_stitcher.odg -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/images/atram_stitcher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/doc/images/atram_stitcher.png -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/tips.md: -------------------------------------------------------------------------------- 1 | # Tips 2 | 3 | - [Assembling multiple genes against a library](#Assembling-multiple-genes-against-a-library) 4 | - [Example of running a shell loop](#Example-of-running-a-shell-loop) 5 | - [Creating an argument file](#Creating-an-argument-file) 6 | - [Backwards compatibility](#Backwards-compatibility) 7 | 8 | 9 | ## Assembling multiple genes against a library 10 | 11 | aTRAM2.0 can assemble a set of genes against a single library. Create a single 12 | file with multiple fasta-formatted sequences and then simply use `-Q QUERY_SPLIT` 13 | where QUERY_SPLIT is the name of the file you created above. 14 | 15 | ## Example of running a shell loop 16 | 17 | In many cases it is convenient to run aTRAM 2 as a loop, assembling a set of 18 | genes for a set of taxa. These can be set up in two parts, as shown below. 19 | Note that aTRAM2 has built in functions supporting assembly of many genes 20 | against a library, as described just above. 21 | 22 | ```bash 23 | # Make aTRAM libraries 24 | array=(sample1 sample2 sample3) 25 | 26 | for a in "${array[@]}"; # Iterate through samples 27 | do 28 | atram_preprocessor.py -c 4 -b path_to_atram_library/lib_${a} path_to_input/${a}_P*.fq 29 | done 30 | ``` 31 | 32 | The part `${a}_P*.fq` will have to be modified to match the name pattern of 33 | your input fastq files. 34 | 35 | Then, supposing we have a set of genes stored in a single file and wish to use 36 | Abyss: 37 | 38 | ```bash 39 | # Assemble genes 40 | array=(sample1 sample2 sample3) 41 | 42 | for a in "${array[@]}"; # Iterate through samples 43 | do 44 | atram.py -b path_to_atram_library/lib_${a} -Q file_name -i 5 --cpus 4 --kmer 64 -o path_to_output/lib_${a}.atram2.fasta --log-file path_to_output/lib_${a}.log -a abyss 45 | done 46 | ``` 47 | 48 | ## Creating an argument file 49 | 50 | There are a lot of arguments to aTRAM and even I don't remember them all. To 51 | help with this a lot of people create Bash scripts once they have tuned the 52 | arguments for their needs. I prefer to use a slightly different method, an 53 | argument file. This is a text file that lists the arguments to a program, one 54 | argument, in long form, per line. 55 | 56 | For the atram_preprocessor tutorial I would create a file, let's call it 57 | `atram_preprocessor.args`, like so: 58 | 59 | ``` 60 | --blast-db=/path/to/atram_db/tutorial 61 | --end-1=/path/to/doc/data/tutorial_end_1.fasta.gz 62 | --end-2=/path/to/doc/data/tutorial_end_2.fasta.gz 63 | --gzip 64 | ``` 65 | 66 | And then you would use it like this: 67 | 68 | ```bash 69 | atram_preprocessor.py @atram_preprocessor.args 70 | ``` 71 | You can still add command-line arguments. Like so: 72 | 73 | ```bash 74 | atram_preprocessor.py @atram_preprocessor.args --cpus=8 75 | ``` 76 | 77 | ## Backwards compatibility 78 | 79 | For any tools that depend on the output format of aTRAM 1.0, this script will 80 | perform the conversion of fasta headers: 81 | 82 | ``` 83 | for i in $(find . -name "*.fasta"); do 84 | sed 's/.* iteration=/>/g' ${i} | sed 's/ contig_id/.0_contigid/g' | sed 's/contigid.*length_//g' | sed 's/_cov.* score=/_/g' | sed 's/\.[0-9]*$//g' > ${i}.aTRAM1.fasta 85 | done 86 | ``` 87 | -------------------------------------------------------------------------------- /bin/aTRAM-master/doc/troubleshooting.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting 2 | 3 | - [Checking the system configuration](#Checking-the-system-configuration) 4 | - [Log file](#Log-file) 5 | - [Saving temporary data](#Saving-temporary-data) 6 | - [Debugging assembler issues](#Debugging-assembler-issues) 7 | 8 | ## Checking the system configuration 9 | 10 | aTRAM requires some external programs and a few set of pip installed Python 11 | modules. We have provided a utility to check that you have the required 12 | programs installed. 13 | 14 | ```bash 15 | ./util_check_requirements.py 16 | ``` 17 | 18 | ## Log file 19 | 20 | aTRAM log files contain more information than the screen output. This should be 21 | your first stop for debugging aTRAM issues. The log file concatenates so you 22 | need to scroll to the end of it to see the latest error message. 23 | 24 | If you don't specify a location for the log file via the 25 | `--log-file=/path/to/log_file.log` then the default location will be in the 26 | same directory as your database files given by the `--blast-db` option. 27 | 28 | ## Saving temporary data 29 | 30 | It is possible to save the temporary data from an aTRAM run. To do this you 31 | need to add two arguments to atram.py or to atram_preprocessor.py: 32 | 33 | - `--temp-dir=/path/to/existing/directory` 34 | This will create subdirectories underneath the given directory. 35 | 36 | - `--keep-temp-dir` 37 | This instructs aTRAM to not delete the data after it is done. You can then 38 | examine the directory contents or use it for 39 | [further analysis](#Debugging-assembler-issues). 40 | 41 | ## Debugging assembler issues 42 | 43 | *Advanced debugging technique.* 44 | 45 | The log file will almost always have a meaningful error message but in rare 46 | cases it can be useful to see how the assembler itself behaves. If aTRAM 47 | crashes while running an assembler we can debug the actual assembler error if 48 | we kept the [temporary data](#Saving-temporary-data). What I do is take the 49 | assembler command that is typically displayed within single quotes `'` in the 50 | aTRAM output and run it directly from the command line. 51 | 52 | **TODO Change to use tutorial arguments** 53 | 54 | For instance if I ran atram with the following arguments. 55 | ```bash 56 | ./atram.py \ 57 | --query=query/Phum.PHUM003340-PA.pep.fasta \ 58 | --blast-db=db/ptgor \ 59 | --output-prefix=output/ptgor \ 60 | --assembler=velvet 61 | ``` 62 | 63 | I add the temporary directory arguments and rerun atram.py: 64 | ```bash 65 | ./atram.py \ 66 | --query=query/Phum.PHUM003340-PA.pep.fasta \ 67 | --blast-db=db/ptgor \ 68 | --output-prefix=output/ptgor \ 69 | --assembler=velvet \ 70 | --temp-dir=temp \ 71 | --keep-temp-dir 72 | ``` 73 | 74 | And I see the following error message. 75 | ``` 76 | 2019-09-13 15:06:17 ERROR: The assembler failed with error: Command 77 | 'velveth temp/atram_dt0jmqy0/ptgor_Phum.PHUM003340-PA.pep.fasta_01_7xy5cnle 31 78 | -fasta 79 | -shortPaired '/home/user/work/aTRAM/temp/atram_dt0jmqy0/ptgor_Phum.PHUM003340-PA.pep.fasta_01_7xy5cnle/paired_1.fasta' 80 | '/home/user/work/aTRAM/temp/atram_dt0jmqy0/ptgor_Phum.PHUM003340-PA.pep.fasta_01_7xy5cnle/paired_2.fasta' 81 | -short '/home/user/work/aTRAM/temp/atram_dt0jmqy0/ptgor_Phum.PHUM003340-PA.pep.fasta_01_7xy5cnle/single_1.fasta'' returned non-zero exit status 1. 82 | ``` 83 | 84 | I pick up everything between the outermost single quotes 85 | (The assembler command itself): 86 | ``` 87 | velveth temp/atram_dt0jmqy0/ptgor_Phum.PHUM003340-PA.pep.fasta_01_7xy5cnle 31 88 | -fasta 89 | -shortPaired '/home/user/work/aTRAM/temp/atram_dt0jmqy0/ptgor_Phum.PHUM003340-PA.pep.fasta_01_7xy5cnle/paired_1.fasta' 90 | '/home/user/work/aTRAM/temp/atram_dt0jmqy0/ptgor_Phum.PHUM003340-PA.pep.fasta_01_7xy5cnle/paired_2.fasta' 91 | -short '/home/user/work/aTRAM/temp/atram_dt0jmqy0/ptgor_Phum.PHUM003340-PA.pep.fasta_01_7xy5cnle/single_1.fasta' 92 | ``` 93 | And then paste it into the command line to see what happens to the assembler. 94 | -------------------------------------------------------------------------------- /bin/aTRAM-master/environment.yml: -------------------------------------------------------------------------------- 1 | name: aTRAM 2 | channels: 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - python=3 7 | - pip 8 | - blast 9 | - velvet 10 | - trinity 11 | - abyss 12 | - spades 13 | - exonerate 14 | - pip: 15 | - git+https://github.com/juliema/aTRAM.git#egg=atram[complete] 16 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/lib/__init__.py -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/assembler.py: -------------------------------------------------------------------------------- 1 | """Base class for the various assembler programs.""" 2 | 3 | import sys 4 | from shutil import which 5 | import textwrap 6 | import psutil 7 | import lib.log as log 8 | from .assemblers.abyss import AbyssAssembler 9 | from .assemblers.spades import SpadesAssembler 10 | from .assemblers.trinity import TrinityAssembler 11 | from .assemblers.velvet import VelvetAssembler 12 | from .assemblers.none import NoneAssembler 13 | 14 | 15 | ASSEMBLERS = { 16 | 'abyss': AbyssAssembler, 17 | 'trinity': TrinityAssembler, 18 | 'velvet': VelvetAssembler, 19 | 'spades': SpadesAssembler, 20 | 'none': NoneAssembler} 21 | 22 | 23 | def factory(args, cxn): 24 | """Return the assembler based upon the configuration options.""" 25 | name = args['assembler'].lower() 26 | assembler = ASSEMBLERS[name] 27 | return assembler(args, cxn) 28 | 29 | 30 | def command_line_args(parser): 31 | """Add command-line arguments for the assemblers.""" 32 | group = parser.add_argument_group('optional assembler arguments') 33 | 34 | group.add_argument( 35 | '--no-long-reads', action='store_true', 36 | help="""Do not use long reads during assembly. for the assemblers is 37 | to use long reads. So this argument will stop the following: 38 | Abyss: long='LONGREADS' LONGREADS=''; 39 | Trinity: --long_reads ''; 40 | Velvet: -long ''.""") 41 | 42 | group.add_argument( 43 | '--kmer', type=int, default=64, 44 | help="""k-mer size. 45 | Abyss: k= (default 64); 46 | Velvet: (default 31).""") 47 | 48 | group.add_argument( 49 | '--mpi', action='store_true', 50 | help="""Use MPI for this assembler. The assembler must have been 51 | compiled to use MPI. Abyss: If this is true then pass --cpus 52 | into np=.""") 53 | 54 | group.add_argument( 55 | '--abyss-paired-ends', action='store_true', 56 | help="""Abyss: Normally, we put all of the input sequences in to the 57 | -se argument. If this is true then we will put paired end sequences 58 | into the -in argument with any residual single ends into the -se 59 | argument.""") 60 | 61 | group.add_argument( 62 | '--bowtie2', action='store_true', 63 | help="""Use bowtie2 during assembly. Trinity: This will prevent 64 | --no_bowtie from being passed to Trinity.""") 65 | 66 | total_mem = psutil.virtual_memory().available >> 30 67 | max_mem = max(1.0, total_mem >> 1) 68 | group.add_argument( 69 | '--max-memory', default=max_mem, metavar='MEMORY', type=int, 70 | help="""Maximum amount of memory to use in gigabytes. We will use {} 71 | out of {} GB of free/unused memory. 72 | Trinity: --max_memory G; 73 | Spades: --memory .""".format(max_mem, total_mem)) 74 | 75 | group.add_argument( 76 | '--exp-coverage', '--expected-coverage', type=int, default=30, 77 | help="""The expected coverage of the region. Velvet: -exp_cov 78 | (default 30).""") 79 | 80 | group.add_argument( 81 | '--ins-length', type=int, default=300, 82 | help="""The size of the fragments used in the short-read library. 83 | Velvet: -ins_length (default 300).""") 84 | 85 | group.add_argument( 86 | '--min-contig-length', type=int, default=100, 87 | help="""The minimum contig length used by the assembler itself. 88 | Velvet: -min_contig_lgth (default is 100).""") 89 | 90 | group.add_argument( 91 | '--careful', action='store_true', 92 | help="""Tries to reduce the number of mismatches and short indels. 93 | Spades: --careful.""") 94 | 95 | group.add_argument( 96 | '--cov-cutoff', default='off', 97 | help="""Read coverage cutoff value. Must be a positive float value, 98 | or "auto", or "off". Spades: --cov-cutoff .""") 99 | 100 | group.add_argument( 101 | '--abyss-p', type=int, 102 | help="""Abyss: Minimum sequence identity of a bubble. Enter a single 103 | digit integer [0-9].""") 104 | 105 | 106 | def default_kmer(kmer, assembler): 107 | """Calculate default kmer argument.""" 108 | if assembler == 'velvet' and kmer > 31: 109 | kmer = 31 110 | 111 | return kmer 112 | 113 | 114 | def default_cov_cutoff(cov_cutoff): 115 | """Calculate default coverage cutoff argument.""" 116 | if cov_cutoff in ['off', 'auto']: 117 | return cov_cutoff 118 | 119 | err = ('Read coverage cutoff value. Must be a positive ' 120 | 'float value, or "auto", or "off"') 121 | try: 122 | value = float(cov_cutoff) 123 | except ValueError: 124 | log.fatal(err) 125 | 126 | if value < 0: 127 | log.fatal(err) 128 | 129 | return cov_cutoff 130 | 131 | 132 | def find_program(assembler_name, program, assembler_arg, option=True): 133 | """Make sure we can find the programs needed by the assembler.""" 134 | if assembler_arg == assembler_name and option and not which(program): 135 | err = (textwrap.dedent(""" 136 | We could not find the "{}" program. You either need to 137 | install it or you need to adjust the PATH environment 138 | variable with the "--path" option so that aTRAM can 139 | find it.""")).format(program) 140 | sys.exit(err) 141 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/assemblers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/lib/assemblers/__init__.py -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/assemblers/abyss.py: -------------------------------------------------------------------------------- 1 | """Wrapper for the Abyss assembler.""" 2 | 3 | from os.path import realpath 4 | from shutil import copyfile 5 | from .base import BaseAssembler 6 | 7 | 8 | class AbyssAssembler(BaseAssembler): 9 | """Wrapper for the Abyss assembler.""" 10 | 11 | def __init__(self, args, cxn): 12 | """Build the assembler.""" 13 | super().__init__(args, cxn) 14 | self.steps = [self.abyss] 15 | 16 | def abyss(self): 17 | """Build the command for assembly.""" 18 | cmd = ['abyss-pe', 19 | "-C '{}'".format(self.work_path()), 20 | 'E=0', 21 | 'k={}'.format(self.args['kmer'])] 22 | 23 | if self.args.get('abyss_p') is not None: 24 | cmd.append('p={}'.format(self.args['abyss_p'])) 25 | 26 | cmd.append("name='{}'".format(self.file['output'])) 27 | 28 | if self.args['mpi']: 29 | cmd.append('np={}'.format(self.args['cpus'])) 30 | 31 | if self.args.get('abyss_paired_ends'): 32 | if self.file['paired_count']: 33 | cmd.append("in='{} {}'".format( 34 | self.file['paired_1'], self.file['paired_2'])) 35 | single_ends = self.get_single_ends() 36 | if single_ends: 37 | cmd.append("se='{}'".format(' '.join(single_ends))) 38 | else: 39 | in_files = [] 40 | if self.file['paired_count']: 41 | in_files += [self.file['paired_1'], self.file['paired_2']] 42 | in_files += self.get_single_ends() 43 | cmd.append("se='{}'".format(' '.join(in_files))) 44 | 45 | if self.file['long_reads'] and not self.args['no_long_reads']: 46 | cmd.append("long='LONGREADS'") 47 | cmd.append("LONGREADS='{}'".format(self.file['long_reads'])) 48 | 49 | return ' '.join(cmd) 50 | 51 | def post_assembly(self): 52 | """Copy the assembler output into the temp directory.""" 53 | src = realpath(self.file['output'] + '-unitigs.fa') 54 | 55 | copyfile(src, self.file['output']) 56 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/assemblers/none.py: -------------------------------------------------------------------------------- 1 | """Null object for the assemblers.""" 2 | 3 | from .. import db_atram 4 | from .. import util 5 | from .base import BaseAssembler 6 | 7 | 8 | class NoneAssembler(BaseAssembler): 9 | """Null object for the assemblers.""" 10 | 11 | def __init__(self, args, cxn): 12 | """Build the assembler.""" 13 | super().__init__(args, cxn) 14 | self.steps = [] 15 | self.blast_only = True # Used to short-circuit the assembler 16 | 17 | def write_final_output(self, blast_db, query): 18 | """Output this file if we are not assembling the contigs.""" 19 | prefix = self.final_output_prefix(blast_db, query) 20 | 21 | file_name = '{}.fasta'.format(prefix) 22 | 23 | with open(file_name, 'w') as output_file: 24 | for row in db_atram.get_sra_blast_hits(self.state['cxn'], 1): 25 | util.write_fasta_record( 26 | output_file, row['seq_name'], row['seq'], row['seq_end']) 27 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/assemblers/spades.py: -------------------------------------------------------------------------------- 1 | """Wrapper for the Spades assembler.""" 2 | 3 | from os.path import join 4 | import shutil 5 | from .base import BaseAssembler 6 | 7 | 8 | class SpadesAssembler(BaseAssembler): 9 | """Wrapper for the Spades assembler.""" 10 | 11 | def __init__(self, args, cxn): 12 | """Build the assembler.""" 13 | super().__init__(args, cxn) 14 | self.steps = [self.spades] 15 | 16 | def work_path(self): 17 | """ 18 | Create output directory name. 19 | 20 | It has has unique requirements. 21 | """ 22 | return join(self.state['iter_dir'], 'spades') 23 | 24 | def spades(self): 25 | """Build the command for assembly.""" 26 | cmd = ['spades.py ', 27 | '--only-assembler', 28 | '--threads {}'.format(self.args['cpus']), 29 | '--memory {}'.format(self.args['max_memory']), 30 | '--cov-cutoff {}'.format(self.args['cov_cutoff']), 31 | '-o {}'.format(self.work_path())] 32 | 33 | if self.args['careful']: 34 | cmd.append('--careful') 35 | 36 | if self.file['paired_count']: 37 | cmd.append("--pe1-1 '{}'".format(self.file['paired_1'])) 38 | cmd.append("--pe1-2 '{}'".format(self.file['paired_2'])) 39 | 40 | if self.file['single_1_count']: 41 | cmd.append("--s1 '{}'".format(self.file['single_1'])) 42 | if self.file['single_2_count']: 43 | cmd.append("--s1 '{}'".format(self.file['single_2'])) 44 | if self.file['single_any_count']: 45 | cmd.append("--s1 '{}'".format(self.file['single_any'])) 46 | 47 | return ' '.join(cmd) 48 | 49 | def post_assembly(self): 50 | """Copy the assembler output.""" 51 | src = join(self.work_path(), 'contigs.fasta') 52 | shutil.move(src, self.file['output']) 53 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/assemblers/trinity.py: -------------------------------------------------------------------------------- 1 | """Wrapper for the Trinity assembler.""" 2 | 3 | from os.path import join 4 | from shutil import move 5 | from .base import BaseAssembler 6 | 7 | 8 | class TrinityAssembler(BaseAssembler): 9 | """Wrapper for the trinity assembler.""" 10 | 11 | def __init__(self, args, cxn): 12 | """Build the assembler.""" 13 | super().__init__(args, cxn) 14 | self.steps = [self.trinity] 15 | 16 | def work_path(self): 17 | """ 18 | Create output directory name. 19 | 20 | It has has unique requirements. 21 | """ 22 | return join(self.state['iter_dir'], 'trinity') 23 | 24 | def trinity(self): 25 | """Build the command for assembly.""" 26 | cmd = ['Trinity', 27 | '--seqType fa', 28 | '--max_memory {}G'.format(self.args['max_memory']), 29 | '--CPU {}'.format(self.args['cpus']), 30 | "--output '{}'".format(self.work_path()), 31 | '--full_cleanup'] 32 | 33 | if not self.args['bowtie2']: 34 | cmd.append('--no_bowtie') 35 | 36 | if self.file['paired_count']: 37 | cmd.append("--left '{}'".format(self.file['paired_1'])) 38 | cmd.append("--right '{}'".format(self.file['paired_2'])) 39 | else: 40 | single_ends = self.get_single_ends() 41 | if single_ends: 42 | cmd.append("--single '{}'".format(','.join(single_ends))) 43 | 44 | if self.file['long_reads'] and not self.args['no_long_reads']: 45 | cmd.append("--long_reads '{}'".format(self.file['long_reads'])) 46 | 47 | return ' '.join(cmd) 48 | 49 | def post_assembly(self): 50 | """Copy the assembler output.""" 51 | src = join(self.state['iter_dir'], 'trinity.Trinity.fasta') 52 | move(src, self.file['output']) 53 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/assemblers/velvet.py: -------------------------------------------------------------------------------- 1 | """Wrapper for the Velvet assembler.""" 2 | 3 | import shutil 4 | from .base import BaseAssembler 5 | 6 | 7 | class VelvetAssembler(BaseAssembler): 8 | """Wrapper for the Velvet assembler.""" 9 | 10 | def __init__(self, args, cxn): 11 | """Build the assembler.""" 12 | super().__init__(args, cxn) 13 | self.steps = [self.velveth, self.velvetg] 14 | 15 | @staticmethod 16 | def parse_contig_id(header): 17 | """Given a fasta header line return the contig ID.""" 18 | return header 19 | 20 | def velveth(self): # noqa 21 | """Build the velveth for the first assembly step.""" 22 | cmd = ['velveth', 23 | '{}'.format(self.work_path()), 24 | '{}'.format(self.args['kmer']), 25 | '-fasta'] 26 | 27 | if self.file['paired_count']: 28 | cmd.append("-shortPaired '{}' '{}'".format( 29 | self.file['paired_1'], self.file['paired_2'])) 30 | 31 | single_ends = [] 32 | if self.file['single_1_count']: 33 | single_ends.append("'{}'".format(self.file['single_1'])) 34 | if self.file['single_2_count']: 35 | single_ends.append("'{}'".format(self.file['single_2'])) 36 | if self.file['single_any_count']: 37 | single_ends.append("'{}'".format(self.file['single_any'])) 38 | if single_ends: 39 | cmd.append("-short {}".format(' '.join(single_ends))) 40 | 41 | if self.file['long_reads'] and not self.args['no_long_reads']: 42 | cmd.append("-long '{}'".format(self.file['long_reads'])) 43 | 44 | return ' '.join(cmd) 45 | 46 | def velvetg(self): 47 | """Build the velvetg for the second assembly step.""" 48 | cmd = ['velvetg', 49 | '{}'.format(self.work_path()), 50 | '-ins_length {}'.format(self.args['ins_length']), 51 | '-exp_cov {}'.format(self.args['exp_coverage']), 52 | '-min_contig_lgth {}'.format(self.args['min_contig_length'])] 53 | 54 | return ' '.join(cmd) 55 | 56 | def post_assembly(self): 57 | """Copy the assembler output.""" 58 | src = self.iter_file('contigs.fa') 59 | shutil.move(src, self.file['output']) 60 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/bio.py: -------------------------------------------------------------------------------- 1 | """Utilities for working with sequences.""" 2 | 3 | import re 4 | from Bio import SeqIO 5 | 6 | CODON_LEN = 3 7 | 8 | COMPLEMENT = str.maketrans('ACGTUWSMKRYBDHVNXacgtuwsmkrybdhvnx-', 9 | 'TGCAAWSKMYRVHDBNXtgcaawskmyrvhdbnx-') 10 | 11 | IS_PROTEIN = re.compile(r'[EFILPQ]', re.IGNORECASE) 12 | 13 | 14 | def reverse_complement(seq): 15 | """Reverse complement a nucleotide sequence. We added some wildcards.""" 16 | return seq.translate(COMPLEMENT)[::-1] 17 | 18 | 19 | def is_protein(seq): 20 | """Check if the sequence a protein.""" 21 | return IS_PROTEIN.search(seq) 22 | 23 | 24 | def fasta_file_has_protein(query_files): 25 | """Search for protein characters in a fasta file.""" 26 | for query_file in query_files: 27 | with open(query_file) as in_file: 28 | for query in SeqIO.parse(in_file, 'fasta'): 29 | if is_protein(str(query.seq)): 30 | return True 31 | 32 | return False 33 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/core_framer.py: -------------------------------------------------------------------------------- 1 | """Put exons into the correct reading frames.""" 2 | 3 | import csv 4 | from collections import defaultdict 5 | from itertools import product 6 | from . import bio 7 | from . import exonerate 8 | from . import db_stitcher as db 9 | from . import log 10 | from . import util 11 | 12 | 13 | def frame(args): 14 | """Frame the exons.""" 15 | log.stitcher_setup(args.log_file, args.log_level) 16 | iteration = 0 17 | 18 | with util.make_temp_dir( 19 | where=args.temp_dir, 20 | prefix='atram_framer_', 21 | keep=args.keep_temp_dir) as temp_dir: 22 | with db.connect(temp_dir, 'atram_framer') as cxn: 23 | cxn.row_factory = lambda c, r: { 24 | col[0]: r[idx] for idx, col in enumerate(c.description)} 25 | exonerate.create_tables(cxn) 26 | 27 | taxon_names = exonerate.get_taxa(args) 28 | exonerate.insert_reference_genes(args, temp_dir, cxn) 29 | exonerate.check_file_counts(args, cxn, taxon_names) 30 | exonerate.create_reference_files(cxn) 31 | 32 | iteration += 1 33 | exonerate.get_contigs_from_fasta( 34 | args, temp_dir, cxn, taxon_names, iteration) 35 | exonerate.contig_file_write(cxn) 36 | exonerate.run_exonerate(temp_dir, cxn, iteration) 37 | 38 | output_contigs(args, cxn) 39 | 40 | log.info('Writing output') 41 | output_summary_per_gene(args, cxn, taxon_names) 42 | output_summary_per_taxon(args, cxn, taxon_names) 43 | 44 | log.info('Finished') 45 | 46 | 47 | def output_contigs(args, cxn): 48 | """Add NNNs to align the contigs to the reference sequence.""" 49 | log.info('Framing contigs') 50 | 51 | for ref in db.select_reference_genes(cxn): 52 | ref_name = ref['ref_name'] 53 | ref_len = len(ref['ref_seq']) * bio.CODON_LEN 54 | 55 | names_seen = defaultdict(int) 56 | 57 | out_path = util.prefix_file( 58 | args.output_prefix, '{}.fasta'.format(ref_name)) 59 | 60 | with open(out_path, 'w') as out_file: 61 | 62 | for contig in db.select_exonerate_ref_gene( 63 | cxn, ref_name, args.min_length): 64 | 65 | contig_name = exonerate.handle_duplicate_name( 66 | contig['contig_name'], names_seen) 67 | 68 | seq = 'N' * (contig['beg'] * bio.CODON_LEN) 69 | seq += contig['seq'] 70 | seq += 'N' * (ref_len - len(seq)) 71 | util.write_fasta_record(out_file, contig_name, seq) 72 | 73 | 74 | def output_summary_per_gene(args, cxn, taxon_names): 75 | """Print per gene summary statistics.""" 76 | longest = max(db.select_longest(cxn), 1) 77 | lengths = db.select_seq_lengths(cxn) 78 | 79 | counts = {t: {'total': set(), 'long': set()} for t in taxon_names} 80 | 81 | for length in lengths: 82 | taxon_name = length['taxon_name'] 83 | ref_name = length['ref_name'] 84 | counts[taxon_name]['total'].add(ref_name) 85 | fraction = length['len'] / longest 86 | if fraction >= args.long_contig: 87 | counts[taxon_name]['long'].add(ref_name) 88 | 89 | out_path = util.prefix_file( 90 | args.output_prefix, 'summary_stats_per_ref_gene.csv') 91 | with open(out_path, 'w') as out_file: 92 | writer = csv.writer(out_file) 93 | writer.writerow(['Taxon', 94 | 'Total_Genes', 95 | 'Total_Genes_>={:0.2}'.format(args.long_contig)]) 96 | for taxon, count in counts.items(): 97 | writer.writerow([taxon, len(count['total']), len(count['long'])]) 98 | 99 | 100 | def output_summary_per_taxon(args, cxn, taxon_names): 101 | """Print per taxon summary statistics.""" 102 | longest = max(db.select_longest(cxn), 1) 103 | lengths = db.select_seq_lengths(cxn) 104 | ref_names = [r['ref_name'] for r in db.select_reference_genes(cxn)] 105 | 106 | counts = {c: {'total': 0, 'long': 0} 107 | for c in product(taxon_names, ref_names)} 108 | 109 | for length in lengths: 110 | taxon_name = length['taxon_name'] 111 | ref_name = length['ref_name'] 112 | key = (taxon_name, ref_name) 113 | counts[key]['total'] += 1 114 | fraction = length['len'] / longest 115 | if fraction >= args.long_contig: 116 | counts[key]['long'] += 1 117 | 118 | out_path = util.prefix_file( 119 | args.output_prefix, 'summary_stats_per_taxon.csv') 120 | with open(out_path, 'w') as out_file: 121 | writer = csv.writer(out_file) 122 | writer.writerow(['Taxon', 123 | 'Gene', 124 | 'Total_Contigs', 125 | 'Total_Contigs_>={:0.2}'.format(args.long_contig)]) 126 | for key, count in counts.items(): 127 | writer.writerow([key[0], key[1], count['total'], count['long']]) 128 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/core_preprocessor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Format the data so that atram can use it later in atram itself. 3 | 4 | It takes sequence read archive (SRA) files and converts them into coordinated 5 | blast and sqlite3 databases. 6 | """ 7 | 8 | import multiprocessing 9 | import sys 10 | from os.path import basename, join, splitext 11 | 12 | from Bio.SeqIO.FastaIO import SimpleFastaParser 13 | from Bio.SeqIO.QualityIO import FastqGeneralIterator 14 | 15 | from . import blast, db, db_preprocessor, log, util 16 | 17 | 18 | def preprocess(args): 19 | """Build the databases required by atram.""" 20 | log.setup(args['log_file'], args['log_level'], args['blast_db']) 21 | 22 | with util.make_temp_dir( 23 | where=args['temp_dir'], 24 | prefix='atram_preprocessor_', 25 | keep=args['keep_temp_dir']) as temp_dir: 26 | util.update_temp_dir(temp_dir, args) 27 | 28 | with db.connect(args['blast_db'], clean=True) as cxn: 29 | db_preprocessor.create_metadata_table(cxn, args) 30 | 31 | db_preprocessor.create_sequences_table(cxn) 32 | load_seqs(args, cxn) 33 | 34 | log.info('Creating an index for the sequence table') 35 | db_preprocessor.create_sequences_index(cxn) 36 | 37 | create_all_blast_shards(args, cxn, args['shard_count']) 38 | 39 | 40 | def load_seqs(args, cxn): 41 | """Load sequences from a fasta/fastq files into the atram database.""" 42 | # We have to clamp the end suffix depending on the file type. 43 | for (ends, clamp) in [('mixed_ends', ''), ('end_1', '1'), 44 | ('end_2', '2'), ('single_ends', '')]: 45 | if args.get(ends): 46 | for file_name in args[ends]: 47 | load_one_file(args, cxn, file_name, ends, clamp) 48 | 49 | 50 | def load_one_file(args, cxn, file_name, ends, seq_end_clamp=''): 51 | """Load sequences from a fasta/fastq file into the atram database.""" 52 | log.info('Loading "{}" into sqlite database'.format(file_name)) 53 | 54 | parser = get_parser(args, file_name) 55 | 56 | with util.open_file(args, file_name) as sra_file: 57 | batch = [] 58 | 59 | for rec in parser(sra_file): 60 | title = rec[0].strip() 61 | seq = rec[1] 62 | seq_name, seq_end = blast.parse_fasta_title( 63 | title, ends, seq_end_clamp) 64 | 65 | batch.append((seq_name, seq_end, seq)) 66 | 67 | if len(batch) >= db.BATCH_SIZE: 68 | db_preprocessor.insert_sequences_batch(cxn, batch) 69 | batch = [] 70 | 71 | db_preprocessor.insert_sequences_batch(cxn, batch) 72 | 73 | 74 | def get_parser(args, file_name): 75 | """Get either a fasta or fastq file parser.""" 76 | is_fastq = util.is_fastq_file(args, file_name) 77 | return FastqGeneralIterator if is_fastq else SimpleFastaParser 78 | 79 | 80 | def create_all_blast_shards(args, cxn, shard_count): 81 | """ 82 | Assign processes to make the blast DBs. 83 | 84 | One process for each blast DB shard. 85 | """ 86 | log.info('Making blast DBs') 87 | db_preprocessor.aux_db(cxn, args['temp_dir']) 88 | db_preprocessor.create_seq_names_table(cxn) 89 | 90 | with multiprocessing.Pool(processes=args['cpus']) as pool: 91 | results = [] 92 | for shard_idx in range(shard_count): 93 | fasta_path = fill_blast_fasta(args, cxn, shard_count, shard_idx) 94 | results.append(pool.apply_async( 95 | create_one_blast_shard, 96 | (args, fasta_path, shard_idx))) 97 | 98 | all_results = [result.get() for result in results] 99 | db_preprocessor.aux_detach(cxn) 100 | log.info('Finished making all {} blast DBs'.format(len(all_results))) 101 | 102 | 103 | def fill_blast_fasta(args, cxn, shard_count, shard_index): 104 | """Fill the shard input files with sequences.""" 105 | exe_name, _ = splitext(basename(sys.argv[0])) 106 | fasta_name = '{}_{:03d}.fasta'.format(exe_name, shard_index + 1) 107 | fasta_path = join(args['temp_dir'], fasta_name) 108 | 109 | with open(fasta_path, 'w') as fasta_file: 110 | for row in db_preprocessor.get_sequences_in_shard( 111 | cxn, shard_count, shard_index): 112 | util.write_fasta_record(fasta_file, row[0], row[2], row[1]) 113 | 114 | return fasta_path 115 | 116 | 117 | def create_one_blast_shard(args, fasta_path, shard_index): 118 | """Create a blast DB from the shard.""" 119 | shard = '{}.{:03d}.blast'.format(args['blast_db'], shard_index + 1) 120 | blast.create_db(args['temp_dir'], fasta_path, shard) 121 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/db.py: -------------------------------------------------------------------------------- 1 | """Handle common database functions.""" 2 | 3 | import sqlite3 4 | import sys 5 | import os 6 | from os.path import basename, join, exists 7 | 8 | ATRAM_VERSION = 'v2.3.1' 9 | 10 | # DB_VERSION != ATRAM_VERSION 11 | # We don't force DB changes until required. 12 | # Therefore DB_VERSION <= ATRAM_VERSION. 13 | DB_VERSION = '2.0' 14 | 15 | BATCH_SIZE = 1e6 # How many sequence records to insert at a time 16 | 17 | 18 | def connect(blast_db, check_version=False, clean=False): 19 | """Create DB connection.""" 20 | db_name = get_db_name(blast_db) 21 | 22 | if clean and exists(db_name): 23 | os.remove(db_name) 24 | 25 | if check_version and not exists(db_name): 26 | err = 'Could not find the database file "{}".'.format(db_name) 27 | sys.exit(err) 28 | 29 | if check_version: 30 | with db_setup(db_name) as cxn: 31 | check_versions(cxn) 32 | 33 | return db_setup(db_name) 34 | 35 | 36 | def get_db_name(db_prefix): 37 | """Build the SQLite DB name from the prefix.""" 38 | return '{}.sqlite.db'.format(db_prefix) 39 | 40 | 41 | def aux_db(cxn, temp_dir, blast_db, query_name): 42 | """Create & attach a temporary database to the current DB connection.""" 43 | db_dir = join(temp_dir, 'db') 44 | os.makedirs(db_dir, exist_ok=True) 45 | 46 | db_name = '{}_{}_temp.sqlite.db'.format( 47 | basename(blast_db), basename(query_name)) 48 | db_name = join(db_dir, db_name) 49 | 50 | sql = """ATTACH DATABASE '{}' AS aux""".format(db_name) 51 | cxn.execute(sql) 52 | 53 | 54 | def aux_detach(cxn): 55 | """Detach the temporary database.""" 56 | cxn.execute('DETACH DATABASE aux') 57 | 58 | 59 | def temp_db(temp_dir, db_prefix): 60 | """Create a temporary database.""" 61 | db_name = join(temp_dir, get_db_name(db_prefix)) 62 | return db_setup(db_name) 63 | 64 | 65 | def db_setup(db_name): 66 | """Database setup.""" 67 | cxn = sqlite3.connect(db_name, timeout=30.0) 68 | cxn.execute("PRAGMA page_size = {}".format(2 ** 16)) 69 | cxn.execute("PRAGMA journal_mode = WAL") 70 | return cxn 71 | 72 | 73 | # ########################### misc functions ################################# 74 | 75 | def check_versions(cxn): 76 | """Make sure the database version matches what we built it with.""" 77 | version = get_version(cxn) 78 | if version != DB_VERSION: 79 | err = ('The database was built with version {} but you are running ' 80 | 'version {}. You need to rebuild the atram database by ' 81 | 'running atram_preprocessor.py again.').format( 82 | version, DB_VERSION) 83 | sys.exit(err) 84 | 85 | 86 | # ########################## metadata table ################################## 87 | 88 | def get_metadata(cxn, key, default=''): 89 | """Get the current database version.""" 90 | sql = """SELECT value FROM metadata WHERE label = ?""" 91 | try: 92 | result = cxn.execute(sql, (key,)) 93 | result = result.fetchone() 94 | return default if not result else result[0] 95 | except sqlite3.OperationalError: 96 | return default 97 | 98 | 99 | def get_version(cxn): 100 | """Get the current database version.""" 101 | return get_metadata(cxn, 'version', default='1.0') 102 | 103 | 104 | def is_single_end(cxn): 105 | """Was the database build for single ends.""" 106 | result = get_metadata(cxn, 'single_ends') 107 | return result != '0' 108 | 109 | 110 | # ########################## sequences table ################################## 111 | 112 | def get_sequence_ends(cxn): 113 | """Get a list of all seq_ends in the database.""" 114 | return cxn.execute('SELECT DISTINCT seq_end FROM sequences') 115 | 116 | 117 | def get_all_sequences(cxn): 118 | """Get a list of all sequences in the database.""" 119 | return cxn.execute('SELECT * FROM sequences') 120 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/db_preprocessor.py: -------------------------------------------------------------------------------- 1 | """Database functions for the preprocessor.""" 2 | 3 | import os 4 | from os.path import join 5 | 6 | from .db import DB_VERSION 7 | 8 | 9 | def aux_db(cxn, temp_dir): 10 | """Create & attach a temporary database to the current DB connection.""" 11 | db_dir = join(temp_dir, 'db') 12 | os.makedirs(db_dir, exist_ok=True) 13 | 14 | db_name = join(db_dir, 'temp.sqlite.db') 15 | 16 | sql = """ATTACH DATABASE '{}' AS aux""".format(db_name) 17 | cxn.execute(sql) 18 | 19 | 20 | def aux_detach(cxn): 21 | """Detach the temporary database.""" 22 | cxn.execute('DETACH DATABASE aux') 23 | 24 | 25 | # ########################## metadata table ################################## 26 | 27 | def create_metadata_table(cxn, args): 28 | """ 29 | Create the metadata table. 30 | 31 | Information used to tell how aTRAM was set up. 32 | """ 33 | cxn.executescript(""" 34 | DROP TABLE IF EXISTS metadata; 35 | 36 | CREATE TABLE metadata ( 37 | label TEXT, 38 | value TEXT); 39 | """) 40 | 41 | with cxn: 42 | sql = """INSERT INTO metadata (label, value) VALUES (?, ?);""" 43 | cxn.execute(sql, ('version', DB_VERSION)) 44 | cxn.execute(sql, ('single_ends', bool(args.get('single_ends')))) 45 | 46 | 47 | # ########################## sequences table ################################## 48 | 49 | def create_sequences_table(cxn): 50 | """Create a table to hold the raw input sequences.""" 51 | cxn.executescript(""" 52 | DROP TABLE IF EXISTS sequences; 53 | 54 | CREATE TABLE sequences ( 55 | seq_name TEXT, 56 | seq_end TEXT, 57 | seq TEXT); 58 | """) 59 | 60 | 61 | def create_sequences_index(cxn): 62 | """ 63 | Create the sequences index after we build the table. 64 | 65 | This speeds up the program significantly. 66 | """ 67 | cxn.executescript(""" 68 | CREATE INDEX sequences_index ON sequences (seq_name, seq_end); 69 | """) 70 | 71 | 72 | def insert_sequences_batch(cxn, batch): 73 | """Insert a batch of sequence records into the database.""" 74 | sql = """INSERT INTO sequences (seq_name, seq_end, seq) 75 | VALUES (?, ?, ?);""" 76 | if batch: 77 | with cxn: 78 | cxn.executemany(sql, batch) 79 | 80 | 81 | # ########################## sequence names ################################## 82 | 83 | def create_seq_names_table(cxn): 84 | """Create the sequence names table and index.""" 85 | cxn.executescript(""" 86 | CREATE TABLE aux.seq_names AS SELECT DISTINCT seq_name FROM sequences; 87 | CREATE INDEX aux.name_index ON seq_names (seq_name); 88 | """) 89 | 90 | 91 | def get_sequences_in_shard(cxn, shard_count, shard_index): 92 | """Split the sequences by row ID to shuffle them into different shards.""" 93 | sql = """ 94 | SELECT seq_name, seq_end, seq 95 | FROM sequences 96 | WHERE seq_name IN ( 97 | SELECT seq_name FROM aux.seq_names WHERE (rowid % ?) = ?); 98 | """ 99 | return cxn.execute(sql, (shard_count, shard_index)) 100 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/log.py: -------------------------------------------------------------------------------- 1 | """Common logging functions.""" 2 | 3 | from os.path import basename, splitext 4 | import sys 5 | import logging 6 | import tempfile 7 | import subprocess 8 | from . import db 9 | 10 | LOGGER = None # Global logger so we can switch between queries & blast DBs 11 | FORMATTER = logging.Formatter('%(asctime)s %(levelname)s: %(message)s', 12 | datefmt='%Y-%m-%d %H:%M:%S') 13 | NAME = 'atram_logger' 14 | 15 | 16 | def setup(log_file, log_level, blast_db, query_file=''): 17 | """Logger setup.""" 18 | log_file = file_name(log_file, blast_db, query_file) 19 | _setup(log_file, log_level) 20 | 21 | 22 | def stitcher_setup(log_file, log_level): 23 | """Build a logger for the stitcher.""" 24 | _setup(log_file, log_level) 25 | 26 | 27 | def _setup(log_file, log_level): 28 | global LOGGER # pylint: disable=global-statement 29 | 30 | if not LOGGER: 31 | handler = logging.FileHandler(log_file) 32 | handler.setFormatter(FORMATTER) 33 | handler.setLevel(logging.DEBUG) 34 | 35 | stream = logging.StreamHandler() 36 | stream.setFormatter(FORMATTER) 37 | stream.setLevel(logging.INFO) 38 | 39 | LOGGER = logging.getLogger(log_file) 40 | 41 | log_level = getattr(logging, log_level.upper()) 42 | LOGGER.setLevel(log_level) 43 | 44 | LOGGER.addHandler(handler) 45 | LOGGER.addHandler(stream) 46 | 47 | info('#' * 80) 48 | info('aTRAM version: {}'.format(db.ATRAM_VERSION)) 49 | info('Python version: {}'.format(' '.join(sys.version.split()))) 50 | info(' '.join(sys.argv[:])) 51 | 52 | 53 | def file_name(log_file, blast_db, query_file=''): 54 | """ 55 | Create the log file name for each run. 56 | 57 | Honor user's argument if given. 58 | """ 59 | if log_file: 60 | return log_file 61 | 62 | program = splitext(basename(sys.argv[0]))[0] 63 | 64 | if query_file: 65 | query_file = splitext(basename(query_file))[0] 66 | return '{}.{}.{}.log'.format(blast_db, query_file, program) 67 | 68 | return '{}.{}.log'.format(blast_db, program) 69 | 70 | 71 | def subcommand(cmd, temp_dir, timeout=None): 72 | """ 73 | Call a subprocess and log the output. 74 | 75 | Note: stdout=PIPE is blocking and large logs cause a hang. 76 | So we don't use it. 77 | """ 78 | LOGGER.debug(cmd) 79 | 80 | with tempfile.NamedTemporaryFile(mode='w', dir=temp_dir) as log_output: 81 | try: 82 | subprocess.check_call( 83 | cmd, 84 | shell=True, 85 | timeout=timeout, 86 | stdout=log_output, 87 | stderr=log_output) 88 | except Exception as err: # pylint: disable=broad-except 89 | error('Exception: {}'.format(err)) 90 | finally: 91 | with open(log_output.name) as log_input: 92 | for line in log_input: 93 | line = line.strip() 94 | if line: 95 | LOGGER.debug(line) 96 | 97 | 98 | def info(msg): 99 | """Log an info message.""" 100 | LOGGER.info(msg) 101 | 102 | 103 | def error(msg): 104 | """Log an error message.""" 105 | LOGGER.error(msg) 106 | 107 | 108 | def fatal(msg): 109 | """Log an error message and exit.""" 110 | error(msg) 111 | sys.exit(1) 112 | -------------------------------------------------------------------------------- /bin/aTRAM-master/lib/util.py: -------------------------------------------------------------------------------- 1 | """Misc. utilities.""" 2 | 3 | import os 4 | from os.path import exists, getsize, join, split 5 | import io 6 | import re 7 | import sys 8 | from shutil import rmtree 9 | import gzip 10 | import bz2 11 | from contextlib import contextmanager 12 | from tempfile import mkdtemp 13 | from Bio.SeqIO.FastaIO import SimpleFastaParser 14 | 15 | 16 | def shorten(text): 17 | """Collapse whitespace in a string.""" 18 | return ' '.join(text.split()) 19 | 20 | 21 | def set_blast_batch_size(batch_size): 22 | """Use this to control blast memory usage & query concatenation.""" 23 | if batch_size: 24 | os.environ['BATCH_SIZE'] = str(batch_size) 25 | 26 | 27 | def write_fasta_record(out_file, seq_name, seq, seq_end=None): 28 | """Write a fasta record to the file.""" 29 | out_file.write('>') 30 | out_file.write(seq_name) 31 | if seq_end: 32 | out_file.write('/') 33 | out_file.write(seq_end) 34 | out_file.write('\n') 35 | 36 | out_file.write(seq) 37 | out_file.write('\n') 38 | 39 | 40 | def temp_dir_exists(temp_dir, debug_dir=None): 41 | """Make sure the temporary directory exits.""" 42 | if temp_dir and not exists(temp_dir): 43 | sys.exit('The temporary directory must exist.') 44 | if debug_dir and not exists(debug_dir): 45 | sys.exit('The temporary debug directory must exist.') 46 | 47 | 48 | def update_temp_dir(temp_dir, args): 49 | """Handle the new temporary directory name.""" 50 | args['temp_dir'] = str(temp_dir) 51 | os.environ['SQLITE_TMPDIR'] = temp_dir 52 | 53 | 54 | @contextmanager 55 | def make_temp_dir(where=None, prefix=None, keep=False): 56 | """Handle creation and deletion of temporary directory.""" 57 | temp_dir = mkdtemp(prefix=prefix, dir=where) 58 | try: 59 | yield temp_dir 60 | finally: 61 | if not keep or not where: 62 | rmtree(temp_dir) 63 | 64 | 65 | @contextmanager 66 | def open_file(args, file_name): 67 | """Handle creation and deletion of temporary directory.""" 68 | if args.get('gzip'): 69 | stream = gzip.open(file_name, 'rt') 70 | elif args.get('bzip'): 71 | stream = bz2.open(file_name, 'rt') 72 | else: 73 | stream = open(file_name) 74 | 75 | try: 76 | yield stream 77 | finally: 78 | stream.close() 79 | 80 | 81 | def clean_name(name): 82 | """Replace problem characters in file names.""" 83 | return re.sub(r'[^\w.]+', '_', name.strip()) 84 | 85 | 86 | def as_word(number): 87 | """Convert a number in a word. 88 | 89 | If this gets complex we will add the inflect module instead. 90 | """ 91 | ordinal = { 92 | 1: 'First', 93 | 2: 'Second', 94 | 3: 'Third'} 95 | return ordinal.get(number, '{}th'.format(number)) 96 | 97 | 98 | def fasta_file_is_empty(fasta_path): 99 | """Check if a fasta file is either empty or does not have a sequence.""" 100 | if os.stat(fasta_path).st_size == 0: 101 | return True 102 | 103 | with open(fasta_path) as fasta_file: 104 | _, seq = next(SimpleFastaParser(fasta_file)) 105 | 106 | if not seq: 107 | return True 108 | 109 | return False 110 | 111 | 112 | def is_fastq_file(args, file_name): 113 | """Check if this a FASTQ file.""" 114 | if args.get('fasta'): 115 | return False 116 | if args.get('fastq'): 117 | return True 118 | 119 | parts = file_name.lower().split('.') 120 | index = -2 if re.search(r'[zp2]$', parts[-1]) and len(parts) > 2 else -1 121 | return parts[index].startswith('f') and parts[index].endswith('q') 122 | 123 | 124 | def shard_file_size(args, file_name): 125 | """Calculate shard file size for FASTA/Q files in raw or zipped format.""" 126 | file_size = getsize(file_name) 127 | 128 | if args.get('gzip'): 129 | with gzip.open(file_name, 'rb') as zippy: 130 | file_size = zippy.seek(0, io.SEEK_END) 131 | elif args.get('bzip'): 132 | with bz2.open(file_name, 'rb') as zippy: 133 | file_size = zippy.seek(0, io.SEEK_END) 134 | 135 | if is_fastq_file(args, file_name): 136 | file_size /= 2 # Guessing that fastq files ~2x fasta files 137 | 138 | return file_size 139 | 140 | 141 | def prefix_file(prefix, name): 142 | """Calculate the output path.""" 143 | dir_, file_ = split(prefix) 144 | file_ += '.' if file_ and file_[-1] != '.' else '' 145 | return join(dir_, file_ + name) 146 | -------------------------------------------------------------------------------- /bin/aTRAM-master/requirements.txt: -------------------------------------------------------------------------------- 1 | biopython>=1.74 2 | numpy>=1.17.3 3 | psutil>=5.6.3 4 | -------------------------------------------------------------------------------- /bin/aTRAM-master/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Setup the aTRAM environment.""" 3 | 4 | # Test pip 5 | # 1) Clean the /dist directory 6 | # 2) python3 setup.py sdist bdist_wheel 7 | # 3) pip install --index-url https://test.pypi.org/simple/ 8 | # --extra-index-url https://pypi.org/simple atram 9 | # 4) twine upload --repository-url https://test.pypi.org/legacy/ dist/* 10 | 11 | import re 12 | from setuptools import setup, find_packages 13 | 14 | 15 | def readme(): 16 | """Get README.md content.""" 17 | with open("README.md", 'r') as f: 18 | return f.read() 19 | 20 | 21 | def license_(): 22 | """Get LICENSE.txt content.""" 23 | with open("LICENSE.txt", 'r') as f: 24 | return f.read() 25 | 26 | 27 | def find_version(): 28 | """Read version from db.py.""" 29 | regex = r"^ATRAM_VERSION = ['\"]v?([^'\"]*)['\"]" 30 | with open("./lib/db.py", 'r') as f: 31 | match = re.search(regex, f.read(), re.M) 32 | if match: 33 | return match.group(1) 34 | 35 | raise RuntimeError("Unable to find version string.") 36 | 37 | 38 | def find_requirements(): 39 | """Read requirements.txt file and returns list of requirements.""" 40 | with open("requirements.txt", 'r') as f: 41 | return f.read().splitlines() 42 | 43 | 44 | setup( 45 | name="atram", 46 | version=find_version(), 47 | packages=find_packages(), 48 | install_requires=find_requirements(), 49 | description="""atram ("automated target restricted assembly method") is 50 | an iterative assembler that performs reference-guided 51 | local de novo assemblies using a variety of available 52 | methods""", 53 | long_description=readme(), 54 | license=license_(), 55 | url="https://github.com/juliema/aTRAM", 56 | python_requires='>=3.4', 57 | scripts=[ 58 | 'atram.py', 59 | 'atram_preprocessor.py', 60 | 'atram_stitcher.py', 61 | 'atram_framer.py', 62 | ]) 63 | -------------------------------------------------------------------------------- /bin/aTRAM-master/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/tests/__init__.py -------------------------------------------------------------------------------- /bin/aTRAM-master/tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Setup the test environment.""" 2 | 3 | import sys 4 | from os.path import dirname, abspath, join 5 | 6 | 7 | ROOT_DIR = join(dirname(dirname(abspath(__file__))), '.', 'atram') 8 | sys.path.append(ROOT_DIR) 9 | -------------------------------------------------------------------------------- /bin/aTRAM-master/tests/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/tests/lib/__init__.py -------------------------------------------------------------------------------- /bin/aTRAM-master/tests/lib/assemblers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/bin/aTRAM-master/tests/lib/assemblers/__init__.py -------------------------------------------------------------------------------- /bin/aTRAM-master/tests/lib/test_bio.py: -------------------------------------------------------------------------------- 1 | """Testing functions in lib/bio.""" 2 | 3 | import lib.bio as bio 4 | 5 | 6 | def test_reverse_complement_01(): 7 | """It complements then reverses the string.""" 8 | seq = 'ACGTUWSMKRYBDHVNXacgtuwsmkrybdhvnx' 9 | actual = bio.reverse_complement(seq) 10 | assert actual == 'TGCAAWSKMYRVHDBNXtgcaawskmyrvhdbnx'[::-1] 11 | 12 | 13 | def test_reverse_complement_02(): 14 | """A reverse complement twice yields the original sequence, except U.""" 15 | seq = 'ACGTUWSMKRYBDHVNXacgtuwsmkrybdhvnx' 16 | actual = bio.reverse_complement(bio.reverse_complement(seq)) 17 | assert actual == 'ACGTTWSMKRYBDHVNXacgttwsmkrybdhvnx' 18 | 19 | 20 | def test_is_protein_no(): 21 | """DNA, RNA, and wildcards are not protein.""" 22 | seq = 'ACGTUWSMKRYBDHVNXacgtuwsmkrybdhvnx' 23 | assert not bio.is_protein(seq) 24 | 25 | 26 | def test_is_protein_yes(): 27 | """Any protein character makes the whole sequence a protein.""" 28 | seq = 'ACGTUWSMKRYBeDHVNXacgtuwsmkrybdhvnx' 29 | assert bio.is_protein(seq) 30 | -------------------------------------------------------------------------------- /bin/aTRAM-master/tests/lib/test_blast.py: -------------------------------------------------------------------------------- 1 | """Testing functions in lib/blast.""" 2 | 3 | import lib.blast as blast 4 | 5 | 6 | # for (ends, clamp) in [('mixed_ends', ''), ('end_1', '1'), 7 | # ('end_2', '2'), ('single_ends', '')]: 8 | 9 | 10 | def test_parse_fasta_title_01(): 11 | """It handles empty strings.""" 12 | actual_seq_name, actual_seq_end = blast.parse_fasta_title('', '', '') 13 | assert actual_seq_name == '' 14 | assert actual_seq_end == '' 15 | 16 | 17 | def test_parse_fasta_title_02(): 18 | """It handles a 1 or 2 at the end of the title.""" 19 | seq_name, seq_end = blast.parse_fasta_title( 20 | 'title1 1 after', 'end_1', '1') 21 | assert seq_name == 'title1' 22 | assert seq_end == '1' 23 | 24 | 25 | def test_parse_fasta_title_03(): 26 | """It handles a 1 or 2 at the end of the title.""" 27 | seq_name, seq_end = blast.parse_fasta_title( 28 | 'title1/2 after', 'end_2', '2') 29 | assert seq_name == 'title1' 30 | assert seq_end == '2' 31 | 32 | 33 | def test_parse_fasta_title_04(): 34 | """It handles a slash delimited end.""" 35 | seq_name, seq_end = blast.parse_fasta_title( 36 | 'title/2 after', 'end_2', '2') 37 | assert seq_name == 'title' 38 | assert seq_end == '2' 39 | 40 | 41 | def test_parse_fasta_title_05(): 42 | """It handles an underscore delimited end.""" 43 | seq_name, seq_end = blast.parse_fasta_title( 44 | 'title_1', 'end_1', '1') 45 | assert seq_name == 'title' 46 | assert seq_end == '1' 47 | 48 | 49 | def test_parse_fasta_title_06(): 50 | """It handles a dot delimited end.""" 51 | seq_name, seq_end = blast.parse_fasta_title( 52 | 'title.1 after', 'end_1', '1') 53 | assert seq_name == 'title' 54 | assert seq_end == '1' 55 | 56 | 57 | def test_parse_fasta_title_07(): 58 | """It handles mixed ends with no sequence end.""" 59 | seq_name, seq_end = blast.parse_fasta_title( 60 | 'title', 'mixed_ends', '') 61 | assert seq_name == 'title' 62 | assert seq_end == '' 63 | 64 | 65 | def test_parse_fasta_title_08(): 66 | """It handles mixed ends with a delimited sequence end.""" 67 | seq_name, seq_end = blast.parse_fasta_title( 68 | 'title_1', 'mixed_ends', '') 69 | assert seq_name == 'title' 70 | assert seq_end == '1' 71 | 72 | 73 | def test_parse_fasta_title_09(): 74 | """It handles mixed ends with a space delimited sequence end.""" 75 | seq_name, seq_end = blast.parse_fasta_title( 76 | 'title 2 after', 'mixed_ends', '') 77 | assert seq_name == 'title' 78 | assert seq_end == '2' 79 | 80 | 81 | def test_parse_fasta_title_10(): 82 | """It handles single ends with no sequence end.""" 83 | seq_name, seq_end = blast.parse_fasta_title( 84 | 'title after', 'single_ends', '') 85 | assert seq_name == 'title' 86 | assert seq_end == '' 87 | 88 | 89 | def test_parse_fasta_title_11(): 90 | """It handles single ends with a delimited sequence end.""" 91 | seq_name, seq_end = blast.parse_fasta_title( 92 | 'title_1', 'single_ends', '') 93 | assert seq_name == 'title_1' 94 | assert seq_end == '' 95 | 96 | 97 | def test_parse_fasta_title_12(): 98 | """It handles single ends with a space delimited sequence end.""" 99 | seq_name, seq_end = blast.parse_fasta_title( 100 | 'title 2 words', 'single_ends', '') 101 | assert seq_name == 'title 2' 102 | assert seq_end == '' 103 | -------------------------------------------------------------------------------- /bin/aTRAM-master/tests/lib/test_db.py: -------------------------------------------------------------------------------- 1 | """Testing functions in lib/db.""" 2 | 3 | import sqlite3 4 | import lib.db as db 5 | import lib.db_atram as db_atram 6 | import lib.db_preprocessor as db_preprocessor 7 | 8 | 9 | CXN = sqlite3.connect(':memory:') 10 | 11 | 12 | def setUpModule(): 13 | """Build the database for testing.""" 14 | CXN.execute("""ATTACH DATABASE ':memory:' AS aux""") 15 | db_preprocessor.create_metadata_table(CXN, {}) 16 | db_preprocessor.create_sequences_table(CXN) 17 | db_atram.create_sra_blast_hits_table(CXN) 18 | db_atram.create_contig_blast_hits_table(CXN) 19 | db_atram.create_assembled_contigs_table(CXN) 20 | 21 | 22 | def test_get_db_name_01(): 23 | """It prepends the blast_db name to the database name.""" 24 | assert db.get_db_name('test_db') == 'test_db.sqlite.db' 25 | 26 | 27 | def test_get_version_01(): 28 | """It returns the current DB version.""" 29 | assert db.get_version(CXN) == '2.0' 30 | 31 | 32 | def test_get_version_02(): 33 | """It returns a default version if there is no metadata table.""" 34 | CXN.execute("""DROP TABLE IF EXISTS metadata""") 35 | assert db.get_version(CXN) == '1.0' 36 | -------------------------------------------------------------------------------- /bin/aTRAM-master/tests/lib/test_log.py: -------------------------------------------------------------------------------- 1 | """Testing functions in lib/log.""" 2 | 3 | # pylint: disable=invalid-name 4 | 5 | 6 | from os.path import basename, splitext 7 | import lib.log as log 8 | 9 | 10 | log_file = 'my_log_file' 11 | blast_db = 'my_blast_db' 12 | query_file = 'my_query_file' 13 | 14 | 15 | def test_file_name_01(): 16 | """It returns the given log file name.""" 17 | actual = log.file_name(log_file, blast_db, query_file=query_file) 18 | assert log_file == actual 19 | 20 | 21 | def test_file_name_02(): 22 | """It returns a default log file name.""" 23 | actual = log.file_name('', blast_db) 24 | expect = '{}.pytest.log'.format(blast_db) 25 | assert expect == actual 26 | 27 | 28 | def test_file_name_03(): 29 | """It adds the query file to the log file name.""" 30 | actual = log.file_name('', blast_db, query_file=query_file) 31 | query = splitext(basename(query_file))[0] 32 | expect = '{}.{}.pytest.log'.format(blast_db, query) 33 | assert expect == actual 34 | -------------------------------------------------------------------------------- /bin/aTRAM-master/tests/requirements-test.txt: -------------------------------------------------------------------------------- 1 | atomicwrites>=1.3.0 2 | attrs>=19.3.0 3 | biopython>=1.74 4 | importlib-metadata>=0.23 5 | more-itertools>=7.2.0 6 | numpy>=1.17.3 7 | packaging>=19.2 8 | pluggy>=0.13.0 9 | psutil>=5.6.3 10 | py>=1.8.0 11 | pyparsing>=2.4.2 12 | pytest>=5.2.1 13 | six>=1.12.0 14 | wcwidth>=0.1.7 15 | zipp>=0.6.0 16 | -------------------------------------------------------------------------------- /bin/aTRAM-master/util_atram_db_to_fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Convert data in an atram sqlite database into fasta files.""" 3 | 4 | import sys 5 | from os.path import exists, splitext 6 | import argparse 7 | import textwrap 8 | import lib.db as db 9 | import lib.blast as blast 10 | import lib.util as util 11 | 12 | 13 | def create_fasta_files(args): 14 | """Convert data in an atram sqlite database into fasta files.""" 15 | if not exists(db.get_db_name(args['blast_db'])): 16 | sys.exit('Could not find the database.') 17 | 18 | with db.connect(args['blast_db'], check_version=True) as cxn: 19 | try: 20 | files = open_fasta_files(args, cxn) 21 | 22 | for rec in db.get_all_sequences(cxn): 23 | util.write_fasta_record(files[rec[1]], rec[0], rec[2], rec[1]) 24 | 25 | finally: 26 | close_fasta_files(files) 27 | 28 | 29 | def open_fasta_files(args, cxn): 30 | """Open one fasta file for each sequence end.""" 31 | files = {} 32 | for end in [e[0] for e in db.get_sequence_ends(cxn)]: 33 | name = '{}{}.{}'.format(args['fasta_root'], end, args['fasta_ext']) 34 | files[end] = open(name, 'w') 35 | return files 36 | 37 | 38 | def close_fasta_files(files): 39 | """Close all fasta files.""" 40 | for file in files.values(): 41 | file.close() 42 | 43 | 44 | def parse_command_line(): 45 | """Process command-line arguments.""" 46 | description = """ 47 | This will read through the aTRAM SQLite database and create fasta 48 | files. One for each end. So end 1 will be named 1.fasta 49 | etc. If there is no end in the DB (i.e. the DB was built with the 50 | --single-ends option) then the file name will be .fasta. 51 | """ 52 | parser = argparse.ArgumentParser( 53 | fromfile_prefix_chars='@', 54 | formatter_class=argparse.RawDescriptionHelpFormatter, 55 | description=textwrap.dedent(description)) 56 | 57 | parser.add_argument('--version', action='version', 58 | version='%(prog)s {}'.format(db.ATRAM_VERSION)) 59 | 60 | parser.add_argument('-b', '--blast-db', '--sra', '--db', '--database', 61 | required=True, metavar='DB', 62 | help="""This needs to match the DB prefix you 63 | entered for atram_preprocessor.py.""") 64 | 65 | parser.add_argument('-f', '--fasta', required=True, 66 | help="""What to name the output fasta files without 67 | then end indicator.""") 68 | 69 | args = vars(parser.parse_args()) 70 | 71 | args['blast_db'] = blast.touchup_blast_db_names([args['blast_db']])[0] 72 | 73 | (args['fasta_root'], args['fasta_ext']) = splitext(args['fasta']) 74 | args['fasta_ext'] = args['fasta_ext'] if args['fasta_ext'] else 'fasta' 75 | 76 | return args 77 | 78 | 79 | if __name__ == '__main__': 80 | ARGS = parse_command_line() 81 | create_fasta_files(ARGS) 82 | -------------------------------------------------------------------------------- /bin/aTRAM-master/util_check_requirements.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Check that we have the minimum requirements for running atram.""" 3 | 4 | import re 5 | import sys 6 | import subprocess 7 | from functools import reduce 8 | from distutils.version import LooseVersion 9 | from shutil import which 10 | 11 | 12 | RESULTS = {} 13 | 14 | 15 | def test_format(name, value): 16 | """Format test results.""" 17 | RESULTS[name] = value 18 | value = 'OK' if value else 'FAIL' 19 | print(name.ljust(40, '.'), value) 20 | 21 | 22 | def parse_requirements(requirements): 23 | """Parse a requirement into a module and version parts.""" 24 | reqs = {} 25 | for req in requirements.split(): 26 | match = re.match(r'^([^>=<]+)([>=<]+)([^>=<]+)$', req) 27 | module = match.group(1) 28 | compare = match.group(2) 29 | version = LooseVersion(match.group(3)).version 30 | reqs[module] = {'compare': compare, 'version': version} 31 | return reqs 32 | 33 | 34 | def check_modules(): 35 | """Get installed python modules.""" 36 | modules = subprocess.check_output([sys.executable, '-m', 'pip', 'freeze']) 37 | installed_list = parse_requirements(modules.decode('utf-8')) 38 | 39 | with open('requirements.txt') as requirements: 40 | required_list = parse_requirements(requirements.read()) 41 | 42 | for module, required in required_list.items(): 43 | installed = installed_list[module] 44 | 45 | cmp = required['compare'] 46 | i_version = installed['version'] 47 | r_version = required['version'] 48 | 49 | if cmp == '==' and i_version != r_version: 50 | test_format(module, False) 51 | elif cmp == '>=' and i_version > r_version: 52 | test_format(module, True) 53 | elif i_version < r_version: 54 | test_format(module, False) 55 | 56 | 57 | def check_programs(): 58 | """Verify that external programs can be found.""" 59 | test_format('makeblastdb', which('makeblastdb')) 60 | test_format('tblastn', which('tblastn')) 61 | test_format('blastn', which('blastn')) 62 | test_format('abyss', which('abyss-pe')) 63 | test_format('trinity', which('Trinity')) 64 | test_format('velvet', which('velveth') and which('velvetg')) 65 | test_format('spades', which('spades.py')) 66 | test_format('bwa', which('bwa')) 67 | test_format('bowtie2', which('bowtie2')) 68 | test_format('exonerate', which('exonerate')) 69 | 70 | 71 | def requires(module, because, program=None): 72 | """Show that aTRAM will not work without the given program/module.""" 73 | if not program: 74 | program = 'atram.py and atram_preprocessor.py' 75 | if not RESULTS[module]: 76 | print(' {} will not work because {}'.format(program, because)) 77 | 78 | 79 | def assembler(module, but): 80 | """Report limited aTRAM functionality.""" 81 | if not RESULTS[module]: 82 | print(' atram.py will work but {}'.format(but)) 83 | 84 | 85 | def report_results(): 86 | """Show the user what they can and cannot do.""" 87 | print('\nResults:') 88 | 89 | if reduce(lambda a, b: a and b, RESULTS.values()): 90 | print(' aTRAM is ready and all features are available.\n') 91 | return 92 | 93 | requires('Python version', 'we need Python version 3.6 or above') 94 | requires('makeblastdb', "BLAST's makeblastdb is not installed") 95 | requires('tblastn', "BLAST's tblastn is not installed") 96 | requires('blastn', "BLAST's blastn is not installed") 97 | requires('biopython', 'the biopython module is missing (install with pip)') 98 | requires('psutil', 'the psutil module is missing (install it with pip)') 99 | requires('numpy', 'the numpy module is missing (install it with pip)') 100 | requires('exonerate', 'exonerate is not installed', 'atram_stitcher.py') 101 | 102 | assembler('abyss', 'you are missing the abyss assembler') 103 | assembler('trinity', 'you are missing the trinity assembler') 104 | assembler('velvet', 'you are missing the velvet assembler') 105 | assembler('spades', 'you are missing the spades assembler') 106 | assembler( 107 | 'bowtie2', 108 | 'you are missing bowtie2 and cannot use it with the trinity assembler') 109 | assembler( 110 | 'bwa', 111 | ('you are missing bwa and will not be able to use the assemble ' 112 | 'long reads with the abyss assembler')) 113 | 114 | 115 | if __name__ == '__main__': 116 | test_format( 117 | 'Python version', 118 | sys.version_info.major == 3 and sys.version_info.minor >= 6) 119 | 120 | check_programs() 121 | check_modules() 122 | report_results() 123 | -------------------------------------------------------------------------------- /docs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/docs/.DS_Store -------------------------------------------------------------------------------- /docs/.ipynb_checkpoints/phylogeny_msc-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Estimating phylogenies from sequence capture data\n", 8 | "Here we show one example of how the phased multiple sequence alignments (MSAs) assembled with the SECAPR pipeline can be used for phylogeny estimation. In the example below we use the **Multi Species Coalescent (MSC)** program BEAST2.\n", 9 | "\n", 10 | "1. Locus selection:\n", 11 | "We used the `secapr locus_selection` function in order to choose the 50 exon loci with the highest read coverage across all samples (see [locus selection workflow here](locus_selection.ipynb)).\n", 12 | "2. Phasing:\n", 13 | "We generated phased allele sequence MSAs fo the selected 50 loci (see [phasing workflow here](phasing.ipynb)).\n", 14 | "3. Generating BEAST xml file with BEAUTI:\n", 15 | "We loaded the 50 phased allele sequence MSAs into BEAUTI v2.4.4. We chose the **STACEY**-specific **BirthDeathCollapse** species tree model with a collapse height of 1e-5. This tree-model allows taxon-assignment free analyses under the MSC model, which allows to observe every sequence as an individual tip in the species tree (rather than having to assign sequences to expected clusters (= species) prior to analysis. Further priors were: bdcGrowthRate = log normal(M=4.6, S=1.5); collapseWeight = beta (alpha=2, beta=2); popPriorScale = log normal(M=-7, S=2); relativeDeathRate = beta (alpha=1.0, beta=1.0).\n", 16 | "4. Run BEAST v2.4.4:\n", 17 | "We set the MCMC for 1 billion generations, logging every 100,000 generations. After approximately 500 million generations all parameters had reached convergence (assessed with Tracer v.1.6, Rambaut et al. 2013) and the MCMC was stopped (approximately 80 hours on a Mac Pro, Late 2013, 3.5 GHz 6-Core Intel Xeon E5 processor).\n", 18 | "5. Summarizing posterior species tree distribution:\n", 19 | "The resulting distribution of species trees was summarized with TreeAnnotator (v2.4.4), using mean heights and excluding 10% burn-in. **Command:** `treeannotator -burnin 10 -heights mean species.trees summary_tree_mean_heights.tre`\n", 20 | "6. Generating similarity matrix from the posterior:\n", 21 | "We then used the SpeciesDelimitationAnalyser (part of the STACEY distribution) to calculate the posterior probability for each pair of species of belonging to the same cluster. This probability was calculated from the complete posterior tree distribution (excl. 10% burn-in), using a user-defined collapse-height value of 1e-5. **Command:** `java -jar speciesDA.jar -burnin 557 -collapseheight 1e-5 species.trees species_da_results_1e-5.txt`\n", 22 | "7. Plot the similarity matrix:\n", 23 | "We applied our custom-made R-script for plotting the similarity matrix. The [plotting-script can be found here](../../src/simmatrix_geonoma_allele_data.R).\n", 24 | "\n", 25 | "![alt text](../../images/stacey_phylogeny.jpg \"Stacey species tree\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "source": [ 34 | "[Previous page](phasing.ipynb)" 35 | ] 36 | } 37 | ], 38 | "metadata": { 39 | "kernelspec": { 40 | "display_name": "Python 3", 41 | "language": "python", 42 | "name": "python3" 43 | }, 44 | "language_info": { 45 | "codemirror_mode": { 46 | "name": "ipython", 47 | "version": 3 48 | }, 49 | "file_extension": ".py", 50 | "mimetype": "text/x-python", 51 | "name": "python", 52 | "nbconvert_exporter": "python", 53 | "pygments_lexer": "ipython3", 54 | "version": "3.6.0" 55 | } 56 | }, 57 | "nbformat": 4, 58 | "nbformat_minor": 2 59 | } 60 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate 2 | defaults: 3 | # _posts 4 | - scope: 5 | path: "" 6 | type: posts 7 | values: 8 | author_profile: true 9 | -------------------------------------------------------------------------------- /docs/_data/navigation.yml: -------------------------------------------------------------------------------- 1 | main: 2 | - title: "Quick-Start Guide" 3 | url: /docs/quick-start-guide/ 4 | - title: "Posts" 5 | url: /year-archive/ 6 | - title: "Categories" 7 | url: /categories/ 8 | - title: "Tags" 9 | url: /tags/ 10 | - title: "Pages" 11 | url: /page-archive/ 12 | - title: "Collections" 13 | url: /collection-archive/ 14 | - title: "External Link" 15 | url: https://google.com 16 | -------------------------------------------------------------------------------- /docs/documentation/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/docs/documentation/.DS_Store -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | author_profile: true 3 | --- 4 | 5 | 6 | # SEquence CApture PRocessor (SECAPR) 7 | 8 | ![SECAPR](https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/master/images/secapr_logo_small.png) 9 | 10 | [![downloads](https://anaconda.org/bioconda/secapr/badges/downloads.svg)](http://bioconda.github.io/recipes/secapr/README.html) 11 | 12 | **Original Publication: [https://doi.org/10.7717/peerj.5175](https://doi.org/10.7717/peerj.5175)** 13 | 14 | ___ 15 | 16 | *We are now teaching a **1-week intensive course** on target enrichment data, including practical exercises for all functionalities of the SECAPR pipeline. Check the [ForBio webpage](https://www.forbio.uio.no/events/courses/2020/target_capture.html) for the next open course dates and for information about past courses.* 17 | 18 | ___ 19 | 20 | ## Real data tutorial (incl. installation) [click here](http://htmlpreview.github.io/?https://github.com/AntonelliLab/seqcap_processor/blob/master/docs/documentation/tutorial.html) 21 | ___ 22 | 23 | ## Detailed documentation of all functions [click here](http://htmlpreview.github.io/?https://github.com/AntonelliLab/seqcap_processor/blob/master/docs/documentation/main_doc.html) 24 | ___ 25 | 26 | 27 | ## Overview 28 | 29 | This semi-automated pipeline aims to make the processing and analysis of sequence capture (= target enrichment) data simple and straight forward for all users. The detailed documentation and simple installation makes this pipeline accessible also for users with limited biofinformatic knowledge, while enabling user-defined processing options for the more experienced users. 30 | 31 | We included an empirical data tutorial in the [pipeline documentation](http://htmlpreview.github.io/?https://github.com/AntonelliLab/seqcap_processor/blob/master/docs/documentation/main_doc.html), which covers the processing from raw Illumina read data into multiple seqeunce alignments (MSAs) for phylogenetic analyses, including the compiling of allele sequence MSAs. This workflow can be applied to any Illumina dataset, independently of the underlying bait set and organism group. 32 | 33 | Some functions in this pipeline are inspired by scripts from the [Phyluce pipeline](https://github.com/faircloth-lab/phyluce) by Braint Faircloth, which is aimed at the processing of Ultraconserved Elements (UCEs). To honour some of the ideas belonging to Brant Faircloth, and the generous sharing of all his code as open-source, we ask you to cite the Phyluce pipeline (Faircloth 2016) alongside with ours (Andermann et al. 2018), when using SECAPR. 34 | 35 | 36 | ## Workflow 37 | 38 | 39 | 40 | 41 | **SECAPR analytical workflow.** The flowchart shows the basic SECAPR functions, which are separated into two separate steps (colored boxes). Blue box (1. reference library from raw data): in this step the raw reads are cleaned and assembled into contigs (de novo assembly); Orange box (2. reference based assembly with custom reference library): the contigs from the previous step are used for reference-based assembly, enabling allele phasing and additional quality control options, e.g. concerning read-coverage. Black boxes show SECAPR commands and white boxes represent the input and output data of the respective function. Boxes marked in grey represent multiple sequence alignments (MSAs) generated with SECAPR, which can be used for phylogenetic inference. 42 | 43 | 44 | 45 | ## Please cite: 46 | 47 | **Andermann T.**, Cano Á., Zizka A., Bacon C., Antonelli A. 2018. SECAPR—a bioinformatics pipeline for the rapid and user-friendly processing of targeted enriched Illumina sequences, from raw reads to alignments. PeerJ 6:e5175 [https://doi.org/10.7717/peerj.5175](https://doi.org/10.7717/peerj.5175) 48 | 49 | Faircloth B.C., PHYLUCE is a software package for the analysis of conserved genomic loci, Bioinformatics, Volume 32, Issue 5, 1 March 2016, Pages 786–788, [https://doi.org/10.1093/bioinformatics/btv646](https://doi.org/10.1093/bioinformatics/btv646) 50 | -------------------------------------------------------------------------------- /docs/markdown/tutorial.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/docs/markdown/tutorial.pdf -------------------------------------------------------------------------------- /docs/notebook/subdocs/phasing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Phasing allele sequences" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "After you mapped your reads against the reference library during the reference-based assembly step, you are ready to phase your reads into the two different allele sequences (in case of diploid organisms). This step is simple to execute, since the function only requires the path to the reference-based assembly output and the user-set minimal read depth for generating the consensus sequence:" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "usage: secapr phase_alleles [-h] --input INPUT --output OUTPUT\n", 27 | " [--min_coverage MIN_COVERAGE]\n", 28 | "\n", 29 | "Phase remapped reads form reference-based assembly into two separate alleles.\n", 30 | "Then produce consensus sequence for each allele.\n", 31 | "\n", 32 | "optional arguments:\n", 33 | " -h, --help show this help message and exit\n", 34 | " --input INPUT Call the folder that contains the results of the\n", 35 | " reference based assembly (output of reference_assembly\n", 36 | " function, containing the bam-files).\n", 37 | " --output OUTPUT The output directory where results will be safed.\n", 38 | " --min_coverage MIN_COVERAGE\n", 39 | " Set the minimum read coverage. Only positions that are\n", 40 | " covered by this number of reads will be called in the\n", 41 | " consensus sequence, otherwise the program will add an\n", 42 | " ambiguity at this position.\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "%%bash\n", 48 | "source activate secapr_env\n", 49 | "secapr phase_alleles -h" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "We can run the command simply like this:\n", 57 | "\n", 58 | " secapr phase_alleles --input ../../data/processed/remapped_reads/ --output ../../data/processed/allele_sequences --min_coverage 3\n", 59 | " \n", 60 | "We can also choose to phase only the [selected loci](locus_selection.ipynb) that were produced with the `secapr locus_selection` function:\n", 61 | "\n", 62 | " secapr phase_alleles --input ../../data/processed/selected_loci --output ../../data/processed/allele_sequences_selected_loci --min_coverage 3" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Producing allele alignments\n", 70 | "Now all we need to do is to run the `secapr align_sequences` function in order to align the extracted allele sequences of all samples for each locus. We can run the command like this:\n", 71 | "\n", 72 | " secapr align_sequences --sequences ../../data/processed/allele_sequences/joined_allele_fastas.fasta --output ../../data/processed/alignments/allele_alignments --aligner mafft --output-format fasta --no-trim --ambiguous\n", 73 | " \n", 74 | "Or like this if we want to instead build allele alignments from only the selected loci:\n", 75 | "\n", 76 | " secapr align_sequences --sequences ../../data/processed/allele_sequences_selected_loci/joined_allele_fastas.fasta --output ../../data/processed/alignments/selected_loci_allele_alignments --aligner mafft --output-format fasta --no-trim --ambiguous\n", 77 | " \n", 78 | "### Adding missing sequences\n", 79 | "Before using these alignments for phylogenetic analyses it usually is a good idea to make sure that all taxa contain the same number of sequences. As of right now, some alignments may be missing one of the two allele sequences for some samples, because not enough reads were present that were supporting both haplotypes (controlled by the `--min_coverage` flag int he `phase_alleles` command. In order to add missing sequences as dummy sequences containing n's we can use the `secapr add_missing_sequences` function:\n", 80 | "\n", 81 | " secapr add_missing_sequences --input ../../data/processed/alignments/selected_loci_allele_alignments/ --output ../../data/processed/alignments/selected_loci_allele_alignments_complete\n", 82 | " \n", 83 | "### Phylogeny estimation with phased allele alignments\n", 84 | "We provide a tutorial of how to use the generated allele sequence alignments for [phylogeny estimation under the Multispecies Coalescent (MSC) model](phylogeny_msc.ipynb).\n", 85 | "\n", 86 | "### SNP extraction from phased allele alignments\n", 87 | "We provide a tool for extracting SNPs from phased allele alignments. The tool can be applied to any type of allignments, but is usually the most useful for fully phased allele alignments, which contain the heterozygous information. See [here for more information](https://github.com/tobiashofmann88/snp_extraction_from_alignments).\n" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "source": [ 96 | "[Previous page](locus_selection.ipynb) | [Next page](phylogeny_msc.ipynb)" 97 | ] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "Python 3", 103 | "language": "python", 104 | "name": "python3" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 3 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.6.4" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 2 121 | } 122 | -------------------------------------------------------------------------------- /docs/notebook/subdocs/phylogeny_msc.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Estimating phylogenies from sequence capture data\n", 8 | "Here we show one example of how the phased multiple sequence alignments (MSAs) assembled with the SECAPR pipeline can be used for phylogeny estimation. In the example below we use the **Multi Species Coalescent (MSC)** program BEAST2.\n", 9 | "\n", 10 | "1. Locus selection:\n", 11 | "We used the `secapr locus_selection` function in order to choose the 50 exon loci with the highest read coverage across all samples (see [locus selection workflow here](locus_selection.ipynb)).\n", 12 | "2. Phasing:\n", 13 | "We generated phased allele sequence MSAs fo the selected 50 loci (see [phasing workflow here](phasing.ipynb)).\n", 14 | "3. Generating BEAST xml file with BEAUTI:\n", 15 | "We loaded the 50 phased allele sequence MSAs into BEAUTI v2.4.4. We chose the **STACEY**-specific **BirthDeathCollapse** species tree model with a collapse height of 1e-5. This tree-model allows taxon-assignment free analyses under the MSC model, which allows to observe every sequence as an individual tip in the species tree (rather than having to assign sequences to expected clusters (= species) prior to analysis. Further priors were: bdcGrowthRate = log normal(M=4.6, S=1.5); collapseWeight = beta (alpha=2, beta=2); popPriorScale = log normal(M=-7, S=2); relativeDeathRate = beta (alpha=1.0, beta=1.0).\n", 16 | "4. Run BEAST v2.4.4:\n", 17 | "We set the MCMC for 1 billion generations, logging every 100,000 generations. After approximately 500 million generations all parameters had reached convergence (assessed with Tracer v.1.6, Rambaut et al. 2013) and the MCMC was stopped (approximately 80 hours on a Mac Pro, Late 2013, 3.5 GHz 6-Core Intel Xeon E5 processor).\n", 18 | "5. Summarizing posterior species tree distribution:\n", 19 | "The resulting distribution of species trees was summarized with TreeAnnotator (v2.4.4), using mean heights and excluding 10% burn-in. **Command:** `treeannotator -burnin 10 -heights mean species.trees summary_tree_mean_heights.tre`\n", 20 | "6. Generating similarity matrix from the posterior:\n", 21 | "We then used the SpeciesDelimitationAnalyser (part of the STACEY distribution) to calculate the posterior probability for each pair of species of belonging to the same cluster. This probability was calculated from the complete posterior tree distribution (excl. 10% burn-in), using a user-defined collapse-height value of 1e-5. **Command:** `java -jar speciesDA.jar -burnin 557 -collapseheight 1e-5 species.trees species_da_results_1e-5.txt`\n", 22 | "7. Plot the similarity matrix:\n", 23 | "We applied our custom-made R-script for plotting the similarity matrix. The [plotting-script can be found here](https://github.com/AntonelliLab/seqcap_processor/src/simmatrix_geonoma_allele_data.R). A nice and userfriendly R-script to plot simmatrices can be found here: https://github.com/scrameri/smtools/tree/master/SpeciesDelimitation\n", 24 | "\n", 25 | "![alt text](https://github.com/AntonelliLab/seqcap_processor/images/stacey_phylogeny.jpg \"Stacey species tree\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "source": [ 34 | "[Previous page](phasing.ipynb)" 35 | ] 36 | } 37 | ], 38 | "metadata": { 39 | "kernelspec": { 40 | "display_name": "Python 3", 41 | "language": "python", 42 | "name": "python3" 43 | }, 44 | "language_info": { 45 | "codemirror_mode": { 46 | "name": "ipython", 47 | "version": 3 48 | }, 49 | "file_extension": ".py", 50 | "mimetype": "text/x-python", 51 | "name": "python", 52 | "nbconvert_exporter": "python", 53 | "pygments_lexer": "ipython3", 54 | "version": "3.6.4" 55 | } 56 | }, 57 | "nbformat": 4, 58 | "nbformat_minor": 2 59 | } -------------------------------------------------------------------------------- /images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/images/.DS_Store -------------------------------------------------------------------------------- /images/exon_vs_contig_based_assembly.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/images/exon_vs_contig_based_assembly.pdf -------------------------------------------------------------------------------- /images/exon_vs_contig_based_assembly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/images/exon_vs_contig_based_assembly.png -------------------------------------------------------------------------------- /images/paralog_contig_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/images/paralog_contig_alignment.png -------------------------------------------------------------------------------- /images/reads_to_contig.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/images/reads_to_contig.jpg -------------------------------------------------------------------------------- /images/secapr_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/images/secapr_logo.png -------------------------------------------------------------------------------- /images/secapr_logo_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/images/secapr_logo_small.png -------------------------------------------------------------------------------- /images/secapr_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/images/secapr_workflow.png -------------------------------------------------------------------------------- /images/stacey_phylogeny.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/images/stacey_phylogeny.jpg -------------------------------------------------------------------------------- /images/stacey_phylogeny.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/images/stacey_phylogeny.pdf -------------------------------------------------------------------------------- /images/wrong_contig.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/images/wrong_contig.JPG -------------------------------------------------------------------------------- /recipe/install_secapr_env.sh: -------------------------------------------------------------------------------- 1 | conda config --add channels defaults 2 | conda config --add channels conda-forge 3 | conda config --add channels bioconda 4 | conda create -y -n secapr_env 5 | sleep 5 6 | source activate secapr_env 7 | sleep 5 8 | conda activate secapr_env 9 | sleep 5 10 | conda install -y python=3.8 11 | conda install -y pandas 12 | conda install -y matplotlib-base 13 | conda install -y biopython 14 | conda install -y fastqc 15 | conda install -y fastp=0.23 16 | conda install -y spades=3.15.2 17 | conda install -y blast 18 | conda install -y mafft 19 | conda install -y muscle 20 | conda install -y emboss 21 | conda install -y bwa 22 | conda install -y samtools==1.3.1 23 | conda install -y trimal 24 | conda install -y secapr 25 | #pip install https://github.com/AntonelliLab/seqcap_processor/archive/refs/tags/v2.2.5.tar.gz 26 | -------------------------------------------------------------------------------- /recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set version = "2.2.6" %} 2 | 3 | package: 4 | name: secapr 5 | version: {{ version }} 6 | 7 | source: 8 | url: https://github.com/AntonelliLab/seqcap_processor/archive/v{{ version }}.tar.gz 9 | sha256: 63a67d6bc5d139b2910e562dc7969e7c8374c01720af5319a0f7a9d06c5983c0 10 | 11 | build: 12 | noarch: generic 13 | number: 0 14 | script: python -m pip install --no-deps --ignore-installed . 15 | entry_points: 16 | - secapr = secapr.__main__:main 17 | 18 | requirements: 19 | host: 20 | - python 21 | - pip 22 | 23 | run: 24 | - python=3.8 25 | - pandas 26 | - matplotlib-base 27 | - biopython 28 | - trimmomatic 29 | - fastqc 30 | - fastp=0.23 31 | - spades=3.15.2 32 | - blast 33 | - mafft 34 | - muscle 35 | - emboss 36 | - bwa 37 | - samtools==1.3.1 38 | - trimal 39 | 40 | test: 41 | imports: 42 | - secapr 43 | commands: 44 | - secapr --version 45 | 46 | about: 47 | home: 'https://github.com/AntonelliLab/seqcap_processor' 48 | license: MIT 49 | license_file: LICENSE 50 | summary: 'Process sequence-capture FASTQ files into alignments for phylogenetic analyses. Integrates allele phasing.' 51 | -------------------------------------------------------------------------------- /recipe/secapr.yml: -------------------------------------------------------------------------------- 1 | name: secapr_env 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - abyss=2.3.1=h98b32fd_0 8 | - biopython=1.79=py39h89e85a6_0 9 | - blast=2.6.0=boost1.64_2 10 | - bwa=0.7.17=h188c3c3_8 11 | - bzip2=1.0.8=h0d85af4_4 12 | - ca-certificates=2021.5.30=h033912b_0 13 | - certifi=2021.5.30=py39h6e9494a_0 14 | - cycler=0.10.0=py_2 15 | - emboss=6.6.0=h6debe1e_0 16 | - expat=2.4.1=he49afe7_0 17 | - fastqc=0.11.9=hdfd78af_1 18 | - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 19 | - fontconfig=2.13.1=h10f422b_1005 20 | - freetype=2.10.4=h4cff582_1 21 | - giflib=5.2.1=hbcb3906_2 22 | - htslib=1.11=h422799e_2 23 | - icu=68.1=h74dc148_0 24 | - jbig=2.1=h0d85af4_2003 25 | - jpeg=9d=hbcb3906_0 26 | - kiwisolver=1.3.1=py39hf018cea_1 27 | - krb5=1.17.2=h60d9502_0 28 | - lcms2=2.12=h577c468_0 29 | - lerc=2.2.1=h046ec9c_0 30 | - libblas=3.9.0=9_openblas 31 | - libcblas=3.9.0=9_openblas 32 | - libcurl=7.71.1=he6690cf_1 33 | - libcxx=11.1.0=habf9029_0 34 | - libdeflate=1.7=h35c211d_5 35 | - libedit=3.1.20191231=h0678c8f_2 36 | - libffi=3.3=h046ec9c_2 37 | - libgd=2.3.2=h4e7a7ea_0 38 | - libgfortran=5.0.0=9_3_0_h6c81a4c_22 39 | - libgfortran5=9.3.0=h6c81a4c_22 40 | - libiconv=1.16=haf1e3a3_0 41 | - liblapack=3.9.0=9_openblas 42 | - libopenblas=0.3.15=openmp_h5e1b9a4_1 43 | - libpng=1.6.37=h7cec526_2 44 | - libssh2=1.9.0=h52ee1ee_6 45 | - libtiff=4.3.0=h1167814_1 46 | - libwebp=1.2.0=h1648767_0 47 | - libwebp-base=1.2.0=h0d85af4_2 48 | - libxml2=2.9.12=h93ec3fd_0 49 | - llvm-openmp=11.1.0=hda6cdc1_1 50 | - lz4-c=1.9.3=h046ec9c_0 51 | - mafft=7.480=hb4d813b_0 52 | - make=4.3=h22f3db7_1 53 | - matplotlib-base=3.4.2=py39hb07454d_0 54 | - mpi=1.0=openmpi 55 | - muscle=3.8.1551=hb280591_6 56 | - ncurses=6.2=h2e338ed_4 57 | - numpy=1.21.0=py39h7eed0ac_0 58 | - olefile=0.46=pyh9f0ad1d_1 59 | - openjdk=11.0.9.1=hcf210ce_1 60 | - openjpeg=2.4.0=h6e7aa92_1 61 | - openmpi=4.1.1=hd3cd54c_0 62 | - openssl=1.1.1k=h0d85af4_0 63 | - pandas=1.3.0=py39h4d6be9b_0 64 | - perl=5.32.1=0_h0d85af4_perl5 65 | - pillow=8.3.1=py39he9bb72f_0 66 | - pip=21.1.3=pyhd8ed1ab_0 67 | - pyparsing=2.4.7=pyh9f0ad1d_0 68 | - python=3.9.6=hd187cdc_1_cpython 69 | - python-dateutil=2.8.1=py_0 70 | - python_abi=3.9=2_cp39 71 | - pytz=2021.1=pyhd8ed1ab_0 72 | - readline=8.1=h05e3726_0 73 | - samtools=1.11=h725deca_0 74 | - secapr=2.1.0=pyh3252c3a_0 75 | - setuptools=49.6.0=py39h6e9494a_3 76 | - six=1.16.0=pyh6c4a22f_0 77 | - spades=3.15.2=he641558_0 78 | - sqlite=3.36.0=h23a322b_0 79 | - tk=8.6.10=h0419947_1 80 | - tornado=6.1=py39h89e85a6_1 81 | - trimal=1.4.1=hb280591_5 82 | - trimmomatic=0.39=hdfd78af_2 83 | - tzdata=2021a=he74cb21_1 84 | - wheel=0.36.2=pyhd3deb0d_0 85 | - xz=5.2.5=haf1e3a3_1 86 | - zlib=1.2.11=h7795811_1010 87 | - zstd=1.5.0=h582d3a0_0 88 | -------------------------------------------------------------------------------- /recipe/secapr/channeldata.json: -------------------------------------------------------------------------------- 1 | { 2 | "channeldata_version": 1, 3 | "packages": {}, 4 | "subdirs": [ 5 | "noarch" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /recipe/secapr/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | secapr 4 | 67 | 68 | 69 |

secapr

70 |

RSS Feed   channeldata.json

71 | noarch    72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 |
PackageLatest VersionDocDevLicensenoarch Summary
81 |
Updated: 2021-07-13 09:22:00 +0000 - Files: 0
82 | 83 | -------------------------------------------------------------------------------- /recipe/secapr/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set version = "2.1.0" %} 2 | 3 | package: 4 | name: secapr 5 | version: {{ version }} 6 | 7 | source: 8 | url: https://github.com/AntonelliLab/seqcap_processor/archive/v{{ version }}.tar.gz 9 | sha256: bc69147dbc452acf56dc75452c5cadcdaeeeda1d0a3a81c5c3667e0018924031 10 | 11 | build: 12 | noarch: python 13 | number: 0 14 | script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed -vv" 15 | entry_points: 16 | - secapr = secapr.__main__:main 17 | 18 | requirements: 19 | host: 20 | - python 21 | - pip 22 | - setuptools 23 | 24 | run: 25 | - python 26 | - pandas 27 | - matplotlib-base 28 | - biopython 29 | - trimmomatic 30 | - fastqc 31 | - abyss 32 | - spades 33 | - blast 34 | - mafft 35 | - muscle 36 | - emboss 37 | - bwa 38 | - samtools=1.9 39 | - trimal 40 | 41 | test: 42 | imports: 43 | - secapr 44 | commands: 45 | - secapr --version 46 | 47 | about: 48 | home: 'https://github.com/AntonelliLab/seqcap_processor' 49 | license: MIT 50 | license_file: LICENSE 51 | summary: 'Process sequence-capture FASTQ files into alignments for phylogenetic analyses. Integrates allele phasing.' -------------------------------------------------------------------------------- /recipe/secapr/noarch/current_repodata.json: -------------------------------------------------------------------------------- 1 | { 2 | "info": { 3 | "subdir": "noarch" 4 | }, 5 | "packages": {}, 6 | "packages.conda": {}, 7 | "removed": [], 8 | "repodata_version": 1 9 | } 10 | -------------------------------------------------------------------------------- /recipe/secapr/noarch/current_repodata.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/recipe/secapr/noarch/current_repodata.json.bz2 -------------------------------------------------------------------------------- /recipe/secapr/noarch/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | secapr/noarch 4 | 44 | 45 | 46 |

secapr/noarch

47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 |
FilenameSizeLast ModifiedSHA256MD5
repodata.json127 B2021-07-13 09:22:00 +0000b546412dc20b790c5f9c223e394ff0e39a27ba12a99631ca4d1bb6c0ca3bd05c917338f97423c09c911618661fef3056
repodata.json.bz2126 B2021-07-13 09:22:00 +000032d48c11b6d5ee61a54a9076f8c06a62ba795f69b671db3e44b1301fa0efae0e1ecb5b301b9a2ba831f0a68a7ace5f00
repodata_from_packages.json127 B2021-07-13 09:22:00 +0000b546412dc20b790c5f9c223e394ff0e39a27ba12a99631ca4d1bb6c0ca3bd05c917338f97423c09c911618661fef3056
repodata_from_packages.json.bz2126 B2021-07-13 09:22:00 +000032d48c11b6d5ee61a54a9076f8c06a62ba795f69b671db3e44b1301fa0efae0e1ecb5b301b9a2ba831f0a68a7ace5f00
80 |
Updated: 2021-07-13 09:22:00 +0000 - Files: 0
81 | 82 | -------------------------------------------------------------------------------- /recipe/secapr/noarch/repodata.json: -------------------------------------------------------------------------------- 1 | { 2 | "info": { 3 | "subdir": "noarch" 4 | }, 5 | "packages": {}, 6 | "packages.conda": {}, 7 | "removed": [], 8 | "repodata_version": 1 9 | } 10 | -------------------------------------------------------------------------------- /recipe/secapr/noarch/repodata.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/recipe/secapr/noarch/repodata.json.bz2 -------------------------------------------------------------------------------- /recipe/secapr/noarch/repodata_from_packages.json: -------------------------------------------------------------------------------- 1 | { 2 | "info": { 3 | "subdir": "noarch" 4 | }, 5 | "packages": {}, 6 | "packages.conda": {}, 7 | "removed": [], 8 | "repodata_version": 1 9 | } 10 | -------------------------------------------------------------------------------- /recipe/secapr/noarch/repodata_from_packages.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/recipe/secapr/noarch/repodata_from_packages.json.bz2 -------------------------------------------------------------------------------- /secapr/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntonelliLab/seqcap_processor/a2f960498519a1e65a7a901665e879ad7c6e9f40/secapr/.DS_Store -------------------------------------------------------------------------------- /secapr/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import get_versions 2 | __version__ = get_versions()['version'] 3 | del get_versions 4 | -------------------------------------------------------------------------------- /secapr/__main__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | #author: Tobias Andermann, tobias.andermann@bioenv.gu.se 4 | #__main__.py created by Estelle, based on IgDiscover (https://bitbucket.org/igdiscover/igdiscover) 5 | 6 | import os 7 | import sys 8 | from argparse import ArgumentParser 9 | import logging 10 | import warnings 11 | from . import __version__ 12 | import importlib 13 | 14 | 15 | __author__ = "Tobias Andermann" 16 | 17 | # List of all subcommands. A module of the given name must exist and define 18 | # add_arguments() and main() functions. 19 | 20 | COMMANDS = [ 21 | 'quality_check', 22 | 'clean_reads', 23 | 'assemble_reads', 24 | 'find_target_contigs', 25 | 'align_sequences', 26 | 'join_exons', 27 | 'reference_assembly', 28 | 'phase_alleles', 29 | 'add_missing_sequences', 30 | 'locus_selection', 31 | 'automate_all', 32 | 'concatenate_alignments', 33 | 'paralogs_to_ref', 34 | 'plot_sequence_yield' 35 | ] 36 | 37 | 38 | def main(arguments=None): 39 | logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') 40 | parser = ArgumentParser(description=__doc__, prog='secapr') 41 | parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) 42 | 43 | subparsers = parser.add_subparsers() 44 | for command_name in COMMANDS: 45 | module = importlib.import_module('.' + command_name, 'secapr') 46 | subparser = subparsers.add_parser(command_name, 47 | help=module.__doc__.split('\n')[1], description=module.__doc__) 48 | subparser.set_defaults(func=module.main) 49 | module.add_arguments(subparser) 50 | 51 | args = parser.parse_args(arguments) 52 | if not hasattr(args, 'func'): 53 | parser.error('Please provide the name of a subcommand to run') 54 | else: 55 | args.func(args) 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /secapr/add_missing_sequences.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | #author: Tobias Hofmann, tobias.andermann@bioenv.gu.se 3 | """ 4 | This script will add dummy sequences '?' for missing taxa in each alignments, making sure that all alignments in the input folder contain the same taxa (as required for e.g. *BEAST) 5 | """ 6 | 7 | import os 8 | import sys 9 | import glob 10 | import shutil 11 | import configparser 12 | import pickle 13 | from .utils import CompletePath 14 | 15 | def add_arguments(parser): 16 | parser.add_argument( 17 | '--input', 18 | required=True, 19 | action=CompletePath, 20 | default=None, 21 | help='The directory containing fasta alignments' 22 | ) 23 | parser.add_argument( 24 | '--output', 25 | required=True, 26 | action=CompletePath, 27 | default=None, 28 | help='The output directory where results will be safed' 29 | ) 30 | 31 | 32 | def read_fasta(fasta): 33 | name, seq = None, [] 34 | for line in fasta: 35 | line = line.rstrip() 36 | if line.startswith(">"): 37 | if name: yield (name, ''.join(seq)) 38 | name, seq = line, [] 39 | else: 40 | seq.append(line) 41 | if name: yield (name, ''.join(seq)) 42 | 43 | 44 | def main(args): 45 | # Set working directory 46 | work_dir = args.input 47 | out_dir = args.output 48 | if not os.path.exists(out_dir): 49 | os.makedirs(out_dir) 50 | 51 | # Create a dictionary with the name-pattern as key and all file-names sharing that name-pattern 52 | fasta_dict = {} 53 | for fasta in os.listdir(work_dir): 54 | if fasta.endswith(".fasta") or fasta.endswith(".fa"): 55 | fasta_dict.setdefault("all",[]).append(fasta) 56 | 57 | 58 | 59 | # Get the list of taxa names (headers) for each locus, key is out-file, values are in-files 60 | for key, value in fasta_dict.items(): 61 | # Creates a list of all headers that are present in the concatenated alignments, accounting for differences in the taxon composition of each alignment 62 | list_headers=[] 63 | # Each k is a separate fasta input file belonging to the same locus ()to be joined) 64 | for k in sorted(value): 65 | with open("%s/%s" %(work_dir,k)) as f: 66 | for name, seq in read_fasta(f): 67 | if not name in list_headers: 68 | list_headers.append(name) 69 | 70 | # "value" is a list of all fasta files to be concatenated 71 | # Find the missing taxa in each fasta input file and simulate a sequence of correct length (only "n") 72 | for k in sorted(value): 73 | taxa_names_single = [] 74 | present_seq = [] 75 | length_alignment = 0 76 | with open("%s/%s" %(work_dir,k)) as f: 77 | for name, seq in read_fasta(f): 78 | taxa_names_single.append(name) 79 | present_seq.append((name,seq)) 80 | length_alignment = len(seq) 81 | # Make a list of all missing taxa in each fasta input file 82 | missing_taxa = [] 83 | for header in list_headers: 84 | if header not in taxa_names_single: 85 | missing_taxa.append(header) 86 | simulated_seq = [] 87 | for mistax in missing_taxa: 88 | fake_string = "n" * length_alignment 89 | simulated_seq.append((mistax,fake_string)) 90 | all_seq = sorted(simulated_seq+present_seq) 91 | out_fasta = open(os.path.join(out_dir, k), 'w') 92 | for seqname, sequence in all_seq: 93 | out_fasta.write(seqname+"\n") 94 | out_fasta.write(sequence+"\n") 95 | out_fasta.close() 96 | try: 97 | pickle_in = os.path.join(args.input,'.secapr_files/sequence_origin.pickle') 98 | with open(pickle_in, 'rb') as handle: 99 | sequence_origin = pickle.load(handle) 100 | pickle_path = os.path.join(args.output,'.secapr_files/sequence_origin.pickle') 101 | with open(pickle_path, 'wb') as handle: 102 | pickle.dump(sequence_origin, handle, protocol=pickle.HIGHEST_PROTOCOL) 103 | except: 104 | pass -------------------------------------------------------------------------------- /secapr/concatenate_alignments.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | Concatenate mutliple alignments (MSAs) into one supermatrix 4 | ''' 5 | 6 | import os 7 | import sys 8 | import glob 9 | import shutil 10 | import argparse 11 | import configparser 12 | import subprocess 13 | import subprocess 14 | import re 15 | #from cogent import LoadSeqs, DNA 16 | from .utils import CompletePath 17 | 18 | 19 | # Get arguments 20 | def add_arguments(parser): 21 | parser.add_argument( 22 | '--input', 23 | required=True, 24 | action=CompletePath, 25 | default=None, 26 | help='The directory containing the fasta-alignment-files' 27 | ) 28 | parser.add_argument( 29 | '--output', 30 | required=True, 31 | action=CompletePath, 32 | default=None, 33 | help='The output directory where results will be safed' 34 | ) 35 | 36 | 37 | def read_fasta(fasta): 38 | name, seq = None, [] 39 | for line in fasta: 40 | line = line.rstrip() 41 | if line.startswith(">"): 42 | if name: yield (name, ''.join(seq)) 43 | name, seq = line, [] 44 | else: 45 | seq.append(line) 46 | if name: yield (name, ''.join(seq)) 47 | 48 | 49 | def main(args): 50 | # Set working directory 51 | work_dir = args.input 52 | out_dir = args.output 53 | if not os.path.exists(out_dir): 54 | os.makedirs(out_dir) 55 | # Create a dictionary with the name-pattern as key and all file-names sharing that name-pattern 56 | fasta_dict = {} 57 | for fasta in os.listdir(work_dir): 58 | if fasta.endswith(".fasta") or fasta.endswith(".fa"): 59 | name_pattern = 'all_fastas' 60 | fasta_dict.setdefault(name_pattern,[]).append(fasta) 61 | #else: 62 | # print "didn't work for", fasta 63 | print(('Found %i alignments. Concatenating all...'%len(fasta_dict['all_fastas']))) 64 | # Get the list of taxa names (headers) for each locus, key is out-file, values are in-files 65 | for key, value in fasta_dict.items(): 66 | list_headers=[] 67 | # Each k is a separate fasta input file belonging to the same locus ()to be joined) 68 | for k in sorted(value): 69 | with open("%s/%s" %(work_dir,k)) as f: 70 | for name, seq in read_fasta(f): 71 | if not name in list_headers: 72 | list_headers.append(name) 73 | 74 | 75 | # Find the missing taxa in each fasta input file and simulate a sequence of correct length (only "n") 76 | in_fasta = os.path.join(work_dir, fasta) 77 | # Each k is a separate fasta input file belonging to the same locus ()to be joined) 78 | all_seq_dict = {} 79 | for k in sorted(value): 80 | taxa_names_single = [] 81 | present_seq = [] 82 | length_alignment = "" 83 | with open("%s/%s" %(work_dir,k)) as f: 84 | for name, seq in read_fasta(f): 85 | taxa_names_single.append(name) 86 | present_seq.append((name,seq)) 87 | length_alignment = len(seq) 88 | # Make a list of all missing taxa in each fasta input file 89 | missing_taxa = [] 90 | for header in list_headers: 91 | if header not in taxa_names_single: 92 | missing_taxa.append(header) 93 | simulated_seq = [] 94 | for mistax in missing_taxa: 95 | fake_string = "n" * length_alignment 96 | simulated_seq.append((mistax,fake_string)) 97 | all_seq = sorted(simulated_seq+present_seq) 98 | 99 | for seq_header, sequence in all_seq: 100 | all_seq_dict.setdefault(seq_header,[]).append(sequence) 101 | 102 | out_fasta = open(os.path.join(out_dir, "%s.fasta" %key), 'w') 103 | for seqname, sequences in all_seq_dict.items(): 104 | final_sequence = "".join(sequences) 105 | final_sequence = final_sequence.replace('\n','') 106 | out_fasta.write(seqname+"\n") 107 | out_fasta.write(final_sequence+"\n") 108 | 109 | out_fasta.close() 110 | print(('Concatenation finished. Supermatrix printed to %s'%os.path.join(out_dir, "%s.fasta" %key))) 111 | -------------------------------------------------------------------------------- /secapr/create_consensus_from_alleles.py: -------------------------------------------------------------------------------- 1 | #author: Tobias Andermann, tobias.andermann@bioenv.gu.se 2 | 3 | import os 4 | import sys 5 | import re 6 | import glob 7 | import shutil 8 | import argparse 9 | import csv 10 | import random 11 | 12 | 13 | from .utils import CompletePath 14 | 15 | 16 | # Get arguments 17 | def get_args(): 18 | parser = argparse.ArgumentParser( 19 | description="This script will create consensus sequences from pairs of allele sequences, thereby turning allele alignments into consensus alignments.", 20 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 21 | ) 22 | parser.add_argument( 23 | '--input', 24 | required=True, 25 | action=CompletePath, 26 | default=None, 27 | help='The directory containing fasta alignments' 28 | ) 29 | parser.add_argument( 30 | '--config', 31 | required=True, 32 | help='A configuration file containing the full paths to the following programs: samtools, bcftools, vcfutils, emboss, picard. Also the paths to either clc-assembly-cell or bwa, depending on which of these two mapping softwares is chosen (see --mapper)' 33 | ) 34 | parser.add_argument( 35 | '--output', 36 | required=True, 37 | action=CompletePath, 38 | default=None, 39 | help='The output directory where results will be safed' 40 | ) 41 | 42 | return parser.parse_args() 43 | 44 | 45 | # Get arguments 46 | args = get_args() 47 | # Set working directory 48 | work_dir = args.input 49 | out_dir = args.output 50 | if not os.path.exists(out_dir): 51 | os.makedirs(out_dir) 52 | config = args.config 53 | 54 | 55 | 56 | 57 | 58 | #XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 59 | #%%% Functions %%% 60 | 61 | 62 | def read_fasta(fasta): 63 | name, seq = None, [] 64 | for line in fasta: 65 | line = line.rstrip() 66 | if line.startswith(">"): 67 | if name: yield (name, ''.join(seq)) 68 | name, seq = line, [] 69 | else: 70 | seq.append(line) 71 | if name: yield (name, ''.join(seq)) 72 | 73 | 74 | 75 | #XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 76 | #%%% Workflow %%% 77 | 78 | # Read in config file 79 | with open(config, 'r') as c: 80 | conf_dict = {} 81 | reader = csv.reader(c, delimiter='\t') 82 | reader = list(reader) 83 | for row in reader: 84 | conf_dict.setdefault(row[1],[]) 85 | conf_dict[row[1]].append(row[0]) 86 | 87 | 88 | # Create a list of all fasta files 89 | fasta_files = [] 90 | for fasta in os.listdir(work_dir): 91 | if fasta.endswith(".fasta") or fasta.endswith(".fa"): 92 | fasta_files.append(fasta) 93 | 94 | 95 | for fasta in fasta_files: 96 | 97 | # Create an output consensus fasta file for each allele alignment 98 | fasta_cons_name = re.sub("allele","consensus",fasta) 99 | out_fasta = open(os.path.join(out_dir, fasta_cons_name), 'w') 100 | 101 | # Create a dictionary for each fasta file, where both allele sequences are assigned to the same key for each sample 102 | seq_dict = {} 103 | for seq_pair in list(conf_dict.values()): 104 | key = list(conf_dict.keys())[list(conf_dict.values()).index(seq_pair)] 105 | #print seq_pair[0], "=", key 106 | #print seq_pair[1], "=", key 107 | with open("%s/%s" %(work_dir,fasta)) as f: 108 | for name, seq in read_fasta(f): 109 | name = re.sub('>', '', name) 110 | if name in seq_pair: 111 | name = key 112 | seq_dict.setdefault(name,[]) 113 | seq_dict[name].append(seq) 114 | # Create a consensus dict for each fasta file with the correct new header name as key and the consensus sequence of the two alleles as value 115 | consensus_dict = {} 116 | for header in seq_dict: 117 | consensus_dict.setdefault(header,[]) 118 | sequence = seq_dict[header] 119 | allele0 = sequence[0] 120 | allele1 = sequence[1] 121 | # Find those positions where the two alleles differ from each other and make a random pick of one of the versions, simulationg a consensus sequence 122 | for id, base in enumerate(allele0): 123 | if base != allele1[id]: 124 | variation = [base,allele1[id]] 125 | base = random.choice(variation) 126 | consensus_dict[header].append(base) 127 | # Write the consensus dictionary into a fasta output file 128 | for cons_header in consensus_dict: 129 | cons_sequence = "".join(consensus_dict[cons_header]) 130 | cons_header = ">%s" %cons_header 131 | out_fasta.write(cons_header+"\n") 132 | out_fasta.write(cons_sequence+"\n") 133 | 134 | out_fasta.close() 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /secapr/extract_alignments_from_phyluce_get_inf_sites_output.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import csv 4 | import random 5 | 6 | from .utils import CompletePath 7 | 8 | 9 | # Get arguments 10 | def get_args(): 11 | parser = argparse.ArgumentParser(description="Use the phyluce_align_get_informative_sites output to extract UCE alignments with a certain number of informative sites (or random ones)", formatter_class=argparse.ArgumentDefaultsHelpFormatter) 12 | 13 | parser.add_argument('--input',required=True,action=CompletePath,default=None,help='The phyluce_align_get_informative_sites screen output as text file') 14 | 15 | parser.add_argument('--output',required=True,action=CompletePath,default=None,help='The name of the file, which will eb a list of taxa fulfilling the requirement') 16 | 17 | parser.add_argument('--mode',choices=["top","bottom","cutoff","random"],default="top",help='Choose which alignments you want to extract: top = the x most informative alignments bottom = the x least informative alignments cutoff = all alignments with more than x informative sites random = randomly chooses x alignments x is specified with the --threshold flag') 18 | 19 | parser.add_argument('--threshold',type=int,default=15,help='Minimum coverage-support that is required for making a base call. Anything below this threshold will be masked as ambiguity.') 20 | 21 | return parser.parse_args() 22 | 23 | # Preparation for calling input variables and files 24 | args = get_args() 25 | 26 | 27 | input = args.input 28 | output = args.output 29 | out_file = output.split("/")[-1] 30 | out_dir = '/'.join(output.split("/")[:-1]) 31 | mode = args.mode 32 | threshold = args.threshold 33 | 34 | def getkey(string): 35 | locus, misc, values = string.partition('\t') 36 | x, y, values = values.partition('\t') 37 | insites, miscx, miscy = values.partition('\t') 38 | return int(insites) 39 | 40 | 41 | output_file = open("%s/%s_%s_%s" %(out_dir,mode,threshold,out_file), "wb") 42 | uce_list=csv.writer(output_file) 43 | 44 | with open(input) as f: 45 | content = [x.strip('\n') for x in f.readlines()] 46 | header = content[0] 47 | tail = content[-0] 48 | body = content[1:-1] 49 | body = sorted(body, key=getkey) 50 | if mode == 'cutoff': 51 | for line in body: 52 | element = line.split('\t') 53 | if int(element[2]) >= int(threshold): 54 | print(line) 55 | uce_list.writerow([element[0]]) 56 | elif mode == 'top': 57 | for line in body[-threshold:]: 58 | element = line.split('\t') 59 | print(line) 60 | uce_list.writerow([element[0]]) 61 | elif mode == 'bottom': 62 | for line in body[:threshold]: 63 | element = line.split('\t') 64 | print(line) 65 | uce_list.writerow([element[0]]) 66 | elif mode =='random': 67 | random_body = random.sample(body, threshold) 68 | for line in random_body: 69 | element = line.split('\t') 70 | print(line) 71 | uce_list.writerow([element[0]]) 72 | -------------------------------------------------------------------------------- /secapr/join_exons.py: -------------------------------------------------------------------------------- 1 | '''Join exon-alignment files belonging to the same gene 2 | ''' 3 | 4 | #author: Tobias Andermann, tobias.andermann@bioenv.gu.se 5 | 6 | import os 7 | import re 8 | #from cogent import LoadSeqs, DNA 9 | 10 | from .utils import CompletePath 11 | 12 | 13 | def add_arguments(parser): 14 | parser.add_argument( 15 | '--input', 16 | required=True, 17 | action=CompletePath, 18 | default=None, 19 | help='The directory containing the fasta-alignment-files' 20 | ) 21 | parser.add_argument( 22 | '--output', 23 | required=True, 24 | action=CompletePath, 25 | default=None, 26 | help='The output directory where results will be saved' 27 | ) 28 | 29 | 30 | def read_fasta(fasta): 31 | name, seq = None, [] 32 | for line in fasta: 33 | line = line.rstrip() 34 | if line.startswith(">"): 35 | if name: 36 | yield (name, ''.join(seq)) 37 | name, seq = line, [] 38 | else: 39 | seq.append(line) 40 | if name: 41 | yield (name, ''.join(seq)) 42 | 43 | 44 | def main(args): 45 | work_dir = args.input 46 | out_dir = args.output 47 | if not os.path.exists(out_dir): 48 | os.makedirs(out_dir) 49 | 50 | # Create a dictionary with the name-pattern as key and all file-names sharing that name-pattern 51 | fasta_dict = {} 52 | for fasta in os.listdir(work_dir): 53 | if fasta.endswith(".fasta") or fasta.endswith(".fa"): 54 | fasta_split = re.split("_", fasta) 55 | name_pattern = "%s_%s" %(fasta_split[0], fasta_split[1]) 56 | fasta_dict.setdefault(name_pattern,[]).append(fasta) 57 | 58 | else: 59 | print("didn't work for", fasta) 60 | 61 | # Get the list of taxa names (headers) for each locus, key is out-file, values are in-files 62 | for key, value in fasta_dict.items(): 63 | print(key) 64 | list_headers = [] 65 | # Each k is a separate fasta input file belonging to the same locus (to be joined) 66 | for k in sorted(value): 67 | with open("%s/%s" %(work_dir, k)) as f: 68 | for name, seq in read_fasta(f): 69 | if name not in list_headers: 70 | list_headers.append(name) 71 | 72 | # Find the missing taxa in each fasta input file and simulate a sequence of correct length (only "?") 73 | in_fasta = os.path.join(work_dir, fasta) 74 | # Each k is a separate fasta input file belonging to the same locus (to be joined) 75 | all_seq_dict = {} 76 | for k in sorted(value): 77 | taxa_names_single = [] 78 | present_seq = [] 79 | length_alignment = "" 80 | with open("%s/%s" %(work_dir,k)) as f: 81 | for name, seq in read_fasta(f): 82 | taxa_names_single.append(name) 83 | present_seq.append((name,seq)) 84 | length_alignment = len(seq) 85 | # Make a list of all missing taxa in each fasta input file 86 | missing_taxa = [] 87 | for header in list_headers: 88 | if header not in taxa_names_single: 89 | missing_taxa.append(header) 90 | simulated_seq = [] 91 | for mistax in missing_taxa: 92 | fake_string = "?" * length_alignment 93 | simulated_seq.append((mistax,fake_string)) 94 | all_seq = sorted(simulated_seq+present_seq) 95 | 96 | for seq_header, sequence in all_seq: 97 | all_seq_dict.setdefault(seq_header,[]).append(sequence) 98 | 99 | with open(os.path.join(out_dir, "%s.fasta" %key), 'w') as out_fasta: 100 | for seqname, sequences in all_seq_dict.items(): 101 | final_sequence = "".join(sequences) 102 | out_fasta.write(seqname+"\n") 103 | out_fasta.write(final_sequence+"\n") 104 | -------------------------------------------------------------------------------- /secapr/merge_probes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import shutil 5 | import argparse 6 | import subprocess 7 | import subprocess 8 | 9 | from .utils import CompletePath 10 | 11 | 12 | # Get arguments 13 | def get_args(): 14 | parser = argparse.ArgumentParser( 15 | description="merge bait sequences from a sequence capture file (fasta) into full sequences", 16 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 17 | ) 18 | parser.add_argument( 19 | '--probe_file', 20 | required=True, 21 | action=CompletePath, 22 | help='the probe/bait file in fasta format' 23 | ) 24 | parser.add_argument( 25 | '--output', 26 | required=True, 27 | action=CompletePath, 28 | help='The output file with the new fasta sequences' 29 | ) 30 | return parser.parse_args() 31 | 32 | 33 | def read_fasta(fasta): 34 | name, seq = None, [] 35 | for line in fasta: 36 | line = line.rstrip() 37 | if line.startswith(">"): 38 | if name: yield (name, ''.join(seq)) 39 | name, seq = line, [] 40 | else: 41 | seq.append(line) 42 | if name: yield (name, ''.join(seq)) 43 | 44 | 45 | def longest_common_substring(s1, s2): 46 | m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))] 47 | longest, x_longest = 0, 0 48 | for x in range(1, 1 + len(s1)): 49 | for y in range(1, 1 + len(s2)): 50 | if s1[x - 1] == s2[y - 1]: 51 | m[x][y] = m[x - 1][y - 1] + 1 52 | if m[x][y] > longest: 53 | longest = m[x][y] 54 | x_longest = x 55 | else: 56 | m[x][y] = 0 57 | return s1[x_longest - longest: x_longest] 58 | 59 | 60 | args = get_args() 61 | fasta_file = args.probe_file 62 | out_file = args.output 63 | 64 | header_list = [] 65 | sequence_list = [] 66 | 67 | with open(fasta_file) as f: 68 | for name, seq in read_fasta(f): 69 | header_list.append(name) 70 | sequence_list.append(seq) 71 | 72 | 73 | sequence_dictionary = {} 74 | for seq in sequence_list: 75 | place_in_list = sequence_list.index(seq) 76 | header = header_list[place_in_list] 77 | if place_in_list > 0: 78 | match = longest_common_substring(sequence_list[place_in_list],sequence_list[place_in_list-1]) 79 | if not sequence_list[place_in_list-1].endswith(match): 80 | # sometimes there are cases where the matching substring is not at the end of the previous sequence, 81 | # since there are some slightly diffeent bases in the sequence of the following probe, even though the 82 | # sequences are clearly homologous. In that case we want to remove everything following the match from 83 | # the previous probe and add this complete probe sequence into the final dictionary 84 | if len(match) > 20: 85 | splitstring = sequence_list[place_in_list-1].rsplit(match, 1) 86 | substring = match+splitstring[1] 87 | sequence_dictionary[new_header][-1] = sequence_dictionary[new_header][-1].replace(substring,'') 88 | sequence_dictionary[new_header].append(seq) 89 | else: 90 | new_header = header 91 | sequence_dictionary.setdefault(new_header,[]) 92 | sequence_dictionary[new_header].append(seq) 93 | 94 | elif sequence_list[place_in_list-1].endswith(match) and sequence_list[place_in_list].startswith(match): 95 | if len(match)> 9: 96 | seq = seq.replace(match,'') 97 | sequence_dictionary[new_header].append(seq) 98 | else: 99 | print(header, seq, match, '\n') 100 | sequence_dictionary.setdefault(new_header,[]) 101 | sequence_dictionary[new_header].append(seq) 102 | else: 103 | print('unidentifiable probe:', header, seq) 104 | elif place_in_list == 0: 105 | new_header = header 106 | sequence_dictionary.setdefault(new_header,[]) 107 | sequence_dictionary[new_header].append(seq) 108 | 109 | 110 | 111 | out_fasta = open(out_file, 'w') 112 | for header in sorted(sequence_dictionary): 113 | sequence = ''.join(sequence_dictionary[header]) 114 | out_fasta.write(header+"\n") 115 | out_fasta.write(sequence+"\n") 116 | out_fasta.close() 117 | -------------------------------------------------------------------------------- /secapr/mpileup_fasta.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | from .utils import CompletePath 5 | 6 | 7 | # Get arguments 8 | def get_args(): 9 | parser = argparse.ArgumentParser(description="Convert a mpileup file generated with samtools into fasta format", formatter_class=argparse.ArgumentDefaultsHelpFormatter) 10 | 11 | parser.add_argument('--input',required=True,action=CompletePath,default=None,help='The mpileup file that you want to transform into fasta format') 12 | 13 | parser.add_argument('--fasta_out',required=True,action=CompletePath,default=None,help='The name of the output fasta file') 14 | 15 | parser.add_argument('--coverage',type=int,default=3,help='Minimum coverage-support that is required for making a base call. Anything below this threshold will be masked as ambiguity.') 16 | 17 | return parser.parse_args() 18 | 19 | # Preparation for calling input variables and files 20 | args = get_args() 21 | 22 | 23 | pileup = args.input 24 | out_fasta = args.fasta_out 25 | cov = args.coverage 26 | with open(pileup) as f: 27 | content = [x.strip('\n') for x in f.readlines()] 28 | 29 | loci = [] 30 | seq_dict = {} 31 | for line in content: 32 | if '#' not in line: 33 | # Split the tab delimited lines into their segments 34 | element = line.split('\t') 35 | # Create a list of all loci names 36 | seq_name = element[0] 37 | if seq_name not in loci: 38 | loci.append(seq_name) 39 | # By default call every position a uncertainty 40 | basecall = "N" 41 | 42 | # Turn all lower case values in upper case 43 | sample = element[4].upper() 44 | # make a directory with all different basecalls and count their occurences 45 | calls = dict((letter,sample.count(letter)) for letter in set(sample)) 46 | 47 | # The basecall in the reference 48 | reference = element[2] 49 | # These characters signal a match with the reference 50 | match_ref = "." "," 51 | # List of base characters 52 | bases = "A" "G" "C" "T" 53 | 54 | # find out how many agreements with reference are among the basecalls. These are all . and , basecalls listed in match_ref. 55 | # reset the counter before every round (every line in file aka position in sequence) 56 | list_matches = 0 57 | for key,value in list(calls.items()): 58 | if key in match_ref: 59 | list_matches += value 60 | if list_matches >= cov: 61 | basecall = reference 62 | 63 | # find if there are any well supported SNPs and make the most prominent call 64 | for key in sorted(calls, key=calls.get, reverse=True): 65 | if key in bases: 66 | if int(calls[key]) >= cov: 67 | if int(calls[key]) >= list_matches: 68 | basecall = key 69 | break 70 | # add the final basecall to the dictionary and to the respective key if it already exists, otherwise create new key 71 | seq_dict.setdefault(element[0],[]) 72 | seq_dict[element[0]].append(basecall) 73 | # Join all basecalls for each key (=locus) into one sequence and deposit in new dictionary 74 | concat_basecalls = {} 75 | for key, value in list(seq_dict.items()): 76 | concat_basecalls[key] = "".join(value) 77 | #print concat_basecalls 78 | 79 | with open(out_fasta, "wb") as f: 80 | for k, v in list(concat_basecalls.items()): 81 | f.write(">" + k+ "\n") 82 | f.write(v+ "\n") 83 | -------------------------------------------------------------------------------- /secapr/process_pileup.py: -------------------------------------------------------------------------------- 1 | #author: Tobias Andermann, tobias.andermann@bioenv.gu.se 2 | 3 | import csv 4 | import os 5 | import sys 6 | import re 7 | import glob 8 | import shutil 9 | import argparse 10 | import configparser 11 | import subprocess 12 | import subprocess 13 | from Bio import SeqIO 14 | 15 | from .utils import CompletePath 16 | 17 | 18 | # Get arguments 19 | def get_args(): 20 | parser = argparse.ArgumentParser( 21 | description="Mask all positions with low read coverage or strange coverage (many reads beginning or ending at same position) as uncertainties.", 22 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 23 | ) 24 | parser.add_argument( 25 | '--pileup', 26 | required=True, 27 | action=CompletePath, 28 | default=None, 29 | help='The name of the file containing the samtools mpileup output' 30 | ) 31 | parser.add_argument( 32 | '--cutoff', 33 | type=int, 34 | default=6, 35 | help='The minimum read depth that you want to accept' 36 | ) 37 | return parser.parse_args() 38 | 39 | args = get_args() 40 | 41 | file = args.pileup 42 | 43 | 44 | def count_letters(word): 45 | GOOD_LETTERS = "actgACTG" 46 | return len([letter for letter in word if letter in GOOD_LETTERS]) 47 | 48 | 49 | with open(file, 'r') as f: 50 | reader = csv.reader(f, delimiter='\t') 51 | for row in reader: 52 | if count_letters(row[4]) < args.cutoff: 53 | row[4] = "N" * count_letters(row[4]) 54 | print(row) 55 | -------------------------------------------------------------------------------- /secapr/remove_uninformative_seqs.py: -------------------------------------------------------------------------------- 1 | #author: Tobias Andermann, tobias.andermann@bioenv.gu.se 2 | 3 | import os 4 | import sys 5 | import re 6 | import glob 7 | import shutil 8 | import argparse 9 | from Bio import SeqIO 10 | 11 | from .utils import CompletePath 12 | 13 | 14 | # Get arguments 15 | def get_args(): 16 | parser = argparse.ArgumentParser( 17 | description="Set the maximum fraction of missing data that you want to allow in an alignment and drop all sequences above this threshold.", 18 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 19 | ) 20 | parser.add_argument( 21 | '--alignment', 22 | required=True, 23 | action=CompletePath, 24 | default=None, 25 | help='The alignment in fasta format.' 26 | ) 27 | parser.add_argument( 28 | '--maximum_missing', 29 | type=float, 30 | default=0.8, 31 | help='Define the maximal fraction of missing data that you want to allow. All sequences below this threshold will be exported into a new alignment.' 32 | ) 33 | parser.add_argument( 34 | '--output', 35 | required=True, 36 | action=CompletePath, 37 | default=None, 38 | help='The output directory where results will be safed.' 39 | ) 40 | return parser.parse_args() 41 | args = get_args() 42 | 43 | # Set working directory 44 | out_dir = args.output 45 | if not os.path.exists(out_dir): 46 | os.makedirs(out_dir) 47 | 48 | # Get other input variables 49 | alignment = args.alignment 50 | max_mis = args.maximum_missing 51 | 52 | 53 | def manage_homzygous_samples(fasta,threshold,output): 54 | fasta_alignment = SeqIO.parse(open(fasta),'fasta') 55 | with open('%s/cleaned_alignment_all_sequences_less_than_%f_missing_data.fasta' %(output,threshold), 'w') as outfile: 56 | final_seqs = {} 57 | for sample in fasta_alignment: 58 | header = sample.description 59 | sequence = sample.seq 60 | chars = list(sequence) 61 | bad_chars = [] 62 | for char in chars: 63 | if char not in ['A','C','T','G','a','c','t','g']: 64 | bad_chars.append(char) 65 | sequence_length = float(len(chars)) 66 | count_bad_chars = float(len(bad_chars)) 67 | fraction = float(count_bad_chars/sequence_length) 68 | if fraction <= threshold: 69 | final_seqs.setdefault(header,[]).append(sequence) 70 | else: 71 | print("Dropped sequence for", header) 72 | for seqname, seq in final_seqs.items(): 73 | sequence = str(seq[0]) 74 | outfile.write(">"+seqname+"\n") 75 | outfile.write(sequence+"\n") 76 | outfile.close 77 | 78 | manage_homzygous_samples(alignment,max_mis,out_dir) 79 | -------------------------------------------------------------------------------- /secapr/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Oct 15 17:08:10 2019 5 | 6 | @author: Tobias Andermann (tobias.andermann@bioenv.gu.se) 7 | """ 8 | 9 | import numpy as np 10 | np.set_printoptions(suppress=True) 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | import os 14 | import argparse 15 | 16 | 17 | class CompletePath(argparse.Action): 18 | """give the full path of an input file/folder""" 19 | def __call__(self, parser, namespace, values, option_string=None): 20 | setattr(namespace, self.dest, os.path.abspath(os.path.expanduser(values))) 21 | 22 | 23 | -------------------------------------------------------------------------------- /secapr/varscan_vcf_2_fasta.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | 5 | from .utils import CompletePath 6 | 7 | 8 | # Get arguments 9 | def get_args(): 10 | parser = argparse.ArgumentParser( 11 | description="Convert a vcf-file generated with Varscan into fasta format", 12 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 13 | ) 14 | parser.add_argument( 15 | '--vcf', 16 | required=True, 17 | action=CompletePath, 18 | default=None, 19 | help='The vcf file that you want to transform into fasta format' 20 | ) 21 | parser.add_argument( 22 | '--fasta_out', 23 | required=True, 24 | action=CompletePath, 25 | default=None, 26 | help='The name of the output fasta file' 27 | ) 28 | return parser.parse_args() 29 | 30 | # Preparation for calling input variables and files 31 | args = get_args() 32 | 33 | 34 | vcf = args.vcf 35 | out_fasta = args.fasta_out 36 | 37 | with open(vcf) as f: 38 | content = [x.strip('\n') for x in f.readlines()] 39 | 40 | loci = [] 41 | seq_dict = {} 42 | for line in content: 43 | if '#' not in line: 44 | element = line.split('\t') 45 | # Create a list of all loci names 46 | seq_name = element[0] 47 | if seq_name not in loci: 48 | loci.append(seq_name) 49 | basecall = element[3] 50 | if element[4] != '.': 51 | basecall = element[4] 52 | seq_dict.setdefault(element[0],[]) 53 | seq_dict[element[0]].append(basecall) 54 | 55 | # Join all basecalls for each key (=locus) into one sequence and deposit in new dictionary 56 | concat_basecalls = {} 57 | for key, value in list(seq_dict.items()): 58 | concat_basecalls[key] = "".join(value) 59 | #print concat_basecalls 60 | 61 | with open(out_fasta, "wb") as f: 62 | for k, v in list(concat_basecalls.items()): 63 | f.write(">" + k+ "\n") 64 | f.write(v+ "\n") 65 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [versioneer] 2 | VCS = git 3 | style = pep440 4 | versionfile_source = secapr/_version.py 5 | versionfile_build = secapr/_version.py 6 | tag_prefix = v 7 | parentdir_prefix = secapr- 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from setuptools import setup 3 | import versioneer 4 | 5 | if sys.version_info < (3, 0): 6 | sys.stdout.write("At least Python 3.0 is required.\n") 7 | sys.exit(1) 8 | 9 | with open('README.md') as f: 10 | long_description = f.read() 11 | 12 | 13 | setup( 14 | name = 'secapr', 15 | version = versioneer.get_version(), 16 | cmdclass = versioneer.get_cmdclass(), 17 | author = 'Tobias Andermann', 18 | author_email = 'tobias.andermann@bioenv.gu.se', 19 | url = 'https://github.com/AntonelliLab/seqcap_processor', 20 | description = 'Process sequence-capture fastq files into alignments for phylogenetic analyses', 21 | long_description = long_description, 22 | license = 'MIT', 23 | entry_points = {'console_scripts': ['secapr = secapr.__main__:main']}, 24 | packages = ['secapr'], 25 | install_requires = [ 26 | # No dependencies listed here since we need to rely on conda anyway 27 | ], 28 | classifiers = [ 29 | "Development Status :: 4 - Beta", 30 | "Environment :: Console", 31 | "Intended Audience :: Science/Research", 32 | "License :: OSI Approved :: MIT License", 33 | "Natural Language :: English", 34 | "Programming Language :: Python :: 2.7", 35 | "Topic :: Scientific/Engineering :: Bio-Informatics" 36 | ] 37 | ) 38 | -------------------------------------------------------------------------------- /src/align_paralogs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Apr 24 10:46:57 2019 5 | 6 | @author: Tobias Andermann (tobias.andermann@bioenv.gu.se) 7 | """ 8 | 9 | import os 10 | import shutil 11 | import numpy as np 12 | import pandas as pd 13 | from numpy import genfromtxt 14 | from Bio import SeqIO 15 | from Bio.Align.Applications import MafftCommandline 16 | import glob 17 | import sys 18 | #help(MafftCommandline) 19 | 20 | def fix_line_wrap(alignment_file): 21 | file_in = open(alignment_file) 22 | final = {} 23 | for line in file_in: 24 | line = line.strip() 25 | if line.startswith(">"): 26 | id = line 27 | final[id] = " " 28 | else: 29 | final[id] += line 30 | file_out = open(alignment_file, "w") 31 | for key, val in final.items(): 32 | file_out.write(key) 33 | file_out.write("\n") 34 | file_out.write(val) 35 | file_out.write("\n") 36 | 37 | # the following paths need to be provided 38 | contig_folder = '/Users/tobias/GitHub/seqcap_processor/data/processed/contigs' 39 | ref_file = '/Users/tobias/GitHub/seqcap_processor/data/raw/palm_reference_sequences.fasta' 40 | root_dir = '/Users/tobias/GitHub/seqcap_processor/data/processed/target_contigs_paralogs_info' 41 | outdir = '/Users/tobias/Desktop/test' 42 | 43 | if os.path.exists(outdir): 44 | shutil.rmtree(outdir) 45 | os.makedirs(outdir) 46 | ref_index_info = os.path.join(root_dir,'reference_fasta_header_info.txt') 47 | 48 | subdirs = list(os.walk(root_dir))[0][1] 49 | for sample in subdirs: 50 | print('\nProcessing sample %s'%sample) 51 | # get the paths for the sample 52 | sample_path = os.path.join(root_dir,sample) 53 | contig_orientation = os.path.join(sample_path,'contig_orientation.txt') 54 | para_info = os.path.join(sample_path,'info_paralogous_loci.txt') 55 | contig_file = os.path.join(contig_folder,'%s.fa'%sample) 56 | # read the data 57 | ref_index_df = pd.read_csv(ref_index_info,sep='\t',header=-1) 58 | keys = ref_index_df[0].values 59 | values = ref_index_df[1].values 60 | id_ref_dict = dict(zip(keys,values)) 61 | ref_seqs = list(SeqIO.parse(ref_file, "fasta")) 62 | contig_seqs = list(SeqIO.parse(contig_file, "fasta")) 63 | contig_orientation_df = pd.read_csv(contig_orientation,sep='\t') 64 | para_data = genfromtxt(para_info, delimiter='\t') 65 | print('%i paralogous loci found.'%len(para_data)) 66 | sample_out_dir = os.path.join(outdir,sample) 67 | if not os.path.exists(sample_out_dir): 68 | os.makedirs(sample_out_dir) 69 | sample_sequence_outdir = os.path.join(sample_out_dir,'paralog_seq_collections') 70 | if not os.path.exists(sample_sequence_outdir): 71 | os.makedirs(sample_sequence_outdir) 72 | # print ref and contig sequences for each paralogous locus into a separate sequence collection 73 | for counter,i in enumerate(para_data): 74 | records = [] 75 | reference = int(i[0]) 76 | reference_id = id_ref_dict[reference] 77 | ref_seq = [ref for ref in ref_seqs if reference_id==ref.id][0] 78 | records.append(ref_seq) 79 | contig_list_tmp = i[1:] 80 | contig_list = np.unique(contig_list_tmp[~np.isnan(contig_list_tmp)].astype(int).astype(str)) 81 | for contig_id in contig_list: 82 | contig_seq = [contig for contig in contig_seqs if contig_id == contig.id][0] 83 | orientation = contig_orientation_df[contig_orientation_df.contig_id == int(contig_id)].orientation.values[0] 84 | if orientation == '+': 85 | pass 86 | else: 87 | contig_seq.seq = contig_seq.seq.reverse_complement() 88 | records.append(contig_seq) 89 | sys.stdout.write('\rPrinting sequence collections %i/%i '%(int(counter+1),len(para_data))) 90 | SeqIO.write(records, os.path.join(sample_sequence_outdir,'paralog_contigs_collection_locus_%i.fasta'%reference),'fasta') 91 | print('\rPrinting sequence collections %i/%i '%(int(counter+1),len(para_data))) 92 | # align the sequences and print as fasta alignment file 93 | sample_alignment_outdir = os.path.join(sample_out_dir,'paralog_alignments') 94 | if not os.path.exists(sample_alignment_outdir): 95 | os.makedirs(sample_alignment_outdir) 96 | seq_colls = glob.glob(os.path.join(sample_sequence_outdir,'*')) 97 | for counter, sequence_collection in enumerate(seq_colls): 98 | filename = sequence_collection.split('/')[-1].replace('paralog_contigs_collection_','alignment_') 99 | cline = MafftCommandline(input=sequence_collection,op=6.,ep=1.) 100 | stdout, stderr = cline() 101 | alignment_out = os.path.join(sample_alignment_outdir,filename) 102 | sys.stdout.write('\rAligning sequence collections %i/%i '%(int(counter+1),len(para_data))) 103 | with open(alignment_out, "w") as handle: 104 | handle.write(stdout) 105 | fix_line_wrap(alignment_out) 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /src/apply_read_thres_select_best_loci.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Aug 31 10:27:32 2017 5 | 6 | @author: tobias 7 | """ 8 | 9 | import pandas as pd 10 | 11 | # Define a threshold of read-depth (anything below will be considered as a locus not present in the data) 12 | threshold = 3 13 | 14 | # Load the input file with all read-coverage values 15 | cov_file = '/Users/tobias/GitHub/seqcap_processor/data/processed/selected_loci/average_cov_per_locus.txt' 16 | coverage_df = pd.read_csv(cov_file,sep='\t') 17 | 18 | # Return boolean for every field, depending on if its greater than the threshold 19 | thres_test = coverage_df.ix[:,1:]>threshold 20 | 21 | # Extract only those rows for which all fields returned 'True' and store in new df 22 | selected_rows = pd.DataFrame([]) 23 | for line in thres_test.iterrows(): 24 | line = line[1] 25 | if line.all(): 26 | selected_rows = selected_rows.append(line) 27 | 28 | # Store all indices of the selected data (selected_rows) in a list 29 | indeces = list(selected_rows.index.get_values()) 30 | 31 | # Use indices to extract rows from oriignal df and create new one from it 32 | loci_passing_test = coverage_df.iloc[indeces,:].copy() 33 | list_of_good_loci = list(loci_passing_test.locus) 34 | 35 | # Calculate the read-depth sum across all samples for each locus and store as new column in df 36 | loci_passing_test['sum'] = loci_passing_test.ix[:,1:].sum(axis=1) 37 | 38 | # Sort the df by the 'sum' column to have the best covered loci on top 39 | loci_passing_test.sort_values('sum', axis=0, ascending=False, inplace=True) 40 | 41 | # select best n columns 42 | n=15 43 | selection = loci_passing_test[:15].copy() 44 | type(selection) 45 | # Write the sorted df to a csv file 46 | loci_passing_test.to_csv('/Users/tobias/GitHub/seqcap_processor/data/processed/selected_loci/loci_passing_threshold_%s.txt' %str(threshold), sep = '\t', index = False) 47 | -------------------------------------------------------------------------------- /src/check_avg_contig_length.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jun 17 14:08:28 2020 5 | 6 | @author: Tobias Andermann (tobias.andermann@bioenv.gu.se) 7 | """ 8 | 9 | import sys,os,glob 10 | import numpy as np 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | 14 | np.set_printoptions(suppress=True) 15 | np.random.seed(1234) 16 | 17 | from Bio import SeqIO 18 | file = '/Users/tobias/GitHub/seqcap_processor/data/test/target_sequences/extracted_target_contigs_all_samples.fasta' 19 | contigs_file_content = SeqIO.parse(open(file),'fasta') 20 | seqlengths = [] 21 | for i in contigs_file_content: 22 | seqlength = len(str(i.seq)) 23 | seqlengths.append(seqlength) 24 | 25 | 26 | np.mean(np.array(seqlengths)) 27 | -------------------------------------------------------------------------------- /src/extract_longest_contig.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Mar 6 11:16:59 2019 5 | 6 | @author: Tobias Andermann (tobias.andermann@bioenv.gu.se) 7 | """ 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | 15 | contig_headers = ['109189', '156474', '490477', '183356', '244827', '249589', '376762', '473401', '506353', '552484'] 16 | contig_headers = ['549799', '593734'] 17 | contig_headers = ['252306', '271392', '399821', '180012', '383994', '497380', '573398'] 18 | lastz_data = pd.read_csv('/Users/tobias/Desktop/target_contigs_test/1061/1061.lastz',sep='\t') 19 | contig_header_values = np.array([i.split(' ')[0].replace('>','') for i in lastz_data.name1.values if i.split(' ')[0].replace('>','') in contig_headers]).astype(int) 20 | contig_length_values = np.array([i.split(' ')[1] for i in lastz_data.name1.values if i.split(' ')[0].replace('>','') in contig_headers]).astype(int) 21 | longest_contig = contig_header_values[list(contig_length_values).index(np.max(contig_length_values))] 22 | -------------------------------------------------------------------------------- /src/fastqc_visualization.r: -------------------------------------------------------------------------------- 1 | if(!require(optparse)){install.packages("optparse",repos = "http://cran.us.r-project.org")} 2 | if(!require(tidyverse)){install.packages("tidyverse",repos = "http://cran.us.r-project.org")} 3 | 4 | 5 | #load libraries 6 | if (!requireNamespace("optparse", quietly = TRUE)) { 7 | write("R package 'optparse' needed for qc visualization. Please install it.\n", 8 | stderr()) 9 | install.packages("optparse") 10 | } 11 | if (!requireNamespace("tidyverse", quietly = TRUE)) { 12 | stop("R package 'tidyverse' needed for qc visualization. Please install it.\n", 13 | stderr()) 14 | install.packages("tidyverse") 15 | } 16 | 17 | 18 | ##optparser options 19 | option_list <- list( 20 | make_option(c("-i", "--input_folder"), type="character", default=getwd(), 21 | help="The path to thefolder with the fastqc results"), 22 | make_option(c("-o", "--output_file"), type="character", default="QC_plots.pdf", 23 | help="Give the name of the pdf file where the plots are to be saved."), 24 | make_option(c("-p", "--print"), type="logical", default=TRUE, 25 | help="Print sample ids of samples that failed QC.") 26 | 27 | ) 28 | 29 | opt <- parse_args(OptionParser(option_list=option_list)) 30 | 31 | #load fastQC summaries and create per test table 32 | inp <- list.files(opt$input_folder, pattern = ".zip") 33 | 34 | 35 | fastqc_results <- lapply(inp, function(k){ 36 | unzip(paste(opt$input_folder, k, sep = "/"),exdir = opt$input_folder) 37 | inpu <- read_delim(paste(paste(gsub(".zip", "", paste(opt$input_folder,k, sep = "/"))), 38 | "summary.txt", sep = "/"), delim = "\t") 39 | out <- as_data_frame(t(inpu[, 1])) %>% 40 | mutate(sample.id = names(inpu)[3]) 41 | names(out) <- c(gsub(" ", "_", unlist(inpu[,2])), "sample_id") 42 | unlink(x = paste(opt$input_folder, gsub(".zip", "", k), sep = "/"), 43 | recursive = T, force = T) 44 | 45 | return(out) 46 | }) 47 | 48 | outp <- do.call("rbind.data.frame", fastqc_results)%>% 49 | select(ID = sample_id, 50 | PBQ = Per_base_sequence_quality, 51 | PTQ = Per_tile_sequence_quality, 52 | PSQ = Per_sequence_quality_scores, 53 | PBC = Per_base_sequence_content, 54 | SGC = Per_sequence_GC_content, 55 | PBN = Per_base_N_content, 56 | SLD = Sequence_Length_Distribution, 57 | SDL = Sequence_Duplication_Levels, 58 | ORS = Overrepresented_sequences, 59 | AdC = Adapter_Content, 60 | KmC = Kmer_Content) 61 | 62 | #change table format 63 | ret <- outp %>% 64 | group_by(ID) %>% 65 | gather(test, status, PBQ:KmC) 66 | 67 | #plot how many samples failed the test 68 | qc.fail <- ggplot()+ 69 | geom_bar(data = ret, aes(x = test, fill = status), stat = 'count', position = 'dodge')+ 70 | theme_bw() 71 | 72 | #plot which sample failed which test 73 | qc.samples <- ggplot()+ 74 | geom_tile(data = ret, aes(y = ID, x = test, fill = as.factor(status)))+ 75 | scale_fill_discrete(name = "status")+ 76 | xlab("FastQC test")+ 77 | ylab("Samples")+ 78 | theme_bw()+ 79 | theme( 80 | axis.text.y = element_blank() 81 | ) 82 | 83 | #plot pdf 84 | pdf(opt$output_file) 85 | print(qc.fail) 86 | print(qc.samples) 87 | dev.off() 88 | 89 | png(gsub(".pdf", "1.png", opt$output_file)) 90 | print(qc.fail) 91 | dev.off() 92 | 93 | png(gsub(".pdf", "2.png", opt$output_file)) 94 | print(qc.samples) 95 | dev.off() 96 | 97 | #table with samples that faild a test 98 | fail <- ret %>% 99 | filter(status == "FAIL") 100 | 101 | #get the ID number of the failed samples 102 | fail.samp <- fail %>% 103 | filter(!duplicated(ID)) %>% 104 | select(ID)%>% 105 | unlist() %>% 106 | parse_number()%>% 107 | unique() %>% 108 | sort() 109 | 110 | if(opt$print){ 111 | write(sprintf("The following sample failed at least one test: %s \n", fail.samp), stdout()) 112 | } 113 | -------------------------------------------------------------------------------- /src/find_good_loci_for_each_sample_from_readcov_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jun 27 10:47:27 2018 5 | 6 | @author: tobias 7 | """ 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from functools import reduce 12 | 13 | read_cov_overview = '/Users/tobias/GitHub/seqcap_processor/data/processed/remapped_reads/average_cov_per_locus.txt' 14 | read_cov_overview_df = pd.read_csv(read_cov_overview,sep='\t') 15 | 16 | loci_names = read_cov_overview_df['locus'].values 17 | good_exons = [] 18 | sample_exon_count_dict = {} 19 | for sample in read_cov_overview_df.columns: 20 | if not sample =='locus': 21 | values = read_cov_overview_df[sample].values 22 | num_loci_high_coverage = len(values[values>3]) 23 | sample_exon_count_dict.setdefault(sample,num_loci_high_coverage) 24 | good_loci_names = loci_names[values>3] 25 | good_exons.append(good_loci_names) 26 | 27 | loci_present_in_all_samples = reduce(np.intersect1d, (good_exons)) 28 | 29 | 30 | exons_per_sample = list(sample_exon_count_dict.values()) 31 | np.mean(exons_per_sample) 32 | np.std(exons_per_sample) 33 | -------------------------------------------------------------------------------- /src/get_stats_from_log_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Jan 22 10:33:17 2018 5 | 6 | @author: tobias 7 | """ 8 | 9 | 10 | df_1 = '/Users/tobias/GitHub/seqcap_processor/data/processed/cleaned_trimmed_reads_test/sample_overview.txt' 11 | df_a = pd.read_csv(df_1,sep='\t') 12 | 13 | 14 | match_table = '/Users/tobias/GitHub/seqcap_processor/data/processed/target_contigs/match_table.txt' 15 | table = pd.read_csv(table,sep='\t',index_col=0) 16 | 17 | 18 | sample_count_dict = {} 19 | for column in table.columns: 20 | sample_count_dict.setdefault(column.replace('sample_',''),sum(table[column])) 21 | 22 | type(new_value) 23 | 24 | 25 | 26 | test_str = '/Users/tobias/GitHub/seqcap_processor/data/processed/contigs_test/stats/1063.fa' 27 | '/'.join(test_str.split('/')[:-2]) 28 | 29 | import os 30 | import subprocess 31 | file = contig_file 32 | lines = 2 33 | tail_out = subprocess.Popen(['tail','-n', lines,file],stdout=subprocess.PIPE) 34 | tail_out.communicate[0] 35 | lines = stdout.readlines(); stdout.close() 36 | return lines[:,-offset] 37 | 38 | tail(contig_file,3) 39 | 40 | contig_file = '/Users/tobias/GitHub/seqcap_processor/data/processed/contigs/sample_1061.fa' 41 | 42 | 43 | stats_file = '/Users/tobias/GitHub/seqcap_processor/data/processed/cleaned_trimmed_reads/1085_clean/1085_stats.txt' 44 | def get_read_count_from_stats_file(stats_file): 45 | F = open(stats_file,'r') 46 | for line in F: 47 | if line.startswith('Input'): 48 | reads_before = line.split(' ')[3] 49 | reads_after = line.split(' ')[6] 50 | return(reads_before,reads_after) 51 | -------------------------------------------------------------------------------- /src/merge_baits_for_each_locus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Nov 27 16:51:25 2017 5 | 6 | @author: tobias 7 | """ 8 | import os 9 | import re 10 | import random 11 | import subprocess 12 | from Bio import SeqIO 13 | from Bio import AlignIO 14 | from Bio.SeqRecord import SeqRecord 15 | from Bio.Align import MultipleSeqAlignment 16 | from Bio.Alphabet import generic_dna 17 | 18 | def read_fasta(fasta): 19 | name, seq = None, [] 20 | for line in fasta: 21 | line = line.rstrip() 22 | if line.startswith(">"): 23 | if name: yield (name, ''.join(seq)) 24 | name, seq = line, [] 25 | else: 26 | seq.append(line) 27 | if name: yield (name, ''.join(seq)) 28 | 29 | def get_seq_dict(dictionary, fasta_path): 30 | new_dict = {} 31 | for locus in dictionary.keys(): 32 | new_dict.setdefault(locus,[]) 33 | alignment = SeqIO.parse(open(fasta_path),'fasta') 34 | for seq in alignment: 35 | ref_loc = str(seq.id).split('_')[0] 36 | new_dict[ref_loc].append(seq) 37 | seq.name='' 38 | seq.description='' 39 | return new_dict 40 | 41 | 42 | def create_reference_fasta(out_dir,path_to_alignments): 43 | # Create a list of fasta files from the input directory 44 | file_list = [fn for fn in os.listdir(path_to_alignments) if fn.endswith(".fasta")] 45 | reference_list = [] 46 | for fasta_alignment in file_list: 47 | sequence_name = re.sub(".fasta","",fasta_alignment) 48 | orig_aln = os.path.join(path_to_alignments,fasta_alignment) 49 | sep_reference = "%s/%s" %(out_dir,fasta_alignment) 50 | reference_list.append(sep_reference) 51 | cons_cmd = "cons -sequence %s -outseq %s -name %s -plurality 0.1 -setcase 0.1" %(orig_aln,sep_reference,sequence_name) 52 | os.system(cons_cmd) 53 | reference = os.path.join(out_dir,"joined_fasta_library.fasta") 54 | join_fastas = "cat %s/*.fasta > %s" %(out_dir,reference) 55 | os.system(join_fastas) 56 | return reference 57 | 58 | 59 | 60 | fasta_file = '/Users/tobias/Desktop/cos2.fasta' 61 | 62 | locus_bait_dict = {} 63 | with open(fasta_file) as f: 64 | for name, seq in read_fasta(f): 65 | locus_name = re.sub('>','',name.split('_')[0]) 66 | locus_bait_dict.setdefault(locus_name,[]) 67 | locus_bait_dict[locus_name].append(seq) 68 | 69 | locus_fasta_dict = get_seq_dict(locus_bait_dict,fasta_file) 70 | 71 | 72 | 73 | out_path = '/Users/tobias/Desktop/merging_probes/sequence_files' 74 | for locus in locus_fasta_dict: 75 | filename = '%s_sequences.fasta' %locus 76 | with open(os.path.join(out_path,filename), "w") as out_file: 77 | seq_list = locus_fasta_dict[locus] 78 | index = 0 79 | for sequence in seq_list: 80 | sequence.id = '%s_%i' %(locus,index) 81 | sequence.name='' 82 | sequence.description='' 83 | index += 1 84 | out_file.write(sequence.format('fasta')) 85 | 86 | # align the sequence fasta files 87 | aln_path = '/Users/tobias/Desktop/merging_probes/alignments' 88 | for fasta in os.listdir(out_path): 89 | fasta_file = os.path.join(out_path,fasta) 90 | new_file_name = re.sub('_sequences.fasta','_sequence_alignment.fasta',fasta) 91 | aln = os.path.join(aln_path,new_file_name) 92 | aln_stdout = open(aln, 'w') 93 | # run MAFFT on the temp file 94 | cmd = ["mafft","--maxiterate", "1000", fasta_file] 95 | # just pass all ENV params 96 | proc = subprocess.Popen(cmd,stderr=subprocess.PIPE,stdout=aln_stdout) 97 | stderr = proc.communicate() 98 | aln_stdout.close() 99 | 100 | 101 | output = '/Users/tobias/Desktop/merging_probes/new_reference' 102 | create_reference_fasta(output,aln_path) 103 | 104 | -------------------------------------------------------------------------------- /src/plot_contig_length_overview.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Mar 15 09:46:32 2019 5 | 6 | @author: Tobias Andermann (tobias.andermann@bioenv.gu.se) 7 | """ 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import matplotlib.pyplot as plt 12 | 13 | old_contigs = '/Users/tobias/Desktop/extracted_target_contigs_all_samples_old.fasta' 14 | new_contigs = '/Users/tobias/Desktop/extracted_target_contigs_all_samples_new.fasta' 15 | 16 | from Bio import AlignIO 17 | from Bio import SeqIO 18 | 19 | contig_lengths_old = [] 20 | for record in SeqIO.parse(old_contigs, "fasta"): 21 | contig_lengths_old.append(len(record.seq)) 22 | contig_lengths_new = [] 23 | for record in SeqIO.parse(new_contigs, "fasta"): 24 | contig_lengths_new.append(len(record.seq)) 25 | 26 | len(contig_lengths_old) 27 | len(contig_lengths_new) 28 | 29 | np.mean(contig_lengths_old) 30 | np.mean(contig_lengths_new) 31 | 32 | plt.hist(contig_lengths_old,100) 33 | plt.title('Old contigs') 34 | 35 | plt.hist(contig_lengths_new,100) 36 | plt.title('New contigs') -------------------------------------------------------------------------------- /src/plot_exon_alignment_yield.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Nov 23 13:55:09 2017 5 | 6 | @author: tobias 7 | """ 8 | 9 | import os 10 | import re 11 | import glob 12 | import numpy as np 13 | import pandas as pd 14 | import matplotlib.pyplot as plt 15 | 16 | # Get the alignment files and make list of loci with alignments 17 | alignment_folder = '/Users/tobias/GitHub/seqcap_processor/data/processed/alignments/contig_alignments' 18 | alignment_files = glob.glob(os.path.join(alignment_folder, '*.fa*')) 19 | list_of_loci_with_alignments = [re.sub('.fasta','',al.split('/')[-1]) for al in alignment_files] 20 | 21 | # Get the list of all exon loci from the match table (x-axis) 22 | match_table = '/Users/tobias/GitHub/seqcap_processor/data/processed/target_contigs/match_table.txt' 23 | matrix = pd.read_csv(match_table,sep='\t',index_col=0) 24 | x_labels = np.array(matrix.index) 25 | #num_x_labels = range(len(x_labels)) 26 | 27 | # Split x-axis into thirds for better readability of plots 28 | #third_x_labels = np.split(np.matrix(x_labels), 3,axis=1) 29 | 30 | # Create 1-dimensional matrix and fill with info which loci have alignment data 31 | presence_absence_df = pd.DataFrame({'loci':x_labels,'presence':0}) 32 | for locus in list_of_loci_with_alignments: 33 | row_index = presence_absence_df[presence_absence_df.loci == locus].index 34 | presence_absence_df.loc[row_index,'presence'] = 1 35 | 36 | 37 | # Plot the data 38 | for i in range(len(third_x_labels)): 39 | fig = plt.figure() 40 | plt.clf() 41 | ax = fig.add_subplot(111) 42 | ax.set_aspect(1) 43 | res = ax.imshow(np.array(third_x_labels[i]), cmap='binary') 44 | height,width = third_data[i].shape 45 | #cb = fig.colorbar(res) 46 | plt.xlabel('exon index',fontsize=7) 47 | plt.ylabel('sample index',fontsize=7) 48 | xlabels = list(np.array(third_num_x_labels[i])[0]) 49 | plt.xticks(np.arange(width)[::30],xlabels[::30],fontsize=8) 50 | plt.yticks(fontsize=8) 51 | #ax.tick_params(left='off',labelleft='off') -------------------------------------------------------------------------------- /src/plot_exon_contig_yield.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Nov 23 11:40:16 2017 5 | 6 | @author: tobias 7 | """ 8 | 9 | import os 10 | import numpy as np 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | 14 | # Read the input data 15 | input_file = '/Users/tobias/GitHub/seqcap_processor/data/processed/target_contigs/match_table.txt' 16 | workdir = '/'.join(input_file.split('/')[:-1]) 17 | matrix = pd.read_csv(input_file,sep='\t',index_col=0) 18 | data = np.matrix(matrix).T 19 | y_labels = matrix.columns 20 | x_labels = np.array(matrix.index) 21 | num_x_labels = range(len(x_labels)) 22 | 23 | # Split dataset into thirds for better readability 24 | third_data = np.split(data, 3,axis=1) 25 | third_x_labels = np.split(np.matrix(x_labels), 3,axis=1) 26 | third_num_x_labels = np.split(np.matrix(num_x_labels),3,axis=1) 27 | 28 | # Plot the matrices 29 | for i in range(len(third_data)): 30 | fig = plt.figure() 31 | plt.clf() 32 | ax = fig.add_subplot(111) 33 | ax.set_aspect(1) 34 | res = ax.imshow(np.array(third_data[i]), cmap='GnBu') 35 | height,width = third_data[i].shape 36 | #cb = fig.colorbar(res) 37 | plt.xlabel('exon index',fontsize=7) 38 | plt.ylabel('sample index',fontsize=7) 39 | xlabels = list(np.array(third_num_x_labels[i])[0]) 40 | plt.xticks(np.arange(width)[::30],xlabels[::30],fontsize=8) 41 | plt.yticks(fontsize=8) 42 | #ax.tick_params(left='off',labelleft='off') 43 | fig.savefig(os.path.join(workdir,'contig_exon_matrix_%i.png'%i), dpi = 500) 44 | 45 | # Write overview of exon indeces 46 | key_to_exon_index = pd.DataFrame({'index':num_x_labels,'locus_name': x_labels}) 47 | key_to_exon_index.to_csv(os.path.join(workdir,'key_to_exon_index.txt'),index=False,sep='\t') 48 | 49 | # Write overview of sample indeces 50 | key_to_sample_index = pd.DataFrame({'index':range(len(y_labels)),'sample_ID': y_labels}) 51 | key_to_sample_index.to_csv(os.path.join(workdir,'key_to_sample_index.txt'),index=False,sep='\t') 52 | -------------------------------------------------------------------------------- /src/plot_exon_read_coverage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Nov 23 13:48:44 2017 5 | 6 | @author: tobias 7 | """ 8 | 9 | import os 10 | import numpy as np 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | 14 | # Get the data as pandas dataframe 15 | log_file = '/Users/tobias/GitHub/seqcap_processor/data/processed/selected_loci/average_cov_per_locus.txt' 16 | data_input = pd.read_csv(log_file, sep = '\t') 17 | -------------------------------------------------------------------------------- /src/plot_exon_yield_all_datatypes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Nov 23 11:40:16 2017 5 | 6 | @author: tobias 7 | """ 8 | 9 | import os 10 | import re 11 | import glob 12 | import numpy as np 13 | import pandas as pd 14 | import matplotlib as mpl 15 | import matplotlib.pyplot as plt 16 | 17 | # Get data for axes 18 | contig_input_file = '/Users/tobias/Desktop/target_contigs/match_table.txt' 19 | workdir = '/'.join(contig_input_file.split('/')[:-1]) 20 | contig_matrix = pd.read_csv(contig_input_file,sep='\t',index_col=0) 21 | 22 | x_labels = np.array(contig_matrix.index) 23 | num_x_labels = range(len(x_labels)) 24 | 25 | 26 | #______________________________Contig Data_____________________________________ 27 | # Read the contig data 28 | data_1_contig_present = np.matrix(contig_matrix).T 29 | data_1_y_labels = contig_matrix.columns 30 | # replace substring in sample name 31 | data_1_y_labels = np.core.defchararray.replace(np.array(data_1_y_labels,dtype=str), 'sample_', 'contigs ') 32 | 33 | 34 | #_______________________________Contig Alignment Data__________________________ 35 | # Get the alignment files and make list of loci with alignments 36 | alignment_folder = '/Users/tobias/Desktop/target_contigs/msa_alignments' 37 | alignment_files = glob.glob(os.path.join(alignment_folder, '*.fa*')) 38 | list_of_loci_with_alignments = [re.sub('.fasta','',al.split('/')[-1]) for al in alignment_files] 39 | # Create 1-dimensional matrix and fill with info which loci have alignment data 40 | presence_absence_df = pd.DataFrame({'loci':x_labels,'presence':0}) 41 | for locus in list_of_loci_with_alignments: 42 | row_index = presence_absence_df[presence_absence_df.loci == locus].index 43 | presence_absence_df.loc[row_index,'presence'] = 1 44 | 45 | data_2_contig_alignment = np.matrix(presence_absence_df.presence) 46 | data_2_y_labels = np.array('contig alignment') 47 | 48 | 49 | #_______________________________Reference-assembly Data__________________________ 50 | # Get the data as pandas dataframe 51 | read_cov_file = '/Users/tobias/GitHub/target_contigs/remapped_reads/average_cov_per_locus.txt' 52 | unsorted_read_cov_data = pd.read_csv(read_cov_file, sep = '\t',index_col=0) 53 | # sort columns in df 54 | temp_read_cov_data = unsorted_read_cov_data[sorted(unsorted_read_cov_data.columns)].sort_index() 55 | # add row of 0's for all missing loci 56 | loci_in_df = list(temp_read_cov_data.index) 57 | for locus in list(x_labels): 58 | if locus not in loci_in_df: 59 | temp_read_cov_data.loc[locus] = [0.0]*len(temp_read_cov_data.columns) 60 | # sort by index again 61 | read_cov_data = temp_read_cov_data.sort_index() 62 | 63 | # turn df into matrix 64 | data_3_read_cov = np.matrix(read_cov_data).T 65 | # lets use the same labels as for the contig data 66 | data_3_y_labels = np.core.defchararray.replace(data_1_y_labels, 'contigs ', 'coverage ') 67 | 68 | 69 | 70 | #___________________________Combine all Data___________________________________ 71 | combined_data = np.vstack([data_1_contig_present, data_2_contig_alignment,data_3_read_cov]) 72 | tmp_combined_y_labels = np.append(data_1_y_labels,data_2_y_labels) 73 | combined_y_labels = np.append(tmp_combined_y_labels,data_3_y_labels) 74 | 75 | height,width = combined_data.shape 76 | 77 | 78 | #_______________________________Plot Combined Data_____________________________ 79 | fig = plt.figure(figsize=(20,8)) 80 | #fig.subplots_adjust(top=1, bottom=0.0, left=0.2, right=0.99) 81 | for i,m in enumerate(combined_data): 82 | ax = plt.subplot(height, 1, i+1) 83 | ax.tick_params(left='off',bottom='off',labelleft='off') 84 | # Only plot x-axis for last row 85 | if not i == height-1: 86 | ax.xaxis.set_major_formatter(plt.NullFormatter()) 87 | #plt.axis("off") 88 | if combined_y_labels[i] == 'contig alignment': 89 | plt.imshow(combined_data[i], aspect='auto', cmap='binary', origin='lower') 90 | elif 'contigs' in combined_y_labels[i]: 91 | plt.imshow(combined_data[i], aspect='auto', cmap='GnBu', origin='lower') 92 | else: 93 | plt.imshow(combined_data[i], aspect='auto', cmap='hot_r', origin='lower',clim=(0.0, 10)) 94 | pos = list(ax.get_position().bounds) 95 | fig.text(pos[0] - 0.01, pos[1], combined_y_labels[i], horizontalalignment='right') 96 | plt.xlabel('exon index') 97 | #plt.colorbar() 98 | plt.show() 99 | fig.savefig(os.path.join(workdir,'exon_yield_all_datatypes.png'), dpi = 500) 100 | 101 | 102 | 103 | #________________________________Plot Legend___________________________________ 104 | # Make a figure and axes with dimensions as desired. 105 | fig = plt.figure(figsize=(1, 8)) 106 | # the values stand for [x0,x1,width,height] --> all in relation to total size as given by 'figsize=' 107 | ax1 = fig.add_axes([0.1,0.05,.4,.9]) 108 | # Set the colormap and norm to correspond to the data for which 109 | # the colorbar will be used. 110 | cmap = mpl.cm.hot_r 111 | norm = mpl.colors.Normalize(vmin=0, vmax=10) 112 | # plot a basic continuous colorbar with ticks and labels. 113 | cb1 = mpl.colorbar.ColorbarBase(ax1, cmap=cmap, 114 | norm=norm, 115 | orientation='vertical') 116 | cb1.set_label('Read coverage') 117 | #plt.show() 118 | fig.savefig(os.path.join(workdir,'legend.png'), dpi = 500) 119 | -------------------------------------------------------------------------------- /src/plot_quality_test_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Feb 26 09:39:51 2019 5 | 6 | @author: Tobias Andermann (tobias.andermann@bioenv.gu.se) 7 | """ 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | from matplotlib import colors 12 | import glob 13 | import zipfile 14 | import collections 15 | 16 | def get_test_results(fastqc_log_content): 17 | test_results = [i for i in fastqc_log_content if i.startswith('>>')] 18 | test_names = [string.split('\t')[0].replace('>>','') for string in test_results if not string == '>>END_MODULE'] 19 | test_results = [string.split('\t')[-1] for string in test_results if not string == '>>END_MODULE'] 20 | return test_names,test_results 21 | 22 | 23 | input_dir = '/Users/tobias/GitHub/seqcap_processor/data/raw/test_folder_quality_check/' 24 | zip_files = glob.glob('%s*.zip'%input_dir) 25 | sample_test_results_dict = {} 26 | for file in zip_files: 27 | sample_name = file.split('/')[-1].replace('_fastqc.zip','') 28 | archive = zipfile.ZipFile(file,'r') 29 | target_file = [i for i in archive.namelist() if i.endswith('fastqc_data.txt')][0] 30 | fastqc_log = archive.read(target_file) 31 | fastqc_log_formatted = str(fastqc_log).replace('\\t','\t').split('\\n') 32 | labels,results = get_test_results(fastqc_log_formatted) 33 | num_results = [0 if i == 'pass' else i for i in results] 34 | num_results = [1 if i == 'warn' else i for i in num_results] 35 | num_results = [2 if i == 'fail' else i for i in num_results] 36 | sample_test_results_dict[sample_name] = num_results 37 | 38 | label_abbrevations = [] 39 | for i in labels: 40 | split_string = i.split(' ') 41 | abbrevation = [] 42 | for j in split_string: 43 | letter = j[0] 44 | abbrevation.append(letter) 45 | abbrevation = ''.join(abbrevation) 46 | label_abbrevations.append(abbrevation) 47 | 48 | 49 | # plot the sampel overview 50 | ordered_dict = collections.OrderedDict(sorted(sample_test_results_dict.items())) 51 | samples = list(ordered_dict.keys()) 52 | values = np.array(list(ordered_dict.values())) 53 | 54 | fig = plt.figure(figsize=(8,len(samples))) 55 | plt.imshow(values, interpolation='nearest', cmap=colors.ListedColormap(['green','yellow','red'])) 56 | plt.yticks(range(values.shape[0]), samples) 57 | plt.xticks(range(values.shape[1]), label_abbrevations) 58 | plt.xlabel('FastQC test (abbrevated names)') 59 | plt.ylabel('Sample name') 60 | plt.title('FastQC results by sample') 61 | fig.savefig('/Users/tobias/Desktop/test1.pdf', dpi = 500,transparent=True)#bbox_inches='tight', 62 | 63 | # plot the test overview 64 | all_pass_counts = [list(col).count(0) for col in values.T] 65 | all_warn_counts = [list(col).count(1) for col in values.T] 66 | all_fail_counts = [list(col).count(2) for col in values.T] 67 | 68 | barWidth=0.3 69 | r2 = np.arange(len(all_pass_counts)) 70 | r1 = [x - barWidth for x in r2] 71 | r3 = [x + barWidth for x in r2] 72 | 73 | fig = plt.figure(figsize=(8,len(samples))) 74 | plt.bar(r1, all_pass_counts, color='green', width=barWidth, edgecolor='black', label='pass') 75 | plt.bar(r2, all_warn_counts, color='yellow', width=barWidth, edgecolor='black', label='warn') 76 | plt.bar(r3, all_fail_counts, color='red', width=barWidth, edgecolor='black', label='fail') 77 | plt.xticks(range(values.shape[1]), label_abbrevations) 78 | for border in np.array(r3)+0.66*barWidth: 79 | plt.axvline(border,color='black',linestyle='--',alpha=0.5) 80 | plt.yticks(range(len(samples)+1), range(len(samples)+1)) 81 | plt.xlim(0-barWidth-0.75*barWidth,) 82 | plt.xlabel('FastQC test (abbrevated names)') 83 | plt.ylabel('number of samples') 84 | plt.title('FastQC results by test type') 85 | plt.legend() 86 | fig.savefig('/Users/tobias/Desktop/test.pdf', dpi = 500,transparent=True)#bbox_inches='tight', 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /src/plotting_function_final.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Apr 25 16:40:29 2019 5 | 6 | @author: Tobias Andermann (tobias.andermann@bioenv.gu.se) 7 | """ 8 | 9 | import os 10 | import re 11 | import glob 12 | import numpy as np 13 | import pandas as pd 14 | import matplotlib as mpl 15 | import matplotlib.pyplot as plt 16 | plt.switch_backend('agg') 17 | 18 | 19 | 20 | contig_input_file = '/Users/tobias/GitHub/seqcap_processor/data/processed/target_contigs/match_table.txt' 21 | outdir = '/Users/tobias/Desktop/' 22 | alignment_folder = '/Users/tobias/GitHub/seqcap_processor/data/processed/alignments/contig_alignments' 23 | read_cov_file = '/Users/tobias/GitHub/seqcap_processor/data/processed/remapped_reads/average_cov_per_locus.txt' 24 | 25 | 26 | #def plot_contig_yield(contig_input_file,outdir): 27 | workdir = '/'.join(contig_input_file.split('/')[:-1]) 28 | contig_matrix = pd.read_csv(contig_input_file,sep='\t',index_col=0) 29 | x_labels = np.array(contig_matrix.index) 30 | num_x_labels = range(len(x_labels)) 31 | #______________________________Contig Data_____________________________________ 32 | # Read the contig data 33 | data_1_contig_present = np.matrix(contig_matrix).T 34 | data_1_y_labels = contig_matrix.columns 35 | # replace substring in sample name 36 | data_1_y_labels = np.core.defchararray.replace(np.array(data_1_y_labels,dtype=str), '', '') 37 | # print a text file with the loci indeces and the corresponding loci names 38 | new_locus_list = x_labels 39 | locus_index_overview = pd.DataFrame({'loci':new_locus_list}) 40 | locus_index_overview.to_csv(os.path.join(workdir,'locus_index_overview.txt'),sep='\t',header=False) 41 | 42 | #_______________________________Contig Alignment Data__________________________ 43 | # Get the alignment files and make list of loci with alignments 44 | alignment_files = glob.glob(os.path.join(alignment_folder, '*.fasta')) 45 | list_of_loci_with_alignments = [re.sub('.fasta','',al.split('/')[-1]) for al in alignment_files] 46 | # Create 1-dimensional matrix and fill with info which loci have alignment data 47 | presence_absence_df = pd.DataFrame({'loci':x_labels,'presence':0}) 48 | for locus in list_of_loci_with_alignments: 49 | row_index = presence_absence_df[presence_absence_df.loci == int(locus)].index 50 | presence_absence_df.loc[row_index,'presence'] = 1 51 | data_2_contig_alignment = np.matrix(presence_absence_df.presence) 52 | data_2_y_labels = np.array('Contig alignment') 53 | 54 | 55 | #_______________________________3. Reference-assembly Data__________________________ 56 | # Get the data as pandas dataframe 57 | unsorted_read_cov_data = pd.read_csv(read_cov_file, sep = '\t',index_col=0) 58 | locus_selection=False 59 | if 'sum_per_locus' in unsorted_read_cov_data.columns: 60 | unsorted_read_cov_data = unsorted_read_cov_data.iloc[:,:-1] 61 | locus_selection=True 62 | # sort columns in df 63 | temp_read_cov_data = unsorted_read_cov_data[sorted(unsorted_read_cov_data.columns)].sort_index() 64 | # add row of 0's for all missing loci 65 | loci_in_df = list(temp_read_cov_data.index) 66 | for locus in list(x_labels): 67 | if locus not in loci_in_df: 68 | temp_read_cov_data.loc[locus] = [0.0]*len(temp_read_cov_data.columns) 69 | # sort by index again 70 | read_cov_data = temp_read_cov_data.sort_index() 71 | # turn df into matrix 72 | data_3_read_cov = np.matrix(read_cov_data).T 73 | # lets use the same labels as for the contig data 74 | data_3_y_labels = np.core.defchararray.replace(data_1_y_labels, 'contigs', 'coverage') 75 | #___________________________Combine all Data___________________________________ 76 | 77 | combined_data = np.vstack([data_1_contig_present, data_2_contig_alignment,data_3_read_cov]) 78 | tmp_combined_y_labels = np.append(data_1_y_labels,data_2_y_labels) 79 | combined_y_labels = np.append(tmp_combined_y_labels,data_3_y_labels) 80 | 81 | 82 | norm_value = 10 83 | #___________________________Plotting settings___________________________________ 84 | height,width = combined_data.shape 85 | norm=None 86 | if norm_value: 87 | norm = mpl.colors.Normalize(vmin=0, vmax=norm_value) 88 | switch = 'off' 89 | 90 | fig, axes2d = plt.subplots(nrows=height, ncols=1,sharex=True, sharey=True,figsize=(width/40,height/2)) 91 | for i, ax in enumerate(axes2d): 92 | ax = plt.subplot(height, 1, i+1) 93 | ax.tick_params(left=False,bottom=False,labelleft=True) 94 | # Only plot x-axis for last row 95 | if not i == height-1: 96 | ax.xaxis.set_major_formatter(plt.NullFormatter()) 97 | #plt.axis("off") 98 | if combined_y_labels[i] == 'Contig alignment': 99 | ax.imshow(combined_data[i], aspect='auto', cmap='Greens', origin='lower') 100 | switch = 'on' 101 | else: 102 | if switch == 'off': 103 | ax.imshow(combined_data[i], aspect='auto', cmap='GnBu', origin='lower') 104 | else : 105 | ax.imshow(combined_data[i], aspect='auto', cmap='hot_r',norm=norm, origin='lower')#,clim=(0.0, 10)) 106 | plt.yticks([0],[combined_y_labels[i]]) 107 | ax.set_xlabel('Exon index') 108 | fig.add_subplot(211, frameon=False) 109 | # hide tick and tick label of the big axis 110 | plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False) 111 | plt.ylabel("Contig present (yes/no)",labelpad=50) 112 | fig.add_subplot(212, frameon=False) 113 | # hide tick and tick label of the big axis 114 | plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False) 115 | plt.ylabel("Read coverage (# of reads)",labelpad=50) 116 | 117 | fig.savefig(os.path.join(outdir,'contig_yield_overview.png'),bbox_inches='tight', dpi = 500) 118 | 119 | #plot_contig_yield(contig_input_file,outdir) 120 | -------------------------------------------------------------------------------- /src/remove_short_contigs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Feb 26 15:21:03 2019 5 | 6 | @author: Tobias Andermann (tobias.andermann@bioenv.gu.se) 7 | """ 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import matplotlib.pyplot as plt 12 | import glob 13 | 14 | contig_folder = '/Users/tobias/GitHub/seqcap_processor/data/processed/contigs/' 15 | contig_files = glob.glob(os.path.join(contig_folder,'*.fa')) 16 | for contig_file in contig_files: 17 | #contig_file = '/Users/tobias/Desktop/1063.fa' 18 | #contig_file_new = '/Users/tobias/GitHub/seqcap_processor/data/processed/contigs/1063_removed_short_contigs.fa' 19 | min_length = 200 20 | fasta = open(contig_file,'r') 21 | fasta_content = list(fasta) 22 | counter = 0 23 | indeces_to_keep = [] 24 | for i,line in enumerate(fasta_content): 25 | if not line.startswith('>'): 26 | contig_length = len(line.replace('\n','')) 27 | if contig_length < min_length: 28 | pass 29 | else: 30 | # line number of header 31 | indeces_to_keep.append(i-1) 32 | # line number of sequence 33 | indeces_to_keep.append(i) 34 | new_fasta_content = list(np.array(fasta_content)[indeces_to_keep]) 35 | new_fasta = open(contig_file,'w') 36 | for line in new_fasta_content: 37 | new_fasta.write(line) 38 | new_fasta.close() 39 | 40 | 41 | -------------------------------------------------------------------------------- /src/simmatrix_geonoma_allele_data.R: -------------------------------------------------------------------------------- 1 | setwd("/Users/tobias/GitHub/seqcap_processor/data/processed/stacey_analysis/run_on_spacemule/") 2 | workdir<-getwd() 3 | workdir 4 | dir() 5 | x<-read.table("species_da_results_1e-3.txt", sep="", header=TRUE) 6 | x 7 | #x$Florisuga <- NULL 8 | y<-names(x) 9 | y 10 | renames <- matrix(c( 11 | "X1166_allele1", "1166_1", 12 | "X1166_allele0", "1166_0", 13 | "X1140_allele1", "1140_1", 14 | "X1140_allele0", "1140_0", 15 | "X1087_allele1", "1087_1", 16 | "X1087_allele0", "1087_0", 17 | "X1086_allele1", "1086_1", 18 | "X1086_allele0", "1086_0", 19 | "X1085_allele1", "1085_1", 20 | "X1085_allele0", "1085_0", 21 | "X1083_allele1", "1083_1", 22 | "X1083_allele0", "1083_0", 23 | "X1082_allele1", "1082_1", 24 | "X1082_allele0", "1082_0", 25 | "X1080_allele1", "1080_1", 26 | "X1080_allele0", "1080_0", 27 | "X1079_allele1", "1079_1", 28 | "X1079_allele0", "1079_0", 29 | "X1074_allele1", "1074_1", 30 | "X1074_allele0", "1074_0", 31 | "X1073_allele1", "1073_1", 32 | "X1073_allele0", "1073_0", 33 | "X1070_allele1", "1070_1", 34 | "X1070_allele0", "1070_0", 35 | "X1068_allele1", "1068_1", 36 | "X1068_allele0", "1068_0", 37 | "X1065_allele1", "1065_1", 38 | "X1065_allele0", "1065_0", 39 | "X1064_allele1", "1064_1", 40 | "X1064_allele0", "1064_0", 41 | "X1063_allele1", "1063_1", 42 | "X1063_allele0", "1063_0", 43 | "X1061_allele1", "1061_1", 44 | "X1061_allele0", "1061_0"), 45 | nrow=34, ncol=2, byrow=TRUE) 46 | renames 47 | # define the columns that should be pursued (remove the 'count' 'fraction' 'similarity' and 'nclusters' column) 48 | mincl.names<-colnames(x)[-(1:4)] 49 | for (i in 1:length(mincl.names)) { 50 | stopifnot(mincl.names[i] == renames[i,1]) 51 | } 52 | mincl.names[1] 53 | renames[1,1] 54 | #make similarity matrix 55 | displaynames <- renames[,2] 56 | nmincls <- length(displaynames) 57 | sim <- matrix(0, ncol=nmincls, nrow=nmincls, dimnames=list(displaynames, displaynames)) 58 | for (i in 1:nmincls) { 59 | for (j in 1:nmincls) { 60 | coli <- x[,mincl.names[i]] 61 | colj <- x[,mincl.names[j]] 62 | w <- coli == colj 63 | sim[i,j] <- sum(x[w,"fraction"]) 64 | } 65 | } 66 | sim <- pmin(sim,1) 67 | neworder <- c(34,33,26,25,22,21,18,17,20,19,32,31,30,29,8,7,28,27,24,23,10,9,12,11,16,15,4,3,2,1,14,13,6,5) 68 | dividers<-c(0,34) 69 | plot.rectangle <- function(v1,v2,...) 70 | { 71 | polygon(c(v1[1],v2[1],v2[1],v1[1]), c(v1[2],v1[2],v2[2],v2[2]), ...) 72 | } 73 | plot.simmatrix <- function() { 74 | par(mar= c(0,5,5,0)+.1) 75 | plot(NULL, xlim=c(0,nmincls), ylim=c(nmincls,0), axes=FALSE, ylab="", xlab="") 76 | axis(3, at=(1:nmincls)-.5, displaynames[neworder], tick=FALSE, las=2, line=-1) 77 | axis(2, at=(1:nmincls)-.5, displaynames[neworder], tick=FALSE, las=2, line=-1) 78 | for (i in 1:nmincls) { 79 | for (j in 1:nmincls) { 80 | d <- 1 - sim[neworder[i],neworder[j]] 81 | plot.rectangle(c(i-1,j-1), c(i,j), col=rgb(d,d,d), border="white") 82 | } 83 | } 84 | for (b in dividers) { 85 | lines(x=c(-.5,nmincls), y=c(b,b)) 86 | lines(x=c(b,b), y=c(-.5,nmincls)) 87 | } 88 | #legend(nmincls-4,0,legend = c("0%","25%","50%","75%","100%"),col=c(rgb(1,1,1),rgb(.75,.75,.75),rgb(.5,.5,.5),rgb(.25,.25,.25),rgb(0,0,0)),bg="white", lwd=15, cex=2, box.lty = 1) 89 | } 90 | print(sim[neworder,neworder], digits=2) 91 | #plot.simmatrix() 92 | pdf(file=paste('/Users/tobias/GitHub/seqcap_processor/data/processed/stacey_analysis/run_on_spacemule/', "simmatrix_geonoma_allele_1e3.pdf", sep="")) 93 | plot.simmatrix() 94 | dev.off() 95 | 96 | --------------------------------------------------------------------------------