├── .gitignore ├── Authors.rst ├── LICENSE.txt ├── MANIFEST.in ├── NEWS ├── README.md ├── _config.yml ├── maegatk ├── .DS_Store ├── __init__.py ├── bin │ ├── .DS_Store │ ├── R │ │ └── toRDS.R │ ├── anno │ │ ├── .DS_Store │ │ └── fasta │ │ │ ├── hg19.fasta │ │ │ ├── hg19.fasta.amb │ │ │ ├── hg19.fasta.ann │ │ │ ├── hg19.fasta.bwt │ │ │ ├── hg19.fasta.pac │ │ │ ├── hg19.fasta.sa │ │ │ ├── rCRS.fasta │ │ │ ├── rCRS.fasta.amb │ │ │ ├── rCRS.fasta.ann │ │ │ ├── rCRS.fasta.bwt │ │ │ ├── rCRS.fasta.pac │ │ │ └── rCRS.fasta.sa │ ├── fgbio.jar │ ├── picard.jar │ ├── python │ │ ├── filterClipBam.py │ │ ├── find_barcodes.py │ │ ├── oneSample_maegatk.py │ │ ├── split_barcoded_bam.py │ │ └── sumstatsBP.py │ └── snake │ │ ├── Snakefile.maegatk.Gather │ │ └── Snakefile.maegatk.Scatter ├── cli.py ├── cliindel.py └── maegatkHelp.py ├── setup.cfg ├── setup.py └── tests ├── README.md ├── data ├── BT_K_variants.rda ├── test_maester.bam └── test_maester.bam.bai ├── make_mixing_plot.R └── test_cli.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | -------------------------------------------------------------------------------- /Authors.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | * Caleb Lareau (@caleblareau) 8 | 9 | Major Contributors 10 | ------------------ 11 | * Kseniia Safina (@noranekonobokkusu) 12 | 13 | Code Contributors 14 | --------------- 15 | * Vincent Liu 16 | 17 | Intellectual contributors 18 | --------------- 19 | * Peter van Galen 20 | * Tyler Miller 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 caleblareau 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Prune tests, data, and other stuff 2 | prune maegatk.egg-info/ 3 | prune tests/ 4 | prune *.DS_Store 5 | 6 | 7 | # Scripts 8 | recursive-include maegatk * 9 | 10 | # Misc 11 | include Authors.rst 12 | include LICENSE.txt 13 | include NEWS 14 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | 2 | # 0.2.1 3 | - Fix error from yaml deprecation 4 | 5 | # 0.2.0 6 | - Documentation refactoring thanks to @noranekonobokkusu 7 | 8 | # 0.1.0 9 | - Alpha version uploaded to PyPi 10 | 11 | # 0.0.1 12 | - Added NEWS 13 | 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # maegatk | Mitochondrial Alteration Enrichment and Genome Analysis Toolkit 2 | 3 | [![PyPI version](https://badge.fury.io/py/maegatk.svg)](https://pypi.python.org/pypi/mgaeatk) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) 5 | [![Downloads](https://pepy.tech/badge/maegatk/month)](https://pepy.tech/project/maegatk) 6 | 7 | [Source code is made freely available](http://github.com/caleblareau/maegatk) 8 | and a packaged install version is provided through [PyPi](https://pypi.python.org/pypi/maegatk/). 9 |
10 | 11 | ## About 12 | This repository houses the **maegatk** package, a python-based command line interface for processing `.bam` files with mitochondrial reads and generating high-quality heteroplasmy estimation from sequencing data. This package places a special emphasis on [MAESTER](https://www.biorxiv.org/content/10.1101/2021.03.08.434450v1) data but is applicable to any UMI-based scRNA-seq dataset. **The key feature present in this package is the consensus base inference by collapsing reads with identical insert positions and UMIs**. This allows `maegatk` to produce robust, error corrected genotype calls in single cells. 13 |
14 | 15 | ## Install maegatk 16 | 17 | **Recommended:** 18 | First, create a `python` virtual environment in some working directory to keep things tidy: 19 | 20 | ``` 21 | python3 -m venv venv3 22 | source venv3/bin/activate 23 | ``` 24 | 25 | Next, install `maegatk` from [PyPi](https://pypi.org/project/maegatk/): 26 | 27 | ``` 28 | pip3 install maegatk 29 | ``` 30 | 31 | This should be all that you need. To verify: 32 | 33 | ``` 34 | maegatk --version 35 | ``` 36 | 37 | Available options: 38 |
 39 |   maegatk --help
 40 | Usage: maegatk [OPTIONS] [bcall|support]
 41 | 
 42 |   maegatk: a Maester genome toolkit.
 43 | 
 44 |   MODE = ['bcall', 'support']
 45 | 
 46 | Options:
 47 |   --version                       Show the version and exit.
 48 |   -i, --input TEXT                Input; a singular, indexed bam file.
 49 |                                   [required]
 50 | 
 51 |   -o, --output TEXT               Output directory for genotypes.
 52 |   -n, --name TEXT                 Prefix for project name
 53 |   -g, --mito-genome TEXT          mitochondrial genome configuration. Requires
 54 |                                   bwa indexed fasta file or `rCRS` (built-in)
 55 |                                   [required]
 56 | 
 57 |   -c, --ncores TEXT               Number of cores to run the main job in
 58 |                                   parallel.
 59 | 
 60 |   --cluster TEXT                  Message to send to Snakemake to execute jobs
 61 |                                   on cluster interface; see documentation.
 62 | 
 63 |   --jobs TEXT                     Max number of jobs to be running
 64 |                                   concurrently on the cluster interface.
 65 | 
 66 |   -bt, --barcode-tag TEXT         Read tag (generally two letters) to separate
 67 |                                   single cells; valid and required only in
 68 |                                   `bcall` mode.
 69 | 
 70 |   -b, --barcodes TEXT             File path to barcodes that will be
 71 |                                   extracted; useful only in `bcall` mode.
 72 | 
 73 |   -mb, --min-barcode-reads INTEGER
 74 |                                   Minimum number of mitochondrial reads for a
 75 |                                   barcode to be genotyped; useful only in
 76 |                                   `bcall` mode; will not overwrite the
 77 |                                   `--barcodes` logic.
 78 | 
 79 |   --NHmax INTEGER                 Maximum number of read alignments allowed as
 80 |                                   governed by the NH flag. Default = 2.
 81 | 
 82 |   --NMmax INTEGER                 Maximum number of paired mismatches allowed
 83 |                                   represented by the NM/nM tags. Default = 15.
 84 | 
 85 |   -mr, --min-reads INTEGER        Minimum number of supporting reads to call a
 86 |                                   consensus UMI/rread. Default = 1.
 87 | 
 88 |   -ub, --umi-barcode TEXT         Read tag (generally two letters) to specify
 89 |                                   the UMI tag when removing duplicates for
 90 |                                   genotyping.
 91 | 
 92 |   -jm, --max-javamem TEXT         Maximum memory for java for running
 93 |                                   duplicate removal. Default = 4000m.
 94 | 
 95 |   -q, --base-qual INTEGER         Minimum base quality for inclusion in the
 96 |                                   genotype count. Default = 0.
 97 | 
 98 |   -aq, --alignment-quality INTEGER
 99 |                                   Minimum alignment quality to include read in
100 |                                   genotype. Default = 0.
101 | 
102 |   -ns, --nsamples INTEGER         The number of samples / cells to be
103 |                                   processed per iteration; default is all.
104 | 
105 |   -k, --keep-samples TEXT         Comma separated list of sample names to
106 |                                   keep; ALL (special string) by default.
107 |                                   Sample refers to basename of .bam file
108 | 
109 |   -x, --ignore-samples TEXT       Comma separated list of sample names to
110 |                                   ignore; NONE (special string) by default.
111 |                                   Sample refers to basename of .bam file
112 | 
113 |   -z, --keep-temp-files           Keep all intermediate files.
114 |   -sr, --skip-R                   Generate plain-text only output. Otherwise,
115 |                                   this generates a .rds obejct that can be
116 |                                   immediately read into R for downstream
117 |                                   analysis.
118 | 
119 |   -sb, --skip-barcodesplit        Skip the time consuming barcode-splitting
120 |                                   step if it finished successfully before
121 |   
122 |   -so, --snake-stdout             Write snakemake log to sdout rather than a
123 |                                   file.
124 | 
125 |   --help                          Show this message and exit.
126 | 
127 | 128 | ### Dependencies 129 | `java`, `snakemake`, `bwa` (tested with v0.7.17-r1188), `samtools` (tested with v1.15.1), `freebayes` (for indel calling), `R` should be available in the environment. 130 | `dplyr`, `data.table`, `Matrix`, `GenomicRanges`, and `SummarizedExperiment` packages should be installed in R. **Note**: if you specify the flag `--skip-R`, you can avoid the internal R execution but will have plain text enumerations of the mitochondrial genetic data. 131 | 132 | ### fgbio 133 | We use [fgbio](https://github.com/fulcrumgenomics/fgbio) for PCR duplicate removal. Thus, `java` is by default a required dependency. While not recommended, you can avoid this dependency by throwing the `--keep-duplicates` flag, which will circumvent the `java` call (but retain likely PCR duplicates, which we've found decreases the interpretability of variants by introducing additional false positives). If you retain duplicates, then maegatk isn't doing anything for you, and you should consider running [mgatk](https://github.com/caleblareau/mgatk). 134 | 135 | > [!IMPORTANT] 136 | > We recommend specifying a custom `tmp` directory for **fgbio**, as the default directory can easily get overflown on your system. This can be done by modifying the **fgbio** command in [maegatk/bin/python/oneSample_maegatk.py](https://github.com/caleblareau/maegatk/blob/master/maegatk/bin/python/oneSample_maegatk.py) (located in `~/.local/lib/python3.9/site-packages/maegatk/bin/python/`) by adding the `-Djava.io.tmpdir` option: 137 |
 fgbio = java + " -Djava.io.tmpdir=/some/directory/"  + " -Xmx" + max_javamem + " -jar " + script_dir + "/bin/fgbio.jar" 
138 | 139 | ## Test run 140 |
maegatk bcall -i tests/data/test_maester.bam -o tests/test_maester -z
141 | 142 | ## Output files 143 | The ultimate result of **maegatk** is an `.rds` file in `final/` which represents a SingleCellExperiment object with multiple assays, containing information on the support of every possible single-nucleotide variant at every possible genome position. The same information is contained in the five 'txt.gz' files in `final`, which are the final output files if `--skip-R` is used. 144 | 145 | The entire pipeline is coordinated in [maegatk/cli.py](https://github.com/caleblareau/maegatk/blob/master/maegatk/cli.py). The input BAM file is first split into smaller .bam files corresponding to individual cell barcodes in `temp/barcoded_bams/`. The first snakemake file, [Snakefile.maegatk.Scatter](https://github.com/caleblareau/maegatk/blob/master/maegatk/bin/snake/Snakefile.maegatk.Scatter), is then executed for each cell independently; it runs `oneSample_maegatk.py` and creates a series of files per cell barcode in `temp/temp_bam/`, `temp/ready_bam/` and `temp/sparse_matrices`. Once all the cell barcodes have been processed, the second snakemake file, [Snakefile.maegatk.Gather](https://github.com/caleblareau/maegatk/blob/master/maegatk/bin/snake/Snakefile.maegatk.Gather), combines `temp/sparse_matrices` results into five `.txt.gz` files in `final`. Finally, [toRDS.R](https://github.com/caleblareau/maegatk/blob/master/maegatk/bin/R/toRDS.R) creates a SingleCellExperiment object out of `.txt.gz` files. 146 | 147 | An error at any stage of the pipeline will result in a generic R error. It is recommended to keep intermediate files with option `-z` and explore snakemake logs in `logs/` and intermediate files in `temp_bam,ready_bam,sparse_matrices` to troubleshoot the case of error. 148 | 149 | ## BAM file preparation 150 | Input `.bam` files should be modified to contain extra tags corresponding to cell barcode and UMI (see [test_maester.bam](https://github.com/caleblareau/maegatk/blob/master/tests/data/test_maester.bam)). If non-standard, these tags should be specified to `maegatk` through `-bt` and `-ub` options. 151 | 152 | ## Should I use maegatk or mgatk? 153 | We previously developed the [mgatk package](https://github.com/caleblareau/mgatk) for genotyping single-cell datasets. The key feature distinctly present in `maegatk` is the consensus collapsing of sequencing reads using [fgbio's CallMolecularConsensusRead](http://fulcrumgenomics.github.io/fgbio/tools/latest/CallMolecularConsensusReads.html). Thus, if you have multiple PCR duplicates per unique molecule (defined by position x UMI x cell), maegatk provides a unique processing workflow to determine the molecular consensus of bases across these duplicate sequencing reads. In contrast, `mgatk` utilizing [picard's MarkDuplicates](https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard-), which selects the singular read with the best mean base quality, which may be suboptimal particularly with deep-sequencing data. Otherwise, the tools produce virtually identical results. One note: `mgatk` has a optimized workflow for large (>10,000 cells) datasets in the `tenx` mode. If you have exceptionally large datasets, mgatk may be the better tool for computational feasibility out-of-the-box. Otherwise, consider just pre-splitting your `.bam` file into smaller pieces. 154 | 155 | ## Indel calling 156 | ### Step 1 - normal maegatk execution 157 | After successfully running `maegatk` with the `-qc` flag, all quality-controlled per-cell bam files will be retained in the output folder. 158 | 159 | ### Step 2 - indel calling 160 | `maegatk-indel` can be called on the folder containing all per-cell bam files to call indels for each cell. Under the hood, `maegatk-indel` calls [freebayes](https://github.com/freebayes/freebayes) on each bam file to generate a vcf file. It then collects indel information from all per-cell vcf files and merges them into the final `indel_summary.csv` file in the user-specified output directory. User can run `maegatk-indel` with `-k` flag to keep the intermediate vcf files, which by default will be removed after execution. The `-m` option specifies minimal number of reads in each cell required to support an indel, which by default is 5 and passed to freebayes. 161 | 162 | ### Step 3 - output interpretation 163 | In the output `indel_summary.csv` file, each row corresponds to a cell/indel combination and contains information specific to that combination. The quality score comes from freebayes. 164 | 165 | ## Interpreting variants 166 | maegatk provides matrices of mtDNA variant counts from which the user can select informative variants to reconstruct cellular relationships. The functional impact of mtDNA variants that are selected for further analysis can be assessed using predictions and annotations: amino acid changes (if any), which features these changes affect, predicted consequences (with SIFT and PolyPhen scores), associated diseases, and the frequency of these variants in the general population. This information can simply be determined from subsetting `rev_table.txt` for [the selected variants](https://github.com/EDePasquale/Mitochondrial_variants). Thoughtful interpretation of variants that are used to establish clonal relationships requires assessment of variant frequencies and their potential impact on fitness of the host cell. 167 | 168 | ## Contact 169 | Raise an issue on the repository with any issues getting this toolkit working. 170 |

171 | 172 | 173 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | baseurl: /maegatk 2 | -------------------------------------------------------------------------------- /maegatk/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/maegatk/.DS_Store -------------------------------------------------------------------------------- /maegatk/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /maegatk/bin/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/maegatk/bin/.DS_Store -------------------------------------------------------------------------------- /maegatk/bin/R/toRDS.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | suppressMessages(suppressWarnings(library(tools))) 3 | suppressMessages(suppressWarnings(library(Matrix))) 4 | suppressMessages(suppressWarnings(library(SummarizedExperiment))) 5 | suppressMessages(suppressWarnings(library(GenomicRanges))) 6 | suppressMessages(suppressWarnings(library(data.table))) 7 | 8 | options(warn=-1) 9 | 10 | if(FALSE){ 11 | 12 | } 13 | 14 | # Explicit import of mgatk output files 15 | importMito.explicit <- function(Afile, Cfile, Gfile, Tfile, 16 | coverageFile, depthFile, referenceAlleleFile, 17 | mitoChr = "chrM"){ 18 | 19 | variantFiles <- list(Afile, Cfile, Gfile, Tfile) 20 | metaFiles <- list(coverageFile, depthFile, referenceAlleleFile) 21 | 22 | nullout <- lapply(c(variantFiles, metaFiles), function(file){ 23 | stopifnot(length(file) == 1) 24 | }) 25 | 26 | # Set up downstream processing including robust ordering 27 | # The coverage file could have slightly more variants / 28 | # individual samples depending on the calls, so base it 29 | # of of them 30 | importDT <- function(file){ 31 | if(tools::file_ext(file) == "gz"){ 32 | cov <- suppressMessages(data.table::fread(paste0("zcat < ", file), stringsAsFactors = TRUE)) 33 | } else if(tools::file_ext(file) %in% c("txt", "csv", "tsv")){ 34 | cov <- suppressMessages(data.table::fread(paste0(file), stringsAsFactors = TRUE)) 35 | } else{ 36 | stop("Provide a valid file format for the file (.gz, .txt, .csv, or .tsv)") 37 | } 38 | } 39 | 40 | cov <- importDT(coverageFile) 41 | 42 | # Make a long matrix of BAQ and Counts for non-reference alleles 43 | ref <- importDT(referenceAlleleFile) 44 | maxpos <- max(ref[[1]]) 45 | 46 | samplesOrder <- levels(cov[[2]]) 47 | maxsamples <- length(samplesOrder) 48 | 49 | # make coverage a sparse matrix 50 | covmat <- Matrix::sparseMatrix( 51 | i = c(cov[[1]], maxpos), 52 | j = c(as.numeric(cov[[2]]), maxsamples), 53 | x = c(cov[[3]], 0) 54 | ) 55 | remove(cov) 56 | 57 | # Import Counts and qualities 58 | importSMs <- function(file){ 59 | # fread the individual variant calls in 60 | if(tools::file_ext(file) == "gz"){ 61 | dt <- suppressMessages(data.table::fread(paste0("zcat < ", file), stringsAsFactors = TRUE)) 62 | } else if(tools::file_ext(file) %in% c("txt", "csv", "tsv")){ 63 | dt <- suppressMessages(data.table::fread(paste0(file), stringsAsFactors = TRUE)) 64 | } else{ 65 | stop("Provide a valid file format for the variant call file (.gz, .txt, .csv, or .tsv)") 66 | } 67 | 68 | dt$sample <- factor(dt$sample, levels = samplesOrder) 69 | 70 | counts_fw <- Matrix::sparseMatrix( 71 | i = c(dt[[1]],maxpos), 72 | j = c(as.numeric(dt[[2]]), maxsamples), 73 | x = c(dt[[3]],0) 74 | ) 75 | 76 | qual_fw <- Matrix::sparseMatrix( 77 | i = c(dt[[1]],maxpos), 78 | j = c(as.numeric(dt[[2]]), maxsamples), 79 | x = c(dt[[4]],0) 80 | ) 81 | 82 | counts_rev <- Matrix::sparseMatrix( 83 | i = c(dt[[1]],maxpos), 84 | j = c(as.numeric(dt[[2]]), maxsamples), 85 | x = c(dt[[5]],0) 86 | ) 87 | 88 | qual_rev <- Matrix::sparseMatrix( 89 | i = c(dt[[1]],maxpos), 90 | j = c(as.numeric(dt[[2]]), maxsamples), 91 | x = c(dt[[6]],0) 92 | ) 93 | remove(dt) 94 | return(list("counts_fw" = counts_fw, "qual_fw" = qual_fw, 95 | "counts_rev" = counts_rev, "qual_rev" = qual_rev)) 96 | } 97 | 98 | ACGT <- lapply(variantFiles, importSMs) 99 | names(ACGT) <- c("A", "C", "G", "T") 100 | 101 | # Create colData 102 | depth <- data.frame(importDT(depthFile)) 103 | sdf <- merge(data.frame(sample = samplesOrder), depth, by.x = "sample", by.y = "V1") 104 | rownames(sdf) <- samplesOrder 105 | colnames(sdf) <- c("sample", "depth") 106 | 107 | # Make row Ranges for each object 108 | row_g_cov <- GenomicRanges::GRanges(seqnames = mitoChr, 109 | IRanges::IRanges(1:maxpos, width = 1)) 110 | GenomicRanges::mcols(row_g_cov) <- data.frame(refAllele = toupper(ref[[2]][1:maxpos])) 111 | 112 | # Make summarized experiments and 113 | SE <- SummarizedExperiment::SummarizedExperiment( 114 | assays = list( 115 | "A_counts_fw" = ACGT[["A"]][["counts_fw"]], "A_counts_rev" = ACGT[["A"]][["counts_rev"]], "A_qual_fw" = ACGT[["A"]][["qual_fw"]], "A_qual_rev" = ACGT[["A"]][["qual_rev"]], 116 | "C_counts_fw" = ACGT[["C"]][["counts_fw"]], "C_counts_rev" = ACGT[["C"]][["counts_rev"]], "C_qual_fw" = ACGT[["C"]][["qual_fw"]], "C_qual_rev" = ACGT[["C"]][["qual_rev"]], 117 | "G_counts_fw" = ACGT[["G"]][["counts_fw"]], "G_counts_rev" = ACGT[["G"]][["counts_rev"]], "G_qual_fw" = ACGT[["G"]][["qual_fw"]], "G_qual_rev" = ACGT[["G"]][["qual_rev"]], 118 | "T_counts_fw" = ACGT[["T"]][["counts_fw"]], "T_counts_rev" = ACGT[["T"]][["counts_rev"]], "T_qual_fw" = ACGT[["T"]][["qual_fw"]], "T_qual_rev" = ACGT[["T"]][["qual_rev"]], 119 | "coverage" = covmat 120 | ), 121 | colData = S4Vectors::DataFrame(sdf), 122 | rowData = row_g_cov 123 | ) 124 | 125 | return(SE) 126 | } 127 | 128 | #--------------------------------------- 129 | # Function to parse the folder hierarchy 130 | #--------------------------------------- 131 | 132 | importMito <- function(folder, ...){ 133 | 134 | files <- list.files(folder, full.names = TRUE) 135 | 136 | checkGrep <- function(hit){ 137 | if(length(hit) != 1){ 138 | stop("Improper folder specification; file missing / extra file present. See documentation") 139 | } else { 140 | return(hit) 141 | } 142 | } 143 | 144 | # Set up file paths 145 | Afile <- files[checkGrep(grep(".A.txt", files))] 146 | Cfile <- files[checkGrep(grep(".C.txt", files))] 147 | Gfile <- files[checkGrep(grep(".G.txt", files))] 148 | Tfile <- files[checkGrep(grep(".T.txt", files))] 149 | coverageFile <- files[checkGrep(grep(".coverage.txt", files))] 150 | depthFile <- files[checkGrep(grep(".depthTable.txt", files))] 151 | referenceAlleleFile <- files[checkGrep(grep("refAllele.txt", files))] 152 | 153 | # Parse out the mitochondrial genome name from the file name 154 | sv <- strsplit(gsub("_refAllele.txt", "", basename(referenceAlleleFile)), split = "[.]")[[1]] 155 | mitoChr <- sv[length(sv)] 156 | 157 | SE <- importMito.explicit(Afile, Cfile, Gfile, Tfile, 158 | coverageFile, depthFile, referenceAlleleFile, mitoChr, ...) 159 | return(SE) 160 | } 161 | 162 | 163 | #----------------- 164 | # Command line i/o 165 | #----------------- 166 | args <- commandArgs(trailingOnly = TRUE) 167 | folder <- args[1] 168 | name <- args[2] 169 | 170 | SE <- importMito(folder) 171 | saveRDS(SE, file = paste0(folder, "/", name, ".rds")) 172 | -------------------------------------------------------------------------------- /maegatk/bin/anno/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/maegatk/bin/anno/.DS_Store -------------------------------------------------------------------------------- /maegatk/bin/anno/fasta/hg19.fasta: -------------------------------------------------------------------------------- 1 | >chrM 2 | GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTT 3 | CGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTC 4 | GCAGTATCTGTCTTTGATTCCTGCCTCATTCTATTATTTATCGCACCTACGTTCAATATT 5 | ACAGGCGAACATACCTACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATA 6 | ACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCA 7 | AACCCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAA 8 | AAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATCTTTAGGCGGTATGC 9 | ACTTTTAACAGTCACCCCCCAACTAACACATTATTTTCCCCTCCCACTCCCATACTACTA 10 | ATCTCATCAATACAACCCCCGCCCATCCTACCCAGCACACACACACCGCTGCTAACCCCA 11 | TACCCCGAACCAACCAAACCCCAAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCA 12 | AAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAAATAGGTTTGG 13 | TCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAAGCATCCCCGTTCCAGTGA 14 | GTTCACCCTCTAAATCACCACGATCAAAAGGGACAAGCATCAAGCACGCAGCAATGCAGC 15 | TCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAAT 16 | AAACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACC 17 | GCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCC 18 | CCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAACTCCAGTTGACACAAAATAG 19 | ACTACGAAAGTGGCTTTAACATATCTGAACACACAATAGCTAAGACCCAAACTGGGATTA 20 | GATACCCCACTATGCTTAGCCCTAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAG 21 | AACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGA 22 | GGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTCAGCCTATA 23 | TACCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGTAAGCGCAAGTACCCACGTAA 24 | AGACGTTAGGTCAAGGTGTAGCCCATGAGGTGGCAAGAAATGGGCTACATTTTCTACCCC 25 | AGAAAACTACGATAGCCCTTATGAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTG 26 | AGAGTAGAGTGCTTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGTCACCCTCC 27 | TCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCATTTATATAGAGGAGACAA 28 | GTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAACCAGAGTGTAGCTTAACA 29 | CAAAGCACCCAACTTACACTTAGGAGATTTCAACTTAACTTGACCGCTCTGAGCTAAACC 30 | TAGCCCCAAACCCACTCCACCTTACTACCAGACAACCTTAGCCAAACCATTTACCCAAAT 31 | AAAGTATAGGCGATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGA 32 | TGAAAAATTATAACCAAGCATAATATAGCAAGGACTAACCCCTATACCTTCTGCATAATG 33 | AATTAACTAGAAATAACTTTGCAAGGAGAGCCAAAGCTAAGACCCCCGAAACCAGACGAG 34 | CTACCTAAGAACAGCTAAAAGAGCACACCCGTCTATGTAGCAAAATAGTGGGAAGATTTA 35 | TAGGTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGATAGAATCTT 36 | AGTTCAACTTTAAATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAG 37 | TCCAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGAGTAAAAAATT 38 | TAACACCCATAGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTTCAAGCTCAACACC 39 | CACTACCTAAAAAATCCCAAACATATAACTGAACTCCTCACACCCAATTGGACCAATCTA 40 | TCACCCTATAGAAGAACTAATGTTAGTATAAGTAACATGAAAACATTCTCCTCCGCATAA 41 | GCCTGCGTCAGATCAAAACACTGAACTGACAATTAACAGCCCAATATCTACAATCAACCA 42 | ACAAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAGGAAAGGTTAA 43 | AAAAAGTAAAAGGAACTCGGCAAACCTTACCCCGCCTGTTTACCAAAAACATCACCTCTA 44 | GCATCACCAGTATTAGAGGCACCGCCTGCCCAGTGACACATGTTTAACGGCCGCGGTACC 45 | CTAACCGTGCAAAGGTAGCATAATCACTTGTTCCTTAAATAGGGACCTGTATGAATGGCT 46 | CCACGAGGGTTCAGCTGTCTCTTACTTTTAACCAGTGAAATTGACCTGCCCGTGAAGAGG 47 | CGGGCATGACACAGCAAGACGAGAAGACCCTATGGAGCTTTAATTTATTAATGCAAACAG 48 | TACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATTAAAAATTTCGGTTGGGGC 49 | GACCTCGGAGCAGAACCCAACCTCCGAGCAGTACATGCTAAGACTTCACCAGTCAAAGCG 50 | AACTACTATACTCAATTGATCCAATAACTTGACCAACGGAACAAGTTACCCTAGGGATAA 51 | CAGCGCAATCCTATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGAT 52 | CAGGACATCCCGATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGATTAAAGTCCT 53 | ACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTTCTATCTACTTCAAATTCCTC 54 | CCTGTACGAAAGGACAAGAGAAATAAGGCCTACTTCACAAAGCGCCTTCCCCCGTAAATG 55 | ATATCATCTCAACTTAGTATTATACCCACACCCACCCAAGAACAGGGTTTGTTAAGATGG 56 | CAGAGCCCGGTAATCGCATAAAACTTAAAACTTTACAGTCAGAGGTTCAATTCCTCTTCT 57 | TAACAACATACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAATCGCAATGGC 58 | ATTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATACAACTACGCAAAGGCCCCAA 59 | CGTTGTAGGCCCCTACGGGCTACTACAACCCTTCGCTGACGCCATAAAACTCTTCACCAA 60 | AGAGCCCCTAAAACCCGCCACATCTACCATCACCCTCTACATCACCGCCCCGACCTTAGC 61 | TCTCACCATCGCTCTTCTACTATGAACCCCCCTCCCCATACCCAACCCCCTGGTCAACCT 62 | CAACCTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAGCCGTTTACTCAATCCTCTG 63 | ATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGCGCACTGCGAGCAGTAGCCCA 64 | AACAATCTCATATGAAGTCACCCTAGCCATCATTCTACTATCAACATTACTAATAAGTGG 65 | CTCCTTTAACCTCTCCACCCTTATCACAACACAAGAACACCTCTGATTACTCCTGCCATC 66 | ATGACCCTTGGCCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTT 67 | CGACCTTGCCGAAGGGGAGTCCGAACTAGTCTCAGGCTTCAACATCGAATACGCCGCAGG 68 | CCCCTTCGCCCTATTCTTCATAGCCGAATACACAAACATTATTATAATAAACACCCTCAC 69 | CACTACAATCTTCCTAGGAACAACATATGACGCACTCTCCCCTGAACTCTACACAACATA 70 | TTTTGTCACCAAGACCCTACTTCTAACCTCCCTGTTCTTATGAATTCGAACAGCATACCC 71 | CCGATTCCGCTACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCT 72 | AGCATTACTTATATGATATGTCTCCATACCCATTACAATCTCCAGCATTCCCCCTCAAAC 73 | CTAAGAAATATGTCTGATAAAAGAGTTACTTTGATAGAGTAAATAATAGGAGCTTAAACC 74 | CCCTTATTTCTAGGACTATGAGAATCGAACCCATCCCTGAGAATCCAAAATTCTCCGTGC 75 | CACCTATCACACCCCATCCTAAAGTAAGGTCAGCTAAATAAGCTATCGGGCCCATACCCC 76 | GAAAATGTTGGTTATACCCTTCCCGTACTAATTAATCCCCTGGCCCAACCCGTCATCTAC 77 | TCTACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGCACTGATTTTTTACCTGA 78 | GTAGGCCTAGAAATAAACATGCTAGCTTTTATTCCAGTTCTAACCAAAAAAATAAACCCT 79 | CGTTCCACAGAAGCTGCCATCAAGTATTTCCTCACGCAAGCAACCGCATCCATAATCCTT 80 | CTAATAGCTATCCTCTTCAACAATATACTCTCCGGACAATGAACCATAACCAATACTACC 81 | AATCAATACTCATCATTAATAATCATAATGGCTATAGCAATAAAACTAGGAATAGCCCCC 82 | TTTCACTTCTGAGTCCCAGAGGTTACCCAAGGCACCCCTCTGACATCCGGCCTGCTTCTT 83 | CTCACATGACAAAAACTAGCCCCCATCTCAATCATATACCAAATCTCTCCCTCACTAAAC 84 | GTAAGCCTTCTCCTCACTCTCTCAATCTTATCCATCATAGCAGGCAGTTGAGGTGGATTA 85 | AACCAAACCCAGCTACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATA 86 | ATAGCAGTTCTACCGTACAACCCTAACATAACCATTCTTAATTTAACTATTTATATTATC 87 | CTAACTACTACCGCATTCCTACTACTCAACTTAAACTCCAGCACCACGACCCTACTACTA 88 | TCTCGCACCTGAAACAAGCTAACATGACTAACACCCTTAATTCCATCCACCCTCCTCTCC 89 | CTAGGAGGCCTGCCCCCGCTAACCGGCTTTTTGCCCAAATGGGCCATTATCGAAGAATTC 90 | ACAAAAAACAATAGCCTCATCATCCCCACCATCATAGCCACCATCACCCTCCTTAACCTC 91 | TACTTCTACCTACGCCTAATCTACTCCACCTCAATCACACTACTCCCCATATCTAACAAC 92 | GTAAAAATAAAATGACAGTTTGAACATACAAAACCCACCCCATTCCTCCCCACACTCATC 93 | GCCCTTACCACGCTACTCCTACCTATCTCCCCTTTTATACTAATAATCTTATAGAAATTT 94 | AGGTTAAATACAGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTG 95 | CAACAGCTAAGGACTGCAAAACCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTA 96 | ATTAAGCTAAGCCCTTACTAGACCAATGGGACTTAAACCCACAAACACTTAGTTAACAGC 97 | TAAGCACCCTAATCAACTGGCTTCAATCTACTTCTCCCGCCGCCGGGAAAAAAGGCGGGA 98 | GAAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAATTCAATATGAAAATCACCT 99 | CGGAGCTGGTAAAAAGAGGCCTAACCCCTGTCTTTAGATTTACAGTCCAATGCTTCACTC 100 | AGCCATTTTACCTCACCCCCACTGATGTTCGCCGACCGTTGACTATTCTCTACAAACCAC 101 | AAAGACATTGGAACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGCACAGCT 102 | CTAAGCCTCCTTATTCGAGCCGAGCTGGGCCAGCCAGGCAACCTTCTAGGTAACGACCAC 103 | ATCTACAACGTTATCGTCACAGCCCATGCATTTGTAATAATCTTCTTCATAGTAATACCC 104 | ATCATAATCGGAGGCTTTGGCAACTGACTAGTTCCCCTAATAATCGGTGCCCCCGATATG 105 | GCGTTTCCCCGCATAAACAACATAAGCTTCTGACTCTTACCTCCCTCTCTCCTACTCCTG 106 | CTCGCATCTGCTATAGTGGAGGCCGGAGCAGGAACAGGTTGAACAGTCTACCCTCCCTTA 107 | GCAGGGAACTACTCCCACCCTGGAGCCTCCGTAGACCTAACCATCTTCTCCTTACACCTA 108 | GCAGGTGTCTCCTCTATCTTAGGGGCCATCAATTTCATCACAACAATTATCAATATAAAA 109 | CCCCCTGCCATAACCCAATACCAAACGCCCCTCTTCGTCTGATCCGTCCTAATCACAGCA 110 | GTCCTACTTCTCCTATCTCTCCCAGTCCTAGCTGCTGGCATCACTATACTACTAACAGAC 111 | CGCAACCTCAACACCACCTTCTTCGACCCCGCCGGAGGAGGAGACCCCATTCTATACCAA 112 | CACCTATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACCAGGCTTCGGA 113 | ATAATCTCCCATATTGTAACTTACTACTCCGGAAAAAAAGAACCATTTGGATACATAGGT 114 | ATGGTCTGAGCTATGATATCAATTGGCTTCCTAGGGTTTATCGTGTGAGCACACCATATA 115 | TTTACAGTAGGAATAGACGTAGACACACGAGCATATTTCACCTCCGCTACCATAATCATC 116 | GCTATCCCCACCGGCGTCAAAGTATTTAGCTGACTCGCCACACTCCACGGAAGCAATATG 117 | AAATGATCTGCTGCAGTGCTCTGAGCCCTAGGATTCATCTTTCTTTTCACCGTAGGTGGC 118 | CTGACTGGCATTGTATTAGCAAACTCATCACTAGACATCGTACTACACGACACGTACTAC 119 | GTTGTAGCTCACTTCCACTATGTCCTATCAATAGGAGCTGTATTTGCCATCATAGGAGGC 120 | TTCATTCACTGATTTCCCCTATTCTCAGGCTACACCCTAGACCAAACCTACGCCAAAATC 121 | CATTTCACTATCATATTCATCGGCGTAAATCTAACTTTCTTCCCACAACACTTTCTCGGC 122 | CTATCCGGAATGCCCCGACGTTACTCGGACTACCCCGATGCATACACCACATGAAACATC 123 | CTATCATCTGTAGGCTCATTCATTTCTCTAACAGCAGTAATATTAATAATTTTCATGATT 124 | TGAGAAGCCTTCGCTTCGAAGCGAAAAGTCCTAATAGTAGAAGAACCCTCCATAAACCTG 125 | GAGTGACTATATGGATGCCCCCCACCCTACCACACATTCGAAGAACCCGTATACATAAAA 126 | TCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGCTGGTTTCAAGCCAACCCCATGGC 127 | CTCCATGACTTTTTCAAAAAGGTATTAGAAAAACCATTTCATAACTTTGTCAAAGTTAAA 128 | TTATAGGCTAAATCCTATATATCTTAATGGCACATGCAGCGCAAGTAGGTCTACAAGACG 129 | CTACTTCCCCTATCATAGAAGAGCTTATCACCTTTCATGATCACGCCCTCATAATCATTT 130 | TCCTTATCTGCTTCCTAGTCCTGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTA 131 | ATACTAACATCTCAGACGCTCAGGAAATAGAAACCGTCTGAACTATCCTGCCCGCCATCA 132 | TCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGACGAGGTCAACG 133 | ATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTACTGAACCTACGAGTACACCG 134 | ACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCG 135 | ACCTGCGACTCCTTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTA 136 | TAATAATTACATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAA 137 | CAGATGCAATTCCCGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTAT 138 | ACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAG 139 | AATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCT 140 | ACCCCCTCTAGAGCCCACTGTAAAGCTAACTTAGCATTAACCTTTTAAGTTAAAGATTAA 141 | GAGAACCAACACCTCTTTACAGTGAAATGCCCCAACTAAATACTACCGTATGGCCCACCA 142 | TAATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATATTAAACACAA 143 | ACTACCACCTACCTCCCTCACCAAAGCCCATAAAAATAAAAAATTATAACAAACCCTGAG 144 | AACCAAAATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAGGCCTACC 145 | CGCCGCAGTACTGATCATTCTATTTCCCCCTCTATTGATCCCCACCTCCAAATATCTCAT 146 | CAACAACCGACTAATCACCACCCAACAATGACTAATCAAACTAACCTCAAAACAAATGAT 147 | AGCCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCTTAATCATTTT 148 | TATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCATTTACACCAACCACCCAACT 149 | ATCTATAAACCTAGCCATGGCCATCCCCTTATGAGCGGGCGCAGTGATTATAGGCTTTCG 150 | CTCTAAGATTAAAAATGCCCTAGCCCACTTCTTACCACAAGGCACACCTACACCCCTTAT 151 | CCCCATACTAGTTATTATCGAAACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGT 152 | ACGCCTAACCGCTAACATTACTGCAGGCCACCTACTCATGCACCTAATTGGAAGCGCCAC 153 | CCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATCATCTTCACAATTCTAATTCT 154 | ACTGACTATCCTAGAAATCGCTGTCGCCTTAATCCAAGCCTACGTTTTCACACTTCTAGT 155 | AAGCCTCTACCTGCACGACAACACATAATGACCCACCAATCACATGCCTATCATATAGTA 156 | AAACCCAGCCCATGACCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTA 157 | GCCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTAACCAACACA 158 | CTAACCATATACCAATGGTGGCGCGATGTAACACGAGAAAGCACATACCAAGGCCACCAC 159 | ACACCACCTGTCCAAAAAGGCCTTCGATACGGGATAATCCTATTTATTACCTCAGAAGTT 160 | TTTTTCTTCGCAGGATTTTTCTGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCCCAA 161 | CTAGGAGGGCACTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAAGTCCCACTC 162 | CTAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCACCATAGTCTA 163 | ATAGAAAACAACCGAAACCAAATAATTCAAGCACTGCTTATTACAATTTTACTGGGTCTC 164 | TATTTTACCCTCCTACAAGCCTCAGAGTACTTCGAGTCTCCCTTCACCATTTCCGACGGC 165 | ATCTACGGCTCAACATTTTTTGTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGC 166 | TCAACTTTCCTCACTATCTGCTTCATCCGCCAACTAATATTTCACTTTACATCCAAACAT 167 | CACTTTGGCTTCGAAGCCGCCGCCTGATACTGGCATTTTGTAGATGTGGTTTGACTATTT 168 | CTGTATGTCTCCATCTATTGATGAGGGTCTTACTCTTTTAGTATAAATAGTACCGTTAAC 169 | TTCCAATTAACTAGTTTTGACAACATTCAAAAAAGAGTAATAAACTTCGCCTTAATTTTA 170 | ATAATCAACACCCTCCTAGCCTTACTACTAATAATTATTACATTTTGACTACCACAACTC 171 | AACGGCTACATAGAAAAATCCACCCCTTACGAGTGCGGCTTCGACCCTATATCCCCCGCC 172 | CGCGTCCCTTTCTCCATAAAATTCTTCTTAGTAGCTATTACCTTCTTATTATTTGATCTA 173 | GAAATTGCCCTCCTTTTACCCCTACCATGAGCCCTACAAACAACTAACCTGCCACTAATA 174 | GTTATGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAGTCTGGCCTATGAGTGACTA 175 | CAAAAAGGATTAGACTGAGCCGAATTGGTATATAGTTTAAACAAAACGAATGATTTCGAC 176 | TCATTAAATTATGATAATCATATTTACCAAATGCCCCTCATTTACATAAATATTATACTA 177 | GCATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATATCCTCCCTA 178 | CTATGCCTAGAAGGAATAATACTATCGCTGTTCATTATAGCTACTCTCATAACCCTCAAC 179 | ACCCACTCCCTCTTAGCCAATATTGTGCCTATTGCCATACTAGTCTTTGCCGCCTGCGAA 180 | GCAGCGGTGGGCCTAGCCCTACTAGTCTCAATCTCCAACACATATGGCCTAGACTACGTA 181 | CATAACCTAAACCTACTCCAATGCTAAAACTAATCGTCCCAACAATTATATTACTACCAC 182 | TGACATGACTTTCCAAAAAGCACATAATTTGAATCAACACAACCACCCACAGCCTAATTA 183 | TTAGCATCATCCCCCTACTATTTTTTAACCAAATCAACAACAACCTATTTAGCTGTTCCC 184 | CAACCTTTTCCTCCGACCCCCTAACAACCCCCCTCCTAATACTAACTACCTGACTCCTAC 185 | CCCTCACAATCATGGCAAGCCAACGCCACTTATCCAGCGAACCACTATCACGAAAAAAAC 186 | TCTACCTCTCTATACTAATCTCCCTACAAATCTCCTTAATTATAACATTCACAGCCACAG 187 | AACTAATCATATTTTATATCTTCTTCGAAACCACACTTATCCCCACCTTGGCTATCATCA 188 | CCCGATGAGGCAACCAGCCAGAACGCCTGAACGCAGGCACATACTTCCTATTCTACACCC 189 | TAGTAGGCTCCCTTCCCCTACTCATCGCACTAATTTACACTCACAACACCCTAGGCTCAC 190 | TAAACATTCTACTACTCACTCTCACTGCCCAAGAACTATCAAACTCCTGAGCCAACAACT 191 | TAATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACT 192 | TATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAG 193 | TACTCTTAAAACTAGGCGGCTATGGTATAATACGCCTCACACTCATTCTCAACCCCCTGA 194 | CAAAACACATAGCCTACCCCTTCCTTGTACTATCCCTATGAGGCATAATTATAACAAGCT 195 | CCATCTGCCTACGACAAACAGACCTAAAATCGCTCATTGCATACTCTTCAATCAGCCACA 196 | TAGCCCTCGTAGTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCGGCGCAGTCA 197 | TTCTCATAATCGCCCACGGACTCACATCCTCATTACTATTCTGCCTAGCAAACTCAAACT 198 | ACGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGGACTTCAAACTCTACTCCCAC 199 | TAATAGCTTTTTGATGACTTCTAGCAAGCCTCGCTAACCTCGCCTTACCCCCCACTATTA 200 | ACCTACTGGGAGAACTCTCTGTGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCC 201 | TACTTACAGGACTCAACATACTAGTCACAGCCCTATACTCCCTCTACATATTTACCACAA 202 | CACAATGGGGCTCACTCACCCACCACATTAACAACATAAAACCCTCATTCACACGAGAAA 203 | ACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCTATCCCTCAACCCCGACATCA 204 | TTACCGGGTTTTCCTCTTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACA 205 | ACAGAGGCTTACGACCCCTTATTTACCGAGAAAGCTCACAAGAACTGCTAACTCATGCCC 206 | CCATGTCTAACAACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTA 207 | GGCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGCACACTACTATAAC 208 | CACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACCACCCTCGTTAACCCTAACAA 209 | AAAAAACTCATACCCCCATTATGTAAAATCCATTGTCGCATCCACCTTTATTATCAGTCT 210 | CTTCCCCACAACAATATTCATGTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTG 211 | AGCCACAACCCAAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAAT 212 | ATTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCACTGTGATATAT 213 | AAACTCAGACCCAAACATTAATCAGTTCTTCAAATATCTACTCATTTTCCTAATTACCAT 214 | ACTAATCTTAGTTACCGCTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTAGG 215 | AATTATATCCTTCTTGCTCATCAGTTGATGATACGCCCGAGCAGATGCCAACACAGCAGC 216 | CATTCAAGCAGTCCTATACAACCGTATCGGCGATATCGGTTTCATCCTCGCCTTAGCATG 217 | ATTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAAACGCTAATCC 218 | AAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCAGGCAAATCAGCCCAATTAGG 219 | TCTCCACCCCTGACTCCCCTCAGCCATAGAAGGCCCCACCCCAGTCTCAGCCCTACTCCA 220 | CTCAAGCACTATAGTTGTAGCAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGA 221 | AAATAGCCCACTAATCCAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGC 222 | AGCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCTCCACTTCAAG 223 | TCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCA 224 | CATCTGTACCCACGCCTTCTTCAAAGCCATACTATTTATGTGCTCCGGGTCCATCATCCA 225 | CAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCT 226 | CACTTCAACCTCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGG 227 | TTTCTACTCCAAAGACCACATCATCGAAACCGCAAACATATCATACACAAACGCCTGAGC 228 | CCTATCTATTACTCTCATCGCTACCTCCCTGACAAGCGCCTATAGCACTCGAATAATTCT 229 | TCTCACCCTAACAGGTCAACCTCGCTTCCCCACCCTTACTAACATTAACGAAAATAACCC 230 | CACCCTACTAAACCCCATTAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCAT 231 | TACTAACAACATTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTACCTAAAACT 232 | CACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCAACTACCTAAC 233 | CAACAAACTTAAAATAAAATCCCCACTATGCACATTTTATTTCTCCAACATACTCGGATT 234 | CTACCCTAGCATCACACACCGCACAATCCCCTATCTAGGCCTTCTTACGAGCCAAAACCT 235 | GCCCCTACTCCTCCTAGACCTAACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACA 236 | GCACCAAATCTCCACCTCCATCATCACCTCAACCCAAAAAGGCATAATTAAACTTTACTT 237 | CCTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAATCACATAACCTATTCCCCC 238 | GAGCAATCTCAATTACAATATATACACCAACAAACAATGTTCAACCAGTAACCACTACTA 239 | ATCAACGCCCATAATCATACAAAGCCCCCGCACCAATAGGATCCTCCCGAATCAACCCTG 240 | ACCCCTCTCCTTCATAAATTATTCAGCTTCCTACACTATTAAAGTTTACCACAACCACCA 241 | CCCCATCATACTCTTTCACCCACAGCACCAATCCTACCTCCATCGCTAACCCCACTAAAA 242 | CACTCACCAAGACCTCAACCCCTGACCCCCATGCCTCAGGATACTCCTCAATAGCCATCG 243 | CTGTAGTATATCCAAAGACAACCATCATTCCCCCTAAATAAATTAAAAAAACTATTAAAC 244 | CCATATAACCTCCCCCAAAATTCAGAATAATAACACACCCGACCACACCGCTAACAATCA 245 | GTACTAAACCCCCATAAATAGGAGAAGGCTTAGAAGAAAACCCCACAAACCCCATTACTA 246 | AACCCACACTCAACAGAAACAAAGCATACATCATTATTCTCGCACGGACTACAACCACGA 247 | CCAATGATATGAAAAACCATCGTTGTATTTCAACTACAAGAACACCAATGACCCCAATAC 248 | GCAAAATTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCAT 249 | CCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAA 250 | TCACCACAGGACTATTCCTAGCCATACACTACTCACCAGACGCCTCAACCGCCTTTTCAT 251 | CAATCGCCCACATCACTCGAGACGTAAATTATGGCTGAATCATCCGCTACCTTCACGCCA 252 | ATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGGCGAGGCCTATATTACG 253 | GATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATCCTCCTGCTTGCAACTATAG 254 | CAACAGCCTTCATAGGCTATGTCCTCCCGTGAGGCCAAATATCATTCTGAGGGGCCACAG 255 | TAATTACAAACTTACTATCCGCCATCCCATACATTGGGACAGACCTAGTTCAATGAATCT 256 | GAGGAGGCTACTCAGTAGACAGTCCCACCCTCACACGATTCTTTACCTTTCACTTCATCT 257 | TACCCTTCATTATTGCAGCCCTAGCAGCACTCCACCTCCTATTCTTGCACGAAACGGGAT 258 | CAAACAACCCCCTAGGAATCACCTCCCATTCCGATAAAATCACCTTCCACCCTTACTACA 259 | CAATCAAAGACGCCCTCGGCTTACTTCTCTTCCTTCTCTCCTTAATGACATTAACACTAT 260 | TCTCACCAGACCTCCTAGGCGACCCAGACAATTATACCCTAGCCAACCCCTTAAACACCC 261 | CTCCCCACATCAAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCC 262 | CTAACAAACTAGGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTAGCAATAATCC 263 | CCATCCTCCATATATCCAAACAACAAAGCATAATATTTCGCCCACTAAGCCAATCACTTT 264 | ATTGACTCCTAGCCGCAGACCTCCTCATTCTAACCTGAATCGGAGGACAACCAGTAAGCT 265 | ACCCTTTTACCATCATTGGACAAGTAGCATCCGTACTATACTTCACAACAATCCTAATCC 266 | TAATACCAACTATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTGTCCTTGTAGTA 267 | TAAACTAATACACCAGTCTTGTAAACCGGAGACGAAAACCTTTTTCCAAGGACAAATCAG 268 | AGAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAAGATTCTAATTTAAACTATTC 269 | TCTGTTCTTTCATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAAC 270 | AACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAA 271 | TACTTGACCACCTGTAGTACATAAAAACCCAACCCACATCAAACCCCCCCCCCCCATGCT 272 | TACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCA 273 | CCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGTACATAAA 274 | GTCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCC 275 | CCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTG 276 | CTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGAC 277 | ATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAG 278 | ACATCACGATG -------------------------------------------------------------------------------- /maegatk/bin/anno/fasta/hg19.fasta.amb: -------------------------------------------------------------------------------- 1 | 16571 1 0 2 | -------------------------------------------------------------------------------- /maegatk/bin/anno/fasta/hg19.fasta.ann: -------------------------------------------------------------------------------- 1 | 16571 1 11 2 | 0 chrM (null) 3 | 0 16571 0 4 | -------------------------------------------------------------------------------- /maegatk/bin/anno/fasta/hg19.fasta.bwt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/maegatk/bin/anno/fasta/hg19.fasta.bwt -------------------------------------------------------------------------------- /maegatk/bin/anno/fasta/hg19.fasta.pac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/maegatk/bin/anno/fasta/hg19.fasta.pac -------------------------------------------------------------------------------- /maegatk/bin/anno/fasta/hg19.fasta.sa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/maegatk/bin/anno/fasta/hg19.fasta.sa -------------------------------------------------------------------------------- /maegatk/bin/anno/fasta/rCRS.fasta: -------------------------------------------------------------------------------- 1 | >chrM 2 | GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT 3 | TTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTG 4 | GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATC 5 | CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTA 6 | AAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAAT 7 | GTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCA 8 | AACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCA 9 | AACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTT 10 | TATCTTTTGGCGGTATGCACTTTTAACAGTCACCCCCCAACTAACACATT 11 | ATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATACAACCCCCGC 12 | CCATCCTACCCAGCACACACACACCGCTGCTAACCCCATACCCCGAACCA 13 | ACCAAACCCCAAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCAAA 14 | GCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAAAT 15 | AGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAA 16 | GCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGA 17 | ACAAGCATCAAGCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCCAC 18 | ACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAAACGAAAGTTT 19 | AACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGC 20 | GGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTA 21 | GATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAACT 22 | CCAGTTGACACAAAATAGACTACGAAAGTGGCTTTAACATATCTGAACAC 23 | ACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCC 24 | TAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAGAACACTACGAGC 25 | CACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGG 26 | AGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTC 27 | AGCCTATATACCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGTAA 28 | GCGCAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGAGGTG 29 | GCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTTAT 30 | GAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTAAGAGTAGAGTGC 31 | TTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGTCACCCTCCTC 32 | AAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCATTTATATAGA 33 | GGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAA 34 | CCAGAGTGTAGCTTAACACAAAGCACCCAACTTACACTTAGGAGATTTCA 35 | ACTTAACTTGACCGCTCTGAGCTAAACCTAGCCCCAAACCCACTCCACCT 36 | TACTACCAGACAACCTTAGCCAAACCATTTACCCAAATAAAGTATAGGCG 37 | ATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGATG 38 | AAAAATTATAACCAAGCATAATATAGCAAGGACTAACCCCTATACCTTCT 39 | GCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCCAAAGCTAAGA 40 | CCCCCGAAACCAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGT 41 | CTATGTAGCAAAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCTAC 42 | CGAGCCTGGTGATAGCTGGTTGTCCAAGATAGAATCTTAGTTCAACTTTA 43 | AATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAGTC 44 | CAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGAGT 45 | AAAAAATTTAACACCCATAGTAGGCCTAAAAGCAGCCACCAATTAAGAAA 46 | GCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAACATATAACTGA 47 | ACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAATG 48 | TTAGTATAAGTAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCAGA 49 | TTAAAACACTGAACTGACAATTAACAGCCCAATATCTACAATCAACCAAC 50 | AAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAGGA 51 | AAGGTTAAAAAAAGTAAAAGGAACTCGGCAAATCTTACCCCGCCTGTTTA 52 | CCAAAAACATCACCTCTAGCATCACCAGTATTAGAGGCACCGCCTGCCCA 53 | GTGACACATGTTTAACGGCCGCGGTACCCTAACCGTGCAaaggtagcata 54 | atcacttgttccttaaatagggacctgtatgaatggctccacgagggttc 55 | agctgtctcttacttttaaccagtgaaattgacctgcccgtgaagaggcg 56 | ggcataacacagcaagacgagaagaccctatggagctttaatttaTTAAT 57 | GCAAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATT 58 | AAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCAGT 59 | ACATGCTAAGACTTCACCAGTCAAAGCGAACTACTATACTCAATTGATCC 60 | AATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATCCT 61 | ATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGATCA 62 | GGACATCCCGATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGATT 63 | AAAGTCCTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTTCT 64 | ATCTACNTTCAAATTCCTCCCTGTACGAAAGGACAAGAGAAATAAGGCCT 65 | ACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTATT 66 | ATACCCACACCCACCCAAGAACAGGGTTTgttaagatggcagagcccggt 67 | aatcgcataaaacttaaaactttacagtcagaggttcaattcctcttctt 68 | aacaacaTACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAAT 69 | CGCAATGGCATTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATAC 70 | AACTACGCAAAGGCCCCAACGTTGTAGGCCCCTACGGGCTACTACAACCC 71 | TTCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCAC 72 | ATCTACCATCACCCTCTACATCACCGCCCCGACCTTAGCTCTCACCATCG 73 | CTCTTCTACTATGAACCCCCCTCCCCATACCCAACCCCCTGGTCAACCTC 74 | AACCTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAGCCGTTTACTC 75 | AATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGCG 76 | CACTGCGAGCAGTAGCCCAAACAATCTCATATGAAGTCACCCTAGCCATC 77 | ATTCTACTATCAACATTACTAATAAGTGGCTCCTTTAACCTCTCCACCCT 78 | TATCACAACACAAGAACACCTCTGATTACTCCTGCCATCATGACCCTTGG 79 | CCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTTC 80 | GACCTTGCCGAAGGGGAGTCCGAACTAGTCTCAGGCTTCAACATCGAATA 81 | CGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATACACAAACATTA 82 | TTATAATAAACACCCTCACCACTACAATCTTCCTAGGAACAACATATGAC 83 | GCACTCTCCCCTGAACTCTACACAACATATTTTGTCACCAAGACCCTACT 84 | TCTAACCTCCCTGTTCTTATGAATTCGAACAGCATACCCCCGATTCCGCT 85 | ACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTA 86 | GCATTACTTATATGATATGTCTCCATACCCATTACAATCTCCAGCATTCC 87 | CCCTCAAACCTAAGAAATATGTCTGATAAAAGAGTTACTTTGATAGAGTA 88 | AATAATAGGAGCTTAAACCCCCTTATTTctaggactatgagaatcgaacc 89 | catccctgagaatccaaaattctccgtgccacctatcacaccccatccta 90 | AAGTAAGGTCAGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTGG 91 | TTATACCCTTCCCGTACTAATTAATCCCCTGGCCCAACCCGTCATCTACT 92 | CTACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGCACTGATTT 93 | TTTACCTGAGTAGGCCTAGAAATAAACATGCTAGCTTTTATTCCAGTTCT 94 | AACCAAAAAAATAAACCCTCGTTCCACAGAAGCTGCCATCAAGTATTTCC 95 | TCACGCAAGCAACCGCATCCATAATCCTTCTAATAGCTATCCTCTTCAAC 96 | AATATACTCTCCGGACAATGAACCATAACCAATACTACCAATCAATACTC 97 | ATCATTAATAATCATAATAGCTATAGCAATAAAACTAGGAATAGCCCCCT 98 | TTCACTTCTGAGTCCCAGAGGTTACCCAAGGCACCCCTCTGACATCCGGC 99 | CTGCTTCTTCTCACATGACAAAAACTAGCCCCCATCTCAATCATATACCA 100 | AATCTCTCCCTCACTAAACGTAAGCCTTCTCCTCACTCTCTCAATCTTAT 101 | CCATCATAGCAGGCAGTTGAGGTGGATTAAACCAAACCCAGCTACGCAAA 102 | ATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAATAGCAGTTCT 103 | ACCGTACAACCCTAACATAACCATTCTTAATTTAACTATTTATATTATCC 104 | TAACTACTACCGCATTCCTACTACTCAACTTAAACTCCAGCACCACGACC 105 | CTACTACTATCTCGCACCTGAAACAAGCTAACATGACTAACACCCTTAAT 106 | TCCATCCACCCTCCTCTCCCTAGGAGGCCTGCCCCCGCTAACCGGCTTTT 107 | TGCCCAAATGGGCCATTATCGAAGAATTCACAAAAAACAATAGCCTCATC 108 | ATCCCCACCATCATAGCCACCATCACCCTCCTTAACCTCTACTTCTACCT 109 | ACGCCTAATCTACTCCACCTCAATCACACTACTCCCCATATCTAACAACG 110 | TAAAAATAAAATGACAGTTTGAACATACAAAACCCACCCCATTCCTCCCC 111 | ACACTCATCGCCCTTACCACGCTACTCCTACCTATCTCCCCTTTTATACT 112 | AATAATCTTATAGAAATTTAGGTTAAATACAGACCAAGAGCCTTCAAAGC 113 | CCTCAGTAAGTTGCAATACTTAATTTCTGTAACAGCTAAGGACTGCAAAA 114 | CCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTAATTAAGCTAAG 115 | CCCTTACTAGACCAATGGGACTTAAACCCACAAACACTTAGTTAACAGCT 116 | AAGCACCCTAATCAACTGGCTTCAATCTACTTCTCCCGCCGCCGGGAAAA 117 | AAGGCGGGAGAAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAA 118 | TTCAATATGAAAATCACCTCGGAGCTGGTAAAAAGAGGCCTAACCCCTGT 119 | CTTTAGATTTACAGTCCAATGCTTCACTCAGCCATTTTACCTCACCCCCA 120 | CTGATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTGG 121 | AACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGCACAGCTC 122 | TAAGCCTCCTTATTCGAGCCGAGCTGGGCCAGCCAGGCAACCTTCTAGGT 123 | AACGACCACATCTACAACGTTATCGTCACAGCCCATGCATTTGTAATAAT 124 | CTTCTTCATAGTAATACCCATCATAATCGGAGGCTTTGGCAACTGACTAG 125 | TTCCCCTAATAATCGGTGCCCCCGATATGGCGTTTCCCCGCATAAACAAC 126 | ATAAGCTTCTGACTCTTACCTCCCTCTCTCCTACTCCTGCTCGCATCTGC 127 | TATAGTGGAGGCCGGAGCAGGAACAGGTTGAACAGTCTACCCTCCCTTAG 128 | CAGGGAACTACTCCCACCCTGGAGCCTCCGTAGACCTAACCATCTTCTCC 129 | TTACACCTAGCAGGTGTCTCCTCTATCTTAGGGGCCATCAATTTCATCAC 130 | AACAATTATCAATATAAAACCCCCTGCCATAACCCAATACCAAACGCCCC 131 | TCTTCGTCTGATCCGTCCTAATCACAGCAGTCCTACTTCTCCTATCTCTC 132 | CCAGTCCTAGCTGCTGGCATCACTATACTACTAACAGACCGCAACCTCAA 133 | CACCACCTTCTTCGACCCCGCCGGAGGAGGAGACCCCATTCTATACCAAC 134 | ACCTATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACCA 135 | GGCTTCGGAATAATCTCCCATATTGTAACTTACTACTCCGGAAAAAAAGA 136 | ACCATTTGGATACATAGGTATGGTCTGAGCTATGATATCAATTGGCTTCC 137 | TAGGGTTTATCGTGTGAGCACACCATATATTTACAGTAGGAATAGACGTA 138 | GACACACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCAC 139 | CGGCGTCAAAGTATTTAGCTGACTCGCCACACTCCACGGAAGCAATATGA 140 | AATGATCTGCTGCAGTGCTCTGAGCCCTAGGATTCATCTTTCTTTTCACC 141 | GTAGGTGGCCTGACTGGCATTGTATTAGCAAACTCATCACTAGACATCGT 142 | ACTACACGACACGTACTACGTTGTAGCCCACTTCCACTATGTCCTATCAA 143 | TAGGAGCTGTATTTGCCATCATAGGAGGCTTCATTCACTGATTTCCCCTA 144 | TTCTCAGGCTACACCCTAGACCAAACCTACGCCAAAATCCATTTCACTAT 145 | CATATTCATCGGCGTAAATCTAACTTTCTTCCCACAACACTTTCTCGGCC 146 | TATCCGGAATGCCCCGACGTTACTCGGACTACCCCGATGCATACACCACA 147 | TGAAACATCCTATCATCTGTAGGCTCATTCATTTCTCTAACAGCAGTAAT 148 | ATTAATAATTTTCATGATTTGAGAAGCCTTCGCTTCGAAGCGAAAAGTCC 149 | TAATAGTAGAAGAACCCTCCATAAACCTGGAGTGACTATATGGATGCCCC 150 | CCACCCTACCACACATTCGAAGAACCCGTATACATAAAATCTAGACAaaa 151 | aaggaaggaatcgaaccccccaaagctggtttcaagccaaccccatggcc 152 | tccatgactttttcAAAAAGGTATTAGAAAAACCATTTCATAACTTTGTC 153 | AAAGTTAAATTATAGGCTAAATCCTATATATCTTAATGGCACATGCAGCG 154 | CAAGTAGGTCTACAAGACGCTACTTCCCCTATCATAGAAGAGCTTATCAC 155 | CTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTCC 156 | TGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATC 157 | TCAGACGCTCAGGAAATAGAAACCGTCTGAACTATCCTGCCCGCCATCAT 158 | CCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGACG 159 | AGGTCAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTAC 160 | TGAACCTACGAGTACACCGACTACGGCGGACTAATCTTCAACTCCTACAT 161 | ACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCCTTGACGTTG 162 | ACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACA 163 | TCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAAC 164 | AGATGCAATTCCCGGACGTCTAAACCAAACCACTTTCACCGCTACACGAC 165 | CGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGT 166 | TTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGG 167 | GCCCGTATTTACCCTATAGCACCCCCTCTACCCCCTCTAGAGCCCACTGT 168 | AAAGCTAACTTAGCATTAACCTTTTAAGTTAAAGATTAAGAGAACCAACA 169 | CCTCTTTACAGTGAAATGCCCCAACTAAATACTACCGTATGGCCCACCAT 170 | AATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATAT 171 | TAAACACAAACTACCACCTACCTCCCTCACCAAAGCCCATAAAAATAAAA 172 | AATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCTGTTCGCTTCA 173 | TTCATTGCCCCCACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTCT 174 | ATTTCCCCCTCTATTGATCCCCACCTCCAAATATCTCATCAACAACCGAC 175 | TAATCACCACCCAACAATGACTAATCAAACTAACCTCAAAACAAATGATA 176 | ACCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCTT 177 | AATCATTTTTATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCAT 178 | TTACACCAACCACCCAACTATCTATAAACCTAGCCATGGCCATCCCCTTA 179 | TGAGCGGGCACAGTGATTATAGGCTTTCGCTCTAAGATTAAAAATGCCCT 180 | AGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTAG 181 | TTATTATCGAAACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGTA 182 | CGCCTAACCGCTAACATTACTGCAGGCCACCTACTCATGCACCTAATTGG 183 | AAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATCA 184 | TCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTTA 185 | ATCCAAGCCTACGTTTTCACACTTCTAGTAAGCCTCTACCTGCACGACAA 186 | CACATAATGACCCACCAATCACATGCCTATCATATAGTAAAACCCAGCCC 187 | ATGACCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTAG 188 | CCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTA 189 | ACCAACACACTAACCATATACCAATGATGGCGCGATGTAACACGAGAAAG 190 | CACATACCAAGGCCACCACACACCACCTGTCCAAAAAGGCCTTCGATACG 191 | GGATAATCCTATTTATTACCTCAGAAGTTTTTTTCTTCGCAGGATTTTTC 192 | TGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCCCAATTAGGAGGGCA 193 | CTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAAGTCCCACTCC 194 | TAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCAC 195 | CATAGTCTAATAGAAAACAACCGAAACCAAATAATTCAAGCACTGCTTAT 196 | TACAATTTTACTGGGTCTCTATTTTACCCTCCTACAAGCCTCAGAGTACT 197 | TCGAGTCTCCCTTCACCATTTCCGACGGCATCTACGGCTCAACATTTTTT 198 | GTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGCTCAACTTTCCT 199 | CACTATCTGCTTCATCCGCCAACTAATATTTCACTTTACATCCAAACATC 200 | ACTTTGGCTTCGAAGCCGCCGCCTGATACTGGCATTTTGTAGATGTGGTT 201 | TGACTATTTCTGTATGTCTCCATCTATTGATGAGGGTCTTACTCTTTTAG 202 | TATAAATAGTACCGTTAACTTCCAATTAACTAGTTTTGACAACATTCAAA 203 | AAAGAGTAATAAACTTCGCCTTAATTTTAATAATCAACACCCTCCTAGCC 204 | TTACTACTAATAATTATTACATTTTGACTACCACAACTCAACGGCTACAT 205 | AGAAAAATCCACCCCTTACGAGTGCGGCTTCGACCCTATATCCCCCGCCC 206 | GCGTCCCTTTCTCCATAAAATTCTTCTTAGTAGCTATTACCTTCTTATTA 207 | TTTGATCTAGAAATTGCCCTCCTTTTACCCCTACCATGAGCCCTACAAAC 208 | AACTAACCTGCCACTAATAGTTATGTCATCCCTCTTATTAATCATCATCC 209 | TAGCCCTAAGTCTGGCCTATGAGTGACTACAAAAAGGATTAGACTGAACC 210 | GAATTGGTATATAGTTTAAACAAAACGAATGATTTCGACTCATTAAATTA 211 | TGATAATCATATTTACCAAATGCCCCTCATTTACATAAATATTATACTAG 212 | CATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATA 213 | TCCTCCCTACTATGCCTAGAAGGAATAATACTATCGCTGTTCATTATAGC 214 | TACTCTCATAACCCTCAACACCCACTCCCTCTTAGCCAATATTGTGCCTA 215 | TTGCCATACTAGTCTTTGCCGCCTGCGAAGCAGCGGTGGGCCTAGCCCTA 216 | CTAGTCTCAATCTCCAACACATATGGCCTAGACTACGTACATAACCTAAA 217 | CCTACTCCAATGCTAAAACTAATCGTCCCAACAATTATATTACTACCACT 218 | GACATGACTTTCCAAAAAACACATAATTTGAATCAACACAACCACCCACA 219 | GCCTAATTATTAGCATCATCCCTCTACTATTTTTTAACCAAATCAACAAC 220 | AACCTATTTAGCTGTTCCCCAACCTTTTCCTCCGACCCCCTAACAACCCC 221 | CCTCCTAATACTAACTACCTGACTCCTACCCCTCACAATCATGGCAAGCC 222 | AACGCCACTTATCCAGTGAACCACTATCACGAAAAAAACTCTACCTCTCT 223 | ATACTAATCTCCCTACAAATCTCCTTAATTATAACATTCACAGCCACAGA 224 | ACTAATCATATTTTATATCTTCTTCGAAACCACACTTATCCCCACCTTGG 225 | CTATCATCACCCGATGAGGCAACCAGCCAGAACGCCTGAACGCAGGCACA 226 | TACTTCCTATTCTACACCCTAGTAGGCTCCCTTCCCCTACTCATCGCACT 227 | AATTTACACTCACAACACCCTAGGCTCACTAAACATTCTACTACTCACTC 228 | TCACTGCCCAAGAACTATCAAACTCCTGAGCCAACAACTTAATATGACTA 229 | GCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTT 230 | ATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTAC 231 | TTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATACGCCTCACA 232 | CTCATTCTCAACCCCCTGACAAAACACATAGCCTACCCCTTCCTTGTACT 233 | ATCCCTATGAGGCATAATTATAACAAGCTCCATCTGCCTACGACAAACAG 234 | ACCTAAAATCGCTCATTGCATACTCTTCAATCAGCCACATAGCCCTCGTA 235 | GTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCGGCGCAGTCAT 236 | TCTCATAATCGCCCACGGGCTTACATCCTCATTACTATTCTGCCTAGCAA 237 | ACTCAAACTACGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGGA 238 | CTTCAAACTCTACTCCCACTAATAGCTTTTTGATGACTTCTAGCAAGCCT 239 | CGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCTG 240 | TGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGA 241 | CTCAACATACTAGTCACAGCCCTATACTCCCTCTACATATTTACCACAAC 242 | ACAATGGGGCTCACTCACCCACCACATTAACAACATAAAACCCTCATTCA 243 | CACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCTA 244 | TCCCTCAACCCCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTTT 245 | AACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTTACGACCCCTTA 246 | TTTACCGAGAAAGCTCACAAGAACTGCTAACTCATGCCCCCATGTCTAAC 247 | AACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTAG 248 | GCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGCACAC 249 | TACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACCA 250 | CCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCC 251 | ATTGTCGCATCCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCAT 252 | GTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTGAGCCACAACCC 253 | AAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAATA 254 | TTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCACT 255 | GTGATATATAAACTCAGACCCAAACATTAATCAGTTCTTCAAATATCTAC 256 | TCATCTTCCTAATTACCATACTAATCTTAGTTACCGCTAACAACCTATTC 257 | CAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCAT 258 | CAGTTGATGATACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCAA 259 | TCCTATACAACCGTATCGGCGATATCGGTTTCATCCTCGCCTTAGCATGA 260 | TTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAAA 261 | CGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCAG 262 | GCAAATCAGCCCAATTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAA 263 | GGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTATAGTTGTAGC 264 | AGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCAC 265 | TAATCCAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCA 266 | GCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCTC 267 | CACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAAC 268 | CACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATA 269 | CTATTTATGTGCTCCGGGTCCATCATCCACAACCTTAACAATGAACAAGA 270 | TATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTCACTTCAACCT 271 | CCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGT 272 | TTCTACTCCAAAGACCACATCATCGAAACCGCAAACATATCATACACAAA 273 | CGCCTGAGCCCTATCTATTACTCTCATCGCTACCTCCCTGACAAGCGCCT 274 | ATAGCACTCGAATAATTCTTCTCACCCTAACAGGTCAACCTCGCTTCCCC 275 | ACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCATTAA 276 | ACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACTAACAACA 277 | TTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTACCTAAAACTC 278 | ACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCAA 279 | CTACCTAACCAACAAACTTAAAATAAAATCCCCACTATGCACATTTTATT 280 | TCTCCAACATACTCGGATTCTACCCTAGCATCACACACCGCACAATCCCC 281 | TATCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACCT 282 | AACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACAGCACCAAATCT 283 | CCACCTCCATCATCACCTCAACCCAAAAAGGCATAATTAAACTTTACTTC 284 | CTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAATCACATAACC 285 | TATTCCCCCGAGCAATCTCAATTACAATATATACACCAACAAACAATGTT 286 | CAACCAGTAACTACTACTAATCAACGCCCATAATCATACAAAGCCCCCGC 287 | ACCAATAGGATCCTCCCGAATCAACCCTGACCCCTCTCCTTCATAAATTA 288 | TTCAGCTTCCTACACTATTAAAGTTTACCACAACCACCACCCCATCATAC 289 | TCTTTCACCCACAGCACCAATCCTACCTCCATCGCTAACCCCACTAAAAC 290 | ACTCACCAAGACCTCAACCCCTGACCCCCATGCCTCAGGATACTCCTCAA 291 | TAGCCATCGCTGTAGTATATCCAAAGACAACCATCATTCCCCCTAAATAA 292 | ATTAAAAAAACTATTAAACCCATATAACCTCCCCCAAAATTCAGAATAAT 293 | AACACACCCGACCACACCGCTAACAATCAATACTAAACCCCCATAAATAG 294 | GAGAAGGCTTAGAAGAAAACCCCACAAACCCCATTACTAAACCCACACTC 295 | AACAGAAACAAAGCATACATCATTATTCTCGCACGGACTACAACCACGAC 296 | CAATGATATGAAAAACCATCGTTGTATTTCAACTACAAGAACACCAATGA 297 | CCCCAATACGCAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTC 298 | ATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTC 299 | ACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAG 300 | CCATGCACTACTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCAC 301 | ATCACTCGAGACGTAAATTATGGCTGAATCATCCGCTACCTTCACGCCAA 302 | TGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGGCGAGGCC 303 | TATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATC 304 | CTCCTGCTTGCAACTATAGCAACAGCCTTCATAGGCTATGTCCTCCCGTG 305 | AGGCCAAATATCATTCTGAGGGGCCACAGTAATTACAAACTTACTATCCG 306 | CCATCCCATACATTGGGACAGACCTAGTTCAATGAATCTGAGGAGGCTAC 307 | TCAGTAGACAGTCCCACCCTCACACGATTCTTTACCTTTCACTTCATCTT 308 | GCCCTTCATTATTGCAGCCCTAGCAACACTCCACCTCCTATTCTTGCACG 309 | AAACGGGATCAAACAACCCCCTAGGAATCACCTCCCATTCCGATAAAATC 310 | ACCTTCCACCCTTACTACACAATCAAAGACGCCCTCGGCTTACTTCTCTT 311 | CCTTCTCTCCTTAATGACATTAACACTATTCTCACCAGACCTCCTAGGCG 312 | ACCCAGACAATTATACCCTAGCCAACCCCTTAAACACCCCTCCCCACATC 313 | AAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCCC 314 | TAACAAACTAGGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTAG 315 | CAATAATCCCCATCCTCCATATATCCAAACAACAAAGCATAATATTTCGC 316 | CCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCTCATTCT 317 | AACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGAC 318 | AAGTAGCATCCGTACTATACTTCACAACAATCCTAATCCTAATACCAACT 319 | ATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTGTCCTTGTAGTAT 320 | AAACTAATACACCAGTCTTGTAAACCGGAGATGAAAACCTTTTTCCAAGG 321 | ACAAATCAGAGAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAAG 322 | ATTCTAATTTAAACTATTCTCTGTTCTTTCATGGGGAAGCAGATTTGGGT 323 | ACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACA 324 | TTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCAC 325 | CTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTA 326 | CAAGcaagtacagcaatcaaccctcaactatcacacatcaactgcaactC 327 | CAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAA 328 | CAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAA 329 | ATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGAC 330 | CACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCG 331 | CTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACAT 332 | CTGGTTCCTACTTCAGGGTCATAAAGCCTAAATAGCCCACACGTTCCCCT 333 | TAAATAAGACATCACGATG 334 | -------------------------------------------------------------------------------- /maegatk/bin/anno/fasta/rCRS.fasta.amb: -------------------------------------------------------------------------------- 1 | 16569 1 1 2 | 3106 1 N 3 | -------------------------------------------------------------------------------- /maegatk/bin/anno/fasta/rCRS.fasta.ann: -------------------------------------------------------------------------------- 1 | 16569 1 11 2 | 0 chrM (null) 3 | 0 16569 1 4 | -------------------------------------------------------------------------------- /maegatk/bin/anno/fasta/rCRS.fasta.bwt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/maegatk/bin/anno/fasta/rCRS.fasta.bwt -------------------------------------------------------------------------------- /maegatk/bin/anno/fasta/rCRS.fasta.pac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/maegatk/bin/anno/fasta/rCRS.fasta.pac -------------------------------------------------------------------------------- /maegatk/bin/anno/fasta/rCRS.fasta.sa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/maegatk/bin/anno/fasta/rCRS.fasta.sa -------------------------------------------------------------------------------- /maegatk/bin/fgbio.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/maegatk/bin/fgbio.jar -------------------------------------------------------------------------------- /maegatk/bin/picard.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/maegatk/bin/picard.jar -------------------------------------------------------------------------------- /maegatk/bin/python/filterClipBam.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################### 4 | # Filter reads / clip .bam files 5 | # Don't print() anything!!!! writing to STDOUT 6 | ############################################## 7 | 8 | import sys 9 | import re 10 | import pysam 11 | 12 | bamfile = sys.argv[1] 13 | logfile = sys.argv[2] 14 | mtchr = sys.argv[3] 15 | proper_pair = sys.argv[4] 16 | NHmax = sys.argv[5] 17 | NMmax = sys.argv[6] 18 | 19 | # https://github.com/pysam-developers/pysam/issues/509 20 | bam = pysam.AlignmentFile(bamfile, "rb") 21 | out = pysam.AlignmentFile("-", "wb", template = bam) 22 | 23 | keepCount = 0 24 | filtCount = 0 25 | #proper_pair = "False" 26 | 27 | def filterReadTags(intags): 28 | ''' 29 | Checks for aligner-specific read tags and filters 30 | ''' 31 | 32 | for tg in intags: 33 | if(('NH' == tg[0] and int(tg[1]) > int(NHmax)) or \ 34 | (('NM' == tg[0] or 'nM' == tg[0]) and int(tg[1]) > int(NMmax))): 35 | return(False) 36 | return(True) 37 | 38 | def pairing(read): 39 | ''' 40 | Check if read is paired, properly paired, etc. 41 | ''' 42 | 43 | if(proper_pair != "True"): # then user doesn't care to filter it 44 | return(True) 45 | else: 46 | return(read.is_proper_pair()) 47 | 48 | def processRead(read): 49 | global keepCount 50 | global filtCount 51 | if(filterReadTags(read.tags) and read.reference_name == mtchr and pairing(read)): 52 | keepCount += 1 53 | out.write(read) 54 | else: 55 | filtCount += 1 56 | 57 | for read in bam: 58 | processRead(read) 59 | 60 | with open(logfile , 'w') as outfile: 61 | outfile.write("Kept "+ str(keepCount) + "\n" + "Removed " + str(filtCount)+ "\n") 62 | 63 | -------------------------------------------------------------------------------- /maegatk/bin/python/find_barcodes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import re 5 | import pysam 6 | import os 7 | from collections import Counter 8 | from contextlib import contextmanager 9 | 10 | bamfile = sys.argv[1] 11 | outfolder = sys.argv[2] 12 | barcodeTag = sys.argv[3] 13 | min_barcodes = int(sys.argv[4]) 14 | mtchr = sys.argv[5] 15 | quant_file = sys.argv[6] 16 | passing_file = sys.argv[7] 17 | 18 | base=os.path.basename(bamfile) 19 | basename=os.path.splitext(base)[0] 20 | 21 | def getBarcode(intags): 22 | ''' 23 | Parse out the barcode per-read 24 | ''' 25 | for tg in intags: 26 | if(barcodeTag == tg[0]): 27 | return(tg[1]) 28 | return("NA") 29 | 30 | def quantifyBarcodes(mtchr): 31 | ''' 32 | Make a giant dictionary of observed barcodes at the mitochondrial chr 33 | ''' 34 | barcodes_all = dict() 35 | bam = pysam.AlignmentFile(bamfile,'rb') 36 | Itr = bam.fetch(str(mtchr),multiple_iterators=False) 37 | 38 | for read in Itr: 39 | read_barcode = getBarcode(read.tags) 40 | barcodes_all[read_barcode] = barcodes_all.get(read_barcode, 0) + 1 41 | bam.close() 42 | return(barcodes_all) 43 | 44 | 45 | # Quant barcodes and write it out 46 | barcodes = quantifyBarcodes(mtchr) 47 | barcodes = {x : barcodes[x] for x in barcodes if barcodes[x] >= min_barcodes and x != "NA"} 48 | bc = list(barcodes.keys()) 49 | 50 | quant_file_o = open(quant_file, "w") 51 | for k, v in barcodes.items(): 52 | quant_file_o.write(k +","+ str(v)+"\n") 53 | quant_file_o.close() 54 | 55 | passing_file_o = open(passing_file, "w") 56 | for k, v in barcodes.items(): 57 | passing_file_o.write(k +"\n") 58 | passing_file_o.close() 59 | -------------------------------------------------------------------------------- /maegatk/bin/python/oneSample_maegatk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from os.path import join 4 | import os 5 | import subprocess 6 | import sys 7 | import shutil 8 | import pysam 9 | from ruamel import yaml 10 | 11 | configFile = sys.argv[1] 12 | inputbam = sys.argv[2] 13 | outputbam = sys.argv[3] 14 | sample = sys.argv[4] 15 | 16 | with open(configFile, 'r') as stream: 17 | config = yaml.load(stream, Loader=yaml.Loader) 18 | 19 | # Parse the configuration variables 20 | indir = config["input_directory"] 21 | outdir = config["output_directory"] 22 | script_dir = config["script_dir"] 23 | 24 | mito_genome = config["mito_chr"] 25 | mito_length = str(config["mito_length"]) 26 | fasta_file = config["fasta_file"] 27 | 28 | umi_barcode = config["umi_barcode"] 29 | 30 | base_qual = str(config["base_qual"]) 31 | alignment_quality = config["alignment_quality"] 32 | NHmax = config["NHmax"] 33 | NMmax = config["NMmax"] 34 | min_reads = str(config["min_reads"]) 35 | 36 | max_javamem = config["max_javamem"] 37 | 38 | # Software paths 39 | java = "java" 40 | python = "python" 41 | 42 | # Script locations 43 | filtclip_py = script_dir + "/bin/python/filterClipBam.py" 44 | detailedcall_py = script_dir + "/bin/python/detailedCalls.py" 45 | sumstatsBP_py = script_dir + "/bin/python/sumstatsBP.py" 46 | fgbio = java + " -Xmx"+max_javamem+" -jar " + script_dir + "/bin/fgbio.jar" 47 | 48 | # Prepare filepath locations 49 | rmlog = outputbam.replace(".qc.bam", ".rmdups.log").replace("/temp/ready_bam/", "/logs/rmdupslogs/") 50 | filtlog = outputbam.replace(".qc.bam", ".filter.log").replace("/temp/ready_bam/", "/logs/filterlogs/") 51 | temp_bam0 = outputbam.replace(".qc.bam", ".temp0.bam").replace("/temp/ready_bam/", "/temp/temp_bam/") 52 | temp_bam1 = outputbam.replace(".qc.bam", ".temp1.bam").replace("/temp/ready_bam/", "/temp/temp_bam/") 53 | temp_sam15 = outputbam.replace(".qc.bam", ".temp1.5.sam").replace("/temp/ready_bam/", "/temp/temp_bam/") 54 | temp_bam15 = outputbam.replace(".qc.bam", ".temp1.5.bam").replace("/temp/ready_bam/", "/temp/temp_bam/") 55 | temp_bam2 = outputbam.replace(".qc.bam", ".temp2.bam").replace("/temp/ready_bam/", "/temp/temp_bam/") 56 | temp_fastq = outputbam.replace(".qc.bam", ".temp0.fastq").replace("/temp/ready_bam/", "/temp/temp_bam/") 57 | 58 | 59 | prefixSM = outdir + "/temp/sparse_matrices/" + sample 60 | outputdepth = outdir + "/qc/depth/" + sample + ".depth.txt" 61 | 62 | # 1) Filter bam files 63 | proper_paired = "False" 64 | pycall = " ".join([python, filtclip_py, inputbam, filtlog, mito_genome, proper_paired, NHmax, NMmax]) + " > " + temp_bam0 65 | os.system(pycall) 66 | 67 | # 2) Sort the filtered bam file 68 | fgcallone = fgbio + " GroupReadsByUmi -s Identity -e 0 -i " + temp_bam0 + " -o " + temp_bam1 + " -t "+ umi_barcode 69 | os.system('echo "'+fgcallone+'"') 70 | os.system(fgcallone) 71 | 72 | # 2.5) Modify the UB tag 73 | samtoolscall = 'samtools view -H ' + temp_bam1 + '> ' + temp_sam15 + '; samtools view ' + temp_bam1 + '| awk \'OFS="\t" {$13=$13""$4; print $0}\' >> ' + temp_sam15 + '; samtools view -b ' + temp_sam15 + '> ' + temp_bam15 74 | os.system('echo "'+samtoolscall+'"') 75 | os.system(samtoolscall) 76 | 77 | # 3) Call consensus reads 78 | fgcalltwo = fgbio + " CallMolecularConsensusReads -t "+umi_barcode+" -i "+temp_bam15+" -o " + temp_bam2 +" -M " + min_reads 79 | os.system(fgcalltwo) 80 | print(fgcalltwo) 81 | 82 | # 4) Convert consensus bam to fastq 83 | # bedtools_call = "bedtools bamtofastq -i "+ temp_bam2 +" -fq " + temp_fastq # Bedtools stopped working for some reason, replacing it with samtools fastq 84 | samtoolscall2 = 'samtools fastq -T cM ' + temp_bam2 + " | sed 's/\tcM:i:/_/g' > " + temp_fastq 85 | os.system(samtoolscall2) 86 | 87 | # 5) Remap + sort bam files 88 | bwa_call = "bwa mem " + fasta_file + " " + temp_fastq + " | samtools sort -o "+ outputbam +" -" 89 | os.system(bwa_call) 90 | pysam.index(outputbam) 91 | 92 | # 6) Get allele counts per sample / base pair and per-base quality scores 93 | alleleCountcall = " ".join([python, sumstatsBP_py, outputbam, prefixSM, mito_genome, mito_length, base_qual, sample, fasta_file, alignment_quality]) 94 | os.system(alleleCountcall) 95 | 96 | # 7) Get depth from the coverage sparse matrix 97 | with open(prefixSM + ".coverage.txt", 'r') as coverage: 98 | depth = 0 99 | for row in coverage: 100 | s = row.split(",") 101 | depth += int(s[2].strip()) 102 | with open(outputdepth, 'w') as d: 103 | d.write(sample + "\t" + str(round(float(depth)/float(mito_length),2)) + "\n") 104 | -------------------------------------------------------------------------------- /maegatk/bin/python/split_barcoded_bam.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import re 5 | import pysam 6 | import os 7 | from collections import Counter 8 | from contextlib import contextmanager 9 | 10 | bamfile = sys.argv[1] 11 | outfolder = sys.argv[2] 12 | barcodeTag = sys.argv[3] 13 | bcfile = sys.argv[4] 14 | mtchr = sys.argv[5] 15 | 16 | base=os.path.basename(bamfile) 17 | basename=os.path.splitext(base)[0] 18 | 19 | def getBarcode(intags): 20 | ''' 21 | Parse out the barcode per-read 22 | ''' 23 | for tg in intags: 24 | if(barcodeTag == tg[0]): 25 | return(tg[1]) 26 | return("NA") 27 | 28 | 29 | def writePassingReads(bc, mtchr): 30 | ''' 31 | Write out reads to their corresponding files based on a barcode index 32 | ''' 33 | bam = pysam.AlignmentFile(bamfile,'rb') 34 | Itr = bam.fetch(str(mtchr),multiple_iterators=False) 35 | for read in Itr: 36 | read_barcode = getBarcode(read.tags) 37 | 38 | # If read barcode is in whitelist, then write it out 39 | if read_barcode in bc: 40 | idx = bc.index(read_barcode) 41 | file = fopen[idx] 42 | file.write(read) 43 | 44 | # Read in the barcodes 45 | with open(bcfile) as barcode_file_handle: 46 | content = barcode_file_handle.readlines() 47 | bc = [x.strip() for x in content] 48 | 49 | # Open up a bunch of files and write out reads for valid barcodes 50 | @contextmanager 51 | def multi_file_manager(files, mode='rt'): 52 | """ 53 | Open multiple files and make sure they all get closed. 54 | """ 55 | temp = pysam.AlignmentFile(bamfile, "rb") 56 | files = [pysam.AlignmentFile(file, "wb", template = temp) for file in files] 57 | temp.close() 58 | yield files 59 | for file in files: 60 | file.close() 61 | 62 | # Final loop to write out passing reads 63 | bambcfiles = [outfolder + "/" + bc1 + ".bam" for bc1 in bc] 64 | with multi_file_manager(bambcfiles) as fopen: 65 | writePassingReads(bc, mtchr) 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /maegatk/bin/python/sumstatsBP.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ################################################### 4 | # Summarizes the total number of reads per position 5 | ################################################### 6 | 7 | import sys 8 | import re 9 | import os 10 | import pysam 11 | 12 | bamfile = sys.argv[1] 13 | outpre = sys.argv[2] 14 | mito_genome = sys.argv[3] 15 | maxBP = sys.argv[4] 16 | base_qual = float(sys.argv[5]) 17 | sample = sys.argv[6] 18 | fasta_file = sys.argv[7] 19 | alignment_quality = float(sys.argv[8]) 20 | 21 | # Export Functions 22 | def writeSparseMatrix(mid, vec): 23 | with open(outpre + "."+mid+".txt","w") as V: 24 | for i in range(0,int(maxBP)): 25 | if(vec[i] > 0): 26 | V.write(str(i+1)+","+sample+","+str(vec[i])+"\n") 27 | 28 | 29 | def writeSparseMatrix2(mid, vec1, vec2): 30 | with open(outpre + "."+mid+".txt","w") as V: 31 | for i in range(0,int(maxBP)): 32 | if(vec1[i] > 0 or vec2[i] > 0): 33 | V.write(str(i+1)+","+sample+","+str(vec1[i])+","+str(vec2[i])+"\n") 34 | 35 | def writeSparseMatrix4(mid, vec1, vec2, vec3, vec4): 36 | with open(outpre + "."+mid+".txt","w") as V: 37 | for i in range(0,int(maxBP)): 38 | if(vec1[i] > 0 or vec3[i] > 0): 39 | V.write(str(i+1)+","+sample+","+str(vec1[i])+","+str(vec2[i])+","+str(vec3[i])+","+str(vec4[i])+"\n") 40 | 41 | 42 | n = int(maxBP) 43 | 44 | # initialize with a pseudo count to avoid dividing by zero 45 | countsA_fw = [0.00000001] * n 46 | countsC_fw = [0.00000001] * n 47 | countsG_fw = [0.00000001] * n 48 | countsT_fw = [0.00000001] * n 49 | 50 | qualA_fw = [0.0] * n 51 | qualC_fw = [0.0] * n 52 | qualG_fw = [0.0] * n 53 | qualT_fw = [0.0] * n 54 | 55 | countsA_rev = [0.00000001] * n 56 | countsC_rev = [0.00000001] * n 57 | countsG_rev = [0.00000001] * n 58 | countsT_rev = [0.00000001] * n 59 | 60 | qualA_rev = [0.0] * n 61 | qualC_rev = [0.0] * n 62 | qualG_rev = [0.0] * n 63 | qualT_rev = [0.0] * n 64 | 65 | bam2 = pysam.AlignmentFile(bamfile, "rb") 66 | for read in bam2: 67 | seq = read.seq 68 | reverse = read.is_reverse 69 | quality = read.query_qualities 70 | align_qual_read = read.mapping_quality 71 | for qpos, refpos in read.get_aligned_pairs(True): 72 | if qpos is not None and refpos is not None and align_qual_read > alignment_quality: 73 | if(seq[qpos] == "A" and quality[qpos] > base_qual): 74 | if(reverse): 75 | qualA_fw[refpos] += quality[qpos] 76 | countsA_fw[refpos] += 1 77 | else: 78 | qualA_rev[refpos] += quality[qpos] 79 | countsA_rev[refpos] += 1 80 | elif(seq[qpos] == "C" and quality[qpos] > base_qual): 81 | if(reverse): 82 | qualC_fw[refpos] += quality[qpos] 83 | countsC_fw[refpos] += 1 84 | else: 85 | qualC_rev[refpos] += quality[qpos] 86 | countsC_rev[refpos] += 1 87 | elif(seq[qpos] == "G" and quality[qpos] > base_qual): 88 | if(reverse): 89 | qualG_fw[refpos] += quality[qpos] 90 | countsG_fw[refpos] += 1 91 | else: 92 | qualG_rev[refpos] += quality[qpos] 93 | countsG_rev[refpos] += 1 94 | elif(seq[qpos] == "T" and quality[qpos] > base_qual): 95 | if(reverse): 96 | qualT_fw[refpos] += quality[qpos] 97 | countsT_fw[refpos] += 1 98 | else: 99 | qualT_rev[refpos] += quality[qpos] 100 | countsT_rev[refpos] += 1 101 | 102 | meanQualA_fw = [round(x/y,1) for x, y in zip(qualA_fw, countsA_fw)] 103 | meanQualC_fw = [round(x/y,1) for x, y in zip(qualC_fw, countsC_fw)] 104 | meanQualG_fw = [round(x/y,1) for x, y in zip(qualG_fw, countsG_fw)] 105 | meanQualT_fw = [round(x/y,1) for x, y in zip(qualT_fw, countsT_fw)] 106 | 107 | countsA_fw = [ int(round(elem)) for elem in countsA_fw ] 108 | countsC_fw = [ int(round(elem)) for elem in countsC_fw ] 109 | countsG_fw = [ int(round(elem)) for elem in countsG_fw ] 110 | countsT_fw = [ int(round(elem)) for elem in countsT_fw ] 111 | 112 | meanQualA_rev = [round(x/y,1) for x, y in zip(qualA_rev, countsA_rev)] 113 | meanQualC_rev = [round(x/y,1) for x, y in zip(qualC_rev, countsC_rev)] 114 | meanQualG_rev = [round(x/y,1) for x, y in zip(qualG_rev, countsG_rev)] 115 | meanQualT_rev = [round(x/y,1) for x, y in zip(qualT_rev, countsT_rev)] 116 | 117 | countsA_rev = [ int(round(elem)) for elem in countsA_rev ] 118 | countsC_rev = [ int(round(elem)) for elem in countsC_rev ] 119 | countsG_rev = [ int(round(elem)) for elem in countsG_rev ] 120 | countsT_rev = [ int(round(elem)) for elem in countsT_rev ] 121 | 122 | # Allele Counts 123 | bam = pysam.AlignmentFile(bamfile, "rb") 124 | 125 | writeSparseMatrix4("A", countsA_fw, meanQualA_fw, countsA_rev, meanQualA_rev) 126 | writeSparseMatrix4("C", countsC_fw, meanQualC_fw, countsC_rev, meanQualC_rev) 127 | writeSparseMatrix4("G", countsG_fw, meanQualG_fw, countsG_rev, meanQualG_rev) 128 | writeSparseMatrix4("T", countsT_fw, meanQualT_fw, countsT_rev, meanQualT_rev) 129 | 130 | zipped_list = zip(list(countsA_fw),list(countsC_fw),list(countsG_fw),list(countsT_fw), list(countsA_rev),list(countsC_rev),list(countsG_rev),list(countsT_rev)) 131 | sums = [sum(item) for item in zipped_list] 132 | writeSparseMatrix("coverage", sums) 133 | -------------------------------------------------------------------------------- /maegatk/bin/snake/Snakefile.maegatk.Gather: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import shutil 4 | import pysam 5 | from os.path import join 6 | 7 | configfile: config["cfp"] 8 | maegatk_directory = config["maegatk_directory"] 9 | name = config["name"] 10 | 11 | # A Snakemake regular expression matching the bam file paths 12 | SAMPLES, = glob_wildcards(join(maegatk_directory, "temp/ready_bam/{sample}.qc.bam")) 13 | 14 | rule all: 15 | input: 16 | maegatk_directory + "/final/" + name + ".depthTable.txt", 17 | maegatk_directory + "/final/" + name + ".A.txt.gz", 18 | maegatk_directory + "/final/" + name + ".C.txt.gz", 19 | maegatk_directory + "/final/" + name + ".G.txt.gz", 20 | maegatk_directory + "/final/" + name + ".T.txt.gz", 21 | maegatk_directory + "/final/" + name + ".coverage.txt.gz", 22 | 23 | rule make_depth_table: 24 | input: 25 | depths = expand(maegatk_directory + "/qc/depth/{sample}.depth.txt", sample=SAMPLES) 26 | output: 27 | depthtable = maegatk_directory + "/final/" + name + ".depthTable.txt" 28 | run: 29 | with open(output.depthtable, 'w') as f: 30 | for file in input.depths: 31 | os.system("cat " + file + " >> " + output.depthtable) 32 | 33 | rule make_final_sparse_matrices: 34 | input: 35 | As = expand(maegatk_directory + "/temp/sparse_matrices/{sample}.A.txt", sample=SAMPLES), 36 | Cs = expand(maegatk_directory + "/temp/sparse_matrices/{sample}.C.txt", sample=SAMPLES), 37 | Gs = expand(maegatk_directory + "/temp/sparse_matrices/{sample}.G.txt", sample=SAMPLES), 38 | Ts = expand(maegatk_directory + "/temp/sparse_matrices/{sample}.T.txt", sample=SAMPLES), 39 | Covs = expand(maegatk_directory + "/temp/sparse_matrices/{sample}.coverage.txt", sample=SAMPLES) 40 | 41 | output: 42 | A = maegatk_directory + "/final/" + name + ".A.txt.gz", 43 | C = maegatk_directory + "/final/" + name + ".C.txt.gz", 44 | G = maegatk_directory + "/final/" + name + ".G.txt.gz", 45 | T = maegatk_directory + "/final/" + name + ".T.txt.gz", 46 | Cov = maegatk_directory + "/final/" + name + ".coverage.txt.gz" 47 | 48 | run: 49 | def makeSM(iterableThing, l): 50 | for i in iterableThing: 51 | os.system("cat " + i + " >> " + maegatk_directory + "/final/" + name + "."+l+".txt") 52 | os.system("gzip " + maegatk_directory + "/final/" + name + "."+l+".txt") 53 | 54 | makeSM(input.As, "A") 55 | makeSM(input.Cs, "C") 56 | makeSM(input.Gs, "G") 57 | makeSM(input.Ts, "T") 58 | makeSM(input.Covs, "coverage") 59 | 60 | -------------------------------------------------------------------------------- /maegatk/bin/snake/Snakefile.maegatk.Scatter: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import shutil 4 | import pysam 5 | from os.path import join 6 | 7 | configfile: config["cfp"] 8 | outdir = config["output_directory"] 9 | script_dir = config["script_dir"] 10 | 11 | # A Snakemake regular expression matching the bam file paths 12 | SAMPLES, = glob_wildcards(join(outdir, ".internal/samples/{sample}.bam.txt")) 13 | bamtxtin = '{sample}.bam.txt' 14 | 15 | oneSample_py = script_dir + "/bin/python/oneSample_maegatk.py" 16 | 17 | rule all: 18 | input: 19 | outdir + "/temp/scattered.allSamples.txt" 20 | 21 | rule process_one_sample: 22 | input: 23 | txtin = join(outdir + "/.internal/samples", bamtxtin) 24 | output: 25 | bam = outdir + "/temp/ready_bam/{sample}.qc.bam", 26 | bai = outdir + "/temp/ready_bam/{sample}.qc.bam.bai", 27 | depth = outdir + "/qc/depth/{sample}.depth.txt", 28 | A = outdir + "/temp/sparse_matrices/{sample}.A.txt", 29 | C = outdir + "/temp/sparse_matrices/{sample}.C.txt", 30 | G = outdir + "/temp/sparse_matrices/{sample}.G.txt", 31 | T = outdir + "/temp/sparse_matrices/{sample}.T.txt", 32 | cov = outdir + "/temp/sparse_matrices/{sample}.coverage.txt", 33 | run: 34 | # Get sample information 35 | sample = output.bam.replace(outdir + "/temp/ready_bam/", "").replace(".qc.bam", "") 36 | with open(input.txtin) as f: 37 | inputbam = f.read() 38 | 39 | # Process one samle 40 | pycall = " ".join(['python', oneSample_py, config["cfp"], inputbam, output.bam, sample]) 41 | print(pycall) 42 | os.system(pycall) 43 | 44 | # Collate everything 45 | rule make_sample_list: 46 | input: 47 | depths = expand(outdir + "/qc/depth/{sample}.depth.txt", sample=SAMPLES) 48 | output: 49 | allSamplesFile = outdir + "/temp/scattered.allSamples.txt" 50 | run: 51 | for file in input.depths: 52 | sample = file.replace(outdir + "/qc/depth/", "").replace(".depth.txt", "") 53 | os.system("echo " + sample + " >> " + output.allSamplesFile) 54 | -------------------------------------------------------------------------------- /maegatk/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | import os 3 | import os.path 4 | import sys 5 | import shutil 6 | import random 7 | import string 8 | import itertools 9 | import time 10 | import pysam 11 | 12 | from pkg_resources import get_distribution 13 | from subprocess import call, check_call 14 | from .maegatkHelp import * 15 | from ruamel.yaml import YAML 16 | from ruamel.yaml.scalarstring import SingleQuotedScalarString as sqs 17 | from multiprocessing import Pool 18 | 19 | @click.command() 20 | @click.version_option() 21 | @click.argument('mode', type=click.Choice(['bcall', 'support'])) 22 | @click.option('--input', '-i', default = ".", required=True, help='Input; a singular, indexed bam file. ') 23 | @click.option('--output', '-o', default="maegatk_out", help='Output directory for genotypes.') 24 | @click.option('--name', '-n', default="maegatk", help='Prefix for project name') 25 | 26 | @click.option('--mito-genome', '-g', default = "rCRS", required=True, help='mitochondrial genome configuration. Requires bwa indexed fasta file or `rCRS` (built-in)') 27 | @click.option('--ncores', '-c', default = "detect", help='Number of cores to run the main job in parallel.') 28 | 29 | @click.option('--cluster', default = "", help='Message to send to Snakemake to execute jobs on cluster interface; see documentation.') 30 | @click.option('--jobs', default = "0", help='Max number of jobs to be running concurrently on the cluster interface.') 31 | 32 | @click.option('--barcode-tag', '-bt', default = "CB", help='Read tag (generally two letters) to separate single cells; valid and required only in `bcall` mode.') 33 | @click.option('--barcodes', '-b', default = "", help='File path to barcodes that will be extracted; useful only in `bcall` mode.') 34 | @click.option('--min-barcode-reads', '-mb', default = 100, help='Minimum number of mitochondrial reads for a barcode to be genotyped; useful only in `bcall` mode; will not overwrite the `--barcodes` logic.') 35 | 36 | @click.option('--NHmax', default = 2, help='Maximum number of read alignments allowed as governed by the NH flag. Default = 2.') 37 | @click.option('--NMmax', default = 15, help='Maximum number of paired mismatches allowed represented by the NM/nM tags. Default = 15.') 38 | @click.option('--min-reads', '-mr', default = 1, help='Minimum number of supporting reads to call a consensus UMI/rread. Default = 1.') 39 | @click.option('--keep-qc-bams', '-qc', is_flag=True, help='Add this flag to keep the quality-controlled bams after processing.') 40 | 41 | @click.option('--umi-barcode', '-ub', default = "UB", help='Read tag (generally two letters) to specify the UMI tag when removing duplicates for genotyping.') 42 | 43 | @click.option('--max-javamem', '-jm', default = "4000m", help='Maximum memory for java for running duplicate removal. Default = 4000m.') 44 | 45 | @click.option('--base-qual', '-q', default = 0, help='Minimum base quality for inclusion in the genotype count. Default = 0.') 46 | @click.option('--alignment-quality', '-aq', default = 0, help='Minimum alignment quality to include read in genotype. Default = 0.') 47 | 48 | @click.option('--nsamples', '-ns', default = 6000, help='The number of samples / cells to be processed per iteration; default is all.') 49 | 50 | @click.option('--keep-samples', '-k', default="ALL", help='Comma separated list of sample names to keep; ALL (special string) by default. Sample refers to basename of .bam file') 51 | @click.option('--ignore-samples', '-x', default="NONE", help='Comma separated list of sample names to ignore; NONE (special string) by default. Sample refers to basename of .bam file') 52 | 53 | @click.option('--keep-temp-files', '-z', is_flag=True, help='Keep all intermediate files.') 54 | 55 | @click.option('--skip-R', '-sr', is_flag=True, help='Generate plain-text only output. Otherwise, this generates a .rds obejct that can be immediately read into R for downstream analysis.') 56 | @click.option('--skip-barcodesplit', '-sb', is_flag=True, help='Skip the time consuming barcode-splitting step if it finished successfully before') 57 | @click.option('--snake-stdout', '-so', is_flag=True, help='Write snakemake log to sdout rather than a file.') 58 | 59 | def main(mode, input, output, name, mito_genome, ncores, 60 | cluster, jobs, barcode_tag, barcodes, min_barcode_reads, 61 | nhmax, nmmax, min_reads, keep_qc_bams, umi_barcode, max_javamem, 62 | base_qual, alignment_quality, 63 | nsamples, keep_samples, ignore_samples, 64 | keep_temp_files, skip_r, skip_barcodesplit, snake_stdout): 65 | 66 | """ 67 | maegatk: a Maester genome toolkit. \n 68 | MODE = ['bcall', 'support'] \n 69 | """ 70 | 71 | script_dir = os.path.dirname(os.path.realpath(__file__)) 72 | cwd = os.getcwd() 73 | __version__ = get_distribution('maegatk').version 74 | click.echo(gettime() + "maegatk v%s" % __version__) 75 | 76 | # Determine cores 77 | if(ncores == "detect"): 78 | ncores = str(available_cpu_count()) 79 | else: 80 | ncores = str(ncores) 81 | 82 | 83 | # Determine which genomes are available 84 | rawsg = os.popen('ls ' + script_dir + "/bin/anno/fasta/*.fasta").read().strip().split("\n") 85 | supported_genomes = [x.replace(script_dir + "/bin/anno/fasta/", "").replace(".fasta", "") for x in rawsg] 86 | 87 | if(mode == "support"): 88 | click.echo(gettime() + "List of built-in genomes supported in maegatk:") 89 | click.echo(gettime() + str(supported_genomes)) 90 | sys.exit(gettime() + 'Specify one of these genomes or provide your own .fasta file with the --mito-genome flag') 91 | 92 | # Remember that I started off as bcall as this will become overwritten 93 | wasbcall = False 94 | if(mode == "bcall"): 95 | if(barcode_tag == "X"): 96 | sys.exit('ERROR: in `bcall` mode, must specify a valid read tag ID (generally two letters).') 97 | 98 | # Input argument is assumed to be a .bam file 99 | filename, file_extension = os.path.splitext(input) 100 | if(file_extension != ".bam"): 101 | sys.exit('ERROR: in `bcall` mode, the input should be an individual .bam file.') 102 | if not os.path.exists(input): 103 | sys.exit('ERROR: No file found called "' + input + '"; please specify a valid .bam file.') 104 | if not os.path.exists(input + ".bai"): 105 | sys.exit('ERROR: index your input .bam file for `bcall` mode.') 106 | click.echo(gettime() + "Found bam file: " + input + " for genotyping.") 107 | 108 | # Determine whether or not we have been supplied barcodes 109 | barcode_known = False 110 | if (os.path.exists(barcodes)) and (barcodes != ""): 111 | click.echo(gettime() + "Found file of barcodes to be parsed: " + barcodes) 112 | barcode_known = True 113 | else: 114 | click.echo(gettime() + "Will determine barcodes with at least: " + str(min_barcode_reads) + " mitochondrial reads.") 115 | 116 | # Make temporary directory of inputs 117 | of = output; tf = of + "/temp"; bcbd = tf + "/barcoded_bams" # bcdb = barcoded bam directory 118 | folders = [of, tf, bcbd, of + "/final"] 119 | mkfolderout = [make_folder(x) for x in folders] 120 | 121 | # Handle fasta requirements 122 | fastaf, mito_chr, mito_length = handle_fasta_inference(mito_genome, supported_genomes, script_dir, mode, of) 123 | idxs = pysam.idxstats(input).split("\n") 124 | 125 | # Handle common mtDNA reference genome errors 126 | bam_length = 0 127 | for i in idxs: 128 | if(i.split("\t")[0] == mito_chr): 129 | bam_length = int(i.split("\t")[1]) 130 | 131 | if(mito_length == bam_length): 132 | click.echo(gettime() + "User specified mitochondrial genome matches .bam file") 133 | elif(bam_length == 16569): 134 | click.echo(gettime() + "User specified mitochondrial genome does NOT match .bam file; using rCRS instead (length == 16569)") 135 | fastaf, mito_chr, mito_length = handle_fasta_inference("rCRS", supported_genomes, script_dir, mode, of) 136 | elif(bam_length == 16571): 137 | click.echo(gettime() + "User specified mitochondrial genome does NOT match .bam file; using hg19 instead (length == 16571)") 138 | fastaf, mito_chr, mito_length = handle_fasta_inference("hg19", supported_genomes, script_dir, mode, of) 139 | else: 140 | click.echo(gettime() + "User specified mitochondrial genome does NOT match .bam file; correctly specify reference genome or .fasta file") 141 | quit() 142 | 143 | # Actually call the external script based on user input 144 | if(not barcode_known): 145 | barc_quant_file = of + "/final/barcodeQuants.tsv" 146 | passing_barcode_file = of + "/final/passingBarcodes.tsv" 147 | find_barcodes_py = script_dir + "/bin/python/find_barcodes.py" 148 | 149 | pycall = " ".join(['python', find_barcodes_py, input, bcbd, barcode_tag, str(min_barcode_reads), mito_chr, barc_quant_file, passing_barcode_file]) 150 | os.system(pycall) 151 | barcodes = passing_barcode_file 152 | 153 | # Potentially split the valid barcodes into smaller files if we need to 154 | if not skip_barcodesplit: 155 | barcode_files = split_barcodes_file(barcodes, nsamples, output) 156 | split_barcoded_bam_py = script_dir + "/bin/python/split_barcoded_bam.py" 157 | 158 | # Loop over the split sample files 159 | for i in range(len(barcode_files)): 160 | one_barcode_file = barcode_files[i] 161 | pycall = " ".join(['python', split_barcoded_bam_py, input, bcbd, barcode_tag, one_barcode_file, mito_chr]) 162 | os.system(pycall) 163 | 164 | click.echo(gettime() + "Finished determining/splitting barcodes for genotyping.") 165 | 166 | # Update everything to appear like we've just set `call` on the set of bams 167 | input = bcbd 168 | wasbcall = True 169 | 170 | 171 | if not skip_r: 172 | check_software_exists("R") 173 | check_R_packages(["dplyr"]) 174 | 175 | # ------------------------------- 176 | # Determine samples for analysis 177 | # ------------------------------- 178 | if(mode == "bcall"): 179 | 180 | bams = [] 181 | bams = os.popen('ls ' + input + '/*.bam').read().strip().split("\n") 182 | 183 | if bams[0] == '': 184 | sys.exit('ERROR: Could not import any samples from the user specification; check flags, logs and input configuration; QUITTING') 185 | 186 | samples = [] 187 | samplebams = [] 188 | 189 | if(not wasbcall): 190 | fastaf, mito_chr, mito_length = handle_fasta_inference(mito_genome, supported_genomes, script_dir, mode, output, write_files = False) 191 | 192 | # Loop over bam files 193 | for bam in bams: 194 | base=os.path.basename(bam) 195 | basename=os.path.splitext(base)[0] 196 | samples.append(basename) 197 | samplebams.append(bam) 198 | 199 | # parallel process to ensure we have .bai files for each bam 200 | pool = Pool(processes=int(ncores)) 201 | pm = pool.map(verify_bai, samplebams) 202 | pool.close() 203 | 204 | samples_fail = [] 205 | for i in range(len(samples)): 206 | sample = samples[i] 207 | bam = samplebams[i] 208 | if( not verify_sample_mitobam(bam, mito_chr, mito_length)): 209 | samples_fail.append(sample) 210 | 211 | if(keep_samples != "ALL"): 212 | keeplist = keep_samples.split(",") 213 | click.echo(gettime() + "Intersecting detected samples with user-retained ones: " + keep_samples) 214 | keepidx = findIdx(samples, keeplist) 215 | samples = [samples[i] for i in keepidx] 216 | samplebams = [samplebams[i] for i in keepidx] 217 | 218 | if(ignore_samples != "NONE"): 219 | iglist = ignore_samples.split(",") 220 | click.echo(gettime() + "Will remove samples from processing:" + ignore_samples) 221 | rmidx = findIdx(samples, iglist) 222 | for index in sorted(rmidx, reverse=True): 223 | del samples[index] 224 | del samplebams[index] 225 | 226 | if(len(samples_fail) > 0): 227 | click.echo(gettime() + "NOTE: the samples below either have 0 mtDNA reads at the specified chromosome or are mapped to an incorrectly specified reference mitochondrial genome") 228 | click.echo(gettime() + "Will remove samples from processing:" ) 229 | rmidx = findIdx(samples, samples_fail) 230 | for index in sorted(rmidx, reverse=True): 231 | print("REMOVED: ", samples[index]) 232 | del samples[index] 233 | del samplebams[index] 234 | 235 | if not len(samples) > 0: 236 | sys.exit('ERROR: Could not import any samples from the user specification. \nERROR: check flags, logs, and input configuration (including reference mitochondrial genome); \nQUITTING') 237 | 238 | nsamplesNote = "maegatk will process " + str(len(samples)) + " samples" 239 | 240 | 241 | if(mode == "bcall"): 242 | 243 | # Make all of the output folders if necessary 244 | of = output; tf = of + "/temp"; qc = of + "/qc"; logs = of + "/logs" 245 | folders = [logs, of + "/logs/filterlogs", of + "/fasta", of + "/.internal", 246 | of + "/.internal/parseltongue", of + "/.internal/samples", of + "/final", 247 | tf, tf + "/ready_bam", tf + "/temp_bam", tf + "/sparse_matrices", tf + "/quality", 248 | qc, qc + "/quality", qc + "/depth"] 249 | 250 | mkfolderout = [make_folder(x) for x in folders] 251 | 252 | #------------------- 253 | # Handle .fasta file 254 | #------------------- 255 | if(mode == "bcall"): 256 | # Logging 257 | logf = open(output + "/logs" + "/base.maegatk.log", 'a') 258 | click.echo(gettime() + "Starting analysis with maegatk", logf) 259 | click.echo(gettime() + nsamplesNote, logf) 260 | make_folder(of + "/logs/rmdupslogs") 261 | 262 | # Create internal README files 263 | if not os.path.exists(of + "/.internal/README"): 264 | with open(of + "/.internal/README" , 'w') as outfile: 265 | outfile.write("This folder creates important (small) intermediate; don't modify it.\n\n") 266 | if not os.path.exists(of + "/.internal/parseltongue/README"): 267 | with open(of + "/.internal/parseltongue/README" , 'w') as outfile: 268 | outfile.write("This folder creates intermediate output to be interpreted by Snakemake; don't modify it.\n\n") 269 | if not os.path.exists(of + "/.internal/samples/README"): 270 | with open(of + "/.internal" + "/samples" + "/README" , 'w') as outfile: 271 | outfile.write("This folder creates samples to be interpreted by Snakemake; don't modify it.\n\n") 272 | 273 | # Set up sample bam plain text file 274 | for i in range(len(samples)): 275 | with open(of + "/.internal/samples/" + samples[i] + ".bam.txt" , 'w') as outfile: 276 | outfile.write(samplebams[i]) 277 | 278 | click.echo(gettime() + "Genotyping samples with "+ncores+" threads") 279 | 280 | # add sqs to get .yaml to play friendly https://stackoverflow.com/questions/39262556/preserve-quotes-and-also-add-data-with-quotes-in-ruamel 281 | dict1 = {'input_directory' : sqs(input), 'output_directory' : sqs(output), 'script_dir' : sqs(script_dir), 282 | 'fasta_file' : sqs(fastaf), 'mito_chr' : sqs(mito_chr), 'mito_length' : sqs(mito_length), 283 | 'base_qual' : sqs(base_qual), 'umi_barcode' : sqs(umi_barcode), 284 | 'alignment_quality' : sqs(alignment_quality), 285 | 'NHmax' : sqs(nhmax), 'NMmax' : sqs(nmmax), 'min_reads' : sqs(min_reads),'max_javamem' : sqs(max_javamem)} 286 | 287 | if(mode == "bcall"): 288 | 289 | # Potentially submit jobs to cluster 290 | snakeclust = "" 291 | njobs = int(jobs) 292 | if(njobs > 0 and cluster != ""): 293 | snakeclust = " --jobs " + jobs + " --cluster '" + cluster + "' " 294 | click.echo(gettime() + "Recognized flags to process jobs on a computing cluster.", logf) 295 | 296 | click.echo(gettime() + "Processing samples with "+ncores+" threads", logf) 297 | 298 | y_s = of + "/.internal/parseltongue/snake.scatter.yaml" 299 | with open(y_s, 'w') as yaml_file: 300 | yaml=YAML() 301 | yaml.default_flow_style = False 302 | yaml.dump(dict1, yaml_file) 303 | 304 | cp_call = "cp " + y_s + " " + logs + "/" + name + ".parameters.txt" 305 | os.system(cp_call) 306 | 307 | # Execute snakemake 308 | snake_stats = logs + "/" + name + ".snakemake_scatter.stats" 309 | snake_log = logs + "/" + name + ".snakemake_scatter.log" 310 | 311 | snake_log_out = "" 312 | if not snake_stdout: 313 | snake_log_out = ' &>' + snake_log 314 | 315 | snakecmd_scatter = 'snakemake'+snakeclust+' --snakefile ' + script_dir + '/bin/snake/Snakefile.maegatk.Scatter --cores '+ncores+' --config cfp="' + y_s + '" --stats '+snake_stats + snake_log_out 316 | os.system(snakecmd_scatter) 317 | click.echo(gettime() + "maegatk successfully processed the supplied .bam files", logf) 318 | 319 | #------- 320 | # Gather 321 | #------- 322 | if(mode == "bcall"): 323 | 324 | if(mode == "bcall"): 325 | maegatk_directory = output 326 | 327 | 328 | dict2 = {'maegatk_directory' : sqs(maegatk_directory), 'name' : sqs(name), 'script_dir' : sqs(script_dir)} 329 | y_g = maegatk_directory + "/.internal/parseltongue/snake.gather.yaml" 330 | with open(y_g, 'w') as yaml_file: 331 | yaml=YAML() 332 | yaml.default_flow_style = False 333 | yaml.dump(dict2, yaml_file) 334 | 335 | 336 | # Snakemake gather 337 | snake_stats = logs + "/" + name + ".snakemake_gather.stats" 338 | snake_log = logs + "/" + name + ".snakemake_gather.log" 339 | 340 | snake_log_out = "" 341 | if not snake_stdout: 342 | snake_log_out = ' &>' + snake_log 343 | 344 | snakecmd_gather = 'snakemake --snakefile ' + script_dir + '/bin/snake/Snakefile.maegatk.Gather --cores '+ncores+' --config cfp="' + y_g + '" --stats '+snake_stats + snake_log_out 345 | os.system(snakecmd_gather) 346 | 347 | if not skip_r: 348 | # Make .rds file from the output 349 | Rcall = "Rscript " + script_dir + "/bin/R/toRDS.R " + maegatk_directory + "/final " + name 350 | os.system(Rcall) 351 | 352 | click.echo(gettime() + "Successfully created final output files", logf) 353 | 354 | #-------- 355 | # Cleanup 356 | #-------- 357 | if(mode == "bcall" ): 358 | 359 | if keep_qc_bams: 360 | click.echo(gettime() + "Final bams retained since --keep-qc-bams was specified.", logf) 361 | dest = shutil.move(of + "/temp/ready_bam", of + "/qc_bam") 362 | if keep_temp_files: 363 | click.echo(gettime() + "Temporary files not deleted since --keep-temp-files was specified.", logf) 364 | else: 365 | if(mode == "bcall"): 366 | byefolder = of 367 | 368 | shutil.rmtree(byefolder + "/fasta") 369 | shutil.rmtree(byefolder + "/.internal") 370 | shutil.rmtree(byefolder + "/temp") 371 | click.echo(gettime() + "Intermediate files successfully removed.", logf) 372 | 373 | # Suspend logging 374 | logf.close() 375 | 376 | -------------------------------------------------------------------------------- /maegatk/cliindel.py: -------------------------------------------------------------------------------- 1 | import click 2 | import os 3 | import os.path 4 | import sys 5 | import glob 6 | import shutil 7 | import random 8 | import string 9 | import itertools 10 | import time 11 | import pysam 12 | import pandas as pd 13 | 14 | from pkg_resources import get_distribution 15 | from subprocess import call, check_call 16 | from .maegatkHelp import * 17 | from ruamel import yaml 18 | from ruamel.yaml.scalarstring import SingleQuotedScalarString as sqs 19 | from joblib import Parallel, delayed 20 | 21 | @click.command() 22 | @click.version_option() 23 | @click.option('--input-dir', '-i', default = ".", required=True, help='Input; a directory containing individual .bam files') 24 | @click.option('--output-dir', '-o', default="mgatk_out", help='Output directory for analysis.') 25 | 26 | @click.option('--mito-genome', '-g', default = "rCRS", required=True, help='mitochondrial genome configuration. Requires bwa indexed fasta file or `rCRS` (built-in)') 27 | @click.option('--min-reads-per-indel', '-m', default='5', required=True, help='minimum # of reads supporting a called indel by freebayes') 28 | @click.option('--keep-intermediate', '-k', is_flag=True, help='whether to keep intermediate per-cell vcf files.') 29 | @click.option('--ncores', '-c', default = "detect", help='Number of cores to run the main job in parallel.') 30 | 31 | def main(input_dir, output_dir, mito_genome, min_reads_per_indel, ncores, keep_intermediate): 32 | 33 | """ 34 | maegatk: a Maester genome toolkit. \n 35 | INDEL calling \n 36 | """ 37 | 38 | script_dir = os.path.dirname(os.path.realpath(__file__)) 39 | cwd = os.getcwd() 40 | __version__ = get_distribution('maegatk').version 41 | click.echo(gettime() + 'maegatk v%s' % __version__) 42 | 43 | # Determine which genomes are available 44 | rawsg = os.popen('ls ' + script_dir + '/bin/anno/fasta/*.fasta').read().strip().split("\n") 45 | supported_genomes = [x.replace(script_dir + '/bin/anno/fasta/', '').replace('.fasta', '') for x in rawsg] 46 | 47 | # Determine cores 48 | if(ncores == 'detect'): 49 | ncores = str(available_cpu_count()) 50 | else: 51 | ncores = str(ncores) 52 | 53 | # ------------------------------- 54 | # Determine samples for analysis 55 | # ------------------------------- 56 | bams = glob.glob(input_dir + '/*.bam') 57 | 58 | if len(bams) == 0: 59 | sys.exit('ERROR: Could not import any samples from the user specification; check --input parameter; QUITTING') 60 | 61 | # ------------------------------- 62 | # Locate the mito genome 63 | # ------------------------------- 64 | if any(mito_genome in s for s in supported_genomes): 65 | mito_fastaf = script_dir + '/bin/anno/fasta/' + mito_genome + '.fasta' 66 | elif os.path.exists(mito_genome): 67 | mito_fastaf = mito_genome 68 | else: 69 | sys.exit('ERROR: Could not find file ' + mito_genome + '; QUITTING') 70 | 71 | # ------------------------------- 72 | # Run freebayes on all bam files 73 | # ------------------------------- 74 | if not os.path.exists(output_dir): 75 | os.mkdir(output_dir) 76 | if not os.path.exists(output_dir+'/vcf'): 77 | os.mkdir(output_dir+'/vcf') 78 | 79 | def run_freebayes(cell_bam_file): 80 | cell_name = cell_bam_file.split('/')[-1].replace('.qc.bam', '') 81 | cur_out_file = output_dir + '/vcf/{}.vcf'.format(cell_name) 82 | os.system('freebayes -C {} -f {} {} > {}'.format(min_reads_per_indel, mito_fastaf, cell_bam_file, cur_out_file)) 83 | 84 | Parallel(n_jobs=int(ncores))(delayed(run_freebayes)(cbf) for cbf in bams) 85 | 86 | # very janky, could use parallelization if speed becomes an issue 87 | #for cell_bam_file in bams: 88 | # cell_name = cell_bam_file.split('/')[-1].replace('.qc.bam', '') 89 | # cur_out_file = output_dir + '/vcf/{}.vcf'.format(cell_name) 90 | # os.system('freebayes -C {} -f {} {} > {}'.format(min_reads_per_indel, mito_fastaf, cell_bam_file, cur_out_file)) 91 | 92 | # ------------------------------- 93 | # Collapse the vcf files into one summary file 94 | # ------------------------------- 95 | vcfs = glob.glob(output_dir + '/vcf/*.vcf') 96 | vcf_summary = [] 97 | for cell_vcf_file in vcfs: 98 | cell_name = cell_vcf_file.split('/')[-1].replace('.vcf', '') 99 | cur_content = [x.split('\t') for x in open(cell_vcf_file, 'r').readlines() if x[0] != '#'] # remove documentation lines 100 | 101 | # keep only position x[1], ref x[3], alt x[4], quality score x[5], and other supporting info x[-2] 102 | cur_content = [[x[1], x[3], x[4], x[5], x[-3]] for x in cur_content if len(x[3]) != len(x[4]) and ',' not in x[4]] # choose indels only 103 | extra_info = [] 104 | for cur_entry in cur_content: 105 | extra_info.append(dict([x.split('=') for x in cur_entry[-1].split(';')])) 106 | summarized_entry = [[cell_name] + x[:-1] + [extra_info[i]['RO'], extra_info[i]['AO']] for i, x in enumerate(cur_content)] 107 | vcf_summary += summarized_entry 108 | vcf_summary = pd.DataFrame(vcf_summary) 109 | vcf_summary.columns = ['cell_barcode', 'pos', 'ref', 'alt', 'score', 'ref_reads', 'alt_reads'] 110 | vcf_summary.to_csv(output_dir + '/indel_summary.csv') 111 | 112 | if not keep_intermediate: 113 | os.system('rm -r {}/vcf'.format(output_dir)) 114 | -------------------------------------------------------------------------------- /maegatk/maegatkHelp.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import time 3 | import shutil 4 | import re 5 | import os 6 | import sys 7 | import subprocess 8 | import pysam 9 | import filecmp 10 | import math 11 | import pysam 12 | 13 | def string_hamming_distance(str1, str2): 14 | """ 15 | Fast hamming distance over 2 strings known to be of same length. 16 | In information theory, the Hamming distance between two strings of equal 17 | length is the number of positions at which the corresponding symbols 18 | are different. 19 | eg "karolin" and "kathrin" is 3. 20 | """ 21 | return sum(itertools.imap(operator.ne, str1, str2)) 22 | 23 | 24 | def rev_comp(seq): 25 | """ 26 | Fast Reverse Compliment 27 | """ 28 | tbl = {'A':'T', 'T':'A', 'C':'G', 'G':'C', 'N':'N'} 29 | return ''.join(tbl[s] for s in seq[::-1]) 30 | 31 | def gettime(): 32 | """ 33 | Matches `date` in Linux 34 | """ 35 | return(time.strftime("%a ") + time.strftime("%b ") + time.strftime("%d ") + time.strftime("%X ") + 36 | time.strftime("%Z ") + time.strftime("%Y")+ ": ") 37 | 38 | def findIdx(list1, list2): 39 | """ 40 | Return the indices of list1 in list2 41 | """ 42 | return [i for i, x in enumerate(list1) if x in list2] 43 | 44 | def check_R_packages(required_packages): 45 | """ 46 | Determines whether or not R packages are properly installed 47 | """ 48 | R_path = shutil.which("R") 49 | installed_packages = os.popen(R_path + ''' -e "installed.packages()" | awk '{print $1}' | sort | uniq''').read().strip().split("\n") 50 | if(not set(required_packages) < set(installed_packages)): 51 | sys.exit("ERROR: cannot find the following R package: " + str(set(required_packages) - set(installed_packages)) + "\n" + 52 | "Install it in your R console and then try rerunning maegatk (but there may be other missing dependencies).") 53 | 54 | def check_software_exists(tool): 55 | tool_path = shutil.which(tool) 56 | if(str(tool_path) == "None"): 57 | sys.exit("ERROR: cannot find "+tool+" in environment; add it to user PATH environment") 58 | 59 | 60 | def parse_fasta(filename): 61 | """ 62 | Imports specified .fasta file 63 | """ 64 | f = open(filename) 65 | sequences = {} 66 | for line in f: 67 | if line.startswith('>'): 68 | name = line[1:].strip() 69 | sequences[name] = '' 70 | else: 71 | sequences[name] = sequences[name] + line.strip() 72 | f.close() 73 | return sequences 74 | 75 | def verify_bai(bamfile): 76 | ''' 77 | Function that indexes bam file from input if missing 78 | ''' 79 | bai_file = bamfile + ".bai" 80 | if(not os.path.exists(bai_file)): 81 | pysam.index(bamfile) 82 | 83 | def verify_sample_mitobam(bam, mito_chr, mito_length): 84 | idxs = pysam.idxstats(bam).split("\n") 85 | 86 | # Pull out essentials from idxstats 87 | for i in idxs: 88 | if(i.split("\t")[0] == mito_chr): 89 | bam_length = int(i.split("\t")[1]) 90 | nReads = int(i.split("\t")[2]) 91 | return(bam_length == mito_length and nReads > 0) 92 | 93 | 94 | def handle_fasta_inference(mito_genome, supported_genomes, script_dir, mode, of, write_files = False): 95 | """ 96 | Determines what's going on with the mitochondrial genome 97 | based on user input / existing data 98 | """ 99 | if any(mito_genome in s for s in supported_genomes): 100 | fastaf = script_dir + "/bin/anno/fasta/" + mito_genome + ".fasta" 101 | else: 102 | if os.path.exists(mito_genome): 103 | fastaf = mito_genome 104 | else: 105 | sys.exit('ERROR: Could not find file ' + mito_genome + '; QUITTING') 106 | fasta = parse_fasta(fastaf) 107 | 108 | if(len(fasta.keys()) != 1): 109 | sys.exit('ERROR: .fasta file has multiple chromosomes; supply file with only 1; QUITTING') 110 | 111 | mito_genome, mito_seq = list(fasta.items())[0] 112 | mito_length = len(mito_seq) 113 | 114 | if(write_files): 115 | make_folder(of + "/fasta/") 116 | make_folder(of + "/final/") 117 | 118 | newfastaf = of + "/fasta/" + mito_genome + ".fasta" 119 | 120 | # Need a special logic to potentially over-write the existing 121 | # mito fasta file if they wind up not being the same from guess/checking hg19 122 | writeFA = False 123 | if os.path.exists(newfastaf): 124 | if filecmp.cmp(fastaf, newfastaf,shallow=False): 125 | pass 126 | else: 127 | writeFA = True 128 | else: 129 | writeFA = True 130 | 131 | if writeFA and write_files: 132 | 133 | shutil.copyfile(fastaf, newfastaf) 134 | fastaf = newfastaf 135 | pysam.faidx(fastaf) 136 | 137 | with open(of + "/final/" + mito_genome + "_refAllele.txt", 'w') as f: 138 | b = 1 139 | for base in mito_seq: 140 | f.write(str(b) + "\t" + base + "\n") 141 | b += 1 142 | f.close() 143 | return(fastaf, mito_genome, mito_length) 144 | 145 | 146 | def make_folder(folder): 147 | """ 148 | Function to only make a given folder if it does not already exist 149 | """ 150 | if not os.path.exists(folder): 151 | os.makedirs(folder) 152 | 153 | def file_len(fname): 154 | with open(fname) as f: 155 | for i, l in enumerate(f): 156 | pass 157 | return i + 1 158 | 159 | def split_barcodes_file(barcode_file, nsamples, output): 160 | """ 161 | Function to only make a given folder if it does not already exist 162 | """ 163 | n_samples_observed = file_len(barcode_file) 164 | 165 | # See if we need to do anything... if user didn't specify or we have relatively few samples, just return the barcode file back 166 | if(n_samples_observed < nsamples or nsamples == 0): 167 | return([barcode_file]) 168 | else: 169 | # Need to split files into a maximum of n samples 170 | total_files = math.ceil(n_samples_observed / nsamples) 171 | lines_per_file = nsamples 172 | 173 | # Set up a temporary output folder to route files to 174 | full_output_folder = output + "/temp" + "/barcode_files" 175 | make_folder(full_output_folder) 176 | smallfile = None 177 | counter = 0 178 | with open(barcode_file) as bigfile: 179 | for lineno, line in enumerate(bigfile): 180 | if lineno % lines_per_file == 0: 181 | if smallfile: 182 | smallfile.close() 183 | counter = counter + 1 184 | small_filename = full_output_folder + "/barcodes." + str(counter) + ".txt" 185 | smallfile = open(small_filename, "w") 186 | smallfile.write(line) 187 | if smallfile: 188 | smallfile.close() 189 | barcodes_files = [full_output_folder + "/barcodes." + str(x) + ".txt" for x in list(range(1, total_files + 1))] 190 | return(barcodes_files) 191 | 192 | 193 | 194 | # https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python 195 | def available_cpu_count(): 196 | """ 197 | Number of available virtual or physical CPUs on this system, i.e. 198 | user/real as output by time(1) when called with an optimally scaling 199 | userspace-only program 200 | """ 201 | # cpuset 202 | # cpuset may restrict the number of *available* processors 203 | try: 204 | m = re.search(r'(?m)^Cpus_allowed:\s*(.*)$', 205 | open('/proc/self/status').read()) 206 | if m: 207 | res = bin(int(m.group(1).replace(',', ''), 16)).count('1') 208 | if res > 0: 209 | return res 210 | except IOError: 211 | pass 212 | 213 | # Python 2.6+ 214 | try: 215 | import multiprocessing 216 | return multiprocessing.cpu_count() 217 | except (ImportError, NotImplementedError): 218 | pass 219 | 220 | # http://code.google.com/p/psutil/ 221 | try: 222 | import psutil 223 | return psutil.cpu_count() # psutil.NUM_CPUS on old versions 224 | except (ImportError, AttributeError): 225 | pass 226 | 227 | # POSIX 228 | try: 229 | res = int(os.sysconf('SC_NPROCESSORS_ONLN')) 230 | 231 | if res > 0: 232 | return res 233 | except (AttributeError, ValueError): 234 | pass 235 | 236 | # Windows 237 | try: 238 | res = int(os.environ['NUMBER_OF_PROCESSORS']) 239 | 240 | if res > 0: 241 | return res 242 | except (KeyError, ValueError): 243 | pass 244 | 245 | # jython 246 | try: 247 | from java.lang import Runtime 248 | runtime = Runtime.getRuntime() 249 | res = runtime.availableProcessors() 250 | if res > 0: 251 | return res 252 | except ImportError: 253 | pass 254 | 255 | # BSD 256 | try: 257 | sysctl = subprocess.Popen(['sysctl', '-n', 'hw.ncpu'], 258 | stdout=subprocess.PIPE) 259 | scStdout = sysctl.communicate()[0] 260 | res = int(scStdout) 261 | 262 | if res > 0: 263 | return res 264 | except (OSError, ValueError): 265 | pass 266 | 267 | # Linux 268 | try: 269 | res = open('/proc/cpuinfo').read().count('processor\t:') 270 | 271 | if res > 0: 272 | return res 273 | except IOError: 274 | pass 275 | 276 | # Solaris 277 | try: 278 | pseudoDevices = os.listdir('/devices/pseudo/') 279 | res = 0 280 | for pd in pseudoDevices: 281 | if re.match(r'^cpuid@[0-9]+$', pd): 282 | res += 1 283 | 284 | if res > 0: 285 | return res 286 | except OSError: 287 | pass 288 | 289 | # Other UNIXes (heuristic) 290 | try: 291 | try: 292 | dmesg = open('/var/run/dmesg.boot').read() 293 | except IOError: 294 | dmesgProcess = subprocess.Popen(['dmesg'], stdout=subprocess.PIPE) 295 | dmesg = dmesgProcess.communicate()[0] 296 | 297 | res = 0 298 | while '\ncpu' + str(res) + ':' in dmesg: 299 | res += 1 300 | 301 | if res > 0: 302 | return res 303 | except OSError: 304 | pass 305 | 306 | raise Exception('Can not determine number of CPUs on this system') 307 | 308 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | 4 | [bdist_wheel] 5 | universal=1 6 | 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | maegatk: a maester genotyping tool kit 3 | """ 4 | from setuptools import find_packages, setup 5 | 6 | dependencies = ['click', 'pysam', 'pytest', 'snakemake', 'biopython', 'optparse-pretty', 'regex', 'ruamel.yaml'] 7 | 8 | setup( 9 | name='maegatk', 10 | version='0.2.1', 11 | url='https://github.com/caleblareau/maegatk', 12 | license='MIT', 13 | author='Caleb Lareau', 14 | author_email='caleb.lareau@gmail.com', 15 | description='Processing and quality control of mitochondrial genome variants from MAESTER data.', 16 | long_description=__doc__, 17 | packages=find_packages(exclude=['tests']), 18 | include_package_data=True, 19 | zip_safe=False, 20 | platforms='any', 21 | install_requires=dependencies, 22 | entry_points={ 23 | 'console_scripts': [ 24 | 'maegatk = maegatk.cli:main', 25 | 'maegatk-indel = maegatk.cliindel:main', 26 | ], 27 | }, 28 | classifiers=[ 29 | # As from http://pypi.python.org/pypi?%3Aaction=list_classifiers 30 | # 'Development Status :: 1 - Planning', 31 | # 'Development Status :: 2 - Pre-Alpha', 32 | 'Development Status :: 3 - Alpha', 33 | # 'Development Status :: 4 - Beta', 34 | # 'Development Status :: 5 - Production/Stable', 35 | # 'Development Status :: 6 - Mature', 36 | # 'Development Status :: 7 - Inactive', 37 | 'Environment :: Console', 38 | 'Intended Audience :: Developers', 39 | 'License :: OSI Approved :: MIT License', 40 | 'Operating System :: POSIX', 41 | 'Operating System :: MacOS', 42 | 'Operating System :: Unix', 43 | 'Programming Language :: Python', 44 | 'Programming Language :: Python :: 3', 45 | 'Topic :: Software Development :: Libraries :: Python Modules', 46 | ] 47 | ) 48 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | ## Sample genotyping 2 | 3 | ``` 4 | maegatk bcall -i data/test_maester.bam -o test_maester -z 5 | ``` 6 | 7 | compare to mgatk 8 | 9 | ``` 10 | mgatk bcall -i data/test_maester.bam -o test_mgatk -ub UB -bt CB -g rCRS --NHmax 2 --NMmax 15 -z 11 | ``` 12 | 13 | 14 | -------------------------------------------------------------------------------- /tests/data/BT_K_variants.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/tests/data/BT_K_variants.rda -------------------------------------------------------------------------------- /tests/data/test_maester.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/tests/data/test_maester.bam -------------------------------------------------------------------------------- /tests/data/test_maester.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caleblareau/maegatk/a61f7da9bbc5baeaeca531e39f43b43a08ada603/tests/data/test_maester.bam.bai -------------------------------------------------------------------------------- /tests/make_mixing_plot.R: -------------------------------------------------------------------------------- 1 | library(BuenColors) 2 | library(stringr) 3 | library(SummarizedExperiment) 4 | library(Matrix) 5 | library(dplyr) 6 | 7 | substrRight <- function(x, n){ 8 | substr(x, nchar(x)-n+1, nchar(x)) 9 | } 10 | load("data/BT_K_variants.rda") 11 | 12 | vdf <- str_split_fixed(c(BT142_var, K562_var), "_", 3) 13 | df2 <- data.frame( 14 | idx = as.numeric(vdf[,1]), 15 | BT142 = c(vdf[1:4,3], vdf[5:33,2]), 16 | K562 = c(vdf[1:4,2], vdf[5:33,3]) 17 | ) 18 | 19 | extractme <- function(cell, letter, SE){ 20 | idxx <- df2[which(df2[,cell] == letter), "idx"] 21 | colSums(assays(SE)[[paste0(letter, "_counts_fw")]][idxx, ] + assays(SE)[[paste0(letter, "_counts_rev")]][idxx, ]) 22 | } 23 | 24 | SE_old <- readRDS("test_mgatk/final/mgatk.rds") 25 | SE_new <- readRDS("test_maester/final/maegtk.rds") 26 | 27 | SE_old <- SE_new 28 | df3 <- data.frame( 29 | barcode = colnames(SE_old), 30 | BT142 = extractme("BT142", "A", SE_old) + extractme("BT142", "C", SE_old) + extractme("BT142", "G", SE_old) + extractme("BT142", "T", SE_old), 31 | K562 = extractme("K562", "A", SE_old) + extractme("K562", "C", SE_old) + extractme("K562", "G", SE_old) + extractme("K562", "T", SE_old) 32 | ) 33 | df3 <- df3 %>% mutate(minor_population = pmin(K562/(BT142 + K562 + 0.001)*100 ,BT142/(BT142 + K562 + 0.001)*100), 34 | minor_population_cut = pmin(minor_population, 10)) 35 | 36 | p1 <- ggplot(df3 , aes(x = BT142, y = K562, color = minor_population_cut)) + 37 | pretty_plot() + L_border() + theme(legend.position = "bottom") + 38 | geom_point() + labs(x = "BT142 homoplasmic", y = "K562 homoplasmic", color = "Minor Pop %") + 39 | scale_color_gradientn(colors = jdb_palette("brewer_spectra")) 40 | 41 | p2 <- ggplot(df3 , aes(x = log10(BT142 + 1), y = log10(K562 + 1), color = minor_population_cut)) + 42 | pretty_plot() + L_border() + theme(legend.position = "bottom") + 43 | geom_point() + labs(x = "log BT142 homoplasmic ", y = "log K562 homoplasmic", color = "Minor Pop %") + 44 | scale_color_gradientn(colors = jdb_palette("brewer_spectra")) 45 | 46 | cowplot::ggsave(cowplot::plot_grid(p1, p2, nrow = 1), file = "MAESTER_mixing_plot.png", width =7, height = 4) 47 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from click.testing import CliRunner 3 | from mgatk import cli 4 | import md5 5 | 6 | 7 | def file_checksums_equal(file1, file2): 8 | with open(file1) as f: 9 | checksum1 = md5.new(f.read()).digest() 10 | with open(file2) as f: 11 | checksum2 = md5.new(f.read()).digest() 12 | return checksum1==checksum2 13 | 14 | 15 | def test_check(): 16 | runner = CliRunner() 17 | result = runner.invoke(cli.main, ['check', '-i', 'intput', '-o', 'output', '-n', 'name']) 18 | print(result.output) 19 | #assert file_checksums_equal('p.s3_1.trim.fastq', 'correct_output/p.s3_1.trim.fastq') 20 | --------------------------------------------------------------------------------