├── 1kg-cnv ├── README.md └── scripts │ └── truth-xls-to-tsv.py ├── LICENSE ├── README.rst ├── annoj ├── static │ ├── aj2.js │ ├── config.js │ └── index.html ├── templates │ ├── config.mako.js │ └── index.mako └── wsgi │ ├── at.wsgi │ ├── bottle.py │ └── index.wsgi ├── fileindex ├── README.rst ├── examples │ ├── bench.py │ ├── fastq_file.py │ └── sam_file.py └── fileindex.py ├── hts-lua ├── LICENSE ├── README.md ├── hts.lua ├── hts_concat.h └── vt.norm.vcf.gz ├── igv └── igv.py ├── kscalc ├── README.rst ├── kscalc.py └── score_guess.py ├── libsvm-tools ├── README.rst └── libsvm-grid.py ├── lowess ├── README.rst ├── _lowess.c ├── lowess.c ├── lowess.pyx └── setup.py ├── methstuffs └── bed-merge.py ├── mosaic ├── README.md ├── filter-functional.py └── mosaic.py ├── motif-pattern ├── README.rst ├── motif_counts.txt ├── motif_significance.py ├── patterns.txt └── run.sh ├── ngs-notes ├── README.rst ├── bowtie-dnaa-dist.rst ├── bowtie-e.md ├── dalliance.rst ├── images │ └── insert-size.png ├── realignment.rst ├── variant-calling.rst └── variant-filtering-and-annotation.rst ├── nim-stuffs ├── edalign │ ├── README.md │ ├── ed.nim │ ├── edlib.cpp │ ├── edlib.h │ └── edlib_c.nim └── fermil-nim │ ├── .gitignore │ ├── LICENSE │ ├── LICENSE-fermi-lite.txt │ ├── Makefile │ ├── README-fermi-lite.md │ ├── README.md │ ├── bfc.c │ ├── bfc.c.sav │ ├── bseq.c │ ├── bubble.c │ ├── example.c │ ├── fermil.nim │ ├── fermil.nimble │ ├── fml.h │ ├── htab.c │ ├── htab.h │ ├── internal.h │ ├── khash.h │ ├── kmer.h │ ├── kseq.h │ ├── ksort.h │ ├── kstring.h │ ├── ksw.c │ ├── ksw.h │ ├── kthread.c │ ├── kvec.h │ ├── mag.c │ ├── mag.h │ ├── misc.c │ ├── mrope.c │ ├── mrope.h │ ├── rld0.c │ ├── rld0.h │ ├── rle.c │ ├── rle.h │ ├── rope.c │ ├── rope.h │ ├── test │ └── MT-simu.fq.gz │ └── unitig.c ├── plots ├── README.rst ├── images │ ├── manhattan.png │ └── manhattan.rgbk.png └── manhattan-plot.py ├── reads-utils ├── README.rst ├── color │ └── convert-cs.py ├── fastq.cpp ├── fastq_pair_filter.py ├── guess-encoding.py └── select-random-pairs.py ├── solidstuff ├── bfastq-to-bwa.py ├── color-qual-replace.py ├── solid-trimmer.py ├── test.csfasta └── test.qual ├── subject_genes_from_query └── subject_genes_from_query.py ├── superbed ├── README.rst ├── superanno.py └── superbed.py ├── utils ├── README.rst ├── find-peaks.py ├── join.py ├── list_overlap_p.py ├── partsort.py ├── pathwayapi-python │ ├── README.rst │ └── pathwayapi.py └── primers │ └── methylation-primers.py ├── vcf └── vcf-to-matrix.py └── werelate ├── README.md ├── test.sh ├── weconcur.py └── werelate.py /1kg-cnv/README.md: -------------------------------------------------------------------------------- 1 | Copy-number values and exome bams for 1kg. 2 | 3 | http://www.nature.com/nature/journal/v464/n7289/extref/nature08516-s4.xls 4 | 5 | Given the truth sets in the XLS and the bams from 1KG, we can evaluate a CNV caller by recovery of those from the truth 6 | set. 7 | 8 | Go 9 | == 10 | 11 | ```Go 12 | 13 | type interval struct { 14 | start int 15 | end int 16 | sample string 17 | } 18 | 19 | type Truthy struct { 20 | ProportionOverlap float64 21 | // tree will return an interval for each sample 22 | tree *IntervalTree 23 | } 24 | 25 | func imin(a, b int) int { 26 | if a < b { 27 | return a 28 | } 29 | return b 30 | } 31 | 32 | func imax(a, b int) int { 33 | if a > b { 34 | return a 35 | } 36 | return b 37 | } 38 | 39 | func overlapp(a, b *interval) float64 { 40 | total := a.end - a.start + b.end - b.start 41 | // ----- 42 | // ------ 43 | ovl := imin(a.end, b.end) - imax(a.start, b.start) 44 | if ovl < 0 { return 0 } 45 | return (ovl * 2) / total 46 | } 47 | 48 | func (t *Truthy) Has(i *interval) bool { 49 | values := t.tree.Get(i) 50 | for _, v := range values { 51 | if v.Sample() == i.Sample(){ 52 | if overlapp(v, i) >= t.ProportionOverlap { 53 | return true 54 | } 55 | } 56 | } 57 | return false 58 | } 59 | 60 | type Evaluator struct { 61 | t *Truthy 62 | FP int 63 | TP int 64 | // calculate true and false negatives by tracking what's touched in the tree. 65 | FN int 66 | TN int 67 | } 68 | 69 | func (e Evaluator) Precision() float64 { } 70 | func (e Evaluator) Recall() float64 { } 71 | func (e *Evaluator) Clear() { 72 | e.TP, e.FP, e.TN, e.FN = 0, 0, 0, 0 73 | } 74 | 75 | 76 | func (e *Evaluator) LoadTruth(bedpath string) error { } 77 | func (e *Evaluator) Evaluate(bedpath string) error { } 78 | 79 | ``` 80 | 81 | -------------------------------------------------------------------------------- /1kg-cnv/scripts/truth-xls-to-tsv.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import toolshed as ts 4 | 5 | xl = pd.ExcelFile('data/nature08516-s4.xls') 6 | 7 | 8 | gm = xl.parse("Genotype Map", index_col=0) 9 | gm = gm[~np.isnan(gm.start)] 10 | 11 | gm.chr = gm.chr.astype(int).astype(str) 12 | gm.chr[gm.chr == "23"] = "X" 13 | gm.start = gm.start.astype(int) 14 | gm.end = gm.end.astype(int) 15 | gm.drop('source', axis=1, inplace=True) 16 | gm.drop('cn', axis=1, inplace=True) 17 | gm.columns = (['#chrom'] + list(gm.columns[1:])) 18 | print(gm.head()) 19 | 20 | j = gm 21 | 22 | def get_bam_lookup(p="data/bam-lookups-from-1kg-site.tsv"): 23 | l = {} 24 | for d in ts.reader(p): 25 | if 'low_coverage' in d['url']: continue 26 | if 'chr20' in d['url']: continue 27 | if 'chrom20' in d['url']: continue 28 | if 'chrom11' in d['url']: continue 29 | if 'unmapped' in d['url']: continue 30 | # NOTE: we could also get some samples with cram. 31 | if not d['url'].endswith('.bam'): continue 32 | if d['Sample'] in l: 33 | print "XXX:", d['url'] 34 | print "YYY:", l[d['Sample']] 35 | l[d['Sample']] = d['url'] 36 | return l 37 | 38 | 39 | samples = get_bam_lookup() 40 | 41 | url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/{sample}/exome_alignment/{sample}.mapped.ILLUMINA.bwa.{pop}.exome.20120522.bam" 42 | 43 | bamfh = open('data/samples.bams.txt', 'w') 44 | for p in ('CEU', 'CHB+JPT', 'YRI'): 45 | pop = xl.parse(p, index_col=0) 46 | j = j.join(pop, how="inner") 47 | 48 | for s in pop.columns[1:]: 49 | if s in samples: 50 | bamfh.write("%s\t%s\n" % (s, samples[s])) 51 | bamfh.close() 52 | 53 | 54 | j.sort_values(by=['#chrom', 'start'], inplace=True) 55 | 56 | j.to_csv('data/copy-numbers.hg18.wide.bed', index=False, 57 | float_format="%.0f", sep="\t", na_rep='nan') 58 | 59 | jlong = pd.melt(j, id_vars=('#chrom', 'start', 'end'), 60 | value_vars=list(j.columns[4:]), var_name='sample', value_name='cn') 61 | 62 | print jlong.shape 63 | jlong = jlong.ix[jlong.cn != 2, :] 64 | jlong.sort_values(by=['#chrom', 'start'], inplace=True) 65 | print jlong.shape 66 | print jlong.head() 67 | jlong.to_csv('data/copy-numbers.hg18.long.bed', index=False, 68 | float_format="%.0f", sep="\t", na_rep='nan') 69 | 70 | grouped = jlong.groupby(['#chrom','start', 'end', 'cn'], axis=0, 71 | as_index=False) 72 | short = grouped.agg(lambda col: ",".join(col)) 73 | print short.__class__ 74 | short.sort_values(by=['#chrom', 'start', 'cn'], inplace=True) 75 | 76 | short.to_csv('data/copy-numbers.hg18.samples.bed', index=False, 77 | float_format="%.0f", sep="\t", na_rep='nan') 78 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2009-2011 Brent Pedersen, Haibao Tang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. contents:: 2 | 3 | Miscellaneous scripts for bioinformatics that dont merit their own repo. 4 | All under MIT License unless otherwise specified. 5 | 6 | Ks Calc 7 | -------- 8 | Abnormal nucleotide frequency tends to throw off normal procedures for estimating `evolutionary models `_. A practical situation is when calculating the Ks values for the grass genes where a significant portion of them are high-GC genes (see details `here `_). In the case of high GC genes, most of the substitutions will be either G or C, therefore the Jukes-Cantor correction will under-estimate the Ks values. The codon models in PAML, on the contrary, tend to over-estimate Ks values. The Ks calculator we want to implement here, ignores the inference of models (where it is difficult anyway, since you have very few sites to estimate the parameters in the model). Instead, we ask this: **given biased substitutions, lengths, run simulations and try to fit an evolutionary model based on the simulations.** 9 | 10 | .. image:: http://chart.apis.google.com/chart?cht=lc&chls=8|8&chd=t2:65,65,65|75,75,75|40,50,80&chs=300x200&chm=V,FFFFFF,0,,25|@tObserved+alignment,,0,.05:.87,10|@twith+difference+D,,0,.05:.8,10|@tSimulate+alignments,,0,.55:.87,10|@twith+various+Ks,,0,.55:.8,10|@tProb(D)=0.3,,0,.3:.45,10|@tProb(D)=0.6,ff0000,0,.3:.37,10|@tProb(D)=0.4,,0,.3:.3,10|@tKs=0.1,,0,.15:.45,10|@tKs=0.2,ff0000,0,.15:.37,10|@tKs=0.3,,0,.15:.3,10|@tKs=...,808080,0,.15:.23,10|@tMaximum+Likelihood+Estimate,ff0000,0,.5:.37,10|a,990066,2,1,9.0&chma=0,0,30,0 11 | :alt: method 12 | 13 | 14 | -------------------------------------------------------------------------------- /annoj/static/config.js: -------------------------------------------------------------------------------- 1 | //The Anno-J configuration object 2 | AnnoJ.config = { 3 | 4 | //List of configurations for all tracks in the Anno-J instance 5 | tracks : [ 6 | 7 | //Example config for a ModelsTrack 8 | { 9 | id : 'models', 10 | name : 'Gene Models', 11 | type : 'ModelsTrack', 12 | path : 'Annotation models', 13 | 14 | //Pointing to a local service 15 | data : '/bed/brachy', 16 | height : 180, 17 | showControls : true 18 | }, 19 | // http://www.annoj.org/instances/configure.shtml 20 | 21 | ], 22 | 23 | //A list of tracks that will be active by default (use the ID of the track) 24 | active : [ 25 | 'models' 26 | ], 27 | 28 | //Address of service that provides information about this genome 29 | //genome : '/proxy/arabidopsis_thaliana.php', 30 | genome : '/bed/genome/brachy', 31 | 32 | //Address of service that stores / loads user bookmarks 33 | //bookmarks : '/bed/genome', 34 | 35 | //A list of stylesheets that a user can select between (optional) 36 | stylesheets : [ 37 | { 38 | id : 'css1', 39 | name : 'Plugins CSS', 40 | href : 'http://www.annoj.org/css/plugins.css', 41 | active : true 42 | },{ 43 | id : 'css2', 44 | name : 'SALK CSS', 45 | href : 'http://www.annoj.org/css/salk.css', 46 | active : true 47 | } 48 | ], 49 | 50 | //The default 'view'. In this example, chr1, position 1, zoom ratio 20:1. 51 | location : { 52 | assembly : 'Bd1', 53 | position : 1, 54 | bases : 80, 55 | pixels : 1 56 | }, 57 | 58 | //Site administrator contact details (optional) 59 | admin : { 60 | name : 'Ju', 61 | email : 'to', 62 | notes : 'Pe' 63 | } 64 | }; 65 | -------------------------------------------------------------------------------- /annoj/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Anno-J 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /annoj/templates/config.mako.js: -------------------------------------------------------------------------------- 1 | //The Anno-J configuration object 2 | AnnoJ.config = { 3 | 4 | //List of configurations for all tracks in the Anno-J instance 5 | tracks : [ 6 | 7 | //Example config for a ModelsTrack 8 | { 9 | id : 'models', 10 | name : 'Gene Models', 11 | type : 'ModelsTrack', 12 | path : 'Annotation models', 13 | 14 | //Pointing to a local service 15 | data : '/organism/${organism}', 16 | height : 180, 17 | showControls : true 18 | }, 19 | // http://www.annoj.org/instances/configure.shtml 20 | 21 | ], 22 | 23 | //A list of tracks that will be active by default (use the ID of the track) 24 | active : [ 25 | 'models' 26 | ], 27 | 28 | //Address of service that provides information about this genome 29 | //genome : '/proxy/arabidopsis_thaliana.php', 30 | genome : '/organism/${organism}', 31 | 32 | //Address of service that stores / loads user bookmarks 33 | //bookmarks : '/bed/genome', 34 | 35 | //A list of stylesheets that a user can select between (optional) 36 | stylesheets : [ 37 | { 38 | id : 'css1', 39 | name : 'Plugins CSS', 40 | href : 'http://www.annoj.org/css/plugins.css', 41 | active : true 42 | },{ 43 | id : 'css2', 44 | name : 'SALK CSS', 45 | href : 'http://www.annoj.org/css/salk.css', 46 | active : true 47 | } 48 | ], 49 | 50 | //The default 'view'. In this example, chr1, position 1, zoom ratio 20:1. 51 | location : { 52 | assembly : '${seqid}', 53 | position : '${position}', 54 | bases : 80, 55 | pixels : 1 56 | }, 57 | 58 | //Site administrator contact details (optional) 59 | admin : { 60 | name : 'Ju', 61 | email : 'to', 62 | notes : 'Pe' 63 | } 64 | }; 65 | -------------------------------------------------------------------------------- /annoj/templates/index.mako: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Anno-J 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /annoj/wsgi/at.wsgi: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | 4 | from flatfeature import Bed 5 | import simplejson 6 | import sys 7 | import os.path as op 8 | sys.path = [op.dirname(__file__)] + sys.path 9 | from bottle import route, request, response, default_app 10 | 11 | PATHS = { 12 | "brachy": ("/opt/src/flatfeature/data/brachy_v1.bed", '/home/brentp/work/bio-me/flank/data/rice_v5_brachy_v1/brachy_v1.fasta'), 13 | "sorghum": ("/home/brentp/work/cnspipeline/data2/brachy_v1_sorghum_v1.4/sorghum_v1.4.bed", '/home/brentp/work/cnspipeline/data2/brachy_v1_sorghum_v1.4/sorghum_v1.4.fasta'), 14 | } 15 | 16 | 17 | @route('/:organism', method='POST') 18 | def index(organism): 19 | bed = Bed(*PATHS[organism]) 20 | action = request.POST.get('action') 21 | response.headers['Content-Type'] = 'text/plain' 22 | if action == 'lookup': 23 | q = request.POST.get('query') 24 | return q 25 | 26 | @route('/:organism/syndicate', method='GET') 27 | def syndicate(organism): 28 | bed = Bed(*PATHS[organism]) 29 | response.headers['Content-Type'] = 'text/javascript' 30 | chrs = [{"id": seqid, "size": len(bed.fasta[seqid])} for seqid in bed.fasta.keys() if len(bed.fasta[seqid]) > 20000] 31 | d = {'success': True, 'data': {'institution': {'name': "UCB", "url": 'http://arabidopsis.org/', "logo": "http://arabidopsis.org/i/logo2.gif"}, 32 | "service": {"title": "Brachypodium distachyon", "version": 1}, "genome": {"assemblies": chrs }}} 33 | return simplejson.dumps(d) 34 | 35 | 36 | 37 | 38 | @route('/:organism/range', method='GET') 39 | def qrange(organism): 40 | bed = Bed(*PATHS[organism]) 41 | l = int(request.GET['left']) 42 | r = int(request.GET['right']) 43 | seqid = request.GET['assembly'] 44 | feats = bed.get_features_in_region(seqid, l, r) 45 | data = [] 46 | 47 | for feat in feats: 48 | parent = feat['accn'] 49 | s = int(feat['start']) 50 | row = [None, feat['accn'], feat['strand'], "mRNA", s, int(feat['end']) - s] 51 | data.append(row) 52 | for i, (start, end) in enumerate(feat['locs']): 53 | row = [parent, str(i), feat['strand'], "CDS", start, end - start] 54 | data.append(row) 55 | 56 | j = {'success': True, 'data': data, 'l': l, 'r': r} 57 | response.headers['Content-Type'] = 'text/javascript' 58 | return simplejson.dumps(j) 59 | 60 | 61 | @route('/:organism/describe', method='GET') 62 | def describe(organism): 63 | bed = Bed(*PATHS[organism]) 64 | id = request.GET['id'] 65 | row = bed.accn(id) 66 | d = {'success': True, 'data': {"id": id, "assembly": row['seqid'], 67 | "start": int(row['start']), 'end': int(row['end']), 68 | "description": "blah"}} 69 | return simplejson.dumps(d) 70 | 71 | 72 | @route('/:organism') 73 | def genome(organism): 74 | bed = Bed(*PATHS[organism]) 75 | response.headers['Content-Type'] = 'text/javascript' 76 | chrs = [{"id": seqid, "size": len(bed.fasta[seqid])} for seqid in bed.fasta.keys() if len(bed.fasta[seqid]) > 20000] 77 | d = {'success': True, 'data': {'institution': {'name': "UCB", "url": 'http://arabidopsis.org'}, 78 | "service": {"title": "Brachypodium distachyon", "version": 1}, "genome": {"assemblies": chrs }}} 79 | return simplejson.dumps(d) 80 | 81 | application = default_app() 82 | -------------------------------------------------------------------------------- /annoj/wsgi/index.wsgi: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | import os.path as op 4 | PATH = op.dirname(__file__) 5 | sys.path = [PATH] + sys.path 6 | from bottle import route, request, response, default_app 7 | from mako.template import Template 8 | from mako.lookup import TemplateLookup 9 | tmpl = op.join(op.dirname(PATH), "templates") 10 | 11 | @route("/:organism") 12 | def index(organism): 13 | t = Template(filename=op.join(tmpl, "index.mako"), lookup=TemplateLookup(directories=tmpl)) 14 | return t.render(organism=organism, seqid='Bd1', position=1234) 15 | 16 | 17 | application = default_app() 18 | 19 | -------------------------------------------------------------------------------- /fileindex/README.rst: -------------------------------------------------------------------------------- 1 | FileIndex 2 | ========= 3 | 4 | index flat files. see: `blogpost `_ 5 | example:: 6 | 7 | >>> FileIndex.create(f, lambda fh: SamLine(fh).name, allow_multiple=True) 8 | >>> fi = FileIndex(f, SamLine, allow_multiple=True) 9 | >>> [(s.name, s.ref_seqid, s.ref_loc) for s in fi['23351265']] 10 | [('23351265', '2', 8524), ('23351265', '3', 14202495)] 11 | 12 | 13 | Installation 14 | ------------ 15 | 16 | requires `py-tcdb` (and tokyo-cabinet headers):: 17 | 18 | easy_install -UZ py-tcdb 19 | -------------------------------------------------------------------------------- /fileindex/examples/bench.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.insert(0, "/usr/local/src/screed") 4 | sys.path.insert(0, "/usr/local/src/bio-playground/fileindex") 5 | sys.path.insert(0, "/usr/local/src/biopython-sqlite/") 6 | import screed 7 | import fileindex 8 | #import bsddbfileindex 9 | import time 10 | import random 11 | 12 | from Bio import SeqIO 13 | 14 | 15 | def get_rand_headers(fq, N=100000): 16 | """ get N random headers from a fastq file without reading the 17 | whole thing into memory""" 18 | records = sum(1 for _ in open(fq)) / 4 19 | rand_records = sorted([random.randint(0, records) for _ in xrange(N)]) 20 | 21 | headers = [] 22 | fh = open(fq) 23 | rec_no = -1 24 | for rr in rand_records: 25 | while rec_no < rr: 26 | header = fh.readline().rstrip() 27 | for i in range(3): fh.readline() 28 | rec_no += 1 29 | headers.append(header) 30 | assert len(headers) == N, (len(headers),) 31 | random.shuffle(headers) 32 | return headers, records 33 | 34 | 35 | class FastQEntry(object): 36 | __slots__ = ('name', 'seq', 'l3', 'qual', 'fpos') 37 | def __init__(self, fh): 38 | self.name = fh.readline().rstrip('\r\n') 39 | self.seq = fh.readline().rstrip('\r\n') 40 | self.l3 = fh.readline().rstrip('\r\n') 41 | self.qual = fh.readline().rstrip('\r\n') 42 | 43 | def rm(f): 44 | try: os.unlink(f) 45 | except OSError: pass 46 | time.sleep(0.1) 47 | 48 | def show_name(name): 49 | print "\n%s\n" % name + "-" * len(name) 50 | 51 | def time_screed(f, random_seqs, name): 52 | show_name(name) 53 | rm("%s_%s" % (f, screed.DBConstants.fileExtension)) 54 | 55 | t = time.time() 56 | screed.read_fastq_sequences(f) 57 | print "create: %.3f" % (time.time() - t) 58 | 59 | faqdb = screed.ScreedDB(f) 60 | t = time.time() 61 | for r in random_seqs: 62 | faqdb[r[1:]].sequence 63 | print "search: %.3f" % (time.time() - t) 64 | del faqdb 65 | 66 | def time_fileindex(f, random_seqs, name, klass): 67 | show_name(name) 68 | rm("%s%s" % (f, klass.ext)) 69 | t = time.time() 70 | klass.create(f, lambda fh: FastQEntry(fh).name) 71 | print "create: %.3f" % (time.time() - t) 72 | 73 | fi = klass(f, FastQEntry) 74 | t = time.time() 75 | for r in random_seqs: 76 | fi[r].seq 77 | print "search: %.3f" % (time.time() - t) 78 | del fi 79 | 80 | def time_biopython_sqlite(f, random_seqs, name): 81 | show_name(name) 82 | idx = "%s.bidx" % f 83 | rm(idx) 84 | t = time.time() 85 | fi = SeqIO.index(f, "fastq", db=idx) 86 | print "create: %.3f" % (time.time() - t) 87 | 88 | t = time.time() 89 | for r in random_seqs: 90 | fi[r[1:]].seq 91 | print "search: %.3f" % (time.time() - t) 92 | del fi 93 | 94 | 95 | 96 | if __name__ == "__main__": 97 | 98 | f = "/home/brentp/ssd/s.fastq" 99 | f = "/opt/src/methylcode/data/s_1_sequence.txt" 100 | N = 500000 101 | 102 | rand_headers, nrecords = get_rand_headers(f, N) 103 | print f 104 | print "benchmarking fastq file with %i records (%i lines)" \ 105 | % (nrecords, nrecords * 4) 106 | print "performing %i random queries" % len(rand_headers) 107 | 108 | time_screed(f, rand_headers, "screed") 109 | 110 | time_biopython_sqlite(f, rand_headers, "biopython-sqlite") 111 | 112 | time_fileindex(f, rand_headers, "fileindex", fileindex.FileIndex) 113 | -------------------------------------------------------------------------------- /fileindex/examples/fastq_file.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path as op 3 | sys.path.insert(0, op.join(op.dirname(__file__), "..")) 4 | from fileindex import FileIndex 5 | 6 | class FastQEntry(object): 7 | __slots__ = ('name', 'seq', 'l3', 'qual', 'fpos') 8 | def __init__(self, fh): 9 | self.name = fh.readline().rstrip('\r\n') 10 | self.seq = fh.readline().rstrip('\r\n') 11 | self.l3 = fh.readline().rstrip('\r\n') 12 | self.qual = fh.readline().rstrip('\r\n') 13 | 14 | if __name__ == "__main__": 15 | f = '/usr/local/src/bowtie/bowtie-0.12.1/work/reads/s_1_sequence.txt' 16 | N = 100 17 | 18 | #if not op.exists(f + FileIndex.ext): 19 | FileIndex.create(f, lambda fh: FastQEntry(fh).name) 20 | 21 | fi = FileIndex(f, FastQEntry) 22 | print "getting %i keys..." % N 23 | 24 | for i, k in enumerate(fi.db.iterkeys(str)): 25 | print fi[k].seq 26 | if i == N: break 27 | 28 | 29 | -------------------------------------------------------------------------------- /fileindex/examples/sam_file.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path as op 3 | sys.path.insert(0, op.join(op.dirname(__file__), "..")) 4 | from fileindex import FileIndex 5 | 6 | class SamLine(object): 7 | __slots__ = ('name', 'ref_loc', 'ref_seqid') 8 | def __init__(self, fh): 9 | line = fh.readline().split("\t") or [None] 10 | self.name = line[0] 11 | self.ref_seqid = line[2] 12 | self.ref_loc = int(line[3]) 13 | # ... other sam format stuff omitted. 14 | 15 | if __name__ == "__main__": 16 | f = '/usr/local/src/methylcode/emen/en-data/out/methylcoded.sam' 17 | if not op.exists(f + FileIndex.ext): 18 | FileIndex.create(f, lambda fh: SamLine(fh).name, allow_multiple=True) 19 | 20 | fi = FileIndex(f, SamLine, allow_multiple=True) 21 | print [(s.name, s.ref_seqid, s.ref_loc) for s in fi['23351265']] 22 | 23 | -------------------------------------------------------------------------------- /fileindex/fileindex.py: -------------------------------------------------------------------------------- 1 | from tcdb.bdb import BDBSimple as BDB 2 | import tcdb.bdb as tc 3 | import sys 4 | import gzip 5 | import time 6 | 7 | 8 | def nopen(f, mode="r"): 9 | """ 10 | open a file that's gzipped or return stdin for '-' 11 | 12 | >>> nopen('-') == sys.stdin 13 | True 14 | >>> nopen(sys.argv[0]) 15 | 16 | """ 17 | if not isinstance(f, basestring): return f 18 | return sys.stdin if f == "-" \ 19 | else gzip.open(f, mode) if f.endswith(".gz") else open(f, mode) 20 | 21 | 22 | class FileIndex(object): 23 | ext = ".fidx" 24 | 25 | @classmethod 26 | def _get_iterable(self, f): 27 | if isinstance(f, basestring): 28 | fh = nopen(f) 29 | name = fh.name 30 | else: 31 | fh = f 32 | name = getattr(f, 'name', "fileindex") 33 | return fh, name 34 | 35 | @classmethod 36 | def create(cls, file_like, get_next, allow_multiple=False): 37 | 38 | fh, name = cls._get_iterable(file_like) 39 | 40 | lines = sum(1 for line in fh) 41 | bnum = lines if lines > 2**24 else lines * 2 42 | fh.seek(0) 43 | db = BDB() 44 | db.open(name + cls.ext, bnum=bnum, lcnum=2**19, 45 | omode=tc.OWRITER | tc.OTRUNC | tc.OCREAT, 46 | apow=6, opts=tc.TLARGE, xmsiz=2**26) 47 | pos = fh.tell() 48 | putter = db.putcat if allow_multiple else db.put 49 | while True: 50 | key = get_next(fh) 51 | if not key: break 52 | # always append the | but only used by multiple. 53 | putter(key , str(pos) + "|") 54 | # fh has been moved forward by get_next. 55 | pos = fh.tell() 56 | fh.close() 57 | db.close() 58 | 59 | def __init__(self, file_like, call_class, allow_multiple=False): 60 | fh, name = self._get_iterable(file_like) 61 | self.filename = name 62 | self.allow_multiple = allow_multiple 63 | self.fh = fh 64 | self.call_class = call_class 65 | self.db = BDB() 66 | self.db.open(name + self.ext, omode=tc.OREADER) 67 | 68 | def __getitem__(self, key): 69 | # every key has the | appended. 70 | pos = self.db.get(key).rstrip("|") 71 | if self.allow_multiple: 72 | results = [] 73 | for p in pos.split("|"): 74 | self.fh.seek(long(p)) 75 | results.append(self.call_class(self.fh)) 76 | return results 77 | 78 | self.fh.seek(long(pos)) 79 | return self.call_class(self.fh) 80 | 81 | def __contains__(self, key): 82 | return key in self.db 83 | 84 | if __name__ == "__main__": 85 | 86 | 87 | class FastQEntry(object): 88 | #__slots__ = ('name', 'seq', 'l3', 'qual', 'fpos') 89 | def __init__(self, fh): 90 | self.name = fh.readline().rstrip('\r\n') 91 | self.seq = fh.readline().rstrip('\r\n') 92 | self.l3 = fh.readline().rstrip('\r\n') 93 | self.qual = fh.readline().rstrip('\r\n') 94 | def __str__(self): 95 | return "\n".join((self.name, self.seq, self.l3, self.qual)) 96 | 97 | 98 | def get_next(fh): 99 | name = fh.readline().strip() 100 | fh.readline(); fh.readline(); fh.readline() 101 | return name or None 102 | 103 | 104 | f = 'test.fastq' 105 | 106 | t = time.time() 107 | FileIndex.create(f, lambda fh: FastQEntry(fh).name) 108 | print "create:", time.time() - t 109 | 110 | fi = FileIndex(f, FastQEntry) 111 | entry = fi['@SRR040002.1_SL-XBC_0005_FC6124NAAXX:6:1:1091:4026/1'] 112 | print entry 113 | 114 | import os; os.unlink(f + FileIndex.ext) 115 | del fi 116 | 117 | fh = open(f) 118 | 119 | FileIndex.create(fh, lambda fh: FastQEntry(fh).name) 120 | fi = FileIndex(open(f), FastQEntry) 121 | 122 | entry = fi['@SRR040002.1_SL-XBC_0005_FC6124NAAXX:6:1:1091:4026/1'] 123 | print str(entry) 124 | os; os.unlink(f + FileIndex.ext) 125 | 126 | # test with gzipped file. 127 | gz = gzip.open('test.fastq.gz', 'w') 128 | gz.writelines(open(f)) 129 | gz.close() 130 | gz = gzip.open('test.fastq.gz', 'r') 131 | 132 | FileIndex.create(gz, lambda fh: FastQEntry(fh).name) 133 | gz.close() 134 | gz = gzip.open('test.fastq.gz', 'r') 135 | fi = FileIndex(gz, FastQEntry) 136 | entry = fi['@SRR040002.1_SL-XBC_0005_FC6124NAAXX:6:1:1091:4026/1'] 137 | print entry.name 138 | 139 | 140 | os.unlink(f + '.gz' + FileIndex.ext) 141 | os.unlink(f + '.gz') 142 | 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /hts-lua/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Brent Pedersen - Bioinformatics 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /hts-lua/README.md: -------------------------------------------------------------------------------- 1 | stub of a luajit wrapper for htslib. 2 | requires libhts.so to be on the path. 3 | -------------------------------------------------------------------------------- /hts-lua/hts.lua: -------------------------------------------------------------------------------- 1 | local ffi = require("ffi") 2 | header = io.open("hts_concat.h"):read("*a") 3 | 4 | local hts = ffi.load("hts") 5 | ffi.cdef(header) 6 | 7 | local Variant = ffi.metatype("bcf1_t", { 8 | __index = { 9 | start = function(t) return t.pos end, 10 | stop = function(t) 11 | return t.pos + t.rlen end, 12 | function(t, k) 13 | return k 14 | end, 15 | }, 16 | __tostring = function(t) 17 | return string.format("%d-%d/%s/%s", t:start(), t:stop(), 18 | ffi.string(t.d.allele[0]), ffi.string(t.d.allele[1])) 19 | end, 20 | 21 | __gc = function(t) 22 | hts.bcf_destroy(t) 23 | end 24 | 25 | }) 26 | 27 | local INFO = {} 28 | INFO.mt = { 29 | __index = function(t, k) 30 | return _getinfo(t, k) 31 | end 32 | } 33 | 34 | function _getinfo(info, key) 35 | local info_t = hts.bcf_get_info(info.hdr, info.bcf, key) 36 | if info_t == ffi.NULL then return nil end 37 | if info_t.len == 1 then 38 | if info_t.type <= 3 then -- INT 39 | if (info_t.type == 1 and info_t.v1.i == -128) or 40 | (info_t.type == 2 and info_t.v1.i == -32768) or 41 | (info_t.type == 3 and info_t.v1.i == -2147483648) then 42 | return nil 43 | end 44 | return info_t.v1.i 45 | elseif info_t.type == 5 then 46 | --if hts.bcf_float_is_missing(info_t.v1.f) then return nil end 47 | return info_t.v1.f 48 | elseif info_t.type == 7 then 49 | local v = ffi.string(info_t.vptr, info_t.vptr_len) 50 | if v[0] == 0x7 then return nil end 51 | return v 52 | end 53 | 54 | end 55 | end 56 | 57 | function INFO.new(bcf, hdr) 58 | local t = {bcf=bcf, hdr=hdr} 59 | setmetatable(t, INFO.mt) 60 | return t 61 | end 62 | 63 | function bcf_init() 64 | return ffi.gc(hts.bcf_init(), hts.bcf_destroy) 65 | end 66 | 67 | 68 | while true do 69 | -- make a file metatype 70 | htf = hts.hts_open("vt.norm.vcf.gz", "r") 71 | hdr = hts.bcf_hdr_read(htf) 72 | k = 0 73 | 74 | while true do 75 | --io.stderr:write(k) 76 | --io.stderr:write("\n") 77 | k = k+1 78 | local bcf = bcf_init() 79 | ret = hts.bcf_read(htf, hdr, bcf) 80 | if ret < 0 then break end 81 | hts.bcf_unpack(bcf, 15) 82 | 83 | 84 | info = INFO.new(bcf, hdr) 85 | a, b, c = info["DP"], info["PQR"], info.DP 86 | print(a, b, c) 87 | 88 | print(bcf.rid, bcf.pos, bcf:start(), bcf:stop()) 89 | print(bcf) 90 | end 91 | 92 | print("closing") 93 | hts.bcf_hdr_destroy(hdr) 94 | hts.hts_close(htf) 95 | end 96 | -------------------------------------------------------------------------------- /hts-lua/vt.norm.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brentp/bio-playground/1982a222328bca7b675dedaa3887600a7d3dad74/hts-lua/vt.norm.vcf.gz -------------------------------------------------------------------------------- /igv/igv.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import os.path as op 3 | import os 4 | import sys 5 | 6 | 7 | class IGV(object): 8 | r""" 9 | Simple wrapper to the IGV (http://www.broadinstitute.org/software/igv/home) 10 | socket interface (http://www.broadinstitute.org/software/igv/PortCommands) 11 | 12 | requires: 13 | 14 | 1) you have IGV running on your machine (launch with webstart here: 15 | http://www.broadinstitute.org/software/igv/download) 16 | 17 | 2) you have enabled port communication in 18 | View -> Preferences... -> Advanced 19 | 20 | Successful commands return 'OK' 21 | 22 | example usage: 23 | 24 | >>> igv = IGV() 25 | >>> igv.genome('hg19') 26 | 'OK' 27 | 28 | #>>> igv.load('http://www.broadinstitute.org/igvdata/1KG/pilot2Bams/NA12878.SLX.bam') 29 | 'OK' 30 | >>> igv.go('chr1:45,600-45,800') 31 | 'OK' 32 | 33 | #save as svg, png, or jpg 34 | >>> igv.save('/tmp/r/region.svg') 35 | 'OK' 36 | >>> igv.save('/tmp/r/region.png') 37 | 'OK' 38 | 39 | # go to a gene name. 40 | >>> igv.go('muc5b') 41 | 'OK' 42 | >>> igv.sort() 43 | 'OK' 44 | >>> igv.save('muc5b.png') 45 | 'OK' 46 | 47 | # get a list of commands that will work as an IGV batch script. 48 | >>> print "\n".join(igv.commands) 49 | snapshotDirectory /tmp/igv 50 | genome hg19 51 | goto chr1:45,600-45,800 52 | snapshotDirectory /tmp/r 53 | snapshot region.svg 54 | snapshot region.png 55 | goto muc5b 56 | sort base 57 | snapshot muc5b.png 58 | 59 | Note, there will be some delay as the browser has to load the annotations 60 | at each step. 61 | 62 | """ 63 | _socket = None 64 | _path = None 65 | 66 | def __init__(self, host='127.0.0.1', port=60151, snapshot_dir='/tmp/igv'): 67 | self.host = host 68 | self.port = port 69 | self.commands = [] 70 | self.connect() 71 | self.set_path(snapshot_dir) 72 | 73 | @classmethod 74 | def start(cls, jnlp="igv.jnlp", url="http://www.broadinstitute.org/igv/projects/current/"): 75 | import subprocess 76 | from threading import Thread 77 | import time 78 | 79 | def readit(ffrom, fto, wait): 80 | for line in iter(ffrom.readline, b''): 81 | if "Listening on port" in line: 82 | wait[0] = False 83 | fto.write(line + '\n') 84 | ffrom.close() 85 | 86 | p = subprocess.Popen("/usr/bin/javaws -Xnosplash %s%s" % (url, jnlp), 87 | shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 88 | 89 | wait = [True] 90 | _tout = Thread(target=readit, args=(p.stdout, sys.stdout, wait)) 91 | _terr = Thread(target=readit, args=(p.stderr, sys.stderr, wait)) 92 | _tout.daemon = _terr.deamon = True 93 | _tout.start() 94 | _terr.start() 95 | while p.poll() is None and wait[0]: 96 | time.sleep(10) 97 | print("waiting", wait) 98 | 99 | def connect(self): 100 | if self._socket: 101 | self._socket.close() 102 | self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 103 | self._socket.connect((self.host, self.port)) 104 | 105 | def go(self, position): 106 | return self.send('goto ' + position) 107 | goto = go 108 | 109 | def genome(self, name): 110 | return self.send('genome ' + name) 111 | 112 | def load(self, url): 113 | return self.send('load ' + url) 114 | 115 | def region(self, contig, start, end): 116 | return self.send(' '.join(map(str, ['region', contig, start, end]))) 117 | 118 | def sort(self, option='base'): 119 | """ 120 | options is one of: base, position, strand, quality, sample, and 121 | readGroup. 122 | """ 123 | assert option in ("base", "position", "strand", "quality", "sample", 124 | "readGroup") 125 | return self.send('sort ' + option) 126 | 127 | def set_path(self, snapshot_dir): 128 | if snapshot_dir == self._path: 129 | return 130 | if not op.exists(snapshot_dir): 131 | os.makedirs(snapshot_dir) 132 | 133 | self.send('snapshotDirectory %s' % snapshot_dir) 134 | self._path = snapshot_dir 135 | 136 | def expand(self, track=''): 137 | self.send('expand %s' % track) 138 | 139 | def collapse(self, track=''): 140 | self.send('collapse %s' % track) 141 | 142 | def clear(self): 143 | self.send('clear') 144 | 145 | def send(self, cmd): 146 | # socket in Python2 oprates with strings 147 | if sys.version_info.major == 2: 148 | self._socket.send(cmd + '\n') 149 | return self._socket.recv(4096).rstrip('\n') 150 | # while socket in Python3 requires bytes 151 | else: 152 | self.commands.append(cmd) 153 | cmd = cmd + '\n' 154 | self._socket.send(cmd.encode('utf-8')) 155 | return self._socket.recv(4096).decode('utf-8').rstrip('\n') 156 | 157 | def save(self, path=None): 158 | if path is not None: 159 | # igv assumes the path is just a single filename, but 160 | # we can set the snapshot dir. then just use the filename. 161 | dirname = op.dirname(path) 162 | if dirname: 163 | self.set_path(dirname) 164 | return self.send('snapshot ' + op.basename(path)) 165 | else: 166 | return self.send('snapshot') 167 | snapshot = save 168 | 169 | if __name__ == "__main__": 170 | import doctest 171 | doctest.testmod() 172 | -------------------------------------------------------------------------------- /kscalc/README.rst: -------------------------------------------------------------------------------- 1 | this will run much faster after running:: 2 | 3 | $ shedskin -e -r -b -w score_guess.py 4 | $ make 5 | 6 | -------------------------------------------------------------------------------- /kscalc/kscalc.py: -------------------------------------------------------------------------------- 1 | import score_guess 2 | print score_guess 3 | from score_guess import score_guess 4 | import scipy.optimize as so 5 | import random 6 | import numpy as np 7 | 8 | 9 | a = """ATGTCGGGGCGCGGCAAGGGCGGCAAGGGGCTCGGCAAGGGCGGCGCGAAGAGGCATCGC 10 | AAGGTGCTCCGCGACAACATCCAGGGCATCACCAAGCCGGCGATCCGGAGGCTGGCGAGG 11 | AGGGGCGGCGTGAAGCGCATCTCCGGGCTGATCTACGAGGAGACCCGCGGCGTGCTCAAG 12 | ATCTTCCTCGAGAACGTCATCCGCGACGCCGTCACCTACACGGAGCACGCCCGCCGCAAG 13 | ACCGTCACCGCCATGGACGTCGTCTACGCGCTCAAGCGCCAGGGCCGCACCCTCTACGGC 14 | TTCGGCGGCTGA""" 15 | b = """ATGTCAGGTCGTGGAAAAGGAGGCAAGGGGCTCGGTAAGGGAGGAGCGAAGCGTCATCGG 16 | AAAGTTCTCCGTGATAACATTCAGGGAATCACTAAGCCGGCTATCCGGCGTCTTGCGAGA 17 | AGAGGTGGAGTGAAGAGAATCAGCGGGTTGATCTACGAGGAGACCAGAGGCGTTTTGAAG 18 | ATCTTCTTGGAGAACGTTATTCGCGATGCTGTGACGTACACTGAGCACGCCAGGCGGAAG 19 | ACGGTGACCGCCATGGATGTTGTTTACGCCCTTAAGAGGCAGGGAAGGACTCTGTACGGG 20 | TTCGGTGGTTAA""" 21 | 22 | aa = """ATGGCGGCGGCGGCGGCGGCGGCGGGGTACAGGGCGGAGGAGGAGTACGACTACCTGTTCAAGGTGGTGCTGATCGGGGACAGCGGCGTGGGGAAGTCGAACCTGCTGTCGCGGTTCGCGCGGGACGAGTTCAGCCTGGAGACCAGGTCCACCATCGGCGTCGAGTTCGCCACCAAGACCGTCCGCGTCGACGACAGGCTCGTCAAGGCCCAGATCTGGGACACCGCCGGCCAAGAGAGGTACCGCGCCATCACGAGCGCCTACTACCGCGGCGCGGTGGGCGCGCTGGTGGTGTACGACGTGACGCGCCGCATCACGTTCGAGAACGCGGAGCGGTGGCTCAAGGAGCTCCGCGACCACACGGACGCCAACATCGTCGTCATGCTCGTGGGCAACAAGGCCGACCTGCGCCACCTCCGCGCCGTCCCCGCGGAGGACGCCAGGGCGTTCGCCGAGGCGCACGGGACCTTCTCCATGGAGACGTCGGCGCTGGAGGCCACCAACGTGGAGGGCGCCTTCACCGAGGTGCTCGCGCAGATCTACCGCGTCGTCAGCCGGAACGCGCTCGACATCGGCGACGACCCCGCCGCGCCGCCCCGGGGGCGGACCATCGACGTCAGCGCCAAGGATGACGCCGTCACCCCCGTGAACAGCTCAGGGTGCTGCTCGTCTTGA""" 23 | ab = """---------------ATGGCGTCGGGGTACCGCGCGGAGGAGGAGTACGACTACCTGTTCAAGGTGGTGCTGATCGGGGACAGCGGCGTGGGCAAGTCGAACCTGCTGTCGCGGTTCGCCAGGGACGAGTTCAGCCTCGAGACCAGGTCCACCATCGGCGTCGAGTTCGCCACCAAGACCGTCCAGGTCGACGACAAGCTCGTCAAGGCGCAGATCTGGGACACCGCCGGGCAGGAGAGGTACCGCGCCATCACGAGCGCATACTACCGCGGCGCGGTGGGCGCGCTGGTGGTGTACGACGTGACCCGCCGCATCACCTTCGACAACGCCGAGCGCTGGCTGCGGGAGCTGCGGGACCACACGGACGCCAACATCGTGGTCATGCTGGTGGGCAACAAGGCCGACCTGCGCCACCTCCGCGCCGTGACGCCCGAGGACGCCGCGGCCTTCGCGGAGCGGCACGGCACCTTCTCCATGGAGACGTCGGCGCTGGACGCCACCAACGTCGACCGCGCCTTCGCCGAGGTGCTCCGCCAGATCTACCACGTCGTCAGCCGGAACGCGCTCGACATCGGGGAGGACCCCGCCGCGCCGCCCAGGGGAAAGACCATCGACGTCGGCGCCGCCAAGGACGAGGTCTCCCCCGTGAATACGGGCGGCTGCTGCTCGGCTTAG""" 24 | 25 | def clean_seqs(a, b): 26 | # we only look at the 3rd basepair. 27 | seqa = a[2::3].upper() 28 | seqb = b[2::3].upper() 29 | 30 | ab = [sab for sab in zip(seqa, seqb) if not "-" in sab] 31 | seqa = "".join([s[0] for s in ab]) 32 | seqb = "".join([s[1] for s in ab]) 33 | return seqa, seqb 34 | 35 | def calc_difference(a, b): 36 | return sum(aa != bb for aa, bb in zip(a, b)) 37 | 38 | def calc_acgt(a, b): 39 | s = a + b 40 | return s.count('A'), s.count('C'), s.count('G'), s.count('T') 41 | 42 | 43 | def mleks(_seqa, _seqb): 44 | 45 | seqa, seqb = clean_seqs(_seqa, _seqb) 46 | D = calc_difference(seqa, seqb) 47 | 48 | seqab = seqa + seqb 49 | slen = len(seqab)/2 50 | 51 | scores = {} 52 | # give the optimizer the best guess from this range. 53 | for guess in (0.3, 0.5, 0.75, 1.1, 1.5): 54 | scores[guess] = score_guess(guess, seqab, D, slen) 55 | 56 | best_guess = sorted(scores.items(), key=lambda a: (a[1], a[0]))[0][0] 57 | def fnopt(ks_guess): 58 | return score_guess(ks_guess[0], seqab, D, slen) 59 | r = so.fmin(fnopt, best_guess, args=(), disp=False, 60 | xtol=0.1, maxfun=20) 61 | return r[0] 62 | 63 | 64 | def gen_seqs(min_len=30, max_len=2000, min_gc=0.2, max_gc=0.8): 65 | print "l, a c g t, actual_difference, ks" 66 | for l in range(min_len, max_len, 50): 67 | for gc in np.arange(min_gc, max_gc, 0.1): 68 | L = 2 * l 69 | gs = int(L * gc) * "G" 70 | cs = int(L * gc) * "C" 71 | was =int(L * (1 - gc)) * "A" 72 | ts = int(L * (1 - gc)) * "T" 73 | choices = gs + cs + was + ts 74 | seq = "".join(random.choice(choices) for i in range(L)) 75 | for rep in range(3): 76 | aseq = seq[:l] 77 | bseq = list(aseq[:]) 78 | 79 | pos = random.randint(0, l - 1) 80 | bend = bseq[pos:] 81 | random.shuffle(bend) 82 | bseq = bseq[:pos] + bend 83 | bseq = "".join(bseq) 84 | 85 | aclean, bclean = clean_seqs(aseq, bseq) 86 | actual_acgt = calc_acgt(aclean, bclean) 87 | acgt = " ".join(map(str, actual_acgt)) 88 | actual_difference = calc_difference(aclean, bclean) 89 | print l, acgt, actual_difference, mleks(aseq, bseq) 90 | 91 | gen_seqs() 92 | 93 | 94 | 95 | 96 | 97 | #print mleks(a.replace("\n", ""), b.replace("\n", "")) 98 | -------------------------------------------------------------------------------- /kscalc/score_guess.py: -------------------------------------------------------------------------------- 1 | """ 2 | given a guess for ks, return a score (0 is perfect) for 3 | the given sequence 4 | """ 5 | import random 6 | import bisect 7 | 8 | def score_guess(ks_guess, seqab, D, slen): 9 | substitutions = int(ks_guess * slen + 0.5) 10 | repeats = 1000 11 | outer_reps = 10 12 | 13 | diffs = [] 14 | for rep in range(outer_reps): 15 | random.seed() 16 | ancestor = "".join(random.choice(seqab) for _ in range(slen)) 17 | for rep in xrange(repeats): 18 | amut = list(ancestor) 19 | bmut = list(ancestor) 20 | 21 | for i in xrange(substitutions): 22 | mut = random.choice((amut, bmut)) 23 | mut[random.randint(0, slen - 1)] = random.choice(seqab) 24 | 25 | diff = sum(int(aa != bb) for aa, bb in zip(amut, bmut)) 26 | diffs.append(diff) 27 | 28 | diffs.sort() 29 | 30 | idx0 = bisect.bisect_left(diffs, D - 10) 31 | idx1 = bisect.bisect_right(diffs, D + 10) 32 | n = (repeats * outer_reps) - abs(idx1 - idx0) 33 | return n 34 | 35 | 36 | if __name__ == "__main__": 37 | seq = "ACCACCAAAGCGCGCGCGGGG" 38 | print score_guess(0.2, seq, 12, len(seq)) 39 | -------------------------------------------------------------------------------- /libsvm-tools/README.rst: -------------------------------------------------------------------------------- 1 | libsvm-grid.py 2 | ============== 3 | 4 | `libsvm-grid.py` is a script to replace the easy.py and grid.py distributed 5 | with `libsvm`_. It's usage looks like:: 6 | 7 | $ python libsvm-grid.py 8 | Options: 9 | -h, --help show this help message and exit 10 | --kernel=KERNEL one of linear/polynomial/rbf/sigmoid 11 | --c-range=C_RANGE log2 range of values in format start:stop:step 12 | [-7:5:2] 13 | --g-range=G_RANGE log2 range of g values in format start:stop:step 14 | [-16:4:2] 15 | --n-threads=N_THREADS 16 | number of threads to use [4] 17 | --out-prefix=OUT_PREFIX 18 | where to send results 19 | --x-fold=X_FOLD number for cross-fold validation on training set [8] 20 | --scale if specified, perform scaling (svm-scale) on the 21 | dataset(s) before calling svm-train. [False] 22 | --split=SPLIT if specified split the training file into 2 files. one 23 | for testing and one for training. --split 0.8 would 24 | use 80% of the lines for training. the selection is 25 | random. this is used instead of specifying a training 26 | file. 27 | -b, --probability calculate and store prediction as a probability rather 28 | than a class. [False] 29 | 30 | It expects `svm-train`, `svm-predict`, and `svm-scale` to be on the path 31 | so it may be called like:: 32 | 33 | $ PATH=/usr/local/src/libsvm-3.0/:$PATH python libsvm-grid.py --scale --n-threads 8 some.train-data some.test-data 34 | 35 | This will scale the train and test data, run `svm-train` in 8 parallel processes (not actually threads) on the scaled train data for a grid of parameter values. It will take the parameters with the highest cross-validation accuracy and run them on the scaled test data. 36 | 37 | `libsvm-grid.py` will automatically guess the number of processors available on 38 | the calling machine and use that many if `--n-threads` is not specified. 39 | 40 | The output will be something like:: 41 | 42 | Best Cross Validation Accuracy: 66.67 with parameters c:0.125, g:4.0 43 | 44 | In addition to the files some.train-data.scale and some-test-data.scale and some-test-data.model if a test set is specified. 45 | 46 | 47 | ROC 48 | --- 49 | If the `-b` parameter is specified, it will print an AUC value and write a file 50 | with x,y values which can be used to plot an ROC-curve. 51 | 52 | .. _`libsvm`: http://www.csie.ntu.edu.tw/~cjlin/libsvm/ 53 | 54 | -------------------------------------------------------------------------------- /lowess/README.rst: -------------------------------------------------------------------------------- 1 | Lowess 2 | ====== 3 | 4 | Lowess is locally weight polynomial regression. 5 | This is a Cython wrapper to the implementation in `R `_ 6 | That implementation is GPL v2, so this is GPL as well. 7 | 8 | Usage 9 | ===== 10 | 11 | Usage is stolen from the `biopython `_ docs for their lowess implementation.:: 12 | 13 | >>> from lowess import lowess 14 | >>> import numpy as np 15 | >>> x = np.array([4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 16 | ... 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 17 | ... 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 18 | ... 20, 22, 23, 24, 24, 24, 24, 25], np.float) 19 | >>> y = np.array([2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 20 | ... 2800, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40, 21 | ... 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56, 22 | ... 64, 66, 54, 70, 92, 93, 120, 85], np.float) 23 | 24 | >>> result = lowess(x, y) 25 | >>> print "%.3f ... %.3f" % (result[0], result[-1]) 26 | 4.712 ... 85.470 27 | 28 | On large datasets, this runs *much* faster and uses less memory than the 29 | biopython implementation. 30 | -------------------------------------------------------------------------------- /lowess/lowess.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | cimport numpy as np 3 | import cython 4 | 5 | cdef extern from "_lowess.c": 6 | 7 | void clowess(double *x, double *y, int n, 8 | double f, int nsteps, double delta, 9 | double *ys, double *rw, double *res) 10 | 11 | 12 | #@cython.embedsignature(True) 13 | def lowess(np.ndarray[np.double_t, cast=True, ndim=1] x, 14 | np.ndarray[np.double_t, cast=True, ndim=1] y, 15 | double f=2/3., int iters=3): 16 | """ 17 | lowess(xs, ys, f=2/3., iters=3) 18 | perform lowess smoothing given numpy arrays for x and y 19 | of the same shape. 20 | 21 | `f` gives the proportion of points in the plot which influence each value. 22 | larger values give more smooth results. 23 | `iter` the number of smoothing iterations to perform. 24 | 25 | this function calls the C-code from the R stats lowess implementation. 26 | """ 27 | 28 | cdef np.ndarray[np.double_t, cast=True, ndim=1] ys = np.empty(x.shape[0]) 29 | cdef np.ndarray[np.double_t, cast=True, ndim=1] rw = np.empty(x.shape[0]) 30 | cdef np.ndarray[np.double_t, cast=True, ndim=1] res = np.empty(x.shape[0]) 31 | 32 | cdef double delta = 0.01 * (x.max() - x.min()) 33 | 34 | clowess(x.data, y.data, x.shape[0], f, iters, delta, 35 | ys.data, 36 | rw.data, 37 | res.data) 38 | return ys 39 | 40 | def test(): 41 | from Bio.Statistics.lowess import lowess as bio_lowess 42 | 43 | import numpy as np 44 | 45 | x = np.array([4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 46 | 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 47 | 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 48 | 20, 22, 23, 24, 24, 24, 24, 25], np.float) 49 | 50 | y = np.array([2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 51 | 2800, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40, 52 | 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56, 53 | 64, 66, 54, 70, 92, 93, 120, 85], np.float) 54 | 55 | x = x.repeat(40) 56 | y = y.repeat(40) 57 | import time 58 | t = time.time() 59 | bio_result = bio_lowess(x, y) 60 | print "Bio:%.2f" % (time.time() - t) 61 | t = time.time() 62 | r_result = lowess(x, y) 63 | print "RCy:%.2f" % (time.time() - t) 64 | 65 | for result in (bio_result, r_result): 66 | print "[%0.2f, ..., %0.2f]" % (result[0], result[-1]) 67 | 68 | diff = np.abs(bio_result - r_result) 69 | print diff.max() 70 | print diff.mean() 71 | -------------------------------------------------------------------------------- /lowess/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from distutils.extension import Extension 3 | from Cython.Distutils import build_ext 4 | import numpy as np 5 | 6 | ext_modules = [ 7 | Extension("lowess", ["lowess.pyx"], 8 | extra_objects=["_lowess.c"], libraries=['m'] , 9 | include_dirs=[np.get_include()])] 10 | 11 | setup( 12 | name = 'lowess', 13 | cmdclass = {'build_ext': build_ext}, 14 | ext_modules = ext_modules 15 | ) 16 | -------------------------------------------------------------------------------- /methstuffs/bed-merge.py: -------------------------------------------------------------------------------- 1 | """ 2 | convert a number of separate, sorted files into a single file with 1 column per 3 | file. merge only by the start value (assume length of intervals is 1). 4 | useful for merging files from, e.g. bismark 5 | 6 | python bed-merge.py --value-col 5 --name-re ".+/(\d+)." *.cov > out.matrix.txt 7 | """ 8 | 9 | import heapq 10 | from itertools import groupby 11 | import gzip 12 | import os.path as op 13 | 14 | xopen = lambda f: gzip.open(f) if f.endswith('.gz') else open(f) 15 | 16 | 17 | class Row(object): 18 | __slots__ = ('chrom', 'start', 'end', 'value', 'source') 19 | 20 | def __init__(self, toks, val_col=4, source=None): 21 | self.chrom = toks[0] 22 | self.start, self.end = int(toks[1]), int(toks[2]) 23 | self.value = toks[val_col - 1] 24 | self.source = source 25 | 26 | def __cmp__(self, other): 27 | return cmp(self.chrom, other.chrom) or cmp(self.start, other.start) 28 | 29 | 30 | def bed_merge(row_iterables, sources): 31 | assert len(sources) == len(row_iterables) 32 | 33 | for loc, cgs in groupby(heapq.merge(*row_iterables), 34 | lambda cg: (cg.chrom, cg.start)): 35 | 36 | cgs = list(cgs) 37 | cg = cgs[0] 38 | present = dict((c.source, c) for c in cgs) 39 | 40 | # if a file doesn't have a record for here, just append 0 41 | values = [(present[s].value if s in present else '0') for s in sources] 42 | yield cg.chrom, cg.start, cg.end, values 43 | 44 | 45 | def gen_iterable(fname, val_col): 46 | source = source_from_fname(fname) 47 | for toks in (x.rstrip("\r\n").split("\t") for x in xopen(fname)): 48 | yield Row(toks, val_col, source) 49 | 50 | 51 | if __name__ == "__main__": 52 | 53 | import argparse 54 | import re 55 | p = argparse.ArgumentParser(__doc__) 56 | p.add_argument("--value-col", type=int, default=4) 57 | p.add_argument("--name-re", default=r"/?(.+)$", 58 | help="regexp to convert file name to sample name") 59 | p.add_argument("bed_files", nargs="+", help="sorted bed files") 60 | 61 | a = p.parse_args() 62 | name_re = re.compile(r"%s" % a.name_re) 63 | 64 | def source_from_fname(fname): 65 | try: 66 | return name_re.match(fname).groups(0)[0] 67 | except: 68 | return op.basename(fname) 69 | 70 | iterables = [gen_iterable(f, a.value_col) for f in a.bed_files] 71 | sources = [source_from_fname(f) for f in a.bed_files] 72 | 73 | 74 | fmt = "{chrom}:{start}\t{vals}" 75 | print "probe\t%s" % "\t".join(sources) 76 | for chrom, start, end, values in bed_merge(iterables, sources): 77 | if sum(float(v) for v in values) < 24: continue 78 | if sum(float(v) > 0 for v in values) < 2: continue 79 | vals = "\t".join(values) 80 | print fmt.format(chrom=chrom, start=start, vals=vals) 81 | 82 | -------------------------------------------------------------------------------- /mosaic/README.md: -------------------------------------------------------------------------------- 1 | Find mosaic variants. 2 | 3 | Rules 4 | ===== 5 | 6 | + the parents have 0 alternate alleles 7 | + the kid has >= 2 alternate alleles 8 | 9 | These include sites where the kid is likely to be called as homozygous reference 10 | and would be missed with normal variant calling method. 11 | 12 | Usage 13 | ===== 14 | 15 | ``` 16 | python mosaic.py $region $ped $fasta $bams 17 | ``` 18 | 19 | e.g. 20 | 21 | ``` 22 | python mosaic.py 9:135766735-135820020 my.ped hs37d5.fa /path/to/*.bam 23 | ``` 24 | 25 | Requirements 26 | ============ 27 | 28 | + Samples names in the ped must match the read-groups in the bam. 29 | + Freebayes must be installed 30 | + peddy python module must be installed 31 | 32 | This will only run on **trios** specified in the ped file. It will 33 | 34 | It will output a VCF from freebayes with only candidate mosaic variants 35 | in any of the kids. It adds a `MOSAIC` field to the info that indicates 36 | which sample has evidence of mosaicism, and what are the ref and alt counts 37 | and what are the sum of alternate quality score, e.g.: 38 | 39 | ``` 40 | MOSAIC=sample_z42:114:3:33;... 41 | ``` 42 | 43 | where here, `sample_z42` has 114 reference alleles and 3 alternate alles with a total 44 | sum quality of 33. So this candidate will likely be filtered downstream. 45 | 46 | If there are multiple probands that are candidates for mosaicism at this site, they 47 | will be delimited by "|". 48 | -------------------------------------------------------------------------------- /mosaic/filter-functional.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | from geneimpacts import VEP 4 | 5 | 6 | def isfunctional(csq): 7 | if csq['BIOTYPE'] != 'protein_coding': return False 8 | if csq['Feature'] == '' or csq['EXON'] == '': return False 9 | return ("splic" in csq['Consequence']) or any(c in ('stop_gained', 'stop_lost', 10 | 'start_lost', 'initiator_codon_variant', 'rare_amino_acid_variant', 11 | 'missense_variant', 'protein_altering_variant', 'frameshift_variant') 12 | for c in csq['Consequence'].split('&')) 13 | 14 | 15 | def get_csq_keys(line): 16 | keys = line.split("Format:")[1].strip().strip('>"').split("|") 17 | return keys 18 | 19 | 20 | for i, line in enumerate(sys.stdin): 21 | if line[0] == "#": 22 | print(line, end="") 23 | if """") 47 | 48 | print(line, end="") 49 | continue 50 | toks = line.rstrip().split("\t") 51 | format = toks[8].split(":") 52 | if i % 1000 == 0: 53 | print("mosaic: checked ...", i, file=sys.stderr) 54 | sys.stderr.flush() 55 | 56 | samples = {sample_names[k]: dict(zip(format, t.split(":"))) for k, t in enumerate(toks[9:])} 57 | 58 | candidates = [] 59 | for kid, mom, dad in trios: 60 | try: 61 | mom = samples[mom.sample_id]['AO'].split(",") 62 | if not any('0' == m for m in mom): continue 63 | 64 | dad = samples[dad.sample_id]['AO'].split(",") 65 | if not any('0' == d for d in dad): continue 66 | 67 | parents = [mom[k] + dad[k] for k in range(len(dad))] 68 | if not '00' in parents: continue 69 | 70 | skid = samples[kid.sample_id] 71 | kid_alts = map(int, skid['AO'].split(",")) 72 | except KeyError: # require all samples to be called. 73 | continue 74 | 75 | if not any(a >= MIN_REQ_ALTS and parents[k] == '00' for k, a in enumerate(kid_alts)): 76 | continue 77 | 78 | candidates.append("%s:%s:%s:%s" % (kid.sample_id, skid['RO'], skid['AO'], skid['QA'])) 79 | if not candidates: 80 | continue 81 | 82 | toks[7] = "MOSAIC=%s;%s" % ("|".join(candidates), toks[7]) 83 | print("\t".join(toks)) 84 | sys.stdout.flush() 85 | 86 | if __name__ == "__main__": 87 | main(sys.argv[1:]) 88 | -------------------------------------------------------------------------------- /motif-pattern/README.rst: -------------------------------------------------------------------------------- 1 | given: 2 | 3 | * a set of motifs and a count of their occurence in a genome or region (motif_counts.txt) 4 | * a set of patterns/groups of motifs and their occurrence in the genome 5 | 6 | return a p-value for each pattern. where a low p-value indicates that it is rare to see that pattern 7 | by chance given the frequency of occurence of its constituent motifs. 8 | 9 | the count of motif patterns is used only to estimate the lengths of observed patterns (presumably in a 10 | single genespace). 11 | 12 | useage:: 13 | 14 | python motif_significance.py -m motif_counts.txt -p patterns.txt -n 2000000 15 | 16 | where -n is the number of monte-carlo simulations to run in order to determine significance. 17 | 18 | -------------------------------------------------------------------------------- /motif-pattern/motif_counts.txt: -------------------------------------------------------------------------------- 1 | A 500 2 | B 2 3 | C 2 4 | D 0 5 | -------------------------------------------------------------------------------- /motif-pattern/motif_significance.py: -------------------------------------------------------------------------------- 1 | """ 2 | calculate the Expect value of each pattern given a 3 | file of occurences in the format 4 | 5 | A B C [count] 6 | 7 | where count is an integer number of occurrences 8 | and A B C is the motif pattern. 9 | """ 10 | import collections 11 | import random 12 | import sys 13 | 14 | def get_pattern_length_freqs(pattern_lengths): 15 | """ 16 | >>> get_pattern_length_freqs({2: 12, 3: 0}) 17 | {2: 1.0, 3: 0.0} 18 | 19 | >>> get_pattern_length_freqs({2: 12, 3: 12}) 20 | {2: 1.0, 3: 0.5} 21 | 22 | >>> get_pattern_length_freqs({2: 12, 3: 12, 4: 24}) 23 | {2: 1.0, 3: 0.75, 4: 0.5} 24 | 25 | """ 26 | freqs = {} 27 | tot = float(sum(pattern_lengths.itervalues())) 28 | max_len = max(pattern_lengths.keys()) 29 | pl = pattern_lengths.copy() 30 | pl[max_len + 1] = 0 31 | for n in sorted(pl, reverse=True): 32 | pl[n] += pl.get(n + 1, 0) 33 | 34 | freqs[max_len + 1] = 0 35 | for n in sorted(pattern_lengths, reverse=True): 36 | freqs[n] = pl[n] / tot 37 | del freqs[max_len + 1] 38 | #print freqs 39 | return freqs 40 | 41 | def run_sim(pattern, motif_pool, length_freq, nsims): 42 | """ 43 | return the number of times the pattern is created randomly. 44 | accounts for the length and the order 45 | """ 46 | rand = random.random 47 | choice = random.choice 48 | seen = 0 49 | for _ in xrange(nsims): 50 | if rand() > length_freq: continue 51 | for l in pattern: 52 | if choice(motif_pool) != l: break 53 | else: 54 | seen += 1 55 | return seen 56 | 57 | def read_motifs(fmotif): 58 | """ 59 | create a random pool of motifs to choose from for the monte-carlo simulations 60 | """ 61 | motif_pool = [] 62 | for line in open(fmotif): 63 | if not line.strip(): continue 64 | if line[0] == "#": continue 65 | motif, count = line.rstrip().split() 66 | motif_pool.extend(motif * int(count)) 67 | random.shuffle(motif_pool) 68 | return motif_pool 69 | 70 | def read_patterns(fpatterns): 71 | patterns = [] 72 | pattern_lengths = collections.defaultdict(int) 73 | for line in open(fpatterns): 74 | if not line.strip(): continue 75 | if line[0] == "#": continue 76 | line = line.strip().split() 77 | pattern, count = tuple(line[:-1]), int(line[-1]) 78 | pattern_lengths[len(pattern)] += count 79 | patterns.append(pattern) 80 | 81 | pattern_length_freqs = get_pattern_length_freqs(pattern_lengths) 82 | return patterns, pattern_length_freqs 83 | 84 | 85 | def main(fmotif, fpattern, nsims): 86 | motif_pool = read_motifs(fmotif) 87 | patterns, pattern_length_freqs = read_patterns(fpattern) 88 | 89 | for pattern in patterns: 90 | lfreq = pattern_length_freqs[len(pattern)] 91 | ngenerated = run_sim(pattern, motif_pool, lfreq, nsims) 92 | print " ".join(pattern), ngenerated / float(nsims) 93 | 94 | if __name__ == "__main__": 95 | import doctest 96 | doctest.testmod() 97 | import optparse 98 | p = optparse.OptionParser(__doc__) 99 | p.add_option('-m', dest='motifs', help='path to file of motifs in format: "motif [count]" ' 100 | ' a single line would look like: "A 123"') 101 | p.add_option('-p', dest='patterns', help='path to file of patterns for which you to get the expect count') 102 | p.add_option('-n', dest='nsims', type='int', help='number of iterations for the simulation. higher is more accurate', default=10000) 103 | 104 | opts, _ = p.parse_args() 105 | if not (opts.motifs and opts.patterns): 106 | sys.exit(p.print_help()) 107 | main(opts.motifs, opts.patterns, opts.nsims) 108 | -------------------------------------------------------------------------------- /motif-pattern/patterns.txt: -------------------------------------------------------------------------------- 1 | A B C 70 2 | A D D 2 3 | A A A 1 4 | A 1 5 | A A 100 6 | -------------------------------------------------------------------------------- /motif-pattern/run.sh: -------------------------------------------------------------------------------- 1 | python motif_significance.py -m motif_counts.txt -p patterns.txt -n 2000000 2 | -------------------------------------------------------------------------------- /ngs-notes/README.rst: -------------------------------------------------------------------------------- 1 | see .rsts in this directory for NGS pipelines and notes. 2 | -------------------------------------------------------------------------------- /ngs-notes/bowtie-dnaa-dist.rst: -------------------------------------------------------------------------------- 1 | 2 | `Bao et al`_ found in a recent paper that bowtie maps only 0.02% of reads in 3 | paired end mode. I wanted to see why that was the case. 4 | As they did, I generated paired-end reads with `dnaatools`_ *dwgsim* 5 | The command I used was:: 6 | 7 | dwgsim -d 300 -s 50 -N 10000 -1 76 -2 76 chr22.fa gen gen 8 | 9 | Indicating that the distance between ends should be 300 bp on average with 10 | a standard deviation of 50. Since this is just a test, I use human chromosome 11 | 22 and generate only 10000 reads. 12 | 13 | I then create the bowtie index as they do in the paper:: 14 | 15 | 16 | ./bowtie/bowtie-0.12.7/bowtie-build chr22.fa chr22 17 | 18 | And then map with the sam parameters they do (except I output SAM format and use 4 processors instead of 1):: 19 | 20 | ./bowtie/bowtie-0.12.7/bowtie -v2 --sam -p 4 chr22 -1 gen.bwa.read1.fastq -2 gen.bwa.read2.fastq out0.sam 21 | 22 | 23 | The output from this is:: 24 | 25 | 26 | # reads processed: 10000 27 | # reads with at least one reported alignment: 10 (0.10%) 28 | # reads that failed to align: 9990 (99.90%) 29 | Reported 10 paired-end alignments to 1 output stream(s) 30 | 31 | 32 | 33 | **Indeed** as reported in the paper, there is a very low mapping rate. 34 | The reason this occurs is because they did not specify the maximum insert 35 | size... 36 | From the bowtie docs:: 37 | 38 | 39 | The maximum insert size for valid paired-end alignments. 40 | E.g. if -X 100 is specified and a paired-end alignment consists 41 | of two 20-bp alignments in the proper orientation with a 60-bp 42 | gap between them, that alignment is considered valid (as long 43 | as -I is also satisfied). A 61-bp gap would not be valid in that 44 | case. If trimming options -3 or -5 are also used, the -X constraint 45 | is applied with respect to the untrimmed mates, not the trimmed 46 | mates. Default: 250. 47 | 48 | So the reads are 76 and dwgsim uses the distance specified (here 300) 49 | as the outer dist then bowtie *should* map with the default maxins of 50 | 250. However, clearly it does not. If we assume it's actually measuring 51 | the inner distance, then we need 300 + 76 * 2 + 2* standard-deviation 52 | -- so we'll just use 700:: 53 | 54 | 55 | ./bowtie/bowtie-0.12.7/bowtie --maxins 700 -v2 --sam -p 4 chr22 -1 gen.bwa.read1.fastq -2 gen.bwa.read2.fastq out.sam 56 | 57 | The output from this is:: 58 | 59 | # reads processed: 10000 60 | # reads with at least one reported alignment: 6405 (64.05%) 61 | # reads that failed to align: 3595 (35.95%) 62 | Reported 6405 paired-end alignments to 1 output stream(s) 63 | 64 | So that maps 64% of reads and that can be increased by allowing more 65 | mismatches. So, this is either an error in the docs. 66 | If we create a .bam file:: 67 | 68 | samtools view -h -bS out.sam | samtools sort - out 69 | 70 | and check it with picard tools:: 71 | 72 | java -jar src/picard/picard-tools-1.39/CollectInsertSizeMetrics.jar I=out.bam O=out.txt H=out.hist ASSUME_SORTED=true 73 | 74 | we can see the insert size distribution: 75 | 76 | 77 | .. image:: https://github.com/brentp/bio-playground/raw/master/ngs-notes/images/insert-size.png 78 | 79 | 80 | So it is actually centered around 450. So dnaa tools is generating pairs 81 | with an **inner** distance specified by `-d` not the **outer** distance as 82 | advertised. 83 | 84 | 85 | .. _`Bao et al`: http://www.nature.com/jhg/journal/vaop/ncurrent/full/jhg201143a.html 86 | .. _`dnaatools`: http://sourceforge.net/apps/mediawiki/dnaa/index.php?title=Main_Page 87 | -------------------------------------------------------------------------------- /ngs-notes/bowtie-e.md: -------------------------------------------------------------------------------- 1 | Using bowtie to map un-ambiguous (colorspace) reads. 2 | The reason I want to do this is that bowtie is fast. So I want it to: 3 | 4 | * Map reads that are easy--no indels, good quality, etc. 5 | * Discard reads that map to multiple locations. 6 | 7 | This should speed downstream alignment by another *rigorous* but slower downstream aligner. 8 | 9 | I want to determine: what's a good cutoff for the sum of qualities of mismatches (`-e`)? 10 | 11 | ``` sh 12 | 13 | for e in 10 30 50 70 90 110 130; do 14 | bowtie -f -C \ 15 | -Q $QUAL \ 16 | --chunkmbs 1025 --best --sam \ 17 | --max data/rm.e${e}.bowtie.max \ 18 | --un data/rm.e${e}.bowtie.un \ 19 | -n 1 -e ${e} --nomaqround --maxbts 50000 \ 20 | -m 1 -p 8 --seed 42 $REF_COLOR \ 21 | $CSFASTA \ 22 | | samtools view -bSF 4 - > t.e${e}.bam 23 | 24 | aligned=$(samtools view -c t.e${e}.bam) 25 | max=$(grep -c "^>" data/rm.e${e}.bowtie.max) 26 | echo $e $aligned $max 27 | done 28 | ``` 29 | 30 | Gives: 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 |
emapped-readsmax-reads
1073638316528
30123892133281
50158476552277
70182475173071
90197951789968
1102076351105399
1302134483118584
51 | 52 | The max-reads column tells the number of reads that are excluded because the mapped to more than 1 location in the genome. I'm not sure why this varies so greatly with differing values of `e`. 53 | 54 | So, even with allowing only a single mismatch in the seed (first 28 bp by default), the number of 55 | mapped reads varies 3-fold (700K to 2.1M) depending on the allow sum of qualities in the mis-matches. 56 | 57 | Even so, bowtie reports `48M` as the cigar string for **every** alignment in all the BAM's above (and a mapping quality of 255 for every alignment). 58 | 59 | Since this is for a targetted re-sequencing project, I can check what percentage of the reads are mapping to the target region (65KB) as that varies with the `e` cutoff. But, that is constant (at 96%) regardless of the cutoff. 60 | 61 | The question then is how are down-stream variant callers affected by these alignments from bowtie? 62 | Does the low quality at the mismatches prevent any miscalls? How does BAQ affect this? 63 | -------------------------------------------------------------------------------- /ngs-notes/images/insert-size.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brentp/bio-playground/1982a222328bca7b675dedaa3887600a7d3dad74/ngs-notes/images/insert-size.png -------------------------------------------------------------------------------- /ngs-notes/variant-calling.rst: -------------------------------------------------------------------------------- 1 | Variant Calling 2 | =============== 3 | 4 | First perform `realignment` 5 | 6 | Calling on All Samples / Using Read Groups 7 | ------------------------------------------ 8 | 9 | 10 | + add a read group to BAM/SAM: http://seqanswers.com/forums/showthread.php?t=4180 11 | 12 | * ID = id name for the readgroup 13 | * SM = sample name 14 | * LB = label? dunno about this one 15 | * PL = platform (Illumina/SoLid/etc.) 16 | 17 | + from free-bayes help:: 18 | 19 | FreeBayes is designed to be run on many individuals from the same population 20 | (e.g. many human samples) simultaneously. The algorithm exploits a neutral 21 | model of evolution and allele diffusion to impute most-confident genotypings 22 | across the entire population. In practice, the quality and confidence in the 23 | callset will increase if you run multiple samples simultaneously. If your 24 | study has multiple individuals, you should run freebayes against them at the 25 | same time. 26 | 27 | To call variants in a population of samples, each alignment must have a read 28 | group identifier attached to it (RG tag), and the header of the BAM file in 29 | which it resides must map the RG tags to sample names (SM). Furthermore, read 30 | group IDs must be unique across all the files used in the analysis. 31 | 32 | + from samtools spec: @RG Read group. Unordered multiple lines are allowed. 33 | 34 | * **ID** Read group identifier. Each @RG line must have a unique ID. The value of ID is used in the RG tags of alignment records. Must be unique among all read groups in header section. Read group IDs may be modifieded when merging SAM filles in order to handle collisions. 35 | 36 | * **CN** Name of sequencing center producing the read. 37 | 38 | * **DS** Description. 39 | 40 | * **DT** Date the run was produced (ISO8601 date or date/time). LB Library. 41 | 42 | * **PG** Programs used for processing the read group. 43 | 44 | * **PI** Predicted median insert size. 45 | 46 | * **PL** Platform/technology used to produce the read. Valid values: ILLUMINA, SOLID, LS454, HELICOS and PACBIO. 47 | 48 | * **PU** Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD). Unique identifier. 49 | 50 | * **SM** Sample. Use pool name where a pool is being sequenced. 51 | 52 | 53 | + example 54 | 55 | * header: @RG\tID:some-unique-id\tSM:hs\tLB:ga\tPL:Illumina 56 | 57 | * in a single read: RG:Z:some-unique-id 58 | 59 | + from samtools mpileup docs:: 60 | 61 | "One alignment file can contain multiple samples; reads from one sample 62 | can also be distributed in different alignment files. SAMtools will regroup 63 | the reads anyway. In addition, if no @RG lines are present, each 64 | alignment file is taken as one sample." 65 | 66 | + also see: http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions#My_BAM_file_doesn.27t_have_read_group_and_sample_information.__Do_I_really_need_it.3F 67 | 68 | Parameters 69 | ---------- 70 | 71 | U87MG Decoded paper: phred score >= 10, observed >=4 times and <= 60 times. and 1x per strand. 72 | 73 | 74 | GATK 75 | ---- 76 | 77 | http://www.broadinstitute.org/gsa/wiki/index.php/The_Genome_Analysis_Toolkit#Variant_Discovery_Tools 78 | 79 | http://www.broadinstitute.org/gsa/wiki/index.php/Indel_Genotyper_V2.0 80 | 81 | http://www.broadinstitute.org/gsa/wiki/index.php/Unified_genotyper#Indel_Calling_with_the_Unified_Genotyper 82 | 83 | 84 | FreeBayes 85 | --------- 86 | 87 | https://github.com/ekg/freebayes 88 | 89 | Samtools 90 | -------- 91 | 92 | http://lh3lh3.users.sourceforge.net/download/multigeno.pdf 93 | 94 | http://samtools.sourceforge.net/samtools.shtml 95 | 96 | :: 97 | 98 | # but use -B to avoid realignment if already called samtools calmd 99 | $ samtools mpileup -m 4 -F 0.2 -C 50 -D -S -gf hg19.fa a.bam b.bam c.bam > ${out}.all.pileup 100 | 101 | $ bcftools view -bvcg out.pileup > ${out}.all.vcf 102 | $ bcftools view ${out}.all.vcf | vcfutils.pl varFilter -D100 -d 3 **[discuss other opts]** > ${out}.vcf 103 | 104 | References 105 | ---------- 106 | 107 | + Homer N, Nelson SF. 2010. Improved variant discovery through local re-alignment of short-read next-generation sequencing data use SRMA. Genome Biology. 11:R99. 108 | -------------------------------------------------------------------------------- /ngs-notes/variant-filtering-and-annotation.rst: -------------------------------------------------------------------------------- 1 | Filtering 2 | ========= 3 | 4 | + Most tools create or expect Variant Call Format: `VCF`_ 5 | 6 | + From there, `VCFTools`_ can be used to: 7 | 8 | * filter on 9 | - quality 10 | - depth 11 | - allele frequency 12 | - etc. 13 | 14 | * summarize 15 | - frequency 16 | - depth 17 | - quality 18 | - etc. 19 | 20 | 21 | + other tools: 22 | 23 | * `vcflib`_ 24 | 25 | * solid-gff3-to-vcf.py will convert solid output to `VCF`_ 26 | with filters for coverage and quality. 27 | 28 | Annotation 29 | ========== 30 | 31 | + `Annovar`_ (Command-Line) : 32 | 33 | * Annotates based on: 34 | - genes: refGene / Known Gene / EnsGene 35 | - any track from UCSC 36 | - dbsnp phastCons segdups, etc. 37 | * Run from command-line like 38 | 39 | - annotate based on location in/near gene (exon/intron/up/down):: 40 | 41 | $ ./annotate_variation.pl --buildver hg19 --geneanno $INPUT humandb/ 42 | 43 | - annotate based on presence in dbsnp:: 44 | 45 | $ ./annotate_variation.pl --buildver hg19 --filter --dbtype 1000g2010nov_all ${IN} humandb 46 | 47 | + `SeattleSeq`_ (Web-Based): 48 | 49 | * seems to have no limit on query size. 50 | 51 | * requires choosing a single individual from dbSNP 52 | 53 | * outputs columns of: 54 | 55 | - inDBSNPOrNot 56 | - chromosome 57 | - position 58 | - referenceBase 59 | - sampleGenotype 60 | - sampleAlleles 61 | - dbSNPGenotype 62 | - allelesDBSNP 63 | - accession 64 | - functionGVS 65 | - functionDBSNP 66 | - rsID 67 | - aminoAcids 68 | - proteinPosition 69 | - polyPhen 70 | - scorePhastCons 71 | - consScoreGERP 72 | - chimpAllele 73 | - CNV 74 | - geneList 75 | - AfricanHapMapFreq 76 | - EuropeanHapMapFreq 77 | - AsianHapMapFreq 78 | - hasGenotypes 79 | - dbSNPValidation 80 | - repeatMasker 81 | - tandemRepeat 82 | - clinicalAssociation 83 | - distanceToSplice 84 | - microRNAs 85 | - proteinSequence 86 | 87 | + `snpEff`_ (Command-line) 88 | 89 | * Annotates based on Ensembl genes (up/downstream, intron, utr, exon) 90 | 91 | * For exon reports: 92 | 93 | - non/synonymous 94 | - stop/start codon gain/lost 95 | - splice/frame shift 96 | 97 | + `bedtools`_ (Command-line) 98 | 99 | * download whatever data you want. get it it bed/gff format and 100 | use linux commands like cut to get desired columns. 101 | 102 | Suggested Short-Term Pipeline 103 | ============================= 104 | 105 | + convert solid gff3 to vcf 106 | 107 | + filter out snps in dbsnp 108 | 109 | * use VCF from dbsnp and vcftools 110 | 111 | * **discuss parameters** (e.g. frequency in dbsnp) 112 | 113 | + **discuss** filter out those near centromere?? 114 | 115 | + Annotate remaining with `SeattleSeq`_ 116 | 117 | * Most are non-coding 118 | 119 | * **discuss what to do with these** 120 | 121 | + annotate based on various UCSC tracks with `Annovar`_ 122 | 123 | + view in UCSC with automatic links. 124 | ( SchwartzHuman/brentp/annotate-variants/annovar-to-ucsc-bed.py ) 125 | 126 | Suggested Mid-Term Pipeline 127 | =========================== 128 | 129 | 130 | + add read-groups for individuals 131 | 132 | - combine all bams 133 | 134 | + remove dups (dnaatools or picard markDuplicates) 135 | 136 | + samtools calmd (parallelized by chromsome) 137 | 138 | + free-bayes (parallelized by chromsome) 139 | 140 | 141 | Suggested Long-Term Pipeline 142 | ============================ 143 | 144 | + Use BFast Alignments 145 | 146 | + remove dups (dnaatools or picard markDuplicates) 147 | 148 | + SRMA to do local re-alignment 149 | 150 | + samtools calmd 151 | 152 | + Indel calling: 153 | 154 | + samtools mpileup 155 | 156 | + freebayes 157 | 158 | 159 | .. _`VCF`: http://vcftools.sourceforge.net/specs.html 160 | .. _`VCFTools`: http://vcftools.sourceforge.net/options.html 161 | .. _`vcflib`: https://github.com/ekg/vcflib 162 | .. _`Annovar`: http://www.openbioinformatics.org/annovar/ 163 | .. _`SeattleSeq`: http://gvs-p.gs.washington.edu/SeattleSeqAnnotation131/index.jsp 164 | .. _`snpEff`: http://snpeff.sourceforge.net 165 | .. _`bedtools`: http://github.com/arq5x/bedtools 166 | -------------------------------------------------------------------------------- /nim-stuffs/edalign/README.md: -------------------------------------------------------------------------------- 1 | nim wrapper for [edlib](https://github.com/Martinsos/edlib) 2 | 3 | this is working but I abandoned in favor of another aligner 4 | -------------------------------------------------------------------------------- /nim-stuffs/edalign/ed.nim: -------------------------------------------------------------------------------- 1 | import edlib_c 2 | import strutils 3 | type CArray{.unchecked.}[T] = array[0..0, T] 4 | 5 | proc free(v: pointer) {.header: "", importc: "free".} 6 | 7 | type 8 | Alignment* = ref object of RootObj 9 | ## Alignment wraps the result of align 10 | c*: EdlibAlignResult 11 | Config* = ref object of RootObj 12 | ## config determins the alignment parameters 13 | c*: EdlibAlignConfig 14 | 15 | proc k*(c: Config): int = 16 | # return k window of the alignment config 17 | return c.c.k.int 18 | 19 | proc mode*(c: var Config): EdlibAlignMode = 20 | # return the mode of the config 21 | return c.c.mode 22 | 23 | proc new_config*(k:int=7, mode:EdlibAlignMode=EDLIB_MODE_HW, task:EdlibAlignTask=EDLIB_TASK_PATH): Config = 24 | ## create a new config object (this can not be modified as it's a copy) 25 | var cfg = edlibNewAlignConfig(k.cint, mode, task, nil, 0) 26 | return Config(c: cfg) 27 | 28 | proc destroy_alignment(a: Alignment) = 29 | edlibFreeAlignResult(a.c) 30 | 31 | proc cigar*(a:Alignment, s:var string, extended:bool=false): string = 32 | ## a string representation of the CIGAR 33 | var x = EDLIB_CIGAR_STANDARD 34 | if extended: 35 | x = EDLIB_CIGAR_EXTENDED 36 | var v = edlibAlignmentToCigar(a.c.alignment, a.c.alignmentLength, x) 37 | s = $v 38 | result = s 39 | free(v) 40 | 41 | proc edit_distance*(a: Alignment): int {.inline.} = 42 | ## the edit distance between the target and the query. 43 | return a.c.editDistance 44 | 45 | proc ok*(a: Alignment): bool {.inline.} = 46 | ## check that the alignment was actually performed. 47 | return a.c.status == EDLIB_STATUS_OK and a.edit_distance != -1 48 | 49 | proc start*(a: Alignment): int = 50 | ## the first start of the alignment; negative for invalid alignment 51 | if a.c.startLocations == nil: 52 | return -1 53 | var arr = cast[ptr CArray[int]](a.c.startLocations.pointer) 54 | return arr[0] 55 | 56 | proc length*(a: Alignment): int = 57 | return a.c.alignmentLength 58 | 59 | proc alignTo*(query: string, target: string, config: Config): Alignment = 60 | ## align query to the target according to config. 61 | var a: Alignment 62 | new(a, destroy_alignment) 63 | # NOTE: should let user to upper 64 | a.c = edlibAlign(query.toUpperAscii.cstring, query.len.cint, target.toUpperAscii.cstring, target.len.cint, config.c) 65 | return a 66 | 67 | const lookupt = ['-', '-', ' ', '-'] 68 | const lookupq = ['-', ' ', '-', '*'] 69 | 70 | 71 | proc score*(a:Alignment, match:int=1, mismatch:int=(-1), gap_open:int=(-2), gap_extend:int=(-1)): int = 72 | ## score an alignment. match should be positive and others should be negative. 73 | if a.c.alignment == nil: 74 | return -1 75 | var 76 | arr = cast[ptr CArray[uint8]](a.c.alignment.pointer) 77 | ingap = false 78 | #if mismatch > 0: mismatch = -mismatch 79 | #if gap_open > 0: gap_open = -gap_open 80 | #if gap_extend > 0: gap_extend = -gap_extend 81 | 82 | for i in 0.. MT.fq 6 | # to compile your program: 7 | gcc -Wall -O2 prog.c -o prog -L/path/to/fermi-lite -lfml -lz -lm -lpthread 8 | ``` 9 | 10 | ## Introduction 11 | 12 | Fermi-lite is a standalone C library as well as a command-line tool for 13 | assembling Illumina short reads in regions from 100bp to 10 million bp in size. 14 | It is largely a light-weight in-memory version of [fermikit][fk] without 15 | generating any intermediate files. It inherits the performance, the relatively 16 | small memory footprint and the features of fermikit. In particular, fermi-lite 17 | is able to retain heterozygous events and thus can be used to assemble diploid 18 | regions for the purpose of variant calling. It is one of the limited choices 19 | for local re-assembly and arguably the easiest to interface. 20 | 21 | ## Usage 22 | 23 | For now, see [example.c][example] for the basic use of the library. Here is a 24 | sketch of the example: 25 | ```cpp 26 | #include // for printf() 27 | #include "fml.h" // only one header file required 28 | 29 | int main(int argc, char *argv[]) 30 | { 31 | int i, n_seqs, n_utgs; 32 | bseq1_t *seqs; // array of input sequences 33 | fml_utg_t *utgs; // array of output unitigs 34 | fml_opt_t opt; 35 | if (argc == 1) return 1; // do nothing if there is no input file 36 | seqs = bseq_read(argv[1], &n_seqs); // or fill the array with callers' functions 37 | fml_opt_init(&opt); // initialize parameters 38 | utgs = fml_assemble(&opt, n_seqs, seqs, &n_utgs); // assemble! 39 | for (i = 0; i < n_utgs; ++i) // output in fasta 40 | printf(">%d\n%s\n", i+1, utgs[i].seq); 41 | fml_utg_destroy(n_utgs, utgs); // deallocate unitigs 42 | return 0; 43 | } 44 | ``` 45 | The `fml_assemble()` output is in fact a graph. You may have a look at the 46 | `fml_utg_print_gfa()` function in [misc.c][misc] about how to derive a 47 | [GFA][gfa] representation from an array of `fml_utg_t` objects. 48 | 49 | ## Overview of the Assembly Algorithm 50 | 51 | Fermi-lite is an overlap-based assembler. Given a set of input reads, it counts 52 | *k*-mers, estimates the *k*-mer coverage, sets a threshold on *k*-mer 53 | occurrences to determine solid *k*-mers and then use them correct sequencing 54 | errors ([Li, 2015][bfc-paper]). After error correction, fermi-lite trims a read 55 | at an *l*-mer unique to the read. It then constructs an FM-index for trimmed 56 | reads ([Li, 2014][rb2-paper]) and builds a transitively reduced overlap graph from the 57 | FM-index ([Simpson and Durbin, 2010][sga-paper]; [Li, 2012][fm1-paper]), 58 | requiring at least *l*-bp overlaps. In this graph, fermi-lite trims tips and 59 | pops bubbles caused by uncorrected errors. If a sequence in the graph has 60 | multiple overlaps, fermi-lite discards overlaps significantly shorter than the 61 | longest overlap -- this is a technique applied to overlap graph only. The graph 62 | after these procedure is the final output. Sequences in this graph are unitigs. 63 | 64 | ## Limitations 65 | 66 | 1. Fermi-lite can efficiently assemble bacterial genomes. However, it has not 67 | been carefully tuned for this type of assembly. While on a few GAGE-B data 68 | sets fermi-lite appears to work well, it may not compete with recent 69 | mainstream assemblers in general. 70 | 71 | 2. Fermi-lite does not work with genomes more than tens of megabases as a 72 | whole. It would take too much memory to stage all data in memory. For large 73 | genomes, please use [fermikit][fk] instead. 74 | 75 | 3. This is the first iteration of fermi-lite. It is still immarture. In 76 | particular, I hope fermi-lite can be smart enough to automatically figure 77 | out various parameters based on input, which is very challenging given the 78 | high variability of input data. 79 | 80 | [sga-paper]: http://www.ncbi.nlm.nih.gov/pubmed/20529929 81 | [bfc-paper]: http://www.ncbi.nlm.nih.gov/pubmed/25953801 82 | [rb2-paper]: http://www.ncbi.nlm.nih.gov/pubmed/25107872 83 | [fm1-paper]: http://www.ncbi.nlm.nih.gov/pubmed/22569178 84 | [bfc]: http://github.com/lh3/bfc 85 | [rb2]: http://github.com/lh3/ropebwt2 86 | [fm2]: http://github.com/lh3/fermi2 87 | [fk]: http://github.com/lh3/fermikit 88 | [example]: https://github.com/lh3/fermi-lite/blob/master/example.c 89 | [header]: https://github.com/lh3/fermi-lite/blob/master/fml.h 90 | [misc]: https://github.com/lh3/fermi-lite/blob/master/misc.c 91 | [gfa]: https://github.com/pmelsted/GFA-spec 92 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/README.md: -------------------------------------------------------------------------------- 1 | [fermi-lite](https://github.com/lh3/fermi-lite/) wrapper for nim. 2 | 3 | Allows assembling short reads. 4 | 5 | ```Nim 6 | import fermil 7 | var dna = "AAAAACTCTACCTCTCTATACTAATCTCCCTACAAATCTCCTTAATTATAACATTCACAGCCACAGAACTAATCATATTTTATATCTTCTTCGAAACCAC" 8 | var qual = "2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222" 9 | 10 | var fml = new_fermi() 11 | var i = 0 12 | # example adding some sub strings of the above sequence... 13 | while i <= (dna.len - 80 + 3): 14 | var s = dna[i..<(i+80)] 15 | var q = qual[0.. 2 | #include 3 | #include 4 | #include 5 | #include "fml.h" 6 | #include "kseq.h" 7 | KSEQ_INIT(gzFile, gzread) 8 | 9 | bseq1_t *bseq_read(const char *fn, int *n_) 10 | { 11 | gzFile fp; 12 | bseq1_t *seqs; 13 | kseq_t *ks; 14 | int m, n; 15 | uint64_t size = 0; 16 | 17 | *n_ = 0; 18 | fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); 19 | if (fp == 0) return 0; 20 | ks = kseq_init(fp); 21 | 22 | m = n = 0; seqs = 0; 23 | while (kseq_read(ks) >= 0) { 24 | bseq1_t *s; 25 | if (n >= m) { 26 | m = m? m<<1 : 256; 27 | seqs = realloc(seqs, m * sizeof(bseq1_t)); 28 | } 29 | s = &seqs[n]; 30 | s->seq = strdup(ks->seq.s); 31 | s->qual = ks->qual.l? strdup(ks->qual.s) : 0; 32 | s->l_seq = ks->seq.l; 33 | size += seqs[n++].l_seq; 34 | } 35 | *n_ = n; 36 | 37 | kseq_destroy(ks); 38 | gzclose(fp); 39 | return seqs; 40 | } 41 | 42 | void seq_reverse(int l, unsigned char *s) 43 | { 44 | int i; 45 | for (i = 0; i < l>>1; ++i) { 46 | int tmp = s[l-1-i]; 47 | s[l-1-i] = s[i]; s[i] = tmp; 48 | } 49 | } 50 | 51 | void seq_revcomp6(int l, unsigned char *s) 52 | { 53 | int i; 54 | for (i = 0; i < l>>1; ++i) { 55 | int tmp = s[l-1-i]; 56 | tmp = (tmp >= 1 && tmp <= 4)? 5 - tmp : tmp; 57 | s[l-1-i] = (s[i] >= 1 && s[i] <= 4)? 5 - s[i] : s[i]; 58 | s[i] = tmp; 59 | } 60 | if (l&1) s[i] = (s[i] >= 1 && s[i] <= 4)? 5 - s[i] : s[i]; 61 | } 62 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/example.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "fml.h" 5 | 6 | int main(int argc, char *argv[]) 7 | { 8 | fml_opt_t opt; 9 | int c, n_seqs, n_utg, gfa_out = 0; 10 | bseq1_t *seqs; 11 | fml_utg_t *utg; 12 | 13 | fml_opt_init(&opt); 14 | while ((c = getopt(argc, argv, "gOAe:l:r:t:c:d:v:")) >= 0) { 15 | if (c == 'e') opt.ec_k = atoi(optarg); 16 | else if (c == 'l') opt.min_asm_ovlp = atoi(optarg); 17 | else if (c == 'r') opt.mag_opt.min_dratio1 = atof(optarg); 18 | else if (c == 'A') opt.mag_opt.flag |= MAG_F_AGGRESSIVE; 19 | else if (c == 'O') opt.mag_opt.flag &= ~MAG_F_POPOPEN; 20 | else if (c == 'd') opt.mag_opt.max_bdiff = atoi(optarg); 21 | else if (c == 't') opt.n_threads = atoi(optarg); 22 | else if (c == 'g') gfa_out = 1; 23 | else if (c == 'v') fm_verbose = atoi(optarg); 24 | else if (c == 'c') { 25 | char *p; 26 | opt.min_cnt = strtol(optarg, &p, 10); 27 | if (*p == ',') opt.max_cnt = strtol(p + 1, &p, 10); 28 | } 29 | } 30 | if (argc == optind) { 31 | fprintf(stderr, "Usage: fml-asm [options] \n"); 32 | fprintf(stderr, "Options:\n"); 33 | fprintf(stderr, " -e INT k-mer length for error correction (0 for auto; -1 to disable) [%d]\n", opt.ec_k); 34 | fprintf(stderr, " -c INT1[,INT2] range of k-mer & read count thresholds for ec and graph cleaning [%d,%d]\n", opt.min_cnt, opt.max_cnt); 35 | fprintf(stderr, " -l INT min overlap length during initial assembly [%d]\n", opt.min_asm_ovlp); 36 | fprintf(stderr, " -r FLOAT drop an overlap if its length is below maxOvlpLen*FLOAT [%g]\n", opt.mag_opt.min_dratio1); 37 | fprintf(stderr, " -t INT number of threads (don't use multi-threading for small data sets) [%d]\n", opt.n_threads); 38 | fprintf(stderr, " -d INT retain a bubble if one side is longer than the other side by >INT-bp [%d]\n", opt.mag_opt.max_bdiff); 39 | fprintf(stderr, " -A discard heterozygotes (apply this to assemble bacterial genomes; override -O)\n"); 40 | fprintf(stderr, " -O don't apply aggressive tip trimming\n"); 41 | fprintf(stderr, " -g output the assembly graph in the GFA format\n"); 42 | return 1; 43 | } 44 | seqs = bseq_read(argv[optind], &n_seqs); 45 | utg = fml_assemble(&opt, n_seqs, seqs, &n_utg); 46 | if (!gfa_out) fml_utg_print(n_utg, utg); 47 | else fml_utg_print_gfa(n_utg, utg); 48 | fml_utg_destroy(n_utg, utg); 49 | return 0; 50 | } 51 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/fermil.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | 3 | version = "0.2.0" 4 | author = "Brent Pedersen" 5 | description = "fermi-list wrapper to assemble short reads" 6 | license = "MIT" 7 | 8 | # Dependencies 9 | 10 | requires "nim >= 0.17.2" #, "nim-lang/c2nim>=0.9.13" 11 | 12 | task test, "run the tests": 13 | exec "make" 14 | exec "nim c -d:release --threads:on --passL:libfml.a --passL:-lz -r fermil.nim" 15 | 16 | task build, "build": 17 | exec "make" 18 | exec "nim c --threads:on --passL:libfml.a --passL:-lz -r fermil.nim" 19 | 20 | before install: 21 | exec "make" 22 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/fml.h: -------------------------------------------------------------------------------- 1 | #ifndef FML_H 2 | #define FML_H 3 | 4 | #define FML_VERSION "r53" 5 | 6 | #include 7 | 8 | typedef struct { 9 | int32_t l_seq; 10 | char *seq, *qual; // NULL-terminated strings; length expected to match $l_seq 11 | } bseq1_t; 12 | 13 | #define MAG_F_AGGRESSIVE 0x20 // pop variant bubbles (not default) 14 | #define MAG_F_POPOPEN 0x40 // aggressive tip trimming (default) 15 | #define MAG_F_NO_SIMPL 0x80 // skip bubble simplification (default) 16 | 17 | typedef struct { 18 | int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth; 19 | float min_dratio1, max_bcov, max_bfrac; 20 | } magopt_t; 21 | 22 | typedef struct { 23 | int n_threads; // number of threads; don't use multi-threading for small data sets 24 | int ec_k; // k-mer length for error correction; 0 for auto estimate 25 | int min_cnt, max_cnt; // both occ threshold in ec and tip threshold in cleaning lie in [min_cnt,max_cnt] 26 | int min_asm_ovlp; // min overlap length during assembly 27 | int min_merge_len; // during assembly, don't explicitly merge an overlap if shorter than this value 28 | magopt_t mag_opt; // graph cleaning options 29 | } fml_opt_t; 30 | 31 | struct rld_t; 32 | struct mag_t; 33 | 34 | typedef struct { 35 | uint32_t len:31, from:1; // $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end 36 | uint32_t id:31, to:1; // $id: unitig number 37 | } fml_ovlp_t; 38 | 39 | typedef struct { 40 | int32_t len; // length of sequence 41 | int32_t nsr; // number of supporting reads 42 | char *seq; // unitig sequence 43 | char *cov; // cov[i]-33 gives per-base coverage at i 44 | int n_ovlp[2]; // number of 5'-end [0] and 3'-end [1] overlaps 45 | fml_ovlp_t *ovlp; // overlaps, of size n_ovlp[0]+n_ovlp[1] 46 | } fml_utg_t; 47 | 48 | extern int fm_verbose; 49 | 50 | #ifdef __cplusplus 51 | extern "C" { 52 | #endif 53 | 54 | /************************ 55 | * High-level functions * 56 | ************************/ 57 | 58 | /** 59 | * Read all sequences from a FASTA/FASTQ file 60 | * 61 | * @param fn filename; NULL or "-" for stdin 62 | * @param n (out) number of sequences read into RAM 63 | * 64 | * @return array of sequences 65 | */ 66 | bseq1_t *bseq_read(const char *fn, int *n); 67 | 68 | /** 69 | * Initialize default parameters 70 | * 71 | * @param opt (out) pointer to parameters 72 | */ 73 | void fml_opt_init(fml_opt_t *opt); 74 | 75 | /** 76 | * Assemble a list of sequences 77 | * 78 | * @param opt parameters 79 | * @param n_seqs number of input sequences 80 | * @param seqs sequences to assemble; FREED on return 81 | * @param n_utg (out) number of unitigs in return 82 | * 83 | * @return array of unitigs 84 | */ 85 | fml_utg_t *fml_assemble(const fml_opt_t *opt, int n_seqs, bseq1_t *seqs, int *n_utg); 86 | 87 | /** 88 | * Free unitigs 89 | * 90 | * @param n_utg number of unitigs 91 | * @param utg array of unitigs 92 | */ 93 | void fml_utg_destroy(int n_utg, fml_utg_t *utg); 94 | 95 | /************************************************ 96 | * Mid-level functions called by fml_assemble() * 97 | ************************************************/ 98 | 99 | /** 100 | * Adjust parameters based on input sequences 101 | * 102 | * @param opt parameters to update IN PLACE 103 | * @param n_seqs number of sequences 104 | * @param seqs array of sequences 105 | */ 106 | void fml_opt_adjust(fml_opt_t *opt, int n_seqs, const bseq1_t *seqs); 107 | 108 | /** 109 | * Error correction 110 | * 111 | * @param opt parameters 112 | * @param n number of sequences 113 | * @param seq array of sequences; corrected IN PLACE 114 | * 115 | * @return k-mer coverage 116 | */ 117 | float fml_correct(const fml_opt_t *opt, int n, bseq1_t *seq); 118 | float fml_fltuniq(const fml_opt_t *opt, int n, bseq1_t *seq); 119 | 120 | /** 121 | * Construct FMD-index 122 | * 123 | * @param opt parameters 124 | * @param n number of sequences 125 | * @param seq array of sequences; FREED on return 126 | * 127 | * @return FMD-index 128 | */ 129 | struct rld_t *fml_seq2fmi(const fml_opt_t *opt, int n, bseq1_t *seq); 130 | 131 | /** 132 | * Generate initial overlap graph 133 | * 134 | * @param opt parameters 135 | * @param e FMD-index; FREED on return 136 | * 137 | * @return overlap graph in the "mag" structure 138 | */ 139 | struct mag_t *fml_fmi2mag(const fml_opt_t *opt, struct rld_t *e); 140 | 141 | /** 142 | * Clean a mag graph 143 | * 144 | * @param opt parameters 145 | * @param g overlap graph; modified IN PLACE 146 | */ 147 | void fml_mag_clean(const fml_opt_t *opt, struct mag_t *g); 148 | 149 | /** 150 | * Convert a graph in mag to fml_utg_t 151 | * 152 | * @param g graph in the "mag" structure; FREED on return 153 | * @param n_utg (out) number of unitigs 154 | * 155 | * @return array of unitigs 156 | */ 157 | fml_utg_t *fml_mag2utg(struct mag_t *g, int *n_utg); 158 | 159 | /** 160 | * Output unitig graph in the mag format 161 | * 162 | * @param n_utg number of unitigs 163 | * @param utg array of unitigs 164 | */ 165 | void fml_utg_print(int n_utgs, const fml_utg_t *utg); 166 | 167 | /** 168 | * Output unitig graph in the GFA format 169 | * 170 | * @param n_utg number of unitigs 171 | * @param utg array of unitigs 172 | */ 173 | void fml_utg_print_gfa(int n, const fml_utg_t *utg); 174 | 175 | /** 176 | * Deallocate an FM-index 177 | * 178 | * @param e pointer to the FM-index 179 | */ 180 | void fml_fmi_destroy(struct rld_t *e); 181 | 182 | /** 183 | * Deallocate a mag graph 184 | * 185 | * @param g pointer to the mag graph 186 | */ 187 | void fml_mag_destroy(struct mag_t *g); 188 | 189 | #ifdef __cplusplus 190 | } 191 | #endif 192 | 193 | #endif 194 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/htab.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "htab.h" 5 | #include "khash.h" 6 | 7 | #define _cnt_eq(a, b) ((a)>>14 == (b)>>14) 8 | #define _cnt_hash(a) ((a)>>14) 9 | KHASH_INIT(cnt, uint64_t, char, 0, _cnt_hash, _cnt_eq) 10 | typedef khash_t(cnt) cnthash_t; 11 | 12 | struct bfc_ch_s { 13 | int k; 14 | cnthash_t **h; 15 | // private 16 | int l_pre; 17 | }; 18 | 19 | bfc_ch_t *bfc_ch_init(int k, int l_pre) 20 | { 21 | bfc_ch_t *ch; 22 | int i; 23 | assert(k <= 63); 24 | if (k * 2 - l_pre > BFC_CH_KEYBITS) 25 | l_pre = k * 2 - BFC_CH_KEYBITS; 26 | if (l_pre > BFC_CH_MAXPRE) l_pre = BFC_CH_MAXPRE; 27 | assert(k - l_pre < BFC_CH_KEYBITS); 28 | ch = calloc(1, sizeof(bfc_ch_t)); 29 | ch->k = k, ch->l_pre = l_pre; 30 | ch->h = calloc(1<l_pre, sizeof(void*)); 31 | for (i = 0; i < 1<l_pre; ++i) 32 | ch->h[i] = kh_init(cnt); 33 | return ch; 34 | } 35 | 36 | void bfc_ch_destroy(bfc_ch_t *ch) 37 | { 38 | int i; 39 | if (ch == 0) return; 40 | for (i = 0; i < 1<l_pre; ++i) 41 | kh_destroy(cnt, ch->h[i]); 42 | free(ch->h); free(ch); 43 | } 44 | 45 | static inline cnthash_t *get_subhash(const bfc_ch_t *ch, const uint64_t x[2], uint64_t *key) 46 | { 47 | if (ch->k <= 32) { 48 | int t = ch->k * 2 - ch->l_pre; 49 | uint64_t z = x[0] << ch->k | x[1]; 50 | *key = (z & ((1ULL<h[z>>t]; 52 | } else { 53 | int t = ch->k - ch->l_pre; 54 | int shift = t + ch->k < BFC_CH_KEYBITS? ch->k : BFC_CH_KEYBITS - t; 55 | *key = ((x[0] & ((1ULL<h[x[0]>>t]; 57 | } 58 | } 59 | 60 | int bfc_ch_insert(bfc_ch_t *ch, const uint64_t x[2], int is_high, int forced) 61 | { 62 | int absent; 63 | uint64_t key; 64 | cnthash_t *h; 65 | khint_t k; 66 | h = get_subhash(ch, x, &key); 67 | if (__sync_lock_test_and_set(&h->lock, 1)) { 68 | if (forced) // then wait until the hash table is unlocked by the thread using it 69 | while (__sync_lock_test_and_set(&h->lock, 1)) 70 | while (h->lock); // lock 71 | else return -1; 72 | } 73 | k = kh_put(cnt, h, key, &absent); 74 | if (absent) { 75 | if (is_high) kh_key(h, k) |= 1<<8; 76 | } else { 77 | if ((kh_key(h, k) & 0xff) != 0xff) ++kh_key(h, k); 78 | if (is_high && (kh_key(h, k) >> 8 & 0x3f) != 0x3f) kh_key(h, k) += 1<<8; 79 | } 80 | __sync_lock_release(&h->lock); // unlock 81 | return 0; 82 | } 83 | 84 | int bfc_ch_get(const bfc_ch_t *ch, const uint64_t x[2]) 85 | { 86 | uint64_t key; 87 | cnthash_t *h; 88 | khint_t itr; 89 | h = get_subhash(ch, x, &key); 90 | itr = kh_get(cnt, h, key); 91 | return itr == kh_end(h)? -1 : kh_key(h, itr) & 0x3fff; 92 | } 93 | 94 | int bfc_ch_kmer_occ(const bfc_ch_t *ch, const bfc_kmer_t *z) 95 | { 96 | uint64_t x[2]; 97 | bfc_kmer_hash(ch->k, z->x, x); 98 | return bfc_ch_get(ch, x); 99 | } 100 | 101 | uint64_t bfc_ch_count(const bfc_ch_t *ch) 102 | { 103 | int i; 104 | uint64_t cnt = 0; 105 | for (i = 0; i < 1<l_pre; ++i) 106 | cnt += kh_size(ch->h[i]); 107 | return cnt; 108 | } 109 | 110 | int bfc_ch_hist(const bfc_ch_t *ch, uint64_t cnt[256], uint64_t high[64]) 111 | { 112 | int i, max_i = -1; 113 | uint64_t max; 114 | memset(cnt, 0, 256 * 8); 115 | memset(high, 0, 64 * 8); 116 | for (i = 0; i < 1<l_pre; ++i) { 117 | khint_t k; 118 | cnthash_t *h = ch->h[i]; 119 | for (k = 0; k != kh_end(h); ++k) 120 | if (kh_exist(h, k)) 121 | ++cnt[kh_key(h, k) & 0xff], ++high[kh_key(h, k)>>8 & 0x3f]; 122 | } 123 | for (i = 3, max = 0; i < 256; ++i) 124 | if (cnt[i] > max) 125 | max = cnt[i], max_i = i; 126 | return max_i; 127 | } 128 | 129 | int bfc_ch_get_k(const bfc_ch_t *ch) 130 | { 131 | return ch->k; 132 | } 133 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/htab.h: -------------------------------------------------------------------------------- 1 | #ifndef BFC_HTAB_H 2 | #define BFC_HTAB_H 3 | 4 | #include 5 | #include "kmer.h" 6 | 7 | #define BFC_CH_KEYBITS 50 8 | #define BFC_CH_MAXPRE 20 9 | 10 | struct bfc_ch_s; 11 | typedef struct bfc_ch_s bfc_ch_t; 12 | 13 | bfc_ch_t *bfc_ch_init(int k, int l_pre); 14 | void bfc_ch_destroy(bfc_ch_t *ch); 15 | int bfc_ch_insert(bfc_ch_t *ch, const uint64_t x[2], int is_high, int forced); 16 | int bfc_ch_get(const bfc_ch_t *ch, const uint64_t x[2]); 17 | uint64_t bfc_ch_count(const bfc_ch_t *ch); 18 | int bfc_ch_hist(const bfc_ch_t *ch, uint64_t cnt[256], uint64_t high[64]); 19 | int bfc_ch_get_k(const bfc_ch_t *ch); 20 | 21 | int bfc_ch_kmer_occ(const bfc_ch_t *ch, const bfc_kmer_t *z); 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/internal.h: -------------------------------------------------------------------------------- 1 | #ifndef FML_INTERNAL_H 2 | #define FML_INTERNAL_H 3 | 4 | #include "fml.h" 5 | 6 | extern unsigned char seq_nt6_table[256]; 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); 13 | void seq_reverse(int l, unsigned char *s); 14 | void seq_revcomp6(int l, unsigned char *s); 15 | struct bfc_ch_s *fml_count(int n, const bseq1_t *seq, int k, int q, int l_pre, int n_threads); 16 | 17 | #ifdef __cplusplus 18 | } 19 | #endif 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/kmer.h: -------------------------------------------------------------------------------- 1 | #ifndef BFC_KMER_H 2 | #define BFC_KMER_H 3 | 4 | #include 5 | 6 | typedef struct { 7 | uint64_t x[4]; 8 | } bfc_kmer_t; 9 | 10 | static inline void bfc_kmer_append(int k, uint64_t x[4], int c) 11 | { // IMPORTANT: 0 <= c < 4 12 | uint64_t mask = (1ULL<>1)) & mask; 15 | x[2] = x[2]>>1 | (1ULL^(c&1))<<(k-1); 16 | x[3] = x[3]>>1 | (1ULL^c>>1) <<(k-1); 17 | } 18 | 19 | static inline void bfc_kmer_change(int k, uint64_t x[4], int d, int c) // d-bp from the 3'-end of k-mer; 0<=d>1)<>1)<<(k-1-d) | (x[3]&t); 27 | } 28 | 29 | // Thomas Wang's integer hash functions. See for a snapshot. 30 | static inline uint64_t bfc_hash_64(uint64_t key, uint64_t mask) 31 | { 32 | key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1; 33 | key = key ^ key >> 24; 34 | key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265 35 | key = key ^ key >> 14; 36 | key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21 37 | key = key ^ key >> 28; 38 | key = (key + (key << 31)) & mask; 39 | return key; 40 | } 41 | 42 | static inline uint64_t bfc_hash_64_inv(uint64_t key, uint64_t mask) 43 | { 44 | uint64_t tmp; 45 | 46 | // Invert key = key + (key << 31) 47 | tmp = (key - (key << 31)); 48 | key = (key - (tmp << 31)) & mask; 49 | 50 | // Invert key = key ^ (key >> 28) 51 | tmp = key ^ key >> 28; 52 | key = key ^ tmp >> 28; 53 | 54 | // Invert key *= 21 55 | key = (key * 14933078535860113213ull) & mask; 56 | 57 | // Invert key = key ^ (key >> 14) 58 | tmp = key ^ key >> 14; 59 | tmp = key ^ tmp >> 14; 60 | tmp = key ^ tmp >> 14; 61 | key = key ^ tmp >> 14; 62 | 63 | // Invert key *= 265 64 | key = (key * 15244667743933553977ull) & mask; 65 | 66 | // Invert key = key ^ (key >> 24) 67 | tmp = key ^ key >> 24; 68 | key = key ^ tmp >> 24; 69 | 70 | // Invert key = (~key) + (key << 21) 71 | tmp = ~key; 72 | tmp = ~(key - (tmp << 21)); 73 | tmp = ~(key - (tmp << 21)); 74 | key = ~(key - (tmp << 21)) & mask; 75 | 76 | return key; 77 | } 78 | 79 | static inline uint64_t bfc_kmer_hash(int k, const uint64_t x[4], uint64_t h[2]) 80 | { 81 | int t = k>>1, u = ((x[1]>>t&1) > (x[3]>>t&1)); // the middle base is always different 82 | uint64_t mask = (1ULL<>l&1)<<1 | (y[0]>>l&1)]; 102 | buf[k] = 0; 103 | return buf; 104 | } 105 | 106 | #endif 107 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/kstring.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #ifndef KSTRING_H 27 | #define KSTRING_H 28 | 29 | #include 30 | #include 31 | #include 32 | 33 | #ifndef kroundup32 34 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 35 | #endif 36 | 37 | #ifndef KSTRING_T 38 | #define KSTRING_T kstring_t 39 | typedef struct __kstring_t { 40 | size_t l, m; 41 | char *s; 42 | } kstring_t; 43 | #endif 44 | 45 | typedef struct { 46 | uint64_t tab[4]; 47 | int sep, finished; 48 | const char *p; // end of the current token 49 | } ks_tokaux_t; 50 | 51 | #ifdef __cplusplus 52 | extern "C" { 53 | #endif 54 | 55 | int ksprintf(kstring_t *s, const char *fmt, ...); 56 | int ksprintf_fast(kstring_t *s, const char *fmt, ...); 57 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); 58 | char *kstrstr(const char *str, const char *pat, int **_prep); 59 | char *kstrnstr(const char *str, const char *pat, int n, int **_prep); 60 | void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); 61 | 62 | /* kstrtok() is similar to strtok_r() except that str is not 63 | * modified and both str and sep can be NULL. For efficiency, it is 64 | * actually recommended to set both to NULL in the subsequent calls 65 | * if sep is not changed. */ 66 | char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); 67 | 68 | #ifdef __cplusplus 69 | } 70 | #endif 71 | 72 | static inline void ks_resize(kstring_t *s, size_t size) 73 | { 74 | if (s->m < size) { 75 | s->m = size; 76 | kroundup32(s->m); 77 | s->s = (char*)realloc(s->s, s->m); 78 | } 79 | } 80 | 81 | static inline int kputsn(const char *p, int l, kstring_t *s) 82 | { 83 | if (s->l + l + 1 >= s->m) { 84 | s->m = s->l + l + 2; 85 | kroundup32(s->m); 86 | s->s = (char*)realloc(s->s, s->m); 87 | } 88 | memcpy(s->s + s->l, p, l); 89 | s->l += l; 90 | s->s[s->l] = 0; 91 | return l; 92 | } 93 | 94 | static inline int kputs(const char *p, kstring_t *s) 95 | { 96 | return kputsn(p, strlen(p), s); 97 | } 98 | 99 | static inline int kputc(int c, kstring_t *s) 100 | { 101 | if (s->l + 1 >= s->m) { 102 | s->m = s->l + 2; 103 | kroundup32(s->m); 104 | s->s = (char*)realloc(s->s, s->m); 105 | } 106 | s->s[s->l++] = c; 107 | s->s[s->l] = 0; 108 | return c; 109 | } 110 | 111 | static inline int kputw(int c, kstring_t *s) 112 | { 113 | char buf[16]; 114 | int l, x; 115 | if (c == 0) return kputc('0', s); 116 | for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; 117 | if (c < 0) buf[l++] = '-'; 118 | if (s->l + l + 1 >= s->m) { 119 | s->m = s->l + l + 2; 120 | kroundup32(s->m); 121 | s->s = (char*)realloc(s->s, s->m); 122 | } 123 | for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; 124 | s->s[s->l] = 0; 125 | return 0; 126 | } 127 | 128 | static inline int kputuw(unsigned c, kstring_t *s) 129 | { 130 | char buf[16]; 131 | int l, i; 132 | unsigned x; 133 | if (c == 0) return kputc('0', s); 134 | for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; 135 | if (s->l + l + 1 >= s->m) { 136 | s->m = s->l + l + 2; 137 | kroundup32(s->m); 138 | s->s = (char*)realloc(s->s, s->m); 139 | } 140 | for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; 141 | s->s[s->l] = 0; 142 | return 0; 143 | } 144 | 145 | static inline int kputl(long c, kstring_t *s) 146 | { 147 | char buf[32]; 148 | long l, x; 149 | if (c == 0) return kputc('0', s); 150 | for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; 151 | if (c < 0) buf[l++] = '-'; 152 | if (s->l + l + 1 >= s->m) { 153 | s->m = s->l + l + 2; 154 | kroundup32(s->m); 155 | s->s = (char*)realloc(s->s, s->m); 156 | } 157 | for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; 158 | s->s[s->l] = 0; 159 | return 0; 160 | } 161 | 162 | static inline int *ksplit(kstring_t *s, int delimiter, int *n) 163 | { 164 | int max = 0, *offsets = 0; 165 | *n = ksplit_core(s->s, delimiter, &max, &offsets); 166 | return offsets; 167 | } 168 | 169 | #endif 170 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/ksw.h: -------------------------------------------------------------------------------- 1 | #ifndef __AC_KSW_H 2 | #define __AC_KSW_H 3 | 4 | #include 5 | 6 | #define KSW_XBYTE 0x10000 7 | #define KSW_XSTOP 0x20000 8 | #define KSW_XSUBO 0x40000 9 | #define KSW_XSTART 0x80000 10 | 11 | struct _kswq_t; 12 | typedef struct _kswq_t kswq_t; 13 | 14 | typedef struct { 15 | int score; // best score 16 | int te, qe; // target end and query end 17 | int score2, te2; // second best score and ending position on the target 18 | int tb, qb; // target start and query start 19 | } kswr_t; 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | /** 26 | * Aligning two sequences 27 | * 28 | * @param qlen length of the query sequence (typically 2 | #include 3 | #include 4 | 5 | /************ 6 | * kt_for() * 7 | ************/ 8 | 9 | struct kt_for_t; 10 | 11 | typedef struct { 12 | struct kt_for_t *t; 13 | long i; 14 | } ktf_worker_t; 15 | 16 | typedef struct kt_for_t { 17 | int n_threads; 18 | long n; 19 | ktf_worker_t *w; 20 | void (*func)(void*,long,int); 21 | void *data; 22 | } kt_for_t; 23 | 24 | static inline long steal_work(kt_for_t *t) 25 | { 26 | int i, min_i = -1; 27 | long k, min = LONG_MAX; 28 | for (i = 0; i < t->n_threads; ++i) 29 | if (min > t->w[i].i) min = t->w[i].i, min_i = i; 30 | k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads); 31 | return k >= t->n? -1 : k; 32 | } 33 | 34 | static void *ktf_worker(void *data) 35 | { 36 | ktf_worker_t *w = (ktf_worker_t*)data; 37 | long i; 38 | for (;;) { 39 | i = __sync_fetch_and_add(&w->i, w->t->n_threads); 40 | if (i >= w->t->n) break; 41 | w->t->func(w->t->data, i, w - w->t->w); 42 | } 43 | while ((i = steal_work(w->t)) >= 0) 44 | w->t->func(w->t->data, i, w - w->t->w); 45 | pthread_exit(0); 46 | } 47 | 48 | void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n) 49 | { 50 | if (n_threads > 1) { 51 | int i; 52 | kt_for_t t; 53 | pthread_t *tid; 54 | t.func = func, t.data = data, t.n_threads = n_threads, t.n = n; 55 | t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t)); 56 | tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); 57 | for (i = 0; i < n_threads; ++i) 58 | t.w[i].t = &t, t.w[i].i = i; 59 | for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]); 60 | for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); 61 | } else { 62 | long j; 63 | for (j = 0; j < n; ++j) func(data, j, 0); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/kvec.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* 27 | An example: 28 | 29 | #include "kvec.h" 30 | int main() { 31 | kvec_t(int) array; 32 | kv_init(array); 33 | kv_push(int, array, 10); // append 34 | kv_a(int, array, 20) = 5; // dynamic 35 | kv_A(array, 20) = 4; // static 36 | kv_destroy(array); 37 | return 0; 38 | } 39 | */ 40 | 41 | /* 42 | 2008-09-22 (0.1.0): 43 | 44 | * The initial version. 45 | 46 | */ 47 | 48 | #ifndef AC_KVEC_H 49 | #define AC_KVEC_H 50 | 51 | #include 52 | 53 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 54 | 55 | #define kvec_t(type) struct { size_t n, m; type *a; } 56 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) 57 | #define kv_destroy(v) free((v).a) 58 | #define kv_A(v, i) ((v).a[(i)]) 59 | #define kv_pop(v) ((v).a[--(v).n]) 60 | #define kv_size(v) ((v).n) 61 | #define kv_max(v) ((v).m) 62 | 63 | #define kv_resize(type, v, s) do { \ 64 | if ((v).m < (s)) { \ 65 | (v).m = (s); \ 66 | kv_roundup32((v).m); \ 67 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ 68 | } \ 69 | } while (0) 70 | 71 | #define kv_copy(type, v1, v0) do { \ 72 | if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ 73 | (v1).n = (v0).n; \ 74 | memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ 75 | } while (0) \ 76 | 77 | #define kv_push(type, v, x) do { \ 78 | if ((v).n == (v).m) { \ 79 | (v).m = (v).m? (v).m<<1 : 2; \ 80 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ 81 | } \ 82 | (v).a[(v).n++] = (x); \ 83 | } while (0) 84 | 85 | #define kv_pushp(type, v, p) do { \ 86 | if ((v).n == (v).m) { \ 87 | (v).m = (v).m? (v).m<<1 : 2; \ 88 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ 89 | } \ 90 | *(p) = &(v).a[(v).n++]; \ 91 | } while (0) 92 | 93 | #define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ 94 | ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ 95 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ 96 | : (v).n <= (size_t)(i)? (v).n = (i) \ 97 | : 0), (v).a[(i)] 98 | 99 | #define kv_reverse(type, v, start) do { \ 100 | if ((v).m > 0 && (v).n > (start)) { \ 101 | size_t __i, __end = (v).n - (start); \ 102 | type *__a = (v).a + (start); \ 103 | for (__i = 0; __i < __end>>1; ++__i) { \ 104 | type __t = __a[__end - 1 - __i]; \ 105 | __a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \ 106 | } \ 107 | } \ 108 | } while (0) 109 | 110 | #endif 111 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/mag.h: -------------------------------------------------------------------------------- 1 | #ifndef FM_MOG_H 2 | #define FM_MOG_H 3 | 4 | #include 5 | #include 6 | #include "kstring.h" 7 | #include "fml.h" 8 | 9 | #ifndef KINT_DEF 10 | #define KINT_DEF 11 | typedef struct { uint64_t x, y; } ku128_t; 12 | typedef struct { size_t n, m; uint64_t *a; } ku64_v; 13 | typedef struct { size_t n, m; ku128_t *a; } ku128_v; 14 | #endif 15 | 16 | typedef struct { 17 | int len, nsr; // length; number supporting reads 18 | uint32_t max_len;// allocated seq/cov size 19 | uint64_t k[2]; // bi-interval 20 | ku128_v nei[2]; // neighbors 21 | char *seq, *cov; // sequence and coverage 22 | void *ptr; // additional information 23 | } magv_t; 24 | 25 | typedef struct { size_t n, m; magv_t *a; } magv_v; 26 | 27 | typedef struct mag_t { 28 | magv_v v; 29 | float rdist; // read distance 30 | int min_ovlp; // minimum overlap seen from the graph 31 | void *h; 32 | } mag_t; 33 | 34 | struct mogb_aux; 35 | typedef struct mogb_aux mogb_aux_t; 36 | 37 | #ifdef __cplusplus 38 | extern "C" { 39 | #endif 40 | 41 | void mag_init_opt(magopt_t *o); 42 | void mag_g_clean(mag_t *g, const magopt_t *opt); 43 | 44 | void mag_g_destroy(mag_t *g); 45 | void mag_g_amend(mag_t *g); 46 | void mag_g_build_hash(mag_t *g); 47 | void mag_g_print(const mag_t *g); 48 | int mag_g_rm_vext(mag_t *g, int min_len, int min_nsr); 49 | void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr); 50 | void mag_g_merge(mag_t *g, int rmdup, int min_merge_len); 51 | void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist); 52 | void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive); 53 | void mag_g_pop_open(mag_t *g, int min_elen); 54 | void mag_g_trim_open(mag_t *g, const magopt_t *opt); 55 | 56 | void mag_v_copy_to_empty(magv_t *dst, const magv_t *src); // NB: memory leak if dst is allocated 57 | void mag_v_del(mag_t *g, magv_t *p); 58 | void mag_v_write(const magv_t *p, kstring_t *out); 59 | void mag_v_pop_open(mag_t *g, magv_t *p, int min_elen); 60 | 61 | uint64_t mag_tid2idd(void *h, uint64_t tid); 62 | void mag_v128_clean(ku128_v *r); 63 | double mag_cal_rdist(mag_t *g); 64 | 65 | #ifdef __cplusplus 66 | } 67 | #endif 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/mrope.h: -------------------------------------------------------------------------------- 1 | #ifndef MROPE_H_ 2 | #define MROPE_H_ 3 | 4 | #include "rope.h" 5 | 6 | #define MR_SO_IO 0 7 | #define MR_SO_RLO 1 8 | #define MR_SO_RCLO 2 9 | 10 | typedef struct { 11 | uint8_t so; // sorting order 12 | int thr_min; // when there are fewer sequences than this, disable multi-threading 13 | rope_t *r[6]; 14 | } mrope_t; // multi-rope 15 | 16 | typedef struct { 17 | mrope_t *r; 18 | int a, to_free; 19 | rpitr_t i; 20 | } mritr_t; 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | /** 27 | * Initiate a multi-rope 28 | * 29 | * @param max_nodes maximum number of nodes in an internal node; use ROPE_DEF_MAX_NODES (64) if unsure 30 | * @param block_len maximum block length in an external node; use ROPE_DEF_BLOCK_LEN (256) if unsure 31 | * @param sorting_order the order in which sequences are added; possible values defined by the MR_SO_* macros 32 | */ 33 | mrope_t *mr_init(int max_nodes, int block_len, int sorting_order); 34 | 35 | void mr_destroy(mrope_t *r); 36 | 37 | int mr_thr_min(mrope_t *r, int thr_min); 38 | 39 | /** 40 | * Insert one string into the index 41 | * 42 | * @param r multi-rope 43 | * @param str the *reverse* of the input string (important: it is reversed!) 44 | */ 45 | int64_t mr_insert1(mrope_t *r, const uint8_t *str); 46 | 47 | /** 48 | * Insert multiple strings 49 | * 50 | * @param mr multi-rope 51 | * @param len total length of $s 52 | * @param s concatenated, NULL delimited, reversed input strings 53 | * @param is_thr true to use 5 threads 54 | */ 55 | void mr_insert_multi(mrope_t *mr, int64_t len, const uint8_t *s, int is_thr); 56 | 57 | void mr_rank2a(const mrope_t *mr, int64_t x, int64_t y, int64_t *cx, int64_t *cy); 58 | #define mr_rank1a(mr, x, cx) mr_rank2a(mr, x, -1, cx, 0) 59 | 60 | /** 61 | * Put the iterator at the start of the index 62 | * 63 | * @param r multi-rope 64 | * @param i iterator to be initialized 65 | * @param to_free if true, free visited buckets 66 | */ 67 | void mr_itr_first(mrope_t *r, mritr_t *i, int to_free); 68 | 69 | /** 70 | * Iterate to the next block 71 | * 72 | * @param i iterator 73 | * 74 | * @return pointer to the start of a block; see rle.h for decoding the block 75 | */ 76 | const uint8_t *mr_itr_next_block(mritr_t *i); 77 | 78 | #ifdef __cplusplus 79 | } 80 | #endif 81 | 82 | static inline int64_t mr_get_c(const mrope_t *mr, int64_t c[6]) 83 | { 84 | int a, b; 85 | int64_t tot = 0; 86 | for (a = 0; a < 6; ++a) c[a] = 0; 87 | for (a = 0; a < 6; ++a) { 88 | for (b = 0; b < 6; ++b) 89 | c[b] += mr->r[a]->c[b]; 90 | tot += c[b]; 91 | } 92 | return tot; 93 | } 94 | 95 | static inline int64_t mr_get_ac(const mrope_t *mr, int64_t ac[7]) 96 | { 97 | int a; 98 | int64_t c[6], tot; 99 | tot = mr_get_c(mr, c); 100 | for (a = 1, ac[0] = 0; a <= 6; ++a) ac[a] = ac[a-1] + c[a-1]; 101 | return tot; 102 | } 103 | 104 | static inline int64_t mr_get_tot(const mrope_t *mr) 105 | { 106 | int a, b; 107 | int64_t tot = 0; 108 | for (a = 0; a < 6; ++a) 109 | for (b = 0; b < 6; ++b) 110 | tot += mr->r[a]->c[b]; 111 | return tot; 112 | } 113 | 114 | #endif 115 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/rld0.h: -------------------------------------------------------------------------------- 1 | #ifndef RLDELTA0_H 2 | #define RLDELTA0_H 3 | 4 | #define _DNA_ONLY 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define RLD_LBITS 23 12 | #define RLD_LSIZE (1<n_bytes>>3>>(e)->sbits<<(e)->sbits) 74 | #define rld_seek_blk(e, k) ((e)->z[(k)>>RLD_LBITS] + ((k)&RLD_LMASK)) 75 | #define rld_get_stail(e, itr) ((itr)->shead + (e)->ssize - ((itr)->shead + (e)->ssize - *(itr)->i == RLD_LSIZE? 2 : 1)) 76 | 77 | #define rld_block_type(x) ((uint64_t)(x)>>62) 78 | 79 | static inline int64_t rld_dec0(const rld_t *e, rlditr_t *itr, int *c) 80 | { 81 | int w; 82 | uint64_t x; 83 | int64_t l, y = 0; 84 | x = itr->p[0] << (64 - itr->r) | (itr->p != itr->stail && itr->r != 64? itr->p[1] >> itr->r : 0); 85 | if (x>>63 == 0) { 86 | if ((w = 0x333333335555779bll>>(x>>59<<2)&0xf) == 0xb && x>>58 == 0) return 0; 87 | l = (x >> (64 - w)) - 1; 88 | y = x << w >> (64 - l) | 1u << l; 89 | w += l; 90 | } else w = y = 1; 91 | *c = x << w >> (64 - e->abits); 92 | w += e->abits; 93 | if (itr->r > w) itr->r -= w; 94 | else ++itr->p, itr->r = 64 + itr->r - w; 95 | return y; 96 | } 97 | 98 | static inline int64_t rld_dec(const rld_t *e, rlditr_t *itr, int *_c, int is_free) 99 | { 100 | int64_t l = rld_dec0(e, itr, _c); 101 | if (l == 0 || *_c > e->asize) { 102 | uint64_t last = rld_last_blk(e); 103 | if (itr->p - *itr->i > RLD_LSIZE - e->ssize) { 104 | if (is_free) { 105 | free(*itr->i); *itr->i = 0; 106 | } 107 | itr->shead = *++itr->i; 108 | } else itr->shead += e->ssize; 109 | if (itr->shead == rld_seek_blk(e, last)) return -1; 110 | itr->p = itr->shead + e->offset0[rld_block_type(*itr->shead)]; 111 | itr->q = (uint8_t*)itr->p; 112 | itr->stail = rld_get_stail(e, itr); 113 | itr->r = 64; 114 | return rld_dec0(e, itr, _c); 115 | } else return l; 116 | } 117 | 118 | // take k symbols from e0 and write it to e 119 | static inline void rld_dec_enc(rld_t *e, rlditr_t *itr, const rld_t *e0, rlditr_t *itr0, int64_t k) 120 | { 121 | if (itr0->l >= k) { // there are more pending symbols 122 | rld_enc(e, itr, k, itr0->c); 123 | itr0->l -= k; // l - k symbols remains 124 | } else { // use up all pending symbols 125 | int c = -1; // to please gcc 126 | int64_t l; 127 | rld_enc(e, itr, itr0->l, itr0->c); // write all pending symbols 128 | k -= itr0->l; 129 | for (; k > 0; k -= l) { // we always go into this loop because l0l = -k; itr0->c = c; 134 | } 135 | } 136 | 137 | #endif 138 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/rle.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "rle.h" 6 | 7 | const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 }; 8 | 9 | // insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase 10 | int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]) 11 | { 12 | uint16_t *nptr = (uint16_t*)block; 13 | int diff; 14 | 15 | block += 2; // skip the first 2 counting bytes 16 | if (*nptr == 0) { 17 | memset(cnt, 0, 48); 18 | diff = rle_enc1(block, a, rl); 19 | } else { 20 | uint8_t *p, *end = block + *nptr, *q; 21 | int64_t pre, z, l = 0, tot, beg_l; 22 | int c = -1, n_bytes = 0, n_bytes2, t = 0; 23 | uint8_t tmp[24]; 24 | beg_l = bc[0] + bc[1] + bc[2] + bc[3] + bc[4] + bc[5]; 25 | tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5]; 26 | if (x < beg_l) { 27 | beg_l = 0, *beg = 0; 28 | memset(bc, 0, 48); 29 | } 30 | if (x == beg_l) { 31 | p = q = block + (*beg); z = beg_l; 32 | memcpy(cnt, bc, 48); 33 | } else if (x - beg_l <= ((tot-beg_l)>>1) + ((tot-beg_l)>>3)) { // forward 34 | z = beg_l; p = block + (*beg); 35 | memcpy(cnt, bc, 48); 36 | while (z < x) { 37 | rle_dec1(p, c, l); 38 | z += l; cnt[c] += l; 39 | } 40 | for (q = p - 1; *q>>6 == 2; --q); 41 | } else { // backward 42 | memcpy(cnt, ec, 48); 43 | z = tot; p = end; 44 | while (z >= x) { 45 | --p; 46 | if (*p>>6 != 2) { 47 | l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; 48 | z -= l; cnt[*p&7] -= l; 49 | l = 0; t = 0; 50 | } else { 51 | l |= (*p&0x3fL) << t; 52 | t += 6; 53 | } 54 | } 55 | q = p; 56 | rle_dec1(p, c, l); 57 | z += l; cnt[c] += l; 58 | } 59 | *beg = q - block; 60 | memcpy(bc, cnt, 48); 61 | bc[c] -= l; 62 | n_bytes = p - q; 63 | if (x == z && a != c && p < end) { // then try the next run 64 | int tc; 65 | int64_t tl; 66 | q = p; 67 | rle_dec1(q, tc, tl); 68 | if (a == tc) 69 | c = tc, n_bytes = q - p, l = tl, z += l, p = q, cnt[tc] += tl; 70 | } 71 | if (z != x) cnt[c] -= z - x; 72 | pre = x - (z - l); p -= n_bytes; 73 | if (a == c) { // insert to the same run 74 | n_bytes2 = rle_enc1(tmp, c, l + rl); 75 | } else if (x == z) { // at the end; append to the existing run 76 | p += n_bytes; n_bytes = 0; 77 | n_bytes2 = rle_enc1(tmp, a, rl); 78 | } else { // break the current run 79 | n_bytes2 = rle_enc1(tmp, c, pre); 80 | n_bytes2 += rle_enc1(tmp + n_bytes2, a, rl); 81 | n_bytes2 += rle_enc1(tmp + n_bytes2, c, l - pre); 82 | } 83 | if (n_bytes != n_bytes2 && end != p + n_bytes) // size changed 84 | memmove(p + n_bytes2, p + n_bytes, end - p - n_bytes); 85 | memcpy(p, tmp, n_bytes2); 86 | diff = n_bytes2 - n_bytes; 87 | } 88 | return (*nptr += diff); 89 | } 90 | 91 | int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6]) 92 | { 93 | int beg = 0; 94 | int64_t bc[6]; 95 | memset(bc, 0, 48); 96 | return rle_insert_cached(block, x, a, rl, cnt, ec, &beg, bc); 97 | } 98 | 99 | void rle_split(uint8_t *block, uint8_t *new_block) 100 | { 101 | int n = *(uint16_t*)block; 102 | uint8_t *end = block + 2 + n, *q = block + 2 + (n>>1); 103 | while (*q>>6 == 2) --q; 104 | memcpy(new_block + 2, q, end - q); 105 | *(uint16_t*)new_block = end - q; 106 | *(uint16_t*)block = q - block - 2; 107 | } 108 | 109 | void rle_count(const uint8_t *block, int64_t cnt[6]) 110 | { 111 | const uint8_t *q = block + 2, *end = q + *(uint16_t*)block; 112 | while (q < end) { 113 | int c; 114 | int64_t l; 115 | rle_dec1(q, c, l); 116 | cnt[c] += l; 117 | } 118 | } 119 | 120 | void rle_print(const uint8_t *block, int expand) 121 | { 122 | const uint16_t *p = (const uint16_t*)block; 123 | const uint8_t *q = block + 2, *end = block + 2 + *p; 124 | while (q < end) { 125 | int c; 126 | int64_t l, x; 127 | rle_dec1(q, c, l); 128 | if (expand) for (x = 0; x < l; ++x) putchar("$ACGTN"[c]); 129 | else printf("%c%ld", "$ACGTN"[c], (long)l); 130 | } 131 | putchar('\n'); 132 | } 133 | 134 | void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]) 135 | { 136 | int a; 137 | int64_t tot, cnt[6]; 138 | const uint8_t *p; 139 | 140 | y = y >= x? y : x; 141 | tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5]; 142 | if (tot == 0) return; 143 | if (x <= (tot - y) + (tot>>3)) { 144 | int c = 0; 145 | int64_t l, z = 0; 146 | memset(cnt, 0, 48); 147 | p = block + 2; 148 | while (z < x) { 149 | rle_dec1(p, c, l); 150 | z += l; cnt[c] += l; 151 | } 152 | for (a = 0; a != 6; ++a) cx[a] += cnt[a]; 153 | cx[c] -= z - x; 154 | if (cy) { 155 | while (z < y) { 156 | rle_dec1(p, c, l); 157 | z += l; cnt[c] += l; 158 | } 159 | for (a = 0; a != 6; ++a) cy[a] += cnt[a]; 160 | cy[c] -= z - y; 161 | } 162 | } else { 163 | #define move_backward(_x) \ 164 | while (z >= (_x)) { \ 165 | --p; \ 166 | if (*p>>6 != 2) { \ 167 | l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; \ 168 | z -= l; cnt[*p&7] -= l; \ 169 | l = 0; t = 0; \ 170 | } else { \ 171 | l |= (*p&0x3fL) << t; \ 172 | t += 6; \ 173 | } \ 174 | } \ 175 | 176 | int t = 0; 177 | int64_t l = 0, z = tot; 178 | memcpy(cnt, ec, 48); 179 | p = block + 2 + *(const uint16_t*)block; 180 | if (cy) { 181 | move_backward(y) 182 | for (a = 0; a != 6; ++a) cy[a] += cnt[a]; 183 | cy[*p&7] += y - z; 184 | } 185 | move_backward(x) 186 | for (a = 0; a != 6; ++a) cx[a] += cnt[a]; 187 | cx[*p&7] += x - z; 188 | 189 | #undef move_backward 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/rle.h: -------------------------------------------------------------------------------- 1 | #ifndef RLE6_H_ 2 | #define RLE6_H_ 3 | 4 | #include 5 | 6 | #ifdef __GNUC__ 7 | #define LIKELY(x) __builtin_expect((x),1) 8 | #else 9 | #define LIKELY(x) (x) 10 | #endif 11 | #ifdef __cplusplus 12 | 13 | extern "C" { 14 | #endif 15 | 16 | int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]); 17 | int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t end_cnt[6]); 18 | void rle_split(uint8_t *block, uint8_t *new_block); 19 | void rle_count(const uint8_t *block, int64_t cnt[6]); 20 | void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]); 21 | #define rle_rank1a(block, x, cx, ec) rle_rank2a(block, x, -1, cx, 0, ec) 22 | 23 | void rle_print(const uint8_t *block, int expand); 24 | 25 | #ifdef __cplusplus 26 | } 27 | #endif 28 | 29 | /****************** 30 | *** 43+3 codec *** 31 | ******************/ 32 | 33 | const uint8_t rle_auxtab[8]; 34 | 35 | #define RLE_MIN_SPACE 18 36 | #define rle_nptr(block) ((uint16_t*)(block)) 37 | 38 | // decode one run (c,l) and move the pointer p 39 | #define rle_dec1(p, c, l) do { \ 40 | (c) = *(p) & 7; \ 41 | if (LIKELY((*(p)&0x80) == 0)) { \ 42 | (l) = *(p)++ >> 3; \ 43 | } else if (LIKELY(*(p)>>5 == 6)) { \ 44 | (l) = (*(p)&0x18L)<<3L | ((p)[1]&0x3fL); \ 45 | (p) += 2; \ 46 | } else { \ 47 | int n = ((*(p)&0x10) >> 2) + 4; \ 48 | (l) = *(p)++ >> 3 & 1; \ 49 | while (--n) (l) = ((l)<<6) | (*(p)++&0x3fL); \ 50 | } \ 51 | } while (0) 52 | 53 | static inline int rle_enc1(uint8_t *p, int c, int64_t l) 54 | { 55 | if (l < 1LL<<4) { 56 | *p = l << 3 | c; 57 | return 1; 58 | } else if (l < 1LL<<8) { 59 | *p = 0xC0 | l >> 6 << 3 | c; 60 | p[1] = 0x80 | (l & 0x3f); 61 | return 2; 62 | } else if (l < 1LL<<19) { 63 | *p = 0xE0 | l >> 18 << 3 | c; 64 | p[1] = 0x80 | (l >> 12 & 0x3f); 65 | p[2] = 0x80 | (l >> 6 & 0x3f); 66 | p[3] = 0x80 | (l & 0x3f); 67 | return 4; 68 | } else { 69 | int i, shift = 36; 70 | *p = 0xF0 | l >> 42 << 3 | c; 71 | for (i = 1; i < 8; ++i, shift -= 6) 72 | p[i] = 0x80 | (l>>shift & 0x3f); 73 | return 8; 74 | } 75 | } 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/rope.h: -------------------------------------------------------------------------------- 1 | #ifndef ROPE_H_ 2 | #define ROPE_H_ 3 | 4 | #include 5 | #include 6 | 7 | #define ROPE_MAX_DEPTH 80 8 | #define ROPE_DEF_MAX_NODES 64 9 | #define ROPE_DEF_BLOCK_LEN 512 10 | 11 | typedef struct rpnode_s { 12 | struct rpnode_s *p; // child; at the bottom level, $p points to a string with the first 2 bytes giving the number of runs (#runs) 13 | uint64_t l:54, n:9, is_bottom:1; // $n and $is_bottom are only set for the first node in a bucket 14 | int64_t c[6]; // marginal counts 15 | } rpnode_t; 16 | 17 | typedef struct { 18 | int32_t max_nodes, block_len; // both MUST BE even numbers 19 | int64_t c[6]; // marginal counts 20 | rpnode_t *root; 21 | void *node, *leaf; // memory pool 22 | } rope_t; 23 | 24 | typedef struct { 25 | const rope_t *rope; // the rope 26 | const rpnode_t *pa[ROPE_MAX_DEPTH]; // parent nodes 27 | int ia[ROPE_MAX_DEPTH]; // index in the parent nodes 28 | int d; // the current depth in the B+-tree 29 | } rpitr_t; 30 | 31 | typedef struct { 32 | int beg; 33 | int64_t bc[6]; 34 | uint8_t *p; 35 | } rpcache_t; 36 | 37 | #ifdef __cplusplus 38 | extern "C" { 39 | #endif 40 | 41 | rope_t *rope_init(int max_nodes, int block_len); 42 | void rope_destroy(rope_t *rope); 43 | int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache); 44 | void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy); 45 | #define rope_rank1a(rope, x, cx) rope_rank2a(rope, x, -1, cx, 0) 46 | 47 | void rope_itr_first(const rope_t *rope, rpitr_t *i); 48 | const uint8_t *rope_itr_next_block(rpitr_t *i); 49 | 50 | #ifdef __cplusplus 51 | } 52 | #endif 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /nim-stuffs/fermil-nim/test/MT-simu.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brentp/bio-playground/1982a222328bca7b675dedaa3887600a7d3dad74/nim-stuffs/fermil-nim/test/MT-simu.fq.gz -------------------------------------------------------------------------------- /plots/README.rst: -------------------------------------------------------------------------------- 1 | 2 | 3 | manhattan-plot.py 4 | ================= 5 | 6 | Generate a manhattan plot given a file with (at least) 7 | chromosome, x, p-value. 8 | Offers a number of command-line options and the code is simple 9 | enough that it can be further customized. 10 | 11 | The input file(s) (or stdin via -) can be in any format but 12 | the user must specify which columns to extract for chr, x, p 13 | using the --cols parameter. 14 | 15 | Usage:: 16 | 17 | manhattan-plot.py [options] files 18 | 19 | plot a manhattan plot of the input file(s). 20 | 21 | 22 | Options: 23 | -h, --help show this help message and exit 24 | --no-log don't do -log10(p) on the value 25 | --cols=COLS zero-based column indexes to get chr, position, p-value 26 | respectively e.g. 0,1,2 27 | --colors=COLORS cycle through these colors 28 | --image=IMAGE save the image to this file. e.g. manhattan.png 29 | --title=TITLE title for the image. 30 | --ymax=YMAX max (logged) y-value for plot 31 | --sep=SEP data separator, default is [tab] 32 | --lines plot the p-values as lines extending from the x-axis rather 33 | than points in space. plotting will take longer with this 34 | option. 35 | 36 | 37 | a command like:: 38 | 39 | $ python manhattan-plot.py --cols 0,1,6 input.bed 40 | 41 | generates 42 | 43 | .. image:: https://github.com/brentp/bio-playground/raw/master/plots/images/manhattan.png 44 | 45 | there are a number of options, including adding custom colors. 46 | 47 | $ python manhattan-plot.py --colors rgbk --cols 0,1,6 input.bed --image manhattan.rgbk.png 48 | 49 | generates 50 | 51 | .. image:: https://github.com/brentp/bio-playground/raw/master/plots/images/manhattan.rgbk.png 52 | 53 | -------------------------------------------------------------------------------- /plots/images/manhattan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brentp/bio-playground/1982a222328bca7b675dedaa3887600a7d3dad74/plots/images/manhattan.png -------------------------------------------------------------------------------- /plots/images/manhattan.rgbk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brentp/bio-playground/1982a222328bca7b675dedaa3887600a7d3dad74/plots/images/manhattan.rgbk.png -------------------------------------------------------------------------------- /plots/manhattan-plot.py: -------------------------------------------------------------------------------- 1 | """ 2 | %prog [options] files 3 | 4 | plot a manhattan plot of the input file(s). 5 | """ 6 | 7 | import optparse 8 | import sys 9 | from itertools import groupby, cycle 10 | from operator import itemgetter 11 | from matplotlib import pyplot as plt 12 | import numpy as np 13 | 14 | def _gen_data(fhs, columns, sep): 15 | """ 16 | iterate over the files and yield chr, start, pvalue 17 | """ 18 | for fh in fhs: 19 | for line in fh: 20 | if line[0] == "#": continue 21 | toks = line.split(sep) 22 | yield toks[columns[0]], int(toks[columns[1]]), float(toks[columns[2]]) 23 | 24 | def chr_cmp(a, b): 25 | a = a.lower().replace("_", ""); b = b.lower().replace("_", "") 26 | achr = a[3:] if a.startswith("chr") else a 27 | bchr = b[3:] if b.startswith("chr") else b 28 | 29 | try: 30 | return cmp(int(achr), int(bchr)) 31 | except ValueError: 32 | if achr.isdigit() and not bchr.isdigit(): return -1 33 | if bchr.isdigit() and not achr.isdigit(): return 1 34 | # X Y 35 | return cmp(achr, bchr) 36 | 37 | 38 | def chr_loc_cmp(alocs, blocs): 39 | return chr_cmp(alocs[0], blocs[0]) or cmp(alocs[1], blocs[1]) 40 | 41 | 42 | 43 | def manhattan(fhs, columns, image_path, no_log, colors, sep, title, lines, ymax): 44 | 45 | xs = [] 46 | ys = [] 47 | cs = [] 48 | colors = cycle(colors) 49 | xs_by_chr = {} 50 | 51 | last_x = 0 52 | data = sorted(_gen_data(fhs, columns, sep), cmp=chr_loc_cmp) 53 | 54 | for seqid, rlist in groupby(data, key=itemgetter(0)): 55 | color = colors.next() 56 | rlist = list(rlist) 57 | region_xs = [last_x + r[1] for r in rlist] 58 | xs.extend(region_xs) 59 | ys.extend([r[2] for r in rlist]) 60 | cs.extend([color] * len(rlist)) 61 | 62 | xs_by_chr[seqid] = (region_xs[0] + region_xs[-1]) / 2 63 | 64 | # keep track so that chrs don't overlap. 65 | last_x = xs[-1] 66 | 67 | xs_by_chr = [(k, xs_by_chr[k]) for k in sorted(xs_by_chr.keys(), cmp=chr_cmp)] 68 | 69 | xs = np.array(xs) 70 | ys = np.array(ys) if no_log else -np.log10(ys) 71 | 72 | plt.close() 73 | f = plt.figure() 74 | ax = f.add_axes((0.1, 0.09, 0.88, 0.85)) 75 | 76 | if title is not None: 77 | plt.title(title) 78 | 79 | ax.set_ylabel('-log10(p-value)') 80 | if lines: 81 | ax.vlines(xs, 0, ys, colors=cs, alpha=0.5) 82 | else: 83 | ax.scatter(xs, ys, s=2, c=cs, alpha=0.8, edgecolors='none') 84 | 85 | # plot 0.05 line after multiple testing. 86 | ax.axhline(y=-np.log10(0.05 / len(data)), color='0.5', linewidth=2) 87 | plt.axis('tight') 88 | plt.xlim(0, xs[-1]) 89 | plt.ylim(ymin=0) 90 | if ymax is not None: plt.ylim(ymax=ymax) 91 | plt.xticks([c[1] for c in xs_by_chr], [c[0] for c in xs_by_chr], rotation=-90, size=8.5) 92 | print >>sys.stderr, "saving to: %s" % image_path 93 | plt.savefig(image_path) 94 | #plt.show() 95 | 96 | 97 | def get_filehandles(args): 98 | return (open(a) if a != "-" else sys.stdin for a in args) 99 | 100 | 101 | def main(): 102 | p = optparse.OptionParser(__doc__) 103 | p.add_option("--no-log", dest="no_log", help="don't do -log10(p) on the value", 104 | action='store_true', default=False) 105 | p.add_option("--cols", dest="cols", help="zero-based column indexes to get" 106 | " chr, position, p-value respectively e.g. %default", default="0,1,2") 107 | p.add_option("--colors", dest="colors", help="cycle through these colors", 108 | default="bk") 109 | p.add_option("--image", dest="image", help="save the image to this file. e.g. %default", 110 | default="manhattan.png") 111 | p.add_option("--title", help="title for the image.", default=None, dest="title") 112 | p.add_option("--ymax", help="max (logged) y-value for plot", dest="ymax", type='float') 113 | p.add_option("--sep", help="data separator, default is [tab]", 114 | default="\t", dest="sep") 115 | p.add_option("--lines", default=False, dest="lines", action="store_true", 116 | help="plot the p-values as lines extending from the x-axis rather than" 117 | " points in space. plotting will take longer with this option.") 118 | 119 | opts, args = p.parse_args() 120 | if (len(args) == 0): 121 | sys.exit(not p.print_help()) 122 | fhs = get_filehandles(args) 123 | columns = map(int, opts.cols.split(",")) 124 | manhattan(fhs, columns, opts.image, opts.no_log, opts.colors, opts.sep, 125 | opts.title, opts.lines, opts.ymax) 126 | 127 | 128 | if __name__ == "__main__": 129 | main() 130 | -------------------------------------------------------------------------------- /reads-utils/README.rst: -------------------------------------------------------------------------------- 1 | 2 | fastq 3 | ===== 4 | 5 | `summarize` and `filter` fastq files. 6 | 7 | 8 | build 9 | ----- 10 | :: 11 | 12 | g++ -O2 -o fastq fastq.cpp 13 | 14 | 15 | usage 16 | ----- 17 | 18 | summarize 19 | +++++++++ 20 | 21 | codon usage table: 22 | :: 23 | 24 | fastq summarize --adjust 64 --codon some.fastq 25 | 26 | creates a table (to STDOUT) and a link to a google api chart showing codon usage at 27 | each basepair in the read. 28 | 29 | .. image:: http://chart.apis.google.com/chart?chbh=a&chdl=A|C|G|T|N&chdlp=l&cht=bvs&chs=800x300&chco=FF0000,00FF00,0000FF,FFFF00,CCCCCC&chxt=x,x&chxr=0,1,76,75&chxl=1:|read%20position|&chxp=1,50&chds=0,1565&chd=t:523,548,490,474,473,461,509,462,453,509,460,455,505,507,460,455,451,452,453,503,456,501,453,451,452,453,496,498,452,498,451,451,450,448,448,448,448,494,449,448,496,447,449,494,493,447,444,443,446,446,491,447,492,446,443,445,488,446,446,446,447,447,448,488,450,444,444,444,444,444,437,443,442,444,442,439|77,20,21,27,27,75,29,28,77,28,28,32,29,30,30,29,78,76,29,31,29,29,30,76,31,31,30,31,32,31,31,78,30,32,41,32,77,31,30,78,32,31,32,31,32,32,33,81,80,34,35,35,35,82,82,41,40,39,83,44,85,47,45,45,46,50,95,98,58,57,102,63,62,106,69,70|390,496,369,316,319,325,337,375,333,337,384,381,339,340,334,387,342,340,391,344,389,346,343,343,391,392,348,346,396,348,396,351,396,396,350,349,351,352,397,349,353,400,398,355,352,354,401,354,357,403,358,402,361,356,362,406,363,364,365,366,368,409,371,371,372,414,376,377,420,383,375,381,383,385,387,425|568,499,682,745,745,701,687,697,700,688,690,694,690,686,739,692,691,695,689,685,689,687,736,693,688,685,687,686,682,683,682,679,684,683,715,729,684,682,683,683,678,682,681,679,682,725,682,680,676,676,673,673,669,674,670,662,668,709,665,703,659,655,693,653,691,652,643,640,636,674,626,671,671,624,660,624|3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,2,2,1,2,3,4,9,4,3,3,4,4,4,3,3,3,3,3,3,4,3,4,5,5,5,4,5,8,4,4,3,4,3,4,5,4,4,3,4,4,5,4,22,4,3,3,4,4 30 | 31 | quality usage table:: 32 | 33 | fastq summarize --adjust 64 --codon some.fastq 34 | 35 | creates a table with columns of quality values for percentiles: 36 | 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99 and for median. each 37 | row is a read position (useful for seeing how quality declines with 38 | increasing read position). 39 | 40 | filter 41 | ++++++ 42 | 43 | run ./fastq with no arguments to see help for filtering. 44 | -------------------------------------------------------------------------------- /reads-utils/color/convert-cs.py: -------------------------------------------------------------------------------- 1 | """ 2 | convert colorspace reads with(out) quality to fastq(a). usage: 3 | %s reads.csfasta [reads_qv.qual] > some.fastq(a) 4 | 5 | if .qual file is not specified, the output is fasta. otherwise, it is fastq. 6 | quals are encoded like illumina-1.3+ reads with offset 64. 7 | """ 8 | import sys 9 | from methylcoder import cs2seq 10 | import os.path as op 11 | from itertools import izip 12 | 13 | __doc__ %= sys.argv[0] 14 | 15 | 16 | def check_exists(path): 17 | if path is None: return 18 | if not op.exists(path): 19 | print >>sys.stderr, "%s does not exist" % path 20 | sys.exit(1) 21 | 22 | def exhaust_comments(fh, doprint=False): 23 | if fh is None: return 24 | pos = fh.tell() 25 | while True: 26 | line = fh.readline() 27 | if line[0] != "#": break 28 | if doprint: 29 | print line, 30 | pos = fh.tell() 31 | fh.seek(pos) 32 | 33 | def print_fastq(fc, fq): 34 | 35 | pairs = izip(fc, fq) 36 | while True: 37 | try: 38 | seq_header, qual_header = pairs.next() 39 | except StopIteration: 40 | break 41 | assert seq_header == qual_header, (seq_header, qual_header) 42 | assert seq_header[0] == ">" 43 | cs, qual = pairs.next() 44 | qual = qual.strip().split(" ") 45 | print "@%s" % seq_header[1:].strip() 46 | print cs2seq(cs.strip()) 47 | print "+" 48 | # since the qual is 1 short, just add an extra here to the end. 49 | print "".join(chr(int(q) + 33) for q in qual) + chr(int(qual[-1]) + 33) 50 | 51 | def print_fasta(fc): 52 | header = fc.readline() 53 | while header: 54 | print header, 55 | print cs2seq(fc.readline().strip()) 56 | header = fc.readline() 57 | 58 | 59 | def main(csfasta, quals=None): 60 | check_exists(csfasta) 61 | check_exists(quals) 62 | fc = open(csfasta) 63 | fq = open(quals) if quals else None 64 | 65 | exhaust_comments(fc, True) 66 | exhaust_comments(fq) 67 | 68 | if quals is None: 69 | print_fasta(fc) 70 | else: 71 | print_fastq(fc, fq) 72 | 73 | 74 | 75 | if __name__ == "__main__": 76 | if len(sys.argv) < 2: 77 | print >>sys.stderr, __doc__ 78 | sys.exit() 79 | 80 | format = "fasta" if len(sys.argv) == 2 else "fastq" 81 | csfasta = sys.argv[1] 82 | quals = None if format == "fasta" else sys.argv[2] 83 | main(csfasta, quals) 84 | -------------------------------------------------------------------------------- /reads-utils/fastq_pair_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """ 3 | %prog [options] pair_1.fastq pair_2.fastq 4 | 5 | filter reads from paired fastq so that no unmatching reads remain. 6 | output files are pair_1.fastq.trim and pair_2.fastq.trim 7 | see: http://hackmap.blogspot.com/2010/09/filtering-paired-end-reads-high.html 8 | """ 9 | __version__ = "0.1.0" 10 | 11 | from subprocess import Popen, PIPE 12 | import sys 13 | 14 | FASTX_CLIPPER="fastx_clipper" 15 | FASTQ_QUALITY_TRIMMER="fastq_quality_trimmer" 16 | def gen_pairs(fha, fhb, min_len, fastq): 17 | def gen_headers(fastq): 18 | fq = open(fastq) 19 | r = fq.readline().rstrip("\r\n") 20 | while r: 21 | fq.readline() 22 | fq.readline() 23 | fq.readline() 24 | yield r[:-2] 25 | r = fq.readline().rstrip("\r\n") 26 | aread, bread = fha.readline, fhb.readline 27 | get_a = lambda: [aread().rstrip("\r\n") for i in range(4)] 28 | get_b = lambda: [bread().rstrip("\r\n") for i in range(4)] 29 | 30 | ah, bh = None, None 31 | header_gen = gen_headers(fastq) 32 | for header in header_gen: 33 | a = get_a() 34 | ah = a[0][:-2] 35 | b = get_b() 36 | bh = b[0][:-2] 37 | 38 | while not header in (ah, bh): 39 | header = header_gen.next() 40 | 41 | if bh != header: 42 | while ah != bh and ah: 43 | a = get_a() 44 | ah = a[0][:-2] 45 | while header != bh: 46 | header = header_gen.next() 47 | if ah != header: 48 | while ah != bh and bh: 49 | b = get_b() 50 | bh = b[0][:-2] 51 | while header != bh: 52 | header = header_gen.next() 53 | if not ah and bh: 54 | raise StopIteration 55 | 56 | assert ah == bh 57 | if len(a[1]) < min_len or len(b[1]) < min_len: continue 58 | yield a, b 59 | 60 | def main(adaptors, M, t, min_len, fastqs, sanger=False): 61 | cmds = [] 62 | for fastq in fastqs: 63 | cmd = [] 64 | for i, a in enumerate(adaptors): 65 | if M == 0: 66 | matches = len(a) 67 | else: 68 | matches = min(M, len(a)) 69 | cmd.append("%s -a %s -M %i %s -l 0" \ 70 | % (FASTX_CLIPPER, a, matches, "-Q 33" if sanger else "")) #, min_len)) 71 | 72 | trim_cmd = "%s -t %i -l 0" % (FASTQ_QUALITY_TRIMMER, t) #, min_len) 73 | if sanger: trim_cmd += " -Q 33" 74 | cmd.append(trim_cmd) 75 | cmd[0] += " < %s" % fastq 76 | 77 | cmds.append(" | ".join(cmd)) 78 | print "[running]:", cmds[-1] 79 | procs = [Popen(cmd, stdout=PIPE, shell=True) for cmd in cmds] 80 | 81 | 82 | trima = open("%s.trim" % fastqs[0], 'w') 83 | trimb = open("%s.trim" % fastqs[1], 'w') 84 | print >>sys.stderr, "writing %s and %s" % (trima.name, trimb.name) 85 | 86 | # no temporary file, just read from stdouts. 87 | for ra, rb in gen_pairs(procs[0].stdout, procs[1].stdout, min_len, 88 | fastqs[0]): 89 | print >>trima, "\n".join(ra) 90 | print >>trimb, "\n".join(rb) 91 | 92 | returncode = 0 93 | for p in procs: 94 | p.wait() 95 | returncode |= p.returncode 96 | if returncode != 0: 97 | print >>sys.stderr, "ERROR: non-zero returncode from fastx toolkit" 98 | sys.exit(returncode) 99 | 100 | if __name__ == "__main__": 101 | import optparse 102 | p = optparse.OptionParser(__doc__) 103 | p.add_option("-a", dest="a", help="adaptor sequence to clip seperate multiples with ','", default="") 104 | 105 | p.add_option("-M", dest="M", help="require minimum adapter alignment length of N." 106 | " If less than N nucleotides aligned with the adapter - don't clip it." 107 | " default 0 means to require the full length of the adaptor to match. ", 108 | default=0, type='int') 109 | 110 | p.add_option("-t", dest="t", help="Quality threshold - nucleotides with lower" 111 | " quality will be trimmed (from the end of the sequence ", 112 | type='int', default=0) 113 | 114 | p.add_option("-l", dest="l", help="Minimum length - sequences shorter than this (after trimming)" 115 | "will be discarded. Default = 0 = no minimum length.", 116 | type="int", default=0) 117 | p.add_option("--sanger", dest="sanger", help="quality scores are ascii 33 sanger encoded (default is 64)", action="store_true") 118 | 119 | opts, fastqs = p.parse_args() 120 | fastqs[-1] = fastqs[-1].rstrip() 121 | if not (fastqs and len(fastqs)) == 2: 122 | sys.exit(p.print_help()) 123 | 124 | adaptors = [ad.strip() for ad in opts.a.split(",") if ad.strip()] 125 | main(adaptors, opts.M, opts.t, opts.l, fastqs, opts.sanger) 126 | -------------------------------------------------------------------------------- /reads-utils/guess-encoding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Guess the encoding of a stream of qual lines. 6 | 7 | Accepts only quality scores as input, either on STDIN or 8 | from a file provided as an argument. 9 | 10 | Use cases: `awk 'NR % 4 == 0' | %prog [options]`, 11 | `%prog [options] `, 12 | `samtools view | cut -f 11 | %prog [options]` 13 | """ 14 | 15 | from __future__ import with_statement, division, print_function 16 | 17 | import fileinput 18 | import operator 19 | import optparse 20 | import sys 21 | 22 | from collections import Counter 23 | 24 | # Note that the theoretical maximum for all encodings is 126. 25 | # The upper limits below are for "typical" data only. 26 | RANGES = { 27 | 'Sanger': (33, 73), 28 | 'Illumina-1.8': (33, 74), 29 | 'Solexa': (59, 104), 30 | 'Illumina-1.3': (64, 104), 31 | 'Illumina-1.5': (66, 105) 32 | } 33 | 34 | # The threshold to decide between Illumina-1.3 and Illumina-1.5 35 | # based upon how common "B" is. The threshold insists it is 36 | # within the Nth most common quality scores. 37 | # N.B. needs to be conservative, as this is applied per input line. 38 | N_MOST_COMMON_THRESH = 4 39 | 40 | 41 | def get_qual_range(qual_str): 42 | """ 43 | >>> get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV") 44 | (68, 89...) 45 | """ 46 | 47 | qual_val_counts = Counter(ord(qual_char) for qual_char in qual_str) 48 | 49 | min_base_qual = min(qual_val_counts.keys()) 50 | max_base_qual = max(qual_val_counts.keys()) 51 | 52 | return (min_base_qual, max_base_qual, qual_val_counts) 53 | 54 | 55 | def get_encodings_in_range(rmin, rmax, ranges=RANGES): 56 | valid_encodings = [] 57 | for encoding, (emin, emax) in ranges.items(): 58 | if rmin >= emin and rmax <= emax: 59 | valid_encodings.append(encoding) 60 | return valid_encodings 61 | 62 | 63 | def heuristic_filter(valid, qual_val_counts): 64 | """Apply heuristics to particular ASCII value scores 65 | to try to narrow-down the encoding, beyond min/max. 66 | """ 67 | 68 | if 'Illumina-1.5' in valid: 69 | # 64–65: Phread+64 quality scores of 0–1 ('@'–'A') 70 | # unused in Illumina 1.5+ 71 | if qual_val_counts[64] > 0 or qual_val_counts[65] > 0: 72 | valid.remove('Illumina-1.5') 73 | 74 | # 66: Phread+64 quality score of 2 'B' 75 | # used by Illumina 1.5+ as QC indicator 76 | elif 66 in map(operator.itemgetter(0), 77 | qual_val_counts.most_common(N_MOST_COMMON_THRESH)): 78 | print("# A large number of 'B' quality scores (value 2, ASCII 66) " 79 | "were detected, which makes it likely that this encoding is " 80 | "Illumina-1.5, which has been returned as the only option.", 81 | file=sys.stderr) 82 | valid = ['Illumina-1.5'] 83 | 84 | return valid 85 | 86 | 87 | def main(): 88 | p = optparse.OptionParser(__doc__) 89 | p.add_option("-n", dest="n", help="number of qual lines to test default:-1" 90 | " means test until end of file or until it it possible to " 91 | " determine a single file-type", 92 | type='int', default=-1) 93 | 94 | opts, args = p.parse_args() 95 | 96 | if len(args) > 1: 97 | print("Only a single input file is supported.", file=sys.stderr) 98 | sys.exit(1) 99 | 100 | gmin = 99 101 | gmax = 0 102 | valid = [] 103 | 104 | err_exit = False 105 | 106 | input_file = fileinput.input(args, openhook=fileinput.hook_compressed) 107 | 108 | for i, line in enumerate(input_file): 109 | if i == 0: 110 | input_filename_for_disp = fileinput.filename() 111 | 112 | if fileinput.isstdin(): 113 | input_filename_for_disp = 'STDIN' 114 | 115 | print("# reading qualities from " 116 | "{}".format(input_filename_for_disp), file=sys.stderr) 117 | 118 | lmin, lmax, qual_val_counts = get_qual_range(line.rstrip()) 119 | 120 | if lmin < gmin or lmax > gmax: 121 | gmin, gmax = min(lmin, gmin), max(lmax, gmax) 122 | valid = get_encodings_in_range(gmin, gmax) 123 | 124 | valid = heuristic_filter(valid, qual_val_counts) 125 | 126 | if len(valid) == 0: 127 | print("no encodings for range: " 128 | "{}".format((gmin, gmax)), file=sys.stderr) 129 | err_exit = True 130 | break 131 | 132 | if len(valid) == 1 and opts.n == -1: 133 | # parsed entire file and found unique guess 134 | break 135 | 136 | if opts.n > 0 and i > opts.n: 137 | # parsed up to specified portion; return current guess(es) 138 | break 139 | 140 | input_file.close() 141 | 142 | if err_exit: 143 | sys.exit(1) 144 | else: 145 | print("{}\t{}\t{}".format(",".join(valid), gmin, gmax)) 146 | 147 | 148 | if __name__ == "__main__": 149 | import doctest 150 | if doctest.testmod(optionflags=doctest.ELLIPSIS | 151 | doctest.NORMALIZE_WHITESPACE).failed == 0: 152 | main() 153 | -------------------------------------------------------------------------------- /reads-utils/select-random-pairs.py: -------------------------------------------------------------------------------- 1 | """ 2 | take paired end files and generate a new 3 | set of paired end files with only a random 4 | subset of reads. 5 | 6 | Usage: 7 | 8 | python %s reads_1.fastq reads_2.fastq 100 9 | 10 | will take 100 random reads (still paired) from each file 11 | and create the new files reads_1.fastq.subset and reads_2.fastq.subset 12 | """ 13 | 14 | import random 15 | import sys 16 | 17 | def write_random_records(fqa, fqb, N=100000): 18 | """ get N random headers from a fastq file without reading the 19 | whole thing into memory""" 20 | records = sum(1 for _ in open(fqa)) / 4 21 | rand_records = sorted([random.randint(0, records - 1) for _ in xrange(N)]) 22 | 23 | fha, fhb = open(fqa), open(fqb) 24 | suba, subb = open(fqa + ".subset", "w"), open(fqb + ".subset", "w") 25 | rec_no = -1 26 | written = 0 27 | for rr in rand_records: 28 | while rec_no < rr: 29 | for i in range(4): fha.readline() 30 | for i in range(4): fhb.readline() 31 | rec_no += 1 32 | for i in range(4): 33 | suba.write(fha.readline()) 34 | subb.write(fhb.readline()) 35 | rec_no += 1 36 | written += 1 37 | assert written == N 38 | 39 | print >>sys.stderr, "wrote to %s, %s" % (suba.name, subb.name) 40 | 41 | if __name__ == "__main__": 42 | if len(sys.argv) < 3: 43 | print __doc__ % sys.argv[0] 44 | sys.exit() 45 | 46 | N = 100 if len(sys.argv) < 4 else int(sys.argv[3]) 47 | write_random_records(sys.argv[1], sys.argv[2], N) 48 | -------------------------------------------------------------------------------- /solidstuff/bfastq-to-bwa.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import string 3 | 4 | encoder = string.maketrans('0123.', 'ACGTN') 5 | 6 | for i, line in enumerate(sys.stdin, start=0): 7 | if i % 4 == 1: 8 | # double encode sequence 9 | assert line[0] == "T" 10 | print line[2:-1].translate(encoder) 11 | elif i % 4 == 3: 12 | # drop first qual 13 | print line[1:], 14 | else: 15 | print line, 16 | -------------------------------------------------------------------------------- /solidstuff/color-qual-replace.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Color aware aligners (BFAST, Shrimp, ...) output color base-qualities in the 4 | 'CQ:Z' tag. Variant callers will ignore this. 5 | This script replaces the existing base-quality (column 11) with the color-base 6 | quality in the CQ:Z tag (if it exists) 7 | reads from stdin and writes to stdout. usage like: 8 | 9 | $ samtools view -h input.bam | python color-qual-replace.py \ 10 | | samtools view -bS - > output.bam 11 | """ 12 | import sys 13 | 14 | for line in sys.stdin: 15 | if line[0] == "@" or not "CQ:Z:" in line: print line, 16 | else: 17 | toks = line.rstrip("\r\n").split("\t") 18 | for t in toks[11:]: 19 | if t.startswith("CQ:Z:"): 20 | toks[10] = t[5:] 21 | break 22 | else: 23 | raise Exception("BAD") 24 | 25 | print "\t".join(toks) 26 | -------------------------------------------------------------------------------- /solidstuff/test.csfasta: -------------------------------------------------------------------------------- 1 | >2_124_343_F3 2 | T220002332212021310112110211010110001200302211.2232 3 | -------------------------------------------------------------------------------- /solidstuff/test.qual: -------------------------------------------------------------------------------- 1 | >2_124_343_F3 2 | 33 33 16 24 27 33 32 17 29 33 27 31 5 30 28 25 18 4 4 6 24 31 12 10 24 26 28 4 4 12 17 16 9 4 11 17 5 8 6 4 17 4 6 10 9 -1 17 5 7 14 3 | -------------------------------------------------------------------------------- /subject_genes_from_query/subject_genes_from_query.py: -------------------------------------------------------------------------------- 1 | """ 2 | no annotations for the subject, so the blast(z) is just query features against 3 | subject genomic sequence. 4 | we use the subject hits as the new features by: 5 | + overlapping subject hits are merged into single hsps 6 | + nearby subject hits to the same query are merge into single HSPs 7 | + no introns are recorded. 8 | 9 | bed and feature fasta files are created. 10 | """ 11 | import sys 12 | import collections 13 | from pyfasta import Fasta 14 | import numpy as np 15 | import os.path as op 16 | from itertools import tee, izip 17 | 18 | blast_file = sys.argv[1] 19 | subject_fasta_file = sys.argv[2] 20 | 21 | out_fasta = "%s.features%s" % op.splitext(subject_fasta_file) 22 | 23 | by_subject = collections.defaultdict(list) 24 | fa = Fasta(subject_fasta_file) 25 | 26 | def pairwise(iterable): 27 | "s -> (s0,s1), (s1,s2), (s2, s3), ..." 28 | a, b = tee(iterable) 29 | next(b, None) 30 | return izip(a, b) 31 | 32 | by_subject = {} 33 | seqids = [] 34 | for seqid in fa.iterkeys(): 35 | by_subject[seqid] = np.zeros((len(fa[seqid]) + 1,), dtype=np.uint8) 36 | seqids.append((len(fa[seqid]),seqid)) 37 | 38 | 39 | by_query_subject = collections.defaultdict(dict) 40 | for line in open(blast_file): 41 | args = line.split("\t") 42 | query, subject = args[0], args[1] 43 | sstart, sstop = sorted(map(int, args[8:10])) 44 | by_subject[subject][sstart: sstop + 1] |= 1 45 | if not subject in by_query_subject[query]: 46 | by_query_subject[query][subject] = [(sstart, sstop)] 47 | else: 48 | by_query_subject[query][subject].append((sstart, sstop)) 49 | 50 | # if 2 HSPs are nearby on the subject and from the same query, merge them into a single HSP 51 | NEAR_SAME = 1000 52 | for query in by_query_subject: 53 | for subject in by_query_subject[query]: 54 | li = sorted(by_query_subject[query][subject]) 55 | if len(li) < 2: continue 56 | for alocs, blocs in pairwise(li): 57 | 58 | if blocs[0] - alocs[1] < NEAR_SAME: 59 | by_subject[subject][alocs[1]: blocs[0] + 1] |= 1 60 | 61 | 62 | 63 | print >>sys.stderr, "writing features.fasta to: %s" % out_fasta 64 | out = open(out_fasta, "w") 65 | 66 | for seqlen, seqid in sorted(seqids, reverse=True): 67 | masks = by_subject[seqid] 68 | starts, = np.where((masks[:-1] == 0) & (masks[1:] == 1)) 69 | ends, = np.where((masks[:-1] == 1) & (masks[1:] == 0)) 70 | 71 | for s, e in zip(starts, ends): 72 | assert s < e, (s, e, seqid) 73 | # add 1 for 0-based and 1 for the [1:] 74 | start = s + 2 75 | end = e + 1 76 | name = "%s-%i-%i" % (seqid, start, end) 77 | print "%s\t%i\t%i\t%s" % (seqid, s + 1, e, name) 78 | print >>out, ">" + name 79 | print >>out, fa[seqid][start - 1: end] 80 | -------------------------------------------------------------------------------- /superbed/README.rst: -------------------------------------------------------------------------------- 1 | Annotate a Bed File 2 | =================== 3 | 4 | Given a file in some kind of bed format (at least the first 3 cols are chr start end), 5 | generate a new file with 2 extra columns: gene, distance. 6 | In cases where the distance is zero, the feature type(s) where the overlap occured is 7 | reported. These could be introns/exons/utrs, etc. 8 | 9 | Example Workflow 10 | ================ 11 | Get the data from UCSC (or your local mirror). 12 | :: 13 | 14 | ORG=hg19 15 | mysql -D $ORG -e "select chrom,txStart,txEnd,cdsStart,cdsEnd,K.name,X.geneSymbol,proteinID,strand,exonStarts,exonEnds from knownGene as K,kgXref as X where X.kgId=K.name" > $ORG.notbed 16 | 17 | Check the actual command in UCSC if you do not have a local DB set up. 18 | 19 | 20 | create a bed6 file with a line for each column:: 21 | 22 | python superbed.py $ORG.notbed > $ORG.super.bed 23 | 24 | 25 | install `bedtools`_ and `pybedtools`_ 26 | 27 | annotate some data with `superanno.py`:: 28 | 29 | python superanno.py -a my.bed -b $ORG.super.bed --header > my.annotated.bed 30 | 31 | my.annotated.bed will now have 2 extra columns: gene(s), distance/feature_type. 32 | -------------------------------------------------------------------------------- /superbed/superbed.py: -------------------------------------------------------------------------------- 1 | """ 2 | Separate the data from UCSC into a bed6 file with columns of: 3 | 4 | chrom, start, end, name, type, strand 5 | 6 | where type is one of: 7 | 8 | intron, exon, utr5, utr3, utr5_intron, utr3_intron 9 | 10 | exon or intron can be prefixed with 'nc_' for non-coding' 11 | 12 | Usage: 13 | 14 | %s [options] output.from.ucsc 15 | 16 | the file output.from.ucsc can be extracted via mysql:: 17 | 18 | mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D $ORG -P 3306 \ 19 | -e "SELECT chrom,txStart,txEnd,cdsStart,cdsEnd,K.name,X.geneSymbol, 20 | proteinID,strand,exonStarts,exonEnds 21 | FROM knownGene as K,kgXref as X WHERE X.kgId=K.name" \ 22 | > output.from.ucsc 23 | 24 | where $ORG is something like hg19 or mm8. The header must be present for this 25 | to work. 26 | 27 | for refGene, this can be something like: 28 | SELECT chrom,txStart,txEnd,cdsStart,cdsEnd,name2 as 29 | name,strand,exonStarts,exonEnds FROM refGene 30 | """ 31 | from __future__ import print_function 32 | import sys 33 | 34 | def reader(fname, sep="\t"): 35 | r""" 36 | for each row in the file `fname` generate dicts based on the header 37 | in the first row. 38 | """ 39 | line_gen = (l.rstrip("\r\n").split(sep) for l in open(fname)) 40 | header = line_gen.next() 41 | header[0] = header[0].lstrip("#") 42 | for toks in line_gen: 43 | yield dict(zip(header, toks)) 44 | 45 | def print_line(start, end, ftype, d): 46 | 47 | if start == end: 48 | assert "intron" in ftype, ((start, end, ftype, d)) 49 | return 50 | else: 51 | assert start < end 52 | 53 | print("%s\t%i\t%i\t%s\t%s\t%s" % (d['chrom'], start, end, 54 | d['full_name'], ftype, d['strand'])) 55 | 56 | def print_introns(starts_ends, d, ftype="intron"): 57 | starts, ends = zip(*starts_ends) 58 | 59 | # first possible intron is between end of first and start of 2nd intron. 60 | for start, end in zip(ends[:-1], starts[1:]): 61 | print_line(start, end, ftype, d) 62 | 63 | def print_exons(starts_ends, d, ftype="exon"): 64 | 65 | if d['cdsStart'] != d['cdsEnd']: 66 | starts_ends[0][0] = d['cdsStart'] 67 | starts_ends[-1][1] = d['cdsEnd'] 68 | 69 | for start, end in starts_ends: 70 | print_line(start, end, ftype, d) 71 | 72 | 73 | def print_noncoding_utrs(starts_ends, d, ftype): 74 | 75 | for i, (start, end) in enumerate(starts_ends): 76 | print_line(start, end, ftype, d) 77 | if (i + 1) < len(starts_ends): 78 | intron_start = end 79 | intron_end = starts_ends[i + 1][0] 80 | print_line(intron_start, intron_end, ftype + "_intron", d) 81 | 82 | 83 | def print_features(d): 84 | starts_ends = zip(d['exonStarts'], d['exonEnds']) 85 | coding = d['cdsStart'] != d['cdsEnd'] 86 | if coding: 87 | cds_starts_ends = [[s, e] for (s, e) in starts_ends if s < d['cdsEnd'] \ 88 | and e > d['cdsStart']] 89 | 90 | utr_lefts = [[s, e] for (s, e) in starts_ends \ 91 | if s < d['cdsStart']] 92 | utr_rights = [[s, e] for (s, e) in starts_ends \ 93 | if e > d['cdsEnd']] 94 | # extend it to the cds start/end. 95 | if utr_lefts: 96 | utr_lefts[-1][1] = d['cdsStart'] 97 | if utr_rights: 98 | utr_rights[0][0] = d['cdsEnd'] 99 | pass 100 | 101 | utr5s = utr_lefts if d['strand'] == '+' else utr_rights 102 | utr3s = utr_lefts if d['strand'] == '-' else utr_rights 103 | 104 | print_noncoding_utrs(utr5s, d, 'utr5') 105 | print_noncoding_utrs(utr3s, d, 'utr3') 106 | else: 107 | cds_starts_ends = starts_ends 108 | 109 | 110 | print_exons(cds_starts_ends, d, "exon" if coding else "nc_exon") 111 | print_introns(cds_starts_ends, d, "intron" if coding else "nc_intron") 112 | 113 | 114 | def superbed(ucsc_file): 115 | 116 | for d in reader(ucsc_file): 117 | for k in ('txStart', 'txEnd', 'cdsStart', 'cdsEnd'): 118 | d[k] = int(d[k]) 119 | assert d['exonStarts'][-1] == "," == d['exonEnds'][-1] 120 | d['exonStarts'] = map(int, d['exonStarts'][:-1].split(",")) 121 | d['exonEnds'] = map(int, d['exonEnds'][:-1].split(",")) 122 | assert len(d['exonStarts']) == len(d['exonEnds']) 123 | d['full_name'] = (",".join((d['name'], d.get('geneSymbol', '')))).rstrip(",") 124 | print_features(d) 125 | 126 | 127 | def main(args=sys.argv[1:]): 128 | if len(args) != 1: 129 | print(__doc__ % sys.argv[0]) 130 | sys.exit(1) 131 | 132 | superbed(args[0]) 133 | 134 | if __name__ == "__main__": 135 | import doctest 136 | if doctest.testmod(optionflags=doctest.ELLIPSIS |\ 137 | doctest.NORMALIZE_WHITESPACE).failed == 0: 138 | main() 139 | -------------------------------------------------------------------------------- /utils/README.rst: -------------------------------------------------------------------------------- 1 | 2 | 3 | join 4 | ==== 5 | 6 | Usage:: 7 | join.py [options] filea:col# fileb:col# 8 | 9 | join filea with fileb by looking for the same value in col# 10 | 11 | col numbers are 0-based indexing. 12 | 13 | can never get linux join to work as i want. also handles files with different 14 | seperators. 15 | 16 | 17 | Options: 18 | -h, --help show this help message and exit 19 | --sepa=SEPA separator for 1st file 20 | --sepb=SEPB separator for 2nd file 21 | -x only print the shared column once. 22 | 23 | gene-list overlap 24 | ================= 25 | 26 | Uses the hypergeometric distribtion to calculate the probality of gene-list 27 | overlap. 28 | Given 2 lists (or gene sets) we know they came from, e.g. `GG` genes 29 | and we know there are `AA` genes in list-A and `BB` genes in list-B. What is the 30 | probability that they share `SS` genes? :: 31 | 32 | $ ./list_overlap_p.py SS GG AA BB 33 | 34 | e.g.:: 35 | 36 | $ ./list_overlap_p.py 10 30000 345 322 37 | 0.0043679470685 38 | 39 | gives the probability that 10 genes are shared in 2 random lists of length 345 and 40 | 322 given that those lists are drawn from a set of 30K genes. 41 | -------------------------------------------------------------------------------- /utils/find-peaks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Find peak-regions given a file of values (likely -log10(p)). 3 | %prog [options] data.file 4 | 5 | e.g. 6 | 7 | %prog --skip 1000 --seed 5 --threshold 3 my.data.txt 8 | 9 | will seed on values in the 2nd column that are larger than `5` and 10 | extend as long as it continues to find values greater than `3` within `1000` 11 | basepairs in either direction--where the locations is determined by the 1st 12 | column. 13 | 14 | if the -g options is used. The columns are: "chromosome" "position" "value" 15 | otherwise, they are: "position" "value". 16 | The file must be be sorted by columns 1, 2 with `-g` and column 1 without `-g` 17 | 18 | If --keep-cols is specified the final output column includes values from each 19 | specified column *and* the value column (3rd column). for any rows above 20 | the threshold. 21 | """ 22 | 23 | import argparse 24 | from itertools import groupby 25 | from operator import itemgetter 26 | from toolshed import reader 27 | 28 | def gen_regions(fh, skip, seed, threshold, group, keep_cols, report_cutoff): 29 | if group == False: 30 | def fhgen(): # insert None so that they all get grouped the same... 31 | for row in fh: 32 | row.insert(0, None) 33 | yield row 34 | fhiter = fhgen() 35 | else: 36 | fhiter = fh 37 | 38 | # turn the indexes into a function that returns their values. 39 | if keep_cols: 40 | # also keep the p-value... 41 | keep_cols.append(2) 42 | col_getter = itemgetter(*keep_cols) 43 | else: 44 | keep_cols, col_getter = None, None 45 | for key, grouped in groupby(fhiter, itemgetter(0)): 46 | for region in find_region(grouped, skip, seed, threshold, col_getter, 47 | report_cutoff): 48 | yield key, region 49 | 50 | def get_and_clear_region(region, col_getter, cutoff): 51 | start, end = region[0][0], region[-1][0] 52 | # r looks like: (67390903, ['chr10', '673903', '3.831', 'mm9-10-67390903']) 53 | rows = (r[1] for r in region if float(r[1][2]) > cutoff) 54 | 55 | extra = "|".join([",".join(col_getter(r)) for r in rows] if col_getter else []) 56 | l = len(region) 57 | region[:] = [] 58 | return start, end, l, extra 59 | 60 | 61 | def find_region(aiter, skip, seed, threshold, col_getter, report_cutoff): 62 | current_region = [] 63 | seeded = False 64 | for row in aiter: 65 | chrom, pos, val = row[:3] 66 | pos = int(pos) 67 | val = float(val) 68 | # first check if we are too far away to continue the region. 69 | if seeded and pos - current_region[-1][0] > skip: 70 | yield get_and_clear_region(current_region, col_getter, 71 | report_cutoff) 72 | assert current_region == [] 73 | seeded = False 74 | elif current_region != [] and pos - current_region[-1][0] > skip: 75 | current_region = [] 76 | assert seeded == False 77 | 78 | # if it's greater than the seed, everything's easy. 79 | if val >= seed: 80 | current_region.append((pos, row)) 81 | seeded = True 82 | elif val >= threshold: 83 | current_region.append((pos, row)) 84 | else: 85 | # nothing, it's not a large value 86 | pass 87 | 88 | if current_region and seeded: 89 | yield get_and_clear_region(current_region, col_getter, report_cutoff) 90 | 91 | 92 | 93 | def main(): 94 | p = argparse.ArgumentParser(__doc__) 95 | 96 | p.add_argument("-g", dest="group", help="group by the first column (usually" 97 | " chromosome or probe) if this [optional]", default=False, 98 | action="store_true") 99 | 100 | p.add_argument("--skip", dest="skip", help="Maximum number of intervening " 101 | "basepairs to skip before seeing a value. If this number is " 102 | "exceeded, the region is ended chromosome or probe " 103 | "[default: %default]", type=int, default=50000) 104 | p.add_argument("--min-region-size", dest="min-region", help="minimum " 105 | "length of the region. regions shorter than this are not printed" 106 | "[default: %default] (no minimum)", type=int, default=0) 107 | p.add_argument("--seed", dest="seed", help="A value must be at least this" 108 | " large in order to seed a region. [default: %default]", 109 | type=float, default=5.0) 110 | p.add_argument("--keep-cols", dest="keep", help="comma separated list of" 111 | "columns to add to the output data", default="") 112 | 113 | p.add_argument("--threshold", dest="threshold", help="After seeding, a value" 114 | "of at least this number can extend a region [default: " 115 | "%default]", type=float, default=3.0) 116 | p.add_argument("regions") 117 | 118 | args = p.parse_args() 119 | 120 | f = reader(args.regions, header=False, sep="\t") 121 | keep = [int(k) for k in args.keep.strip().split(",") if k] 122 | report_cutoff = args.seed 123 | for key, region in gen_regions(f, args.skip, args.seed, args.threshold, 124 | args.group, keep, report_cutoff): 125 | print key + "\t" + "\t".join(map(str, region)) 126 | 127 | 128 | if __name__ == "__main__": 129 | import doctest 130 | if doctest.testmod(optionflags=doctest.ELLIPSIS).failed == 0: 131 | main() 132 | -------------------------------------------------------------------------------- /utils/join.py: -------------------------------------------------------------------------------- 1 | """ 2 | %prog [options] filea:col# fileb:col# 3 | 4 | e.g. 5 | 6 | %prog --sepa , --sepb , f1.txt:2 f3.txt:5 7 | 8 | join filea with fileb by looking for the same value in col# 9 | 10 | col numbers are 0-based indexing. can key on multiple columns: 11 | 12 | %prob f1.txt:2:4 f3.txt:5:7 13 | 14 | will use columns 2 and 4 and check agains columns 5 and 7. 15 | """ 16 | import optparse 17 | import sys 18 | 19 | def join(fa, colsa, fb, colsb, sepa, sepb, remove): 20 | bgen = (line.rstrip("\n") for line in open(fb)) 21 | bdict = {} 22 | for line in bgen: 23 | # can have multiple keys and keep the header in case 24 | # file a has one also. 25 | if line[0] == "#": 26 | bdict['header'] = line[1:] 27 | continue 28 | toks = line.split(sepb) 29 | key = tuple(toks[ib] for ib in colsb) 30 | bdict[key] = line 31 | 32 | mismatches = 0 33 | for line in open(fa): 34 | if line[0] == "#": 35 | bstuff = bdict.get('header', '').split(sepb) 36 | if remove: 37 | for colb in sorted(colsb, reverse=True): 38 | del bstuff[colb] 39 | print line.rstrip("\r\n") + sepa + sepa.join(bstuff) 40 | continue 41 | 42 | toks = line.split(sepa) 43 | key = tuple(toks[cola] for cola in colsa) 44 | bstuff = bdict.get(key, "").split(sepb) 45 | mismatches += int(bstuff == ['']) 46 | if remove and bstuff and bstuff[0]: 47 | for colb in sorted(colsb, reverse=True): 48 | if bstuff != ['']: del bstuff[colb] 49 | 50 | print line.strip("\r\n") + sepa + sepa.join(bstuff) 51 | print >>sys.stderr, "%i lines did not match" % mismatches 52 | 53 | def main(): 54 | p = optparse.OptionParser(__doc__) 55 | p.add_option("--sepa", dest="sepa", default="\t", help="separator for 1st file") 56 | p.add_option("--sepb", dest="sepb", default="\t", help="separator for 2nd file") 57 | p.add_option("-x", dest="x", action="store_true", default=False, 58 | help="only print the shared column once.") 59 | opts, args = p.parse_args() 60 | if (len(args) != 2): 61 | sys.exit(not p.print_help()) 62 | a = args[0].split(":") 63 | fa, colsa = a[0], a[1:] 64 | b = args[1].split(":") 65 | fb, colsb = b[0], b[1:] 66 | 67 | join(fa, map(int, colsa), fb, map(int, colsb), opts.sepa, opts.sepb, opts.x) 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /utils/list_overlap_p.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | find the probability that as high as `shared_genes` is random 4 | given the number of genes: `A_genes`, `B_genes` drawn from `total_genes` 5 | e.g.: 6 | 7 | $ %prog shared_genes total_genes A_genes B_genes 8 | 9 | or: 10 | 11 | $ %prog 10 30000 345 322 12 | 0.0043679470685 13 | 14 | gives the probability that, 2 random gene subsets (chosen from 30000 genes) 15 | of length 345 and 322 would share at least 10 genes by chance. 16 | 17 | 18 | This can also be called with 3 files of gene_names: 19 | 20 | $ %prog all_genes.txt A_genes.txt B_genes.txt 21 | 22 | A_genes.txt and B_genes.txt are then intersected to get the numbers for 23 | the hypergeometric test. Each file must be a single column containing 24 | the gene-name. The comparison *is* case sensitive. 25 | 26 | 27 | See: http://www.nslij-genetics.org/wli/pub/ieee-embs06.pdf 28 | 29 | """ 30 | import optparse 31 | import sys 32 | import os.path as op 33 | import scipy.stats as ss 34 | 35 | def hypergeom(m, n, n1, n2, p=False): 36 | """ 37 | >>> hypergeom(1, 1000, 1000, 1000) # has to be shared. 38 | 1.0 39 | 40 | >>> all(hypergeom(i, 1000, 1000, 1000) == 1.0 for i in range(100)) 41 | True 42 | 43 | >>> hypergeom(1, 30000, 20, 20) 44 | 0.013253396616299651 45 | 46 | >>> hypergeom(2, 30000, 20, 20) 47 | 7.9649366037104485e-05 48 | 49 | >>> hypergeom(11, 30000, 20, 20) 50 | 4.516176321800458e-11 51 | 52 | >>> hypergeom(10, 30000, 20, 20) # very low prob. 53 | 4.516176321800458e-11 54 | 55 | >>> hypergeom(20, 30000, 20, 20) # very low chance that all are shared. 56 | 4.516176321800458e-11 57 | 58 | """ 59 | if m <= 0: return 1.0 60 | mmin = m - 1 61 | mmax = min(n1, n2) 62 | return ss.hypergeom.cdf(mmax, n, n1, n2) - ss.hypergeom.cdf(mmin, n, n1, n2) 63 | 64 | def with_genes(fftot, ffa, ffb): 65 | """ 66 | given 3 genelists, calculate the p-value of the shared 67 | genes between fa and fb that are drawn from ftot. 68 | """ 69 | 70 | ftot = frozenset(f.strip() for f in open(fftot) if f.strip()) 71 | fa = frozenset(f.strip() for f in open(ffa) if f.strip()) 72 | fb = frozenset(f.strip() for f in open(ffb) if f.strip()) 73 | 74 | n1, n2 = len(fa), len(fb) 75 | m = len(fa.intersection(fb)) 76 | n = len(ftot) 77 | 78 | print "A : %-32s:%-5i" % (ffa, n1) 79 | print "B : %-32s:%-5i" % (ffb, n2) 80 | print "total : %-32s:%-5i" % (fftot, n) 81 | print "shared: %-32s:%-5i" % (' ', m) 82 | return hypergeom(m, n, n1, n2) 83 | 84 | 85 | def main(): 86 | p = optparse.OptionParser(__doc__) 87 | opts, args = p.parse_args() 88 | if (len(args) not in (3, 4)): 89 | sys.exit(not p.print_help()) 90 | if len(args) == 4 and not all(a.isdigit() for a in args): 91 | print >>sys.stderr, "four arguments must be integers" 92 | sys.exit(not p.print_help()) 93 | elif len(args) == 3 and not all(op.exists(f) for f in args): 94 | sys.exit(not p.print_help()) 95 | 96 | if len(args) == 4: 97 | args = map(long, args) 98 | m, n, n1, n2 = args 99 | print hypergeom(m, n, n1, n2) 100 | else: 101 | tot_genes, a_genes, b_genes = map(str.strip, args) 102 | print with_genes(tot_genes, a_genes, b_genes) 103 | 104 | 105 | 106 | if __name__ == "__main__": 107 | import doctest 108 | if doctest.testmod(optionflags=doctest.ELLIPSIS).failed == 0: 109 | main() 110 | 111 | -------------------------------------------------------------------------------- /utils/partsort.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | """ 3 | partial sort of a file. Useful when some columns are known to be sorted, but 4 | within a group defined by those column, some other columns are out of order. 5 | e.g., if you have a bed file and you know it's already sorted by (or even 6 | just grouped by) chromosome, you can sort by start, stop within the 7 | chromosome without reading the entire bed-file into memory. The syntax for 8 | that would be:: 9 | 10 | %(prog)s -g 0 -s 1n,2n my.bed > my.sorted.bed 11 | 12 | where the 'n' suffix indicates that it's a number. The default is to 13 | sort as a string. 14 | 15 | """ 16 | import argparse 17 | from toolshed import reader, header as get_header 18 | from itertools import groupby, chain 19 | from operator import itemgetter 20 | import sys 21 | 22 | def partsort(afile, group_cols, sort_cols, sort_convertors, header=False): 23 | """ 24 | the converted columns are appended to the end of the row. 25 | then after the sort, these are removed. 26 | this removes problems with floating point reprs. 27 | """ 28 | the_first_line = get_header(afile) 29 | row_len = len(the_first_line) 30 | n_extra = len(sort_convertors) 31 | 32 | # maintain order of the sort cols, but use the appended columns for the 33 | # numeric ones. 34 | actual_sort_cols = [] 35 | n_extra = 0 36 | 37 | # since we append floats to the end *and* want to maintain the 38 | # requested sort order, we create the `actual_sort_cols` 39 | for c in sort_cols: 40 | if not c in sort_convertors: 41 | actual_sort_cols.append(c) 42 | else: 43 | idx = row_len + n_extra 44 | actual_sort_cols.append(idx) 45 | n_extra += 1 46 | 47 | # if it was stdin, then we read one line to get the header length. 48 | lines = reader(afile, header=header) if afile != "-" \ 49 | else chain([the_first_line], reader(afile, header)) 50 | # groupby the correct columns 51 | for keyed, group in groupby(lines, lambda toks: 52 | [toks[i] for i in group_cols]): 53 | 54 | # then generate the rows with the converted columns appended. 55 | def gen_converted_group(): 56 | for toks in group: 57 | # add the converted columns onto the end. 58 | yield toks + [fn(toks[col_idx]) for col_idx, fn in sort_convertors.items()] 59 | 60 | # then iterator over the sorted cols. 61 | for toks in sorted(gen_converted_group(), key=itemgetter(*actual_sort_cols)): 62 | # strip the extra columns. 63 | yield toks[:row_len] 64 | 65 | def read_sort_spec(spec): 66 | toks = [x.strip() for x in spec.split(",")] 67 | col_idxs = map(int, (x.rstrip("n") for x in toks)) 68 | col_convertors = dict([(i, float) for i, x in enumerate(toks) \ 69 | if x[-1] == "n"]) 70 | return col_idxs, col_convertors 71 | 72 | 73 | def main(): 74 | p = argparse.ArgumentParser(description=__doc__, 75 | formatter_class=argparse.RawDescriptionHelpFormatter) 76 | p.add_argument("-g", dest="g", help="these 0-based column numbers define a" 77 | " group and must already be sorted. Once these change, the group " 78 | " ends and is sorted by the columns defined in option `-s`") 79 | 80 | p.add_argument("-s", dest="s", help="these 0-based column numbers define" 81 | "the columns to sort on. if the column to be sorted is numeric " 82 | "(float or int) add a 'n'. e.g. -s 3n indicates that the 4th " 83 | "column should be converted to a number before sorting") 84 | 85 | p.add_argument('file', help='file to process', default="-") 86 | args = p.parse_args() 87 | if (args.g is None or args.s is None): 88 | sys.exit(not p.print_help()) 89 | 90 | group_cols, group_convertors = read_sort_spec(args.g) 91 | sort_cols, sort_convertors = read_sort_spec(args.s) 92 | # group_convertors not used. 93 | 94 | for toks in partsort(args.file, group_cols, sort_cols, sort_convertors, header=False): 95 | print "\t".join(toks) 96 | if __name__ == "__main__": 97 | import doctest 98 | if doctest.testmod(optionflags=doctest.ELLIPSIS |\ 99 | doctest.NORMALIZE_WHITESPACE).failed == 0: 100 | main() 101 | -------------------------------------------------------------------------------- /utils/pathwayapi-python/README.rst: -------------------------------------------------------------------------------- 1 | pathwayapi 2 | ========== 3 | 4 | A python interface to http://www.pathwayapi.com/ 5 | 6 | 7 | in addition the the module, it can be called from the command-line 8 | like:: 9 | 10 | $ python pathwayapi.py some.input 7 11 | 12 | where some.input has gene names in the 8th (starting from 0) column. 13 | The result, sent to stdout, will be the same as the input but with an additional, 14 | final column with pathway information. if that information is not available 15 | for the given gene name, the column is empty. 16 | -------------------------------------------------------------------------------- /utils/pathwayapi-python/pathwayapi.py: -------------------------------------------------------------------------------- 1 | import simplejson 2 | import urllib 3 | import anydbm 4 | import sys 5 | import gzip 6 | 7 | def nopen(f, mode="r"): 8 | return sys.stdin if f == "-" \ 9 | else gzip.open(f, mode) if f.endswith(".gz") else open(f, mode) 10 | 11 | __all__ = ['api'] 12 | 13 | def get(url, cache=anydbm.open('t.cache', 'c')): 14 | if url in cache: 15 | json = cache[url] 16 | else: 17 | json = urllib.urlopen(url).read() 18 | cache[url] = json 19 | cache.sync() 20 | return simplejson.loads(json) 21 | 22 | class api(object): 23 | methods = { 24 | 'GetGeneID': ('gene_name', ('gene_name', 'gene_id')) 25 | } 26 | url = 'http://www.pathwayapi.com/api/API_%s.php?%s=' 27 | 28 | 29 | def get_gene_ids(self, *gene_names): 30 | """ 31 | >>> a = api() 32 | >>> a.get_gene_ids('GATA3') 33 | {'GATA3': '2625'} 34 | """ 35 | url = self.url % ("GetGeneID", "SearchGene") 36 | names = {} 37 | import sys 38 | for gene_name in gene_names: 39 | arr = get(url + gene_name) 40 | for row in arr: 41 | # TODO: handle multiples. 42 | names[row[0]] = row[1] 43 | return names 44 | 45 | def get_pathways(self, *gene_names): 46 | """ 47 | >>> a = api() 48 | >>> a.get_pathways('GATA3') 49 | {'GATA3': ['Adipogenesis']} 50 | """ 51 | url = self.url % ("GetGenePathways", "SearchGene") 52 | 53 | gene_names = gene_names 54 | if not isinstance(gene_names, dict): 55 | gene_names = self.get_gene_ids(*gene_names) 56 | pathways = {} 57 | for gene_name, gene_id in gene_names.iteritems(): 58 | if gene_id == []: 59 | pathways[gene_name] = [] 60 | continue 61 | 62 | pw = get(url + gene_id) 63 | if pw != []: pw = pw.values() # ids are just numbers 64 | pathways[gene_name] = pw 65 | return pathways 66 | 67 | 68 | 69 | def main(fname, col): 70 | a = api() 71 | 72 | for line in nopen(fname): 73 | line = line.rstrip("\r\n") 74 | if line[0] == "#": 75 | print line + "\tpathways" 76 | continue 77 | toks = line.split("\t") 78 | # they might just have a gene list. 79 | name = toks[1] if len(toks) == 1 else toks[col] 80 | p = a.get_pathways(name) 81 | values = [x for x in p.values() if x !=[]] 82 | try: 83 | print line + "\t" + ",".join(values[0] if values and values[0] else []) 84 | except: 85 | print values 86 | raise 87 | 88 | if __name__ == "__main__": 89 | import doctest 90 | doctest.testmod() 91 | if len(sys.argv) > 1: 92 | # assume it's a bed and get the name from the 4th col unless it's specified. 93 | main(sys.argv[1], int(sys.argv[2]) if len(sys.argv) > 2 else 3) 94 | 95 | -------------------------------------------------------------------------------- /utils/primers/methylation-primers.py: -------------------------------------------------------------------------------- 1 | """ 2 | batch Methylation primer design using the `MethPrimer`_ site 3 | 4 | http://www.urogene.org/methprimer/index1.html 5 | """ 6 | 7 | post_data = """\ 8 | SEQUENCE:%(seq)s 9 | PRIMER_TASK:PICK_PCR_PRIMERS 10 | CPG_ISLAND_SIZE:100 11 | CPG_SHIFT:1 12 | CPG_OE:0.6 13 | CPG_ISLAND_PERCENT:50.0 14 | Pick Primers:Submit 15 | PRIMER_SEQUENCE_ID: 16 | TARGET: 17 | EXCLUDED_REGION: 18 | PRIMER_NUM_RETURN:9 19 | PRODUCT_MIN_SIZE:%(prod_min)i 20 | PRIMER_PRODUCT_OPT_SIZE:%(prod_opt)i 21 | PRODUCT_MAX_SIZE:%(prod_max)i 22 | PRIMER_MIN_TM:%(tmin)i 23 | PRIMER_OPT_TM:%(topt)i 24 | PRIMER_MAX_TM:%(tmax)i 25 | PRIMER_MIN_SIZE:%(pmin)i 26 | PRIMER_OPT_SIZE:%(popt)i 27 | PRIMER_MAX_SIZE:%(pmax)i 28 | PROD_CG_MIN:4 29 | PRIMER_MAX_POLY_X:5 30 | NUM_CS:4 31 | PRIMER_MAX_POLY_T:8 32 | CG_3_POSITION:3 33 | NUM_CGS:1 34 | SET_TA_DIFF:5""" 35 | 36 | import re 37 | from itertools import groupby 38 | from toolshed import nopen 39 | 40 | SPACES = re.compile(r"\s{2,}") 41 | COLUMNS = "Primer Start Size Tm GC% 'C's Sequence".split() 42 | COLUMNS = ["sequence", COLUMNS[0]] + ["left-" + c for c in COLUMNS[1:]] \ 43 | + ["right-" + c for c in COLUMNS[1:]] 44 | 45 | 46 | def fasta_iter(fasta_name): 47 | """ 48 | given a fasta file. yield tuples of header, sequence 49 | """ 50 | fh = nopen(fasta_name) 51 | # ditch the boolean (x[0]) and just keep the header or sequence since 52 | # we know they alternate. 53 | faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">")) 54 | for header in faiter: 55 | # drop the ">" 56 | header = header.next()[1:].strip() 57 | # join all sequence lines to one. 58 | seq = "".join(s.strip() for s in faiter.next()) 59 | yield header, seq 60 | 61 | def main(post_data): 62 | post_data = dict(x.split(":") for x in post_data.split("\n")) 63 | URL = 'http://www.urogene.org/cgi-bin/methprimer/methprimer_results.cgi' 64 | 65 | import urllib 66 | html = urllib.urlopen(URL, urllib.urlencode(post_data)).read() 67 | if 'No primers found!' in html: 68 | raise StopIteration 69 | 70 | lines = [x.strip() for x in html.split("Sequence Length:")[1].split("\n")] 71 | lefts = ["Left " + l.split("")[1].strip() for l in lines if ">Left" in l] 72 | rights = [l for l in lines if "Right primer" in l and not "<<<" in l] 73 | products = [l for l in lines if "Product size:" in l] 74 | assert len(lefts) == len(rights) == len(products) 75 | 76 | lefts = [re.split(SPACES, l) for l in lefts] 77 | rights = [re.split(SPACES, r) for r in rights] 78 | products = [re.split(SPACES, p) for p in products] 79 | 80 | for lrp in zip(lefts, rights, products): 81 | yield lrp 82 | 83 | if __name__ == "__main__": 84 | import argparse 85 | p = argparse.ArgumentParser(description=__doc__, 86 | formatter_class=argparse.RawDescriptionHelpFormatter) 87 | p.add_argument("--primer-size", dest="primer_size", help="range of primer" 88 | " sizes. format is min:optimal:max e.g. %(default)s", 89 | default="18:25:30") 90 | p.add_argument("--product-size", dest="product_size", help="range of " 91 | "product sizes. format is min:optimal:max e.g. %(default)s", 92 | default="180:220:300") 93 | p.add_argument("--temp-range", dest="temp_range", help="range of " 94 | "temperatures sizes. format is min:optimal:max e.g. %(default)s", 95 | default="52:60:74") 96 | 97 | p.add_argument("fasta", help="fasta containing regions") 98 | 99 | args = p.parse_args() 100 | pmin, popt, pmax = map(int, args.primer_size.split(":")) 101 | prod_min, prod_opt, prod_max = map(int, args.product_size.split(":")) 102 | tmin, topt, _tmax = map(int, args.temp_range.split(":")) 103 | 104 | print "\t".join(COLUMNS) 105 | for header, seq in fasta_iter(args.fasta): 106 | if header.endswith("/2"): continue 107 | seen = {} 108 | tmax = min(_tmax, len(seq)) 109 | for lrp in main(post_data % locals()): 110 | line = [header] 111 | left, right, product = lrp 112 | line.extend(product + left[1:] + right[1:]) 113 | s = "\t".join(line) 114 | if s in seen: continue 115 | seen[s] = True 116 | print s 117 | -------------------------------------------------------------------------------- /vcf/vcf-to-matrix.py: -------------------------------------------------------------------------------- 1 | """ 2 | Given a multi-sample VCF, return a matrix of genotypes 3 | """ 4 | from __future__ import print_function, division 5 | import toolshed as ts 6 | import re 7 | from collections import Counter 8 | import sys 9 | 10 | 11 | def get_genotype(fmt, gts, gq_cutoff=0): 12 | splitter = re.compile("/|\|") 13 | fmt = fmt.split(":") 14 | ges = [] 15 | gqs = [] 16 | for gt in gts: 17 | if gt.startswith("./.") or gt == "." or all ("." == v for v in gt.split(":")): 18 | ges.append("nan") 19 | gqs.append('nan') 20 | else: 21 | d = dict(zip(fmt, gt.split(":"))) 22 | try: 23 | g, q = d['GT'], float(d['GQ']) 24 | except: 25 | print(fmt, d, file=sys.stderr) 26 | raise 27 | gqs.append("%.2f" % q) 28 | if q < gq_cutoff: 29 | ges.append("nan") 30 | else: 31 | # if multiple ALT's, just set to 1. 32 | ges.append(sum([min(int(n), 1) for n in splitter.split(g)])) 33 | if ges[-1] > 2: 34 | raise Exception(gt) 35 | ges[-1] = str(ges[-1]) 36 | assert len(ges) == len(gts) 37 | return ges, gqs 38 | 39 | 40 | def main(vcf, gq_cutoff, prefix, min_qual): 41 | out_gts = open('{prefix}.gt.txt'.format(prefix=prefix), 'w') 42 | out_gqs = open('{prefix}.gq.txt'.format(prefix=prefix), 'w') 43 | for i, d in enumerate(ts.reader(vcf, header="ordered", skip_while=lambda l: l[0] != "#CHROM")): 44 | if i == 0: 45 | print("\t".join(["loc"] + d.keys()[9:]), file=out_gts) 46 | print("\t".join(["loc"] + d.keys()[9:]), file=out_gqs) 47 | 48 | if float(d['QUAL']) < min_qual: continue 49 | 50 | gts, gqs = get_genotype(d['FORMAT'], d.values()[9:], gq_cutoff) 51 | if sum(1 for g in gts if g != "nan") < 2: continue 52 | 53 | print("\t".join(["{CHROM}:{POS}__{INFO}__qual-{QUAL}".format(**d)] + gts), file=out_gts) 54 | print("\t".join(["{CHROM}:{POS}__{INFO}__qual-{QUAL}".format(**d)] + gqs), file=out_gqs) 55 | 56 | 57 | 58 | if __name__ == "__main__": 59 | import argparse 60 | p = argparse.ArgumentParser(__doc__) 61 | p.add_argument('--gq', help='set values with a genotype-quality less than this to NA', 62 | default=0) 63 | p.add_argument('--min-qual', help='skip variants with QUAL less than this', 64 | default=1, type=float) 65 | p.add_argument('vcf') 66 | p.add_argument('prefix') 67 | a = p.parse_args() 68 | print(a.vcf, a.prefix, a.gq) 69 | main(a.vcf, a.gq, a.prefix, a.min_qual) 70 | -------------------------------------------------------------------------------- /werelate/README.md: -------------------------------------------------------------------------------- 1 | werelate 2 | ======== 3 | 4 | This is a flexible version chromsweep implemented in python. 5 | 6 | It has a nice api. 7 | -------------------------------------------------------------------------------- /werelate/test.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | bedtools intersect -header -c -a data/replication_timing.hg19.bed.gz -b data/cpgIsland.hg19.bed.gz | sort -k1,1 -k2,2n > bt 4 | python werelate.py data/replication_timing.hg19.bed.gz data/cpgIsland.hg19.bed.gz | sort -k1,1 -k2,2n > we 5 | 6 | mbt=$(md5sum bt | awk '{print $1}') 7 | mwe=$(md5sum we | awk '{print $1}') 8 | 9 | rm bt we 10 | 11 | if [[ "$mbt" == "$mwe" ]]; then 12 | echo "SUCCESS" $mbt $mwe 13 | else 14 | echo FAIL, $mbt, $mwe 15 | fi 16 | 17 | bedtools intersect -header -c -b data/replication_timing.hg19.bed.gz -a data/cpgIsland.hg19.bed.gz | grep -v ^# | sort -k1,1 -k2,2n > bt 18 | python werelate.py data/cpgIsland.hg19.bed.gz data/replication_timing.hg19.bed.gz | sort -k1,1 -k2,2n > we 19 | 20 | mbt=$(md5sum bt | awk '{print $1}') 21 | mwe=$(md5sum we | awk '{print $1}') 22 | 23 | rm bt we 24 | 25 | if [[ "$mbt" == "$mwe" ]]; then 26 | echo "SUCCESS" $mbt $mwe 27 | else 28 | echo FAIL $mbt $mwe 29 | fi 30 | -------------------------------------------------------------------------------- /werelate/weconcur.py: -------------------------------------------------------------------------------- 1 | """ 2 | from we related, we get an iterable of query intervals where each query 3 | interval has a list of related (overlapping) intervals from the other file(s). 4 | given this list, e.g.: 5 | 6 | query1: [dba, dbb, dbc] 7 | query2: [dbc, dbd, dbe, dbf] 8 | 9 | we can expand to: 10 | query1 dba 11 | query1 dbb 12 | query1 dbc 13 | query2 dbc 14 | query2 dbd 15 | query2 dbe 16 | query2 dbf 17 | 18 | which are all the pairings. Then we can look at a particular column from query 19 | and subject and see how often they co-occur. e.g. 20 | 21 | query1.sample_id, dba.sample_id ... 22 | query1.sample_id, dbb.sample_id ... 23 | 24 | finally, we can repeatedly shuffle one of the columns and check the randomized 25 | co-occurence without repeating the overlap test. 26 | """ 27 | import numpy as np 28 | import collections 29 | from werelate import relate, merge_files 30 | 31 | def main(): 32 | import argparse 33 | p = argparse.ArgumentParser("") 34 | p.add_argument("--a-col", type=int, help="1-based column in `afile` that indicates a grouping to test against file b") 35 | p.add_argument("--b-col", type=int, help="1-based column in `bfile` with grouping to test against file a") 36 | p.add_argument("--n-sims", type=int, help="number of simulations to run", default=20000) 37 | p.add_argument("afile", help="sorted bed file") 38 | p.add_argument("bfile", help="sorted bed file") 39 | a = p.parse_args() 40 | run(a) 41 | 42 | def run(args): 43 | ai, bi = args.a_col - 1, args.b_col - 1 44 | 45 | avs, bvs = [], [] 46 | for a in relate(merge_files(args.afile, args.bfile)): 47 | # TODO: add ops, e.g. below would be the default of all. 48 | # could also have max/min which convert to float and take max/min 49 | # and uniq which would use set(a.related). 50 | avs.extend([a.line[ai].rstrip("\r\n")] * len(a.related)) 51 | bvs.extend(b.line[ai].rstrip("\r\n") for b in a.related) 52 | 53 | # convert to integers for faster shuffling and equality testing. 54 | # TODO: handle floats with np.abs(avs - bvs) < delta. 55 | # or handle floats with np.abs(avs - bvs).sum() or np.corrcoef(avs, 56 | # bvs). 57 | ilookup = {p: i for i, p in enumerate(set(avs) | set(bvs))} 58 | avs = np.array([ilookup[a] for a in avs], dtype=np.int) 59 | bvs = np.array([ilookup[b] for b in bvs], dtype=np.int) 60 | 61 | obs = (avs == bvs).sum() 62 | 63 | exp = [] 64 | for i in range(args.n_sims): 65 | np.random.shuffle(bvs) 66 | exp.append((avs == bvs).sum()) 67 | 68 | ngt = sum(e >= obs for e in exp) 69 | print "p: %.3g (greater: %d)" % ((1.0 + ngt) / (1.0 + args.n_sims), ngt) 70 | 71 | 72 | if __name__ == "__main__": 73 | main() 74 | -------------------------------------------------------------------------------- /werelate/werelate.py: -------------------------------------------------------------------------------- 1 | """Streaming relation (overlap, distance) testing of (any number of) sorted files of intervals.""" 2 | from __future__ import print_function 3 | 4 | import sys 5 | import gzip 6 | import heapq 7 | import itertools as it 8 | from operator import attrgetter 9 | from collections import namedtuple 10 | try: 11 | filter = it.ifilter 12 | except AttributeError: # python3 13 | pass 14 | 15 | ######################################################################## 16 | ######################################################################## 17 | # this section is mostly uninteresting code to: 18 | # 1. "parse" the intervals 19 | # 2. sort them into a single iterable across files (using heapq.merge in python). 20 | # 3. group them by chrom (using itertools.groupby) 21 | # this is all done lazily streaming over the intervals. 22 | ######################################################################## 23 | ######################################################################## 24 | 25 | def xopen(f): 26 | return gzip.open(f) if f.endswith('.gz') \ 27 | else sys.stdin if f == "-" \ 28 | else open(f) 29 | 30 | # related will be a list of all things that are related to the given interval 31 | Interval = namedtuple('Interval', 'chrom start end fh line related i'.split()) 32 | 33 | class BedIter(object): 34 | __slots__ = "fh chrom start end header i line_num".split() 35 | def __init__(self, fname, i=None, chrom=0, start=1, end=2): 36 | self.fh = xopen(fname) 37 | self.chrom, self.start, self.end = chrom, start, end 38 | self.header = None 39 | self.i = i 40 | self.line_num = 0 41 | 42 | def __iter__(self): 43 | return self 44 | 45 | def next(self): 46 | line = next(self.fh).split("\t") 47 | self.line_num += 1 48 | if self.line_num == 1: 49 | try: 50 | chrom, start = line[self.chrom], int(line[self.start]) 51 | except: 52 | line = next(self.fh).split("\t") 53 | self.line_num += 1 54 | chrom, start = line[self.chrom], int(line[self.start]) 55 | else: 56 | chrom, start = line[self.chrom], int(line[self.start]) 57 | 58 | # yield chrom and start so that heapq.merge works 59 | return Interval(chrom, start, int(line[self.end]), self.fh, line, [], self.i) 60 | __next__ = next 61 | 62 | def merge_files(*beds): 63 | beds = [BedIter(f, i=i) for i, f in enumerate(beds)] 64 | return merge_beds(beds) 65 | 66 | def merge_beds(beds): 67 | for item in heapq.merge(*beds): 68 | yield item 69 | 70 | def relate(merged): 71 | iterable = it.groupby(merged, attrgetter('chrom')) 72 | 73 | seenChroms, lastChrom = set(), None 74 | 75 | for chrom, li in iterable: 76 | if chrom in seenChroms and chrom != lastChrom: 77 | raise Exception("chromosomes wout of order between files: %s, %s" % 78 | (chrom, lastChrom)) 79 | lastChrom = chrom 80 | seenChroms.add(chrom) 81 | # we know everything in li is from the same chrom 82 | start = -1 83 | for interval in werelate(li): 84 | if interval.start < start: 85 | raise Exception("intervals out of order: %s after %d" % (interval, start)) 86 | start = interval.start 87 | yield interval 88 | 89 | ############################################################################ 90 | ############################################################################ 91 | # The section below is more interesting with the `werelate` function taking 92 | # an iterable of intervals and simply sending the appropriate intervals to 93 | # `check_related`. Each time `check_related` returns 94 | # True, info about the related interval is added to the other so the relations 95 | 96 | 97 | # example function to check for overlap or check within distance 98 | def check_related_distance(a, b, distance=0): 99 | # note with distance == 0 this just overlap. 100 | assert a.start <= b.start and a.chrom == b.chrom 101 | return b.start - distance < a.end 102 | 103 | def werelate(interval_iter, check_related=check_related_distance): 104 | """ 105 | Flexible overlap, or proximity testing and reporting. 106 | 107 | Arguments 108 | --------- 109 | 110 | interval_iter: iterable 111 | a lazy iterable of Intervals sorted by start position and from the same 112 | chromosome. 113 | 114 | check_related: function(a Interval, b Interval) -> bool 115 | a function that accepts 2 intervals and tells if they are related. 116 | Since the input is assumed to be sorted, it is assumed that if the 117 | function returns false, the a Interval can not possibly be related to 118 | any interval after the b Interval and so it can be yielded (and not 119 | tested against any remaining intervals in the stream). 120 | See check_related_distance and for example. 121 | """ 122 | cache = [next(interval_iter)] 123 | for interval in interval_iter: 124 | for i, c in enumerate(cache): 125 | if check_related(c, interval): 126 | # they must overlap. add each to the other's related pile 127 | if c.fh != interval.fh: 128 | if c.i == 0: 129 | c.related.append(interval) 130 | elif interval.i == 0: 131 | interval.related.append(c) 132 | else: 133 | # only report intervals from the query. 134 | if c.i == 0: 135 | yield c 136 | cache[i] = None 137 | 138 | cache = list(filter(None, cache)) + [interval] 139 | 140 | for c in filter(None, cache): 141 | if c.i == 0: 142 | yield c 143 | 144 | if __name__ == "__main__": 145 | import sys 146 | for b in relate(merge_files(*sys.argv[1:])): 147 | print("\t".join(map(str, (b.chrom, b.start, b.end, 148 | "\t".join(b.line[3:]).rstrip("\n"), len(b.related))))) 149 | --------------------------------------------------------------------------------