├── .gitignore ├── README.txt ├── genes.gff ├── lib ├── blastparser.py └── parse_blast.py ├── ngs-00-update-notebooks.ipynb ├── ngs-01-install-dropbox.ipynb ├── ngs-02-install-screed.ipynb ├── ngs-03-install-khmer.ipynb ├── ngs-10-blast.ipynb ├── ngs-11-python-and-graphing.ipynb ├── ngs-12-python-rundown.ipynb ├── ngs-31-python-and-short-reads.ipynb ├── ngs-41-velvet-genome-assembly.ipynb ├── ngs-42-oases-mRNAseq-assembly.ipynb ├── ngs-43-trinity-mRNAseq-assembly.ipynb ├── ngs-44-kmer-distributions.ipynb ├── ngs-51-gimme-tools.ipynb ├── ngs-5x-digital-normalization.ipynb ├── ngs-5x-kmer-abundance-distributions-2013.ipynb ├── ngs-61-intro-to-dicts.ipynb ├── ngs-62-screed-database-as-dict.ipynb ├── ngs-70-hmp-diginorm.ipynb ├── ngs-71-hmp-partition.ipynb ├── reads.bed └── run.sh /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | IPython notebooks for the MSU NGS summer course. 2 | 3 | See: 4 | 5 | http://bioinformatics.msu.edu/ngs-summer-course-2012 6 | 7 | for more information. 8 | 9 | C. Titus Brown, ctb@msu.edu 10 | -------------------------------------------------------------------------------- /lib/blastparser.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | Yet Another BLAST parser for NCBI BLAST output. 4 | 5 | Goals: 6 | 7 | - nice introspection 8 | - nice Pythonic accessibility 9 | - maintainability in the face of changing NCBI BLAST output formats 10 | 11 | Sample usage: :: 12 | 13 | for record in parse_file('blast_output.txt'): 14 | print '-', record.query_name, record.database.name 15 | for hit in record.hits: 16 | print '--', hit.subject_name, hit.subject_length 17 | print ' ', hit.total_score, hit.total_expect 18 | for submatch in hit: 19 | print submatch.expect, submatch.bits 20 | 21 | print submatch.query_sequence 22 | print submatch.alignment 23 | print submatch.subject_sequence 24 | 25 | Here, 'submatch' is a BlastObjectSubmatch; 'hit' is a BlastSubjectHits; 26 | 'record' is a BlastQuery; and 'record.database' is a BlastDatabase. See 27 | the docstrings below for attributes available on these objects. 28 | 29 | Author: C. Titus Brown 30 | """ 31 | 32 | __version__ = 0.2 33 | 34 | __all__ = ['BlastParser', 'parse_fp', 'parse_file', 'parse_string', 35 | 'open_shelf'] 36 | __docformat__ = 'restructuredtext' 37 | 38 | import math 39 | from cStringIO import StringIO 40 | import parse_blast 41 | 42 | ### 43 | 44 | class BlastSubjectSubmatch(object): 45 | """ 46 | BlastSubjectSubmatch. 47 | 48 | A specific submatch (score/alignment) of a query sequence to a 49 | subject sequence. 50 | 51 | Attributes: 52 | 53 | - expect 54 | - frame1 55 | - frame2 56 | - score 57 | - query_start 58 | - query_end 59 | - subject_start 60 | - subject_end 61 | - query_sequence 62 | - subject_sequence 63 | 64 | Usage: :: 65 | 66 | print submatch_obj.expect 67 | 68 | (etc.) 69 | 70 | """ 71 | # __slots__ = ['expect', 'frame1', 'frame2', 72 | # 'query_start', 'query_end', 'query_sequence', 73 | # 'subject_start', 'subject_end', 'subject_sequence', 'identity'] 74 | 75 | def __init__(self, expect, frame1, frame2, 76 | q_start, q_end, q_seq, s_start, s_end, s_seq, identity, score): 77 | self.expect = math.pow(10, -expect) 78 | self.frame1 = frame1 79 | self.frame2 = frame2 80 | self.query_start = q_start 81 | self.query_end = q_end 82 | self.query_sequence = q_seq 83 | 84 | self.subject_start = s_start 85 | self.subject_end = s_end 86 | self.subject_sequence = s_seq 87 | self.score = score 88 | 89 | def __repr__(self): 90 | return ""\ 91 | % (self.expect, self.query_start, self.query_end, 92 | self.subject_start, self.subject_end) 93 | 94 | class BlastSubjectHits(object): 95 | """ 96 | BlastSubjectHits. 97 | 98 | A list of all of the matches between a query sequence and a subject 99 | sequence. 100 | 101 | Attributes: 102 | * subject_name -- name of subject sequence. 103 | * matches -- list of BlastSubjectSubmatch objects. 104 | 105 | Usage: :: 106 | 107 | print hits_object.subject_name 108 | for match in hits_object: 109 | print match 110 | """ 111 | # __slots__ = ['subject_name', 'matches' ] 112 | def __init__(self, subject_name, matches): 113 | self.subject_name = str(subject_name) 114 | self.matches = matches 115 | 116 | def __getitem__(self, i): 117 | return self.matches[i] 118 | 119 | def __len__(self): 120 | return len(self.matches) 121 | 122 | def __repr__(self): 123 | seqname = build_short_sequence_name(self.subject_name) 124 | return "" % (seqname, len(self)) 125 | 126 | class BlastQuery(object): 127 | """ 128 | A BLAST query (single sequence against database) containing all results. 129 | 130 | Attributes: 131 | 132 | * query_name -- name of query sequence (following 'Query='). 133 | * hits -- a list of BlastSubjectHits, containing each match + alignment. 134 | 135 | Usage: :: 136 | 137 | print query_object.query_name 138 | for hits_object in query_object: 139 | print hits_object.subject_name 140 | """ 141 | # __slots__ = ['query_name', 'hits' ] 142 | def __init__(self, query_name, hits): 143 | self.query_name = query_name 144 | self.hits = list(hits) 145 | 146 | def __repr__(self): 147 | query_short = build_short_sequence_name(self.query_name) 148 | return "" % (query_short, len(self.hits)) 149 | 150 | def __len__(self): 151 | return len(self.hits) 152 | 153 | def __getitem__(self, i): 154 | return self.hits[i] 155 | 156 | class _BlastShelf(object): 157 | def __init__(self, filename, mode='r'): 158 | from shelve import BsdDbShelf 159 | from bsddb import btopen 160 | 161 | _db = btopen(filename, 'r') 162 | self.db = BsdDbShelf(_db) 163 | 164 | def __iter__(self): 165 | db = self.db 166 | last_k, _ = db.last() 167 | k, v = db.first() 168 | while k != last_k: 169 | yield k, v 170 | k, v = db.next() 171 | yield k, v 172 | 173 | def open_shelf(filename, mode='r'): 174 | from shelve import BsdDbShelf 175 | from bsddb import btopen 176 | 177 | return _BlastShelf(filename, mode) 178 | 179 | def parse_file(filename): 180 | """ 181 | Parse records from a given file; 'filename' is the path to the file. 182 | """ 183 | b = BlastParser() 184 | for record in b.parse_file(filename): 185 | yield record 186 | 187 | def parse_fp(fp, **kw): 188 | """ 189 | Parse records out of the given file handle. 190 | """ 191 | b = BlastParser() 192 | 193 | for record in b.parse_fp(fp, **kw): 194 | yield record 195 | 196 | def parse_string(s): 197 | """ 198 | Parse records out of a string buffer. 199 | """ 200 | fp = StringIO(s) 201 | b = BlastParser() 202 | 203 | for record in b.parse_fp(fp): 204 | yield record 205 | 206 | class _PygrBlastHitParser(parse_blast.BlastHitParser): 207 | def generate_intervals(self): 208 | yield self.query_id, self.subject_id, \ 209 | BlastSubjectSubmatch(self.e_value, 210 | None, 211 | None, 212 | self.query_start, 213 | self.query_end, 214 | self.query_seq, 215 | self.subject_start, 216 | self.subject_end, 217 | self.subject_seq, 218 | self.identity_percent, 219 | self.blast_score) 220 | 221 | class BlastParser(object): 222 | """ 223 | BlastParser objects coordinate the use of pyparsing parsers to 224 | parse complete BLAST records. 225 | 226 | Attributes: 227 | 228 | * blast_record -- an individual BLAST record; returns BlastQuery object. 229 | * blast_output -- list of records; returns list of BlastQuery objects. 230 | 231 | Methods: 232 | 233 | * reset() -- clear the blast parser of persistent information. 234 | * parse_string(s) 235 | * parse_file(filename) 236 | * parse_fp(fp) 237 | """ 238 | def __init__(self): 239 | self.p = _PygrBlastHitParser() 240 | 241 | def parse_file(self, filename): 242 | fp = open(filename) 243 | for record in self.parse_fp(fp): 244 | yield record 245 | 246 | def parse_fp(self, fp): 247 | 248 | subjects = [] 249 | matches = [] 250 | 251 | cur_query = None 252 | cur_subject = None 253 | 254 | for query_id, subject_id, submatch in self.p.parse_file(fp): 255 | if cur_subject != subject_id or cur_query != query_id: 256 | if matches: 257 | assert cur_subject 258 | subject_hits = BlastSubjectHits(cur_subject, matches) 259 | subjects.append(subject_hits) 260 | matches = [] 261 | 262 | cur_subject = subject_id 263 | 264 | if cur_query != query_id: 265 | if cur_query: 266 | assert subjects, cur_query 267 | yield BlastQuery(cur_query, subjects) 268 | subjects = [] 269 | 270 | cur_query = query_id 271 | 272 | matches.append(submatch) 273 | 274 | if matches: 275 | subjects.append(BlastSubjectHits(cur_subject, matches)) 276 | 277 | if subjects: 278 | yield BlastQuery(cur_query, subjects) 279 | 280 | 281 | def build_short_sequence_name(name, max_len=20): 282 | if len(name) < max_len: 283 | return name 284 | 285 | name_l = name.split() 286 | if len(name_l) > 1: 287 | return build_short_sequence_name(name_l[0], max_len) 288 | 289 | name = name_l[0] 290 | if len(name) > max_len: 291 | name = name[:max_len-3] + '...' 292 | return name 293 | 294 | ##### 295 | 296 | if __name__ == '__main__': 297 | import sys 298 | from shelve import BsdDbShelf 299 | from bsddb import btopen 300 | from optparse import OptionParser 301 | 302 | ### read command line parameters 303 | 304 | parser = OptionParser() 305 | parser.add_option('-z', '--zlib-compressed', action="store_true", 306 | dest="zlib_compressed", 307 | help="read gzipped BLAST output file") 308 | 309 | parser.add_option('-n', '--ignore-empty-hits', action="store_true", 310 | dest="ignore_empty_hits", 311 | help="ignore BLAST hits with no results") 312 | 313 | (options, args) = parser.parse_args() 314 | 315 | (blast_file, output_file) = args 316 | 317 | ### open blast file, open/create database r/w 318 | 319 | if options.zlib_compressed: 320 | import gzip 321 | blast_fp = gzip.open(blast_file) 322 | else: 323 | blast_fp = open(blast_file) 324 | 325 | _db = btopen(output_file, 'c') 326 | db = BsdDbShelf(_db) 327 | 328 | ### go! 329 | 330 | for n, record in enumerate(parse_fp(blast_fp, 331 | ignore_no_hits=options.ignore_empty_hits)): 332 | if n % 100 == 0: 333 | print '...', n 334 | 335 | if options.ignore_empty_hits and not record: 336 | continue 337 | 338 | name = record.query_name 339 | db[name] = record 340 | 341 | print 'read %d records total' % (n + 1,) 342 | -------------------------------------------------------------------------------- /lib/parse_blast.py: -------------------------------------------------------------------------------- 1 | from __future__ import generators 2 | import math 3 | 4 | class CoordsGroupStart(object): 5 | pass 6 | 7 | class CoordsGroupEnd(object): 8 | pass 9 | 10 | # AUTHORS: zfierstadt, leec 11 | 12 | def is_line_start(token,line): 13 | "check whether line begins with token" 14 | return token==line[:len(token)] 15 | 16 | def get_ori_letterunit(start,end,seq,gapchar='-'): 17 | """try to determine orientation (1 or -1) based on whether start>end, 18 | and letterunit (1 or 3) depending on the ratio of end-start difference 19 | vs the actual non-gap letter count. Returns tuple (ori,letterunit)""" 20 | if end>start: 21 | ori=1 22 | else: 23 | ori= -1 24 | ngap=0 25 | for l in seq: 26 | if l==gapchar: 27 | ngap+=1 28 | seqlen=len(seq)-ngap 29 | if ori*float(end-start)/seqlen >2.0: 30 | letterunit=3 31 | else: 32 | letterunit=1 33 | return ori,letterunit 34 | 35 | class BlastIval(object): 36 | def __repr__(self): 37 | return '' 38 | 39 | class BlastHitParser(object): 40 | """reads alignment info from blastall standard output. 41 | Method parse_file(fo) reads file object fo, and generates tuples 42 | suitable for BlastIval. 43 | 44 | Attributes: 45 | query_seq 46 | query_start 47 | query_end 48 | subject_seq 49 | subject_start 50 | subject_end 51 | query_id 52 | subject_id 53 | e_value 54 | blast_score 55 | identity_percent 56 | """ 57 | gapchar='-' 58 | def __init__(self): 59 | self.hit_id=0 60 | self.nline = 0 61 | self.reset() 62 | def reset(self): 63 | "flush any alignment info, so we can start reading new alignment" 64 | self.query_seq="" 65 | self.subject_seq="" 66 | self.hit_id+=1 67 | def save_query(self,line): 68 | self.query_id=line.split()[1] 69 | def save_subject(self,line): 70 | self.subject_id=line.split()[0][1:] 71 | def save_score(self,line): 72 | "save a Score: line" 73 | self.blast_score=float(line.split()[2]) 74 | s=line.split()[7] 75 | if s[0]=='e': 76 | s='1'+s 77 | if s.endswith(','): s = s.strip(',') 78 | try: 79 | self.e_value= -math.log(float(s))/math.log(10.0) 80 | except (ValueError,OverflowError), e: 81 | self.e_value=300. 82 | def save_identity(self,line): 83 | "save Identities line" 84 | s=line.split()[3][1:] 85 | self.identity_percent=int(s[:s.find('%')]) 86 | def save_query_line(self,line): 87 | "save a Query: line" 88 | c=line.split() 89 | self.query_end=int(c[3]) 90 | if not self.query_seq: 91 | self.query_start=int(c[1]) 92 | if self.query_start < self.query_end: # handles forward orientation 93 | self.query_start -= 1 94 | self.query_seq+=c[2] 95 | self.seq_start_char=line.find(c[2], 5) # IN CASE BLAST SCREWS UP Sbjct: 96 | def save_subject_line(self,line): 97 | "save a Sbjct: line, attempt to handle various BLAST insanities" 98 | c=line.split() 99 | if len(c)<4: # OOPS, BLAST FORGOT TO PUT SPACE BEFORE 1ST NUMBER 100 | # THIS HAPPENS IN TBLASTN... WHEN THE SUBJECT SEQUENCE 101 | # COVERS RANGE 1-1200, THE FOUR DIGIT NUMBER WILL RUN INTO 102 | # THE SEQUENCE, WITH NO SPACE!! 103 | c=['Sbjct:',line[6:self.seq_start_char]] \ 104 | +line[self.seq_start_char:].split() # FIX BLAST SCREW-UP 105 | self.subject_end=int(c[3]) 106 | if not self.subject_seq: 107 | self.subject_start=int(c[1]) 108 | if self.subject_start < self.subject_end: # handles forward orientation 109 | self.subject_start -= 1 110 | self.subject_seq+=c[2] 111 | lendiff=len(self.query_seq)-len(self.subject_seq) 112 | if lendiff>0: # HANDLE TBLASTN SCREWINESS: Sbjct SEQ OFTEN TOO SHORT!! 113 | # THIS APPEARS TO BE ASSOCIATED ESPECIALLY WITH STOP CODONS * 114 | self.subject_seq+=lendiff*'A' # EXTEND TO SAME LENGTH AS QUERY... 115 | elif lendiff<0 and not hasattr(self,'ignore_query_truncation'): 116 | # WHAT THE HECK?!?! WARN THE USER: BLAST RESULTS ARE SCREWY... 117 | raise ValueError( 118 | """BLAST appears to have truncated the Query: sequence 119 | to be shorter than the Sbjct: sequence: 120 | Query: %s 121 | Sbjct: %s 122 | This should not happen! To ignore this error, please 123 | create an attribute ignore_query_truncation on the 124 | BlastHitParser object.""" % (self.query_seq,self.subject_seq)) 125 | def get_interval_obj(self, q_start, q_end, s_start, s_end, 126 | query_ori, query_factor, subject_ori, subject_factor): 127 | "return interval result as an object with attributes" 128 | o = BlastIval() 129 | o.hit_id = self.hit_id 130 | o.src_id = self.query_id 131 | o.dest_id = self.subject_id 132 | o.blast_score = self.blast_score 133 | o.e_value = self.e_value 134 | o.percent_id = self.identity_percent 135 | o.src_ori = query_ori 136 | o.dest_ori = subject_ori 137 | query_start = self.query_start+q_start*query_ori*query_factor 138 | query_end = self.query_start+q_end*query_ori*query_factor 139 | subject_start = self.subject_start+s_start*subject_ori*subject_factor 140 | subject_end = self.subject_start+s_end*subject_ori*subject_factor 141 | if query_start=0: # END OF AN UNGAPPED INTERVAL 171 | yield self.get_interval_obj(q_start, i_query, 172 | s_start, i_subject, 173 | query_ori, query_factor, 174 | subject_ori, subject_factor) 175 | q_start= -1 176 | elif q_start<0: # START OF AN UNGAPPED INTERVAL 177 | q_start=i_query 178 | s_start=i_subject 179 | if self.query_seq[i]!=self.gapchar: # COUNT QUERY LETTERS 180 | i_query+=1 181 | if self.subject_seq[i]!=self.gapchar: # COUNT SUBJECT LETTERS 182 | i_subject+=1 183 | if q_start>=0: # REPORT THE LAST INTERVAL 184 | yield self.get_interval_obj(q_start, i_query, 185 | s_start, i_subject, 186 | query_ori, query_factor, 187 | subject_ori, subject_factor) 188 | 189 | yield CoordsGroupEnd() 190 | 191 | def parse_file(self,myfile): 192 | "generate interval tuples by parsing BLAST output from myfile" 193 | for line in myfile: 194 | self.nline += 1 195 | if self.is_valid_hit() and \ 196 | (is_line_start('>',line) or is_line_start(' Score =',line) \ 197 | or is_line_start(' Database:',line) \ 198 | or is_line_start('Query=',line)): 199 | for t in self.generate_intervals(): # REPORT THIS ALIGNMENT 200 | yield t # GENERATE ALL ITS INTERVAL MATCHES 201 | self.reset() # RESET TO START A NEW ALIGNMENT 202 | if is_line_start('Query=',line): 203 | self.save_query(line) 204 | elif is_line_start('>',line): 205 | self.save_subject(line) 206 | elif is_line_start(' Score =',line): 207 | self.save_score(line) 208 | elif 'Identities =' in line: 209 | self.save_identity(line) 210 | elif is_line_start('Query:',line): 211 | self.save_query_line(line) 212 | elif is_line_start('Sbjct:',line): 213 | self.save_subject_line(line) 214 | if self.nline == 0: # no blast output?? 215 | raise IOError('no BLAST output. Check that blastall is in your PATH') 216 | 217 | if __name__=='__main__': 218 | import sys 219 | p=BlastHitParser() 220 | for t in p.parse_file(sys.stdin): 221 | print t 222 | -------------------------------------------------------------------------------- /ngs-00-update-notebooks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-00-update-notebooks" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "code", 11 | "input": [ 12 | "cd /usr/local/notebooks" 13 | ], 14 | "language": "python", 15 | "outputs": [ 16 | { 17 | "output_type": "stream", 18 | "stream": "stdout", 19 | "text": [ 20 | "/usr/local/notebooks", 21 | "" 22 | ] 23 | } 24 | ], 25 | "prompt_number": 1 26 | }, 27 | { 28 | "cell_type": "code", 29 | "input": [ 30 | "ls" 31 | ], 32 | "language": "python", 33 | "outputs": [ 34 | { 35 | "output_type": "stream", 36 | "stream": "stdout", 37 | "text": [ 38 | "\u001b[0m\u001b[01;34mlib\u001b[0m/ ngs-00-update-notebooks.ipynb ngs-10-blast.ipynb README.txt", 39 | "" 40 | ] 41 | } 42 | ], 43 | "prompt_number": 2 44 | }, 45 | { 46 | "cell_type": "code", 47 | "input": [ 48 | "!git checkout -f master" 49 | ], 50 | "language": "python", 51 | "outputs": [ 52 | { 53 | "output_type": "stream", 54 | "stream": "stdout", 55 | "text": [ 56 | "Already on 'master'", 57 | "" 58 | ] 59 | } 60 | ], 61 | "prompt_number": 4 62 | }, 63 | { 64 | "cell_type": "code", 65 | "input": [ 66 | "!git pull origin master" 67 | ], 68 | "language": "python", 69 | "outputs": [ 70 | { 71 | "output_type": "stream", 72 | "stream": "stdout", 73 | "text": [ 74 | "From git://github.com/ctb/ngs-notebooks", 75 | " * branch master -> FETCH_HEAD", 76 | "" 77 | ] 78 | }, 79 | { 80 | "output_type": "stream", 81 | "stream": "stdout", 82 | "text": [ 83 | "Already up-to-date.", 84 | "" 85 | ] 86 | } 87 | ], 88 | "prompt_number": 5 89 | } 90 | ] 91 | } 92 | ] 93 | } -------------------------------------------------------------------------------- /ngs-01-install-dropbox.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-01-install-dropbox" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "code", 11 | "input": [ 12 | "cd" 13 | ], 14 | "language": "python", 15 | "outputs": [] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "input": [ 20 | "!wget -O dropbox.tar.gz \"http://www.dropbox.com/download/?plat=lnx.x86_64\"" 21 | ], 22 | "language": "python", 23 | "outputs": [] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "input": [ 28 | "!tar -xzf dropbox.tar.gz" 29 | ], 30 | "language": "python", 31 | "outputs": [] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "input": [ 36 | "!~/.dropbox-dist/dropboxd" 37 | ], 38 | "language": "python", 39 | "outputs": [] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "source": [ 44 | "Now, once you see the message \"This client is not linked to any account...\", visit the link in another tab & log in to Dropbox.", 45 | "", 46 | "THEN, go up to 'Kernel', and select 'Interrupt'." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "input": [ 52 | "# finally: reboot!", 53 | "!/sbin/shutdown -r now" 54 | ], 55 | "language": "python", 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "source": [ 61 | "When the machine restarts, you will have a directory /root/Dropbox that is linked to your dropbox account." 62 | ] 63 | } 64 | ] 65 | } 66 | ] 67 | } -------------------------------------------------------------------------------- /ngs-02-install-screed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-02-install-screed" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "code", 11 | "input": [ 12 | "cd /usr/local/src" 13 | ], 14 | "language": "python", 15 | "outputs": [] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "input": [ 20 | "!git clone git://github.com/ged-lab/screed.git" 21 | ], 22 | "language": "python", 23 | "outputs": [] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "input": [ 28 | "!easy_install screed/" 29 | ], 30 | "language": "python", 31 | "outputs": [] 32 | } 33 | ] 34 | } 35 | ] 36 | } -------------------------------------------------------------------------------- /ngs-03-install-khmer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-03-install-khmer" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "markdown", 11 | "source": [ 12 | "# Installing khmer", 13 | "", 14 | "khmer is a package that, like screed, was developed by the Brown Lab. It's for k-mer counting and other nefarious purposes.", 15 | "", 16 | "See: http://readthedocs.org/docs/khmer/en/latest/" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "input": [ 22 | "cd /usr/local/src" 23 | ], 24 | "language": "python", 25 | "outputs": [] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "input": [ 30 | "!git clone git://github.com/ged-lab/khmer.git" 31 | ], 32 | "language": "python", 33 | "outputs": [] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "input": [ 38 | "!cd khmer && make test" 39 | ], 40 | "language": "python", 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "input": [ 46 | "!easy_install khmer/python" 47 | ], 48 | "language": "python", 49 | "outputs": [] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "input": [ 54 | "" 55 | ], 56 | "language": "python", 57 | "outputs": [] 58 | } 59 | ] 60 | } 61 | ] 62 | } -------------------------------------------------------------------------------- /ngs-11-python-and-graphing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-11-python-and-graphing" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "markdown", 11 | "source": [ 12 | "## Using Python to load some data and do some plotting" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "source": [ 18 | "Let's get some workout data and plot it!" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "input": [ 24 | "import urllib", 25 | "fp = urllib.urlopen('http://ged.msu.edu/angus/tutorials-2012/files/workout.csv')", 26 | "for line in fp:", 27 | " print line," 28 | ], 29 | "language": "python", 30 | "outputs": [] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "input": [ 35 | "workouts = []", 36 | "", 37 | "fp = urllib.urlopen('http://ged.msu.edu/angus/tutorials-2012/files/workout.csv')", 38 | "for line in fp:", 39 | " line = line.strip()", 40 | " if line:", 41 | " workouts.append(line)", 42 | "workouts" 43 | ], 44 | "language": "python", 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "input": [ 50 | "# let's use the CSV library: http://docs.python.org/library/csv.html", 51 | "import csv", 52 | "", 53 | "workouts = []", 54 | "", 55 | "fp = urllib.urlopen('http://lyorn.idyll.org/~t/transfer/workout.csv')", 56 | "reader = csv.reader(fp)", 57 | "for line in reader:", 58 | " if line:", 59 | " if not line[0].startswith('#'):", 60 | " workouts.append(line)", 61 | "", 62 | "workouts" 63 | ], 64 | "language": "python", 65 | "outputs": [] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "input": [ 70 | "# http://docs.python.org/library/datetime.html", 71 | "from datetime import datetime", 72 | "workouts2 = []", 73 | "for (date, exercise, miles, duration) in workouts:", 74 | " miles = int(miles)", 75 | " duration = int(duration)", 76 | " exercise = exercise.strip()", 77 | " date = datetime.strptime(date, \"%Y, %b-%d\")", 78 | " workouts2.append([date, exercise, miles, duration])", 79 | " ", 80 | "workouts = workouts2", 81 | "workouts" 82 | ], 83 | "language": "python", 84 | "outputs": [] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "input": [ 89 | "durations = []", 90 | "for row in workouts:", 91 | " durations.append(row[3])", 92 | "durations" 93 | ], 94 | "language": "python", 95 | "outputs": [] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "input": [ 100 | "days = []", 101 | "for row in workouts:", 102 | " date = row[0]", 103 | " day = date.day", 104 | " days.append(day)", 105 | "days" 106 | ], 107 | "language": "python", 108 | "outputs": [] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "input": [ 113 | "plot(days, durations, 'b-', linewidth=3)", 114 | "xlabel('Day of month')", 115 | "ylabel('Duration of workout')", 116 | "title('Titus\\'s workout')", 117 | "grid()", 118 | "xticks(range(1, max(days) + 1))", 119 | "axis(ymax=70)", 120 | "savefig('/root/Dropbox/ngs-day2.pdf')" 121 | ], 122 | "language": "python", 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "source": [ 128 | "# Loading matplotlib gallery code", 129 | "", 130 | "Go to http://matplotlib.sourceforge.net/gallery.html, find the source code link, and", 131 | "then put it in a ipynb cell after '%load' and it will be loaded into the next cell." 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "input": [ 137 | "%load http://matplotlib.sourceforge.net/mpl_examples/api/radar_chart.py" 138 | ], 139 | "language": "python", 140 | "outputs": [] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "input": [ 145 | "import numpy as np", 146 | "", 147 | "import matplotlib.pyplot as plt", 148 | "from matplotlib.projections.polar import PolarAxes ", 149 | "from matplotlib.projections import register_projection ", 150 | "", 151 | "def radar_factory(num_vars, frame='circle'): ", 152 | " \"\"\"Create a radar chart with `num_vars` axes.\"\"\" ", 153 | " # calculate evenly-spaced axis angles ", 154 | " theta = 2*np.pi * np.linspace(0, 1-1./num_vars, num_vars) ", 155 | " # rotate theta such that the first axis is at the top ", 156 | " theta += np.pi/2 ", 157 | "", 158 | " def draw_poly_frame(self, x0, y0, r): ", 159 | " # TODO: use transforms to convert (x, y) to (r, theta)", 160 | " verts = [(r*np.cos(t) + x0, r*np.sin(t) + y0) for t in theta] ", 161 | " return plt.Polygon(verts, closed=True, edgecolor='k') ", 162 | " ", 163 | " def draw_circle_frame(self, x0, y0, r): ", 164 | " return plt.Circle((x0, y0), r) ", 165 | "", 166 | " frame_dict = {'polygon': draw_poly_frame, 'circle': draw_circle_frame} ", 167 | " if frame not in frame_dict: ", 168 | " raise ValueError, 'unknown value for `frame`: %s' % frame ", 169 | "", 170 | " class RadarAxes(PolarAxes): ", 171 | " \"\"\"Class for creating a radar chart (a.k.a. a spider or star chart) ", 172 | " ", 173 | " http://en.wikipedia.org/wiki/Radar_chart ", 174 | " \"\"\" ", 175 | " name = 'radar' ", 176 | " # use 1 line segment to connect specified points ", 177 | " RESOLUTION = 1 ", 178 | " # define draw_frame method ", 179 | " draw_frame = frame_dict[frame] ", 180 | " ", 181 | " def fill(self, *args, **kwargs): ", 182 | " \"\"\"Override fill so that line is closed by default\"\"\" ", 183 | " closed = kwargs.pop('closed', True) ", 184 | " return super(RadarAxes, self).fill(closed=closed, *args, **kwargs) ", 185 | " ", 186 | " def plot(self, *args, **kwargs): ", 187 | " \"\"\"Override plot so that line is closed by default\"\"\" ", 188 | " lines = super(RadarAxes, self).plot(*args, **kwargs) ", 189 | " for line in lines: ", 190 | " self._close_line(line) ", 191 | " ", 192 | " def _close_line(self, line): ", 193 | " x, y = line.get_data() ", 194 | " # FIXME: markers at x[0], y[0] get doubled-up ", 195 | " if x[0] != x[-1]: ", 196 | " x = np.concatenate((x, [x[0]])) ", 197 | " y = np.concatenate((y, [y[0]])) ", 198 | " line.set_data(x, y) ", 199 | " ", 200 | " def set_varlabels(self, labels): ", 201 | " self.set_thetagrids(theta * 180/np.pi, labels) ", 202 | " ", 203 | " def _gen_axes_patch(self): ", 204 | " x0, y0 = (0.5, 0.5) ", 205 | " r = 0.5 ", 206 | " return self.draw_frame(x0, y0, r)", 207 | " ", 208 | " register_projection(RadarAxes) ", 209 | " return theta ", 210 | "", 211 | "", 212 | "if __name__ == '__main__': ", 213 | " #The following data is from the Denver Aerosol Sources and Health study. ", 214 | " #See doi:10.1016/j.atmosenv.2008.12.017 ", 215 | " #", 216 | " #The data are pollution source profile estimates for five modeled pollution", 217 | " #sources (e.g., cars, wood-burning, etc) that emit 7-9 chemical species.", 218 | " #The radar charts are experimented with here to see if we can nicely ", 219 | " #visualize how the modeled source profiles change across four scenarios:", 220 | " # 1) No gas-phase species present, just seven particulate counts on", 221 | " # Sulfate", 222 | " # Nitrate", 223 | " # Elemental Carbon (EC)", 224 | " # Organic Carbon fraction 1 (OC)", 225 | " # Organic Carbon fraction 2 (OC2)", 226 | " # Organic Carbon fraction 3 (OC3)", 227 | " # Pyrolized Organic Carbon (OP)", 228 | " # 2)Inclusion of gas-phase specie carbon monoxide (CO) ", 229 | " # 3)Inclusion of gas-phase specie ozone (O3). ", 230 | " # 4)Inclusion of both gas-phase speciesis present...", 231 | " N = 9", 232 | " theta = radar_factory(N)", 233 | " spoke_labels = ['Sulfate', 'Nitrate', 'EC', 'OC1', 'OC2', 'OC3', 'OP', 'CO', ", 234 | " 'O3']", 235 | " f1_base = [0.88, 0.01, 0.03, 0.03, 0.00, 0.06, 0.01, 0.00, 0.00]", 236 | " f1_CO = [0.88, 0.02, 0.02, 0.02, 0.00, 0.05, 0.00, 0.05, 0.00] ", 237 | " f1_O3 = [0.89, 0.01, 0.07, 0.00, 0.00, 0.05, 0.00, 0.00, 0.03] ", 238 | " f1_both = [0.87, 0.01, 0.08, 0.00, 0.00, 0.04, 0.00, 0.00, 0.01] ", 239 | "", 240 | " f2_base = [0.07, 0.95, 0.04, 0.05, 0.00, 0.02, 0.01, 0.00, 0.00]", 241 | " f2_CO = [0.08, 0.94, 0.04, 0.02, 0.00, 0.01, 0.12, 0.04, 0.00] ", 242 | " f2_O3 = [0.07, 0.95, 0.05, 0.04, 0.00, 0.02, 0.12, 0.00, 0.00] ", 243 | " f2_both = [0.09, 0.95, 0.02, 0.03, 0.00, 0.01, 0.13, 0.06, 0.00] ", 244 | "", 245 | " f3_base = [0.01, 0.02, 0.85, 0.19, 0.05, 0.10, 0.00, 0.00, 0.00]", 246 | " f3_CO = [0.01, 0.01, 0.79, 0.10, 0.00, 0.05, 0.00, 0.31, 0.00] ", 247 | " f3_O3 = [0.01, 0.02, 0.86, 0.27, 0.16, 0.19, 0.00, 0.00, 0.00] ", 248 | " f3_both = [0.01, 0.02, 0.71, 0.24, 0.13, 0.16, 0.00, 0.50, 0.00] ", 249 | " ", 250 | " f4_base = [0.02, 0.01, 0.07, 0.01, 0.21, 0.12, 0.98, 0.00, 0.00]", 251 | " f4_CO = [0.00, 0.02, 0.03, 0.38, 0.31, 0.31, 0.00, 0.59, 0.00] ", 252 | " f4_O3 = [0.01, 0.03, 0.00, 0.32, 0.29, 0.27, 0.00, 0.00, 0.95] ", 253 | " f4_both = [0.01, 0.03, 0.00, 0.28, 0.24, 0.23, 0.00, 0.44, 0.88] ", 254 | "", 255 | " f5_base = [0.01, 0.01, 0.02, 0.71, 0.74, 0.70, 0.00, 0.00, 0.00]", 256 | " f5_CO = [0.02, 0.02, 0.11, 0.47, 0.69, 0.58, 0.88, 0.00, 0.00] ", 257 | " f5_O3 = [0.02, 0.00, 0.03, 0.37, 0.56, 0.47, 0.87, 0.00, 0.00] ", 258 | " f5_both = [0.02, 0.00, 0.18, 0.45, 0.64, 0.55, 0.86, 0.00, 0.16] ", 259 | "", 260 | " fig = plt.figure(figsize=(9,9))", 261 | " # adjust spacing around the subplots", 262 | " fig.subplots_adjust(wspace=0.25, hspace=0.20, top=0.85, bottom=0.05)", 263 | " title_list = ['Basecase', 'With CO', 'With O3', 'CO & O3']", 264 | " data = {'Basecase': [f1_base, f2_base, f3_base, f4_base, f5_base],", 265 | " 'With CO': [f1_CO, f2_CO, f3_CO, f4_CO, f5_CO],", 266 | " 'With O3': [f1_O3, f2_O3, f3_O3, f4_O3, f5_O3], ", 267 | " 'CO & O3': [f1_both, f2_both, f3_both, f4_both, f5_both]}", 268 | " colors = ['b', 'r', 'g', 'm', 'y']", 269 | " # chemicals range from 0 to 1", 270 | " radial_grid = [0.2, 0.4, 0.6, 0.8]", 271 | " # If you don't care about the order, you can loop over data_dict.items()", 272 | " for n, title in enumerate(title_list):", 273 | " ax = fig.add_subplot(2, 2, n+1, projection='radar')", 274 | " plt.rgrids(radial_grid)", 275 | " ax.set_title(title, weight='bold', size='medium', position=(0.5, 1.1),", 276 | " horizontalalignment='center', verticalalignment='center')", 277 | " for d, color in zip(data[title], colors):", 278 | " ax.plot(theta, d, color=color) ", 279 | " ax.fill(theta, d, facecolor=color, alpha=0.25) ", 280 | " ax.set_varlabels(spoke_labels)", 281 | " # add legend relative to top-left plot", 282 | " plt.subplot(2,2,1)", 283 | " labels = ('Factor 1', 'Factor 2', 'Factor 3', 'Factor 4', 'Factor 5')", 284 | " legend = plt.legend(labels, loc=(0.9, .95), labelspacing=0.1)", 285 | " plt.setp(legend.get_texts(), fontsize='small')", 286 | " plt.figtext(0.5, 0.965, '5-Factor Solution Profiles Across Four Scenarios', ", 287 | " ha='center', color='black', weight='bold', size='large') ", 288 | " plt.show()", 289 | "" 290 | ], 291 | "language": "python", 292 | "outputs": [] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "input": [ 297 | "" 298 | ], 299 | "language": "python", 300 | "outputs": [] 301 | } 302 | ] 303 | } 304 | ] 305 | } -------------------------------------------------------------------------------- /ngs-12-python-rundown.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-12-python-rundown" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "markdown", 11 | "source": [ 12 | "# Python" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "input": [ 18 | "a = 5", 19 | "b = 'hello, world'", 20 | "c = '5'", 21 | "print a", 22 | "print b", 23 | "print c", 24 | "", 25 | "a = \"five\"", 26 | "d = \"someone else's problem\"", 27 | "", 28 | "e = 10.5", 29 | "print e", 30 | "", 31 | "f = \"\"\"this is a triple quote.", 32 | "", 33 | "It actually supports multiple lines", 34 | "\"\"\"" 35 | ], 36 | "language": "python", 37 | "outputs": [ 38 | { 39 | "output_type": "stream", 40 | "stream": "stdout", 41 | "text": [ 42 | "5", 43 | "hello, world", 44 | "5", 45 | "10.5", 46 | "" 47 | ] 48 | } 49 | ], 50 | "prompt_number": 5 51 | }, 52 | { 53 | "cell_type": "code", 54 | "input": [ 55 | "# list", 56 | "x = [a, b, c, d, 20, 30, 40]", 57 | "print x", 58 | "print x[0]", 59 | "print x[4]" 60 | ], 61 | "language": "python", 62 | "outputs": [ 63 | { 64 | "output_type": "stream", 65 | "stream": "stdout", 66 | "text": [ 67 | "['five', 'hello, world', '5', \"someone else's problem\", 20, 30, 40]", 68 | "five", 69 | "20", 70 | "" 71 | ] 72 | } 73 | ], 74 | "prompt_number": 9 75 | }, 76 | { 77 | "cell_type": "code", 78 | "input": [ 79 | "y = [10, 20, 30]", 80 | "z = [15, 25, 35, 45]", 81 | "x = [y, z]", 82 | "print x", 83 | "print x[0]", 84 | "print x[0][2]" 85 | ], 86 | "language": "python", 87 | "outputs": [ 88 | { 89 | "output_type": "stream", 90 | "stream": "stdout", 91 | "text": [ 92 | "[[10, 20, 30], [15, 25, 35, 45]]", 93 | "[10, 20, 30]", 94 | "30", 95 | "" 96 | ] 97 | } 98 | ], 99 | "prompt_number": 17 100 | }, 101 | { 102 | "cell_type": "code", 103 | "input": [ 104 | "print 5*5", 105 | "print 30/2", 106 | "print 10+10", 107 | "print 52-5", 108 | "a = 10", 109 | "print a*40" 110 | ], 111 | "language": "python", 112 | "outputs": [ 113 | { 114 | "output_type": "stream", 115 | "stream": "stdout", 116 | "text": [ 117 | "25", 118 | "15", 119 | "20", 120 | "47", 121 | "400", 122 | "" 123 | ] 124 | } 125 | ], 126 | "prompt_number": 19 127 | }, 128 | { 129 | "cell_type": "code", 130 | "input": [ 131 | "a = True", 132 | "", 133 | "if a:", 134 | " print 'a is True'", 135 | "else:", 136 | " print 'a is False'", 137 | "", 138 | "b = False", 139 | "if not b:", 140 | " print 'b is False'", 141 | "else:", 142 | " print 'b is True'", 143 | "" 144 | ], 145 | "language": "python", 146 | "outputs": [ 147 | { 148 | "output_type": "stream", 149 | "stream": "stdout", 150 | "text": [ 151 | "a is True", 152 | "b is False", 153 | "" 154 | ] 155 | } 156 | ], 157 | "prompt_number": 24 158 | }, 159 | { 160 | "cell_type": "code", 161 | "input": [ 162 | "x = 0", 163 | "y = 1", 164 | "", 165 | "a = \"\"", 166 | "b = \"something\"", 167 | "", 168 | "if a:", 169 | " print 'a evaluates to True'", 170 | "if b:", 171 | " print 'b evaluates to True'", 172 | "if x:", 173 | " print 'x evaluates to True'", 174 | "if y:", 175 | " print 'y evaluates to True'", 176 | " ", 177 | "if x == 0:", 178 | " print 'x is zero'", 179 | "", 180 | "" 181 | ], 182 | "language": "python", 183 | "outputs": [ 184 | { 185 | "output_type": "stream", 186 | "stream": "stdout", 187 | "text": [ 188 | "b evaluates to True", 189 | "y evaluates to True", 190 | "x is zero", 191 | "" 192 | ] 193 | } 194 | ], 195 | "prompt_number": 27 196 | }, 197 | { 198 | "cell_type": "code", 199 | "input": [ 200 | "x = 5", 201 | "if x > 5:", 202 | " print 'x is greater than 5'", 203 | "if x <= 5:", 204 | " print 'x is less than or equal to 5'" 205 | ], 206 | "language": "python", 207 | "outputs": [ 208 | { 209 | "output_type": "stream", 210 | "stream": "stdout", 211 | "text": [ 212 | "x is less than or equal to 5", 213 | "" 214 | ] 215 | } 216 | ], 217 | "prompt_number": 29 218 | }, 219 | { 220 | "cell_type": "code", 221 | "input": [ 222 | "x = []", 223 | "y = ['not empty', 1, 2, 3]", 224 | "z = [5, 6, 7]", 225 | "", 226 | "if x:", 227 | " print 'x is True'", 228 | "if y:", 229 | " print 'y is True'", 230 | "if z:", 231 | " print 'z is True'" 232 | ], 233 | "language": "python", 234 | "outputs": [ 235 | { 236 | "output_type": "stream", 237 | "stream": "stdout", 238 | "text": [ 239 | "y is True", 240 | "z is True", 241 | "" 242 | ] 243 | } 244 | ], 245 | "prompt_number": 28 246 | }, 247 | { 248 | "cell_type": "code", 249 | "input": [ 250 | "5 / 2.0" 251 | ], 252 | "language": "python", 253 | "outputs": [ 254 | { 255 | "output_type": "pyout", 256 | "prompt_number": 31, 257 | "text": [ 258 | "2.5" 259 | ] 260 | } 261 | ], 262 | "prompt_number": 31 263 | }, 264 | { 265 | "cell_type": "code", 266 | "input": [ 267 | "x = [ 'five', 10, 2.5 ]", 268 | "for a in x:", 269 | " print a", 270 | "", 271 | "for a in range(4, 10):", 272 | " print a*2" 273 | ], 274 | "language": "python", 275 | "outputs": [ 276 | { 277 | "output_type": "stream", 278 | "stream": "stdout", 279 | "text": [ 280 | "five", 281 | "10", 282 | "2.5", 283 | "8", 284 | "10", 285 | "12", 286 | "14", 287 | "16", 288 | "18", 289 | "" 290 | ] 291 | } 292 | ], 293 | "prompt_number": 34 294 | }, 295 | { 296 | "cell_type": "code", 297 | "input": [ 298 | "x = ['5', 'mark', 'neal']", 299 | "print x", 300 | "print len(x)", 301 | "x.append('jorge')", 302 | "print x", 303 | "print len(x)", 304 | "", 305 | "print x[2]", 306 | "", 307 | "x = ['5', 'mark', 'neal', 'jorge']", 308 | "y = []", 309 | "for a in x:", 310 | " if not a.startswith('j'):", 311 | " y.append(a)", 312 | "print y" 313 | ], 314 | "language": "python", 315 | "outputs": [ 316 | { 317 | "output_type": "stream", 318 | "stream": "stdout", 319 | "text": [ 320 | "['5', 'mark', 'neal']", 321 | "3", 322 | "['5', 'mark', 'neal', 'jorge']", 323 | "4", 324 | "neal", 325 | "['5', 'mark', 'neal']", 326 | "" 327 | ] 328 | } 329 | ], 330 | "prompt_number": 43 331 | }, 332 | { 333 | "cell_type": "code", 334 | "input": [ 335 | "x = ['5', 'mark', 'neal', 'jorge']", 336 | "y = []", 337 | "for a in x:", 338 | " print \"I am at\", a", 339 | " if not a.startswith('j'):", 340 | " print \"apparently here a does not start with j\", a", 341 | " y.append(a)", 342 | " print y", 343 | " else:", 344 | " print 'rejecting', a", 345 | "print y" 346 | ], 347 | "language": "python", 348 | "outputs": [ 349 | { 350 | "output_type": "stream", 351 | "stream": "stdout", 352 | "text": [ 353 | "I am at 5", 354 | "apparently here a does not start with j 5", 355 | "['5']", 356 | "I am at mark", 357 | "apparently here a does not start with j mark", 358 | "['5', 'mark']", 359 | "I am at neal", 360 | "apparently here a does not start with j neal", 361 | "['5', 'mark', 'neal']", 362 | "I am at jorge", 363 | "rejecting jorge", 364 | "['5', 'mark', 'neal']", 365 | "" 366 | ] 367 | } 368 | ], 369 | "prompt_number": 46 370 | }, 371 | { 372 | "cell_type": "code", 373 | "input": [ 374 | "this_a_really_complicated_variable_name = 5", 375 | "" 376 | ], 377 | "language": "python", 378 | "outputs": [], 379 | "prompt_number": 47 380 | }, 381 | { 382 | "cell_type": "code", 383 | "input": [ 384 | "a = \"some string\"", 385 | "if a.startswith('ome str', 1):", 386 | " print 'yes it does'" 387 | ], 388 | "language": "python", 389 | "outputs": [ 390 | { 391 | "output_type": "stream", 392 | "stream": "stdout", 393 | "text": [ 394 | "yes it does", 395 | "" 396 | ] 397 | } 398 | ], 399 | "prompt_number": 53 400 | }, 401 | { 402 | "cell_type": "code", 403 | "input": [ 404 | "# you can also Python what functions are available on a list", 405 | "dir(x)", 406 | "help(x)" 407 | ], 408 | "language": "python", 409 | "outputs": [ 410 | { 411 | "output_type": "stream", 412 | "stream": "stdout", 413 | "text": [ 414 | "Help on list object:", 415 | "", 416 | "class list(object)", 417 | " | list() -> new empty list", 418 | " | list(iterable) -> new list initialized from iterable's items", 419 | " | ", 420 | " | Methods defined here:", 421 | " | ", 422 | " | __add__(...)", 423 | " | x.__add__(y) <==> x+y", 424 | " | ", 425 | " | __contains__(...)", 426 | " | x.__contains__(y) <==> y in x", 427 | " | ", 428 | " | __delitem__(...)", 429 | " | x.__delitem__(y) <==> del x[y]", 430 | " | ", 431 | " | __delslice__(...)", 432 | " | x.__delslice__(i, j) <==> del x[i:j]", 433 | " | ", 434 | " | Use of negative indices is not supported.", 435 | " | ", 436 | " | __eq__(...)", 437 | " | x.__eq__(y) <==> x==y", 438 | " | ", 439 | " | __ge__(...)", 440 | " | x.__ge__(y) <==> x>=y", 441 | " | ", 442 | " | __getattribute__(...)", 443 | " | x.__getattribute__('name') <==> x.name", 444 | " | ", 445 | " | __getitem__(...)", 446 | " | x.__getitem__(y) <==> x[y]", 447 | " | ", 448 | " | __getslice__(...)", 449 | " | x.__getslice__(i, j) <==> x[i:j]", 450 | " | ", 451 | " | Use of negative indices is not supported.", 452 | " | ", 453 | " | __gt__(...)", 454 | " | x.__gt__(y) <==> x>y", 455 | " | ", 456 | " | __iadd__(...)", 457 | " | x.__iadd__(y) <==> x+=y", 458 | " | ", 459 | " | __imul__(...)", 460 | " | x.__imul__(y) <==> x*=y", 461 | " | ", 462 | " | __init__(...)", 463 | " | x.__init__(...) initializes x; see help(type(x)) for signature", 464 | " | ", 465 | " | __iter__(...)", 466 | " | x.__iter__() <==> iter(x)", 467 | " | ", 468 | " | __le__(...)", 469 | " | x.__le__(y) <==> x<=y", 470 | " | ", 471 | " | __len__(...)", 472 | " | x.__len__() <==> len(x)", 473 | " | ", 474 | " | __lt__(...)", 475 | " | x.__lt__(y) <==> x x*n", 479 | " | ", 480 | " | __ne__(...)", 481 | " | x.__ne__(y) <==> x!=y", 482 | " | ", 483 | " | __repr__(...)", 484 | " | x.__repr__() <==> repr(x)", 485 | " | ", 486 | " | __reversed__(...)", 487 | " | L.__reversed__() -- return a reverse iterator over the list", 488 | " | ", 489 | " | __rmul__(...)", 490 | " | x.__rmul__(n) <==> n*x", 491 | " | ", 492 | " | __setitem__(...)", 493 | " | x.__setitem__(i, y) <==> x[i]=y", 494 | " | ", 495 | " | __setslice__(...)", 496 | " | x.__setslice__(i, j, y) <==> x[i:j]=y", 497 | " | ", 498 | " | Use of negative indices is not supported.", 499 | " | ", 500 | " | __sizeof__(...)", 501 | " | L.__sizeof__() -- size of L in memory, in bytes", 502 | " | ", 503 | " | append(...)", 504 | " | L.append(object) -- append object to end", 505 | " | ", 506 | " | count(...)", 507 | " | L.count(value) -> integer -- return number of occurrences of value", 508 | " | ", 509 | " | extend(...)", 510 | " | L.extend(iterable) -- extend list by appending elements from the iterable", 511 | " | ", 512 | " | index(...)", 513 | " | L.index(value, [start, [stop]]) -> integer -- return first index of value.", 514 | " | Raises ValueError if the value is not present.", 515 | " | ", 516 | " | insert(...)", 517 | " | L.insert(index, object) -- insert object before index", 518 | " | ", 519 | " | pop(...)", 520 | " | L.pop([index]) -> item -- remove and return item at index (default last).", 521 | " | Raises IndexError if list is empty or index is out of range.", 522 | " | ", 523 | " | remove(...)", 524 | " | L.remove(value) -- remove first occurrence of value.", 525 | " | Raises ValueError if the value is not present.", 526 | " | ", 527 | " | reverse(...)", 528 | " | L.reverse() -- reverse *IN PLACE*", 529 | " | ", 530 | " | sort(...)", 531 | " | L.sort(cmp=None, key=None, reverse=False) -- stable sort *IN PLACE*;", 532 | " | cmp(x, y) -> -1, 0, 1", 533 | " | ", 534 | " | ----------------------------------------------------------------------", 535 | " | Data and other attributes defined here:", 536 | " | ", 537 | " | __hash__ = None", 538 | " | ", 539 | " | __new__ = ", 540 | " | T.__new__(S, ...) -> a new object with type S, a subtype of T", 541 | "", 542 | "" 543 | ] 544 | } 545 | ], 546 | "prompt_number": 58 547 | }, 548 | { 549 | "cell_type": "code", 550 | "input": [ 551 | "import math", 552 | "print math.factorial(10)" 553 | ], 554 | "language": "python", 555 | "outputs": [ 556 | { 557 | "output_type": "stream", 558 | "stream": "stdout", 559 | "text": [ 560 | "3628800", 561 | "" 562 | ] 563 | } 564 | ], 565 | "prompt_number": 59 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "source": [ 570 | "# Other Python resources", 571 | "", 572 | "Introductory Python tutorial:", 573 | "http://docs.python.org/tutorial/index.html", 574 | "", 575 | "Head First Python", 576 | "http://www.headfirstlabs.com/books/hfpython/", 577 | "", 578 | "Software Carpentry:", 579 | "http://software-carpentry.org/", 580 | "", 581 | "Python Standard Library:", 582 | "http://docs.python.org/library/index.html", 583 | "", 584 | "Python Cookbook:", 585 | "http://code.activestate.com/recipes/langs/python/", 586 | "", 587 | "Dive Into Python:", 588 | "http://www.diveintopython.net/", 589 | "", 590 | "Intermediate and Advanced Software Carpentry:", 591 | "http://ivory.idyll.org/articles/advanced-swc/" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "source": [ 597 | "# Bioinformatics resources", 598 | "", 599 | "Q&A site: http://www.biostars.org/", 600 | "", 601 | "seqanswers.com" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "input": [ 607 | "cd /mnt", 608 | "" 609 | ], 610 | "language": "python", 611 | "outputs": [ 612 | { 613 | "output_type": "stream", 614 | "stream": "stdout", 615 | "text": [ 616 | "/mnt" 617 | ] 618 | }, 619 | { 620 | "output_type": "stream", 621 | "stream": "stdout", 622 | "text": [ 623 | "", 624 | "" 625 | ] 626 | } 627 | ], 628 | "prompt_number": 61 629 | }, 630 | { 631 | "cell_type": "code", 632 | "input": [ 633 | "ls" 634 | ], 635 | "language": "python", 636 | "outputs": [ 637 | { 638 | "output_type": "stream", 639 | "stream": "stdout", 640 | "text": [ 641 | "\u001b[0m\u001b[01;34mblast-2.2.24\u001b[0m/ mouse.protein.faa.psi", 642 | "blast-2.2.24-ia32-linux.tar.gz mouse.protein.faa.psq", 643 | "formatdb.log out.txt", 644 | "\u001b[01;34mlost+found\u001b[0m/ zebrafish.protein.faa", 645 | "mm-first.fa zebrafish.protein.faa.phr", 646 | "mouse.protein.faa zebrafish.protein.faa.pin", 647 | "mouse.protein.faa.phr zebrafish.protein.faa.pnd", 648 | "mouse.protein.faa.pin zebrafish.protein.faa.pni", 649 | "mouse.protein.faa.pnd zebrafish.protein.faa.psd", 650 | "mouse.protein.faa.pni zebrafish.protein.faa.psi", 651 | "mouse.protein.faa.psd zebrafish.protein.faa.psq", 652 | "" 653 | ] 654 | } 655 | ], 656 | "prompt_number": 62 657 | }, 658 | { 659 | "cell_type": "code", 660 | "input": [ 661 | "!head -11 mouse.protein.faa >mm-first.fa", 662 | "!cat mm-first.fa" 663 | ], 664 | "language": "python", 665 | "outputs": [ 666 | { 667 | "output_type": "stream", 668 | "stream": "stdout", 669 | "text": [ 670 | ">gi|83029092|ref|XP_357633.3| PREDICTED: similar to Ig kappa chain V-V region L7 precursor [Mus musculus]", 671 | "MRSLPASELSLPYHQDFRPDSSCPGLTDNHLLCEPTQKPRYPDLGRKRSGENRHRYRDVCISFIRRSFSALCHNPQKLRE", 672 | "NLKMVSTSQLLGLLLFWTSASRCDIVMTQSPATLSVTPGDRVSLSCRASQSISDYLHWYQQKSHESPRLLIKYASQSISG", 673 | "IPSRFSGSGSGSDFTLSINSVEPEDVGVYYCQNGHSFPPTMMQTIAKTSRKAEVRG", 674 | ">gi|70778726|ref|NP_067474.2| WD repeat domain 8 [Mus musculus]", 675 | "MNFSESFKLSGLLCRFSPDGKYLASCVQYRLVIRDVTTLQILQLYTCLDQIQHIEWSADSLFILCAMYRRGLVQVWSLEQ", 676 | "PEWHCKIDEGSAGLVASCWSPDGRHILNTTEFHLRITVWSLCTKSVSYIKYPKACQQGLTFTRDGRYLALAERRDCRDYV", 677 | "SIFVCSDWQLLRHFDTDTQDLTGIEWAPNGCVLAAWDTCLEYKVLLYSLDGRLLSAYCAYEWSLGIKSVAWSPSSQFLAI", 678 | "GSYDGKVRLLNHVTWKMITEFGHPATINNPKTVVYKEAEKSPLLGLGHLSFPPPRAMAGALSTSESKYEIASGPVSLQTL", 679 | "KPVADRANPRMGVGMLAFSSDSYFLASRNDNVPNAVWIWDIQKLKLFVVLEHMSPVRSFQWDPQQPRLAICTGGSKVYLW", 680 | "SPAGCVSVQVPGEGDFPVLGLCWHLSGDSLALLSKDHFCLCFLETKERVGTAYEQRDGMPRT", 681 | "" 682 | ] 683 | } 684 | ], 685 | "prompt_number": 64 686 | }, 687 | { 688 | "cell_type": "code", 689 | "input": [ 690 | "!head -9 zebrafish.protein.faa > zfin-first.fa" 691 | ], 692 | "language": "python", 693 | "outputs": [], 694 | "prompt_number": 67 695 | }, 696 | { 697 | "cell_type": "code", 698 | "input": [ 699 | "ls" 700 | ], 701 | "language": "python", 702 | "outputs": [ 703 | { 704 | "output_type": "stream", 705 | "stream": "stdout", 706 | "text": [ 707 | "\u001b[0m\u001b[01;34mblast-2.2.24\u001b[0m/ mouse.protein.faa.psq", 708 | "blast-2.2.24-ia32-linux.tar.gz out.txt", 709 | "formatdb.log zebrafish.protein.faa", 710 | "\u001b[01;34mlost+found\u001b[0m/ zebrafish.protein.faa.phr", 711 | "mm-first.fa zebrafish.protein.faa.pin", 712 | "mouse.protein.faa zebrafish.protein.faa.pnd", 713 | "mouse.protein.faa.phr zebrafish.protein.faa.pni", 714 | "mouse.protein.faa.pin zebrafish.protein.faa.psd", 715 | "mouse.protein.faa.pnd zebrafish.protein.faa.psi", 716 | "mouse.protein.faa.pni zebrafish.protein.faa.psq", 717 | "mouse.protein.faa.psd zfin-first.fa", 718 | "mouse.protein.faa.psi", 719 | "" 720 | ] 721 | } 722 | ], 723 | "prompt_number": 68 724 | }, 725 | { 726 | "cell_type": "code", 727 | "input": [ 728 | "import os", 729 | "for filename in ['mm-first.fa', 'zfin-first.fa']:", 730 | " !echo {filename}.out", 731 | " !blastall -i {filename} -o {filename}.out -d zebrafish.protein.faa -e 1e-10 -p blastp", 732 | " #os.system('blastall -i %s -o %s.out -d zebrafish.protein.faa -p blastp' % (filename, filename))", 733 | "" 734 | ], 735 | "language": "python", 736 | "outputs": [ 737 | { 738 | "output_type": "stream", 739 | "stream": "stdout", 740 | "text": [ 741 | "mm-first.fa.out", 742 | "" 743 | ] 744 | }, 745 | { 746 | "output_type": "stream", 747 | "stream": "stdout", 748 | "text": [ 749 | "zfin-first.fa.out", 750 | "" 751 | ] 752 | } 753 | ], 754 | "prompt_number": 83 755 | }, 756 | { 757 | "cell_type": "code", 758 | "input": [ 759 | "" 760 | ], 761 | "language": "python", 762 | "outputs": [] 763 | } 764 | ] 765 | } 766 | ] 767 | } -------------------------------------------------------------------------------- /ngs-31-python-and-short-reads.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-31-python-and-short-reads" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "code", 11 | "input": [ 12 | "!df", 13 | "!mount /dev/xvdf /data" 14 | ], 15 | "language": "python", 16 | "outputs": [] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "input": [ 21 | "cd /mnt" 22 | ], 23 | "language": "python", 24 | "outputs": [] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "input": [ 29 | "!head /data/drosophila/RAL357_1.fastq" 30 | ], 31 | "language": "python", 32 | "outputs": [] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "source": [ 37 | "# Introducing screed", 38 | "", 39 | "screed is a Python library built in the Brown Lab to work with sequences. It includes both functionality for reading in sequences in FASTA and FASTQ, and creating a random-access sequence database for efficiently retrieving specific named sequences.", 40 | "", 41 | "One of the most convenient things about screed is that it will automatically figure out whether or not it's a FASTA or FASTQ file, and automatically read gz and bz2 files.", 42 | "", 43 | "Read more about it here:", 44 | "", 45 | "http://readthedocs.org/docs/screed/en/latest/" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "input": [ 51 | "import screed", 52 | "", 53 | "for read in screed.open('/data/drosophila/RAL357_1.fastq'):", 54 | " print read", 55 | " break", 56 | "" 57 | ], 58 | "language": "python", 59 | "outputs": [] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "input": [ 64 | "for read in screed.open('/data/drosophila/RAL357_1.fastq'):", 65 | " print read.name, read.sequence", 66 | " break" 67 | ], 68 | "language": "python", 69 | "outputs": [] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "input": [ 74 | "for n, read in enumerate(screed.open('/data/drosophila/RAL357_1.fastq')):", 75 | " if n > 5:", 76 | " break", 77 | " print read.name, read.sequence" 78 | ], 79 | "language": "python", 80 | "outputs": [] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "input": [ 85 | "lengths = []", 86 | "for n, read in enumerate(screed.open('/data/drosophila/RAL357_1.fastq')):", 87 | " if n > 10000: break", 88 | " ", 89 | " seq = read.sequence", 90 | " first_n = seq.find('N') # or you could do seq = seq.rstrip('N')", 91 | " if first_n >= 0:", 92 | " seq = seq[0:first_n]", 93 | " ", 94 | " read_len = len(seq)", 95 | " lengths.append(read_len)", 96 | "" 97 | ], 98 | "language": "python", 99 | "outputs": [] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "input": [ 104 | "print seq", 105 | "print seq[:5]" 106 | ], 107 | "language": "python", 108 | "outputs": [] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "input": [ 113 | "hist(lengths, cumulative=True)", 114 | "axis(ymax=100, xmin=0)" 115 | ], 116 | "language": "python", 117 | "outputs": [] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "input": [ 122 | "nlocs = []", 123 | "for n, read in enumerate(screed.open('/data/drosophila/RAL357_1.fastq')):", 124 | " if n > 10000: break", 125 | " ", 126 | " seq = read.sequence", 127 | " first_n = seq.find('N') # or you could do seq = seq.rstrip('N')", 128 | " if first_n >= 0:", 129 | " nlocs.append(first_n)", 130 | "" 131 | ], 132 | "language": "python", 133 | "outputs": [] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "input": [ 138 | "hist(nlocs, cumulative=True, bins=100)", 139 | "axis(ymax=100, xmin=0)" 140 | ], 141 | "language": "python", 142 | "outputs": [] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "input": [ 147 | "# pick off the first 100,000 sequences from the FASTQ file and align them.", 148 | "!curl -O ftp://ftp.flybase.net/genomes/Drosophila_melanogaster/dmel_r5.37_FB2011_05/fasta/dmel-all-chromosome-r5.37.fasta.gz", 149 | "!gunzip -f dmel-all-chromosome-r5.37.fasta.gz", 150 | "!bowtie-index dmel-all-chromosome-r5.37.fasta drosophila_bowtie", 151 | "", 152 | "!head -400000 /data/drosophila/RAL357_1.fastq > RAL357_1.100k.fastq", 153 | "!bowtie -p 2 -q drosophila_bowtie RAL357_1.100k.fastq > ral357-random.map" 154 | ], 155 | "language": "python", 156 | "outputs": [] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "input": [ 161 | "!head -1 ral357-random.map" 162 | ], 163 | "language": "python", 164 | "outputs": [] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "input": [ 169 | "# define a function that we can use to read in the mapping output file", 170 | "def read_mapping_file(filename):", 171 | " for line in open(filename):", 172 | " _, orient, refname, pos, read, _, _, mismatches = line.split('\\t')[:8]", 173 | " pos = int(pos)", 174 | " mismatches = mismatches.split(',')", 175 | " x = []", 176 | " for posn in mismatches:", 177 | " posn = posn.split(':')[0]", 178 | " posn = int(posn)", 179 | " x.append(posn)", 180 | " yield orient, refname, pos, read, x" 181 | ], 182 | "language": "python", 183 | "outputs": [] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "input": [ 188 | "d = {}", 189 | "", 190 | "for (orient, refname, pos, read, mismatches) in read_mapping_file('ral357-random.map'):", 191 | " for posn in mismatches:", 192 | " d[posn] = d.get(posn, 0) + 1" 193 | ], 194 | "language": "python", 195 | "outputs": [] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "input": [ 200 | "items = d.items()", 201 | "items.sort()", 202 | "", 203 | "x = []", 204 | "y = []", 205 | "for (a, b) in items:", 206 | " x.append(a)", 207 | " y.append(b)", 208 | "" 209 | ], 210 | "language": "python", 211 | "outputs": [] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "input": [ 216 | "plot(x, y)", 217 | "axis(ymax=4000)" 218 | ], 219 | "language": "python", 220 | "outputs": [] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "input": [ 225 | "" 226 | ], 227 | "language": "python", 228 | "outputs": [] 229 | } 230 | ] 231 | } 232 | ] 233 | } -------------------------------------------------------------------------------- /ngs-41-velvet-genome-assembly.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-41-velvet-genome-assembly" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "markdown", 11 | "source": [ 12 | "# Using Velvet for genome assembly.", 13 | "", 14 | "Velvet is a commonly-used genome assembler; below, we'll use it to assemble some E. coli sequences.", 15 | "", 16 | "See: http://genome.cshlp.org/content/18/5/821.short" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "source": [ 22 | "## Requirements", 23 | "", 24 | "You'll need to have screed and khmer installed (see the ngs-02-install-screed and ngs-03-install-khmer notebooks).", 25 | "", 26 | "Also, mount a copy of the snapshot snap-f69fda89 as /assembly-data: briefly, make a volume in the same zone as your instance; attach it to your instance as (for example) /dev/sdf; then do" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "input": [ 32 | "!mkdir /assembly-data", 33 | "!mount /dev/xvdf /assembly-data" 34 | ], 35 | "language": "python", 36 | "outputs": [] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "source": [ 41 | "First, download and install Velvet." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "input": [ 47 | "cd /mnt" 48 | ], 49 | "language": "python", 50 | "outputs": [] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "input": [ 55 | "!curl -O http://www.ebi.ac.uk/~zerbino/velvet/velvet_1.2.06.tgz", 56 | "!tar xzf velvet_1.2.06.tgz", 57 | "!cd velvet_1.2.06 && make MAXKMERLENGTH=51 OPENMP=2", 58 | "!cp velvet_1.2.06/velvet? /usr/local/bin" 59 | ], 60 | "language": "python", 61 | "outputs": [] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "source": [ 66 | "Now, copy the raw reads into the ecoli directory." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "input": [ 72 | "!mkdir ecoli", 73 | "!cp /assembly-data/ecoli*.fq.gz ecoli", 74 | "" 75 | ], 76 | "language": "python", 77 | "outputs": [] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "input": [ 82 | "cd /mnt/ecoli" 83 | ], 84 | "language": "python", 85 | "outputs": [] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "input": [ 90 | "ls" 91 | ], 92 | "language": "python", 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "source": [ 98 | "The first command you run, velveth, creates the de Bruijn graph from the reads. Here is where you have to specify the kinds of reads you're loading in, from where, and the k size for the graph." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "input": [ 104 | "!velveth ecoli.31 31 -fastq.gz -shortPaired ecoli_ref-1m.fq.gz" 105 | ], 106 | "language": "python", 107 | "outputs": [] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "source": [ 112 | "The second command, velvetg, explores the graph and builds contigs and scaffolds from it." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "input": [ 118 | "!velvetg ecoli.31 -exp_cov auto -cov_cutoff auto" 119 | ], 120 | "language": "python", 121 | "outputs": [] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "source": [ 126 | "And voila, you're done! The final output file containing your contigs or scaffolds is in the ecoli.31 directory as 'contigs.fa'. You can use a program included with khmer, 'assemstats3.py', to look at the number of contigs, the total sum of assembly, and the max sequence length in your assembly:" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "input": [ 132 | "!python /usr/local/src/khmer/sandbox/assemstats3.py 1000 ecoli.31/contigs.fa" 133 | ], 134 | "language": "python", 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "source": [ 140 | "Let's quickly calculate the contig length distribution, too." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "input": [ 146 | "import screed", 147 | "", 148 | "lengths = []", 149 | "for read in screed.open('ecoli.31/contigs.fa'):", 150 | " lengths.append(len(read.sequence))", 151 | "" 152 | ], 153 | "language": "python", 154 | "outputs": [] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "input": [ 159 | "hist(lengths, bins=20)" 160 | ], 161 | "language": "python", 162 | "outputs": [] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "source": [ 167 | "The main thing to note here is that there's a strong tendency towards really short contigs. This is true of ALL short-read assemblers, and it's one reason why picking a lower cutoff is important...", 168 | "", 169 | "One neat trick -- use khmer's \"extract-long-sequences\" to pick out only those sequences you think are interesting:" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "input": [ 175 | "!python /usr/local/src/khmer/sandbox/extract-long-sequences.py 1000 ecoli.31/contigs.fa > ecoli-31-1k.fa" 176 | ], 177 | "language": "python", 178 | "outputs": [] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "source": [ 183 | "These can now be BLASTed or used for further analysis of some sort." 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "source": [ 189 | "Another thing that is commonly done is to explore what different parameters -- in particular, what k values -- do to the assembly. For example, you can run multiple assemblies with multiple different k values:" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "input": [ 195 | "for k in [33, 35]:", 196 | " !velveth ecoli.$k $k -fastq.gz -shortPaired ecoli_ref-1m.fq.gz", 197 | " !velvetg ecoli.$k -exp_cov auto -cov_cutoff auto" 198 | ], 199 | "language": "python", 200 | "outputs": [] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "input": [ 205 | "!python /usr/local/src/khmer/sandbox/assemstats3.py 1000 ecoli.*/contigs.fa" 206 | ], 207 | "language": "python", 208 | "outputs": [] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "source": [ 213 | "So which one of these is \"best\"? Well, I would argue that the 35 is looking pretty good, based on small number of contigs, biggest sum of bases, and longest contig. But what happens if you move up to, say, 37?" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "input": [ 219 | "" 220 | ], 221 | "language": "python", 222 | "outputs": [] 223 | } 224 | ] 225 | } 226 | ] 227 | } -------------------------------------------------------------------------------- /ngs-42-oases-mRNAseq-assembly.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-42-oases-mRNAseq-assembly" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "markdown", 11 | "source": [ 12 | "# Using Oases for transcriptome assembly.", 13 | "", 14 | "Oases is a commonly-used transcriptome assembler; below, we'll use it to assemble some yeast mRNAseq data.", 15 | "", 16 | "See: http://www.ebi.ac.uk/~zerbino/oases/ and http://bioinformatics.oxfordjournals.org/content/early/2012/02/24/bioinformatics.bts094.short" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "source": [ 22 | "## Requirements", 23 | "", 24 | "If you haven't already, run through the first part of ngs-41-velvet-genome-assembly to download, build, and install Velvet.", 25 | "", 26 | "(Oases depends on Velvet.)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "source": [ 32 | "Now, download and compile Oases:" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "input": [ 38 | "cd /mnt" 39 | ], 40 | "language": "python", 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "input": [ 46 | "!curl -O http://www.ebi.ac.uk/~zerbino/oases/oases_0.2.07.tgz", 47 | "!tar xzf oases_0.2.07.tgz", 48 | "!cd oases_0.2.07 && make VELVET_DIR=../velvet_1.2.06 MAXKMERLENGTH=51", 49 | "!cp oases_0.2.07/oases /usr/local/bin" 50 | ], 51 | "language": "python", 52 | "outputs": [] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "input": [ 57 | "!mkdir yeast-oases", 58 | "!cp /assembly-data/yeast-1m.fq.gz yeast-oases" 59 | ], 60 | "language": "python", 61 | "outputs": [] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "input": [ 66 | "cd yeast-oases" 67 | ], 68 | "language": "python", 69 | "outputs": [] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "input": [ 74 | "ls" 75 | ], 76 | "language": "python", 77 | "outputs": [] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "input": [ 82 | "!velveth yeast.31 31 -shortPaired -fastq.gz yeast-1m.fq.gz" 83 | ], 84 | "language": "python", 85 | "outputs": [] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "input": [ 90 | "!velvetg yeast.31 -read_trkg yes" 91 | ], 92 | "language": "python", 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "input": [ 98 | "!oases yeast.31" 99 | ], 100 | "language": "python", 101 | "outputs": [] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "source": [ 106 | "And voila, your assembled sequences are in /mnt/yeast-oases/yeast.31/transcripts.fa!" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "input": [ 112 | "!head /mnt/yeast-oases/yeast.31/transcripts.fa" 113 | ], 114 | "language": "python", 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "input": [ 120 | "!python /usr/local/src/khmer/sandbox/assemstats3.py 1000 yeast.31/transcripts.fa" 121 | ], 122 | "language": "python", 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "source": [ 128 | "As with Velvet, you can compute the contig length distribution:" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "input": [ 134 | "cd /mnt/yeast-oases" 135 | ], 136 | "language": "python", 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "input": [ 142 | "import screed", 143 | "", 144 | "lengths = []", 145 | "for read in screed.open('yeast.31/transcripts.fa'):", 146 | " lengths.append(len(read.sequence))", 147 | "" 148 | ], 149 | "language": "python", 150 | "outputs": [] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "input": [ 155 | "hist(lengths, bins=20)" 156 | ], 157 | "language": "python", 158 | "outputs": [] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "source": [ 163 | "At this point, it's probably worth asking -- how *do* you evaluate an mRNAseq assembly, anyway??" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "input": [ 169 | "" 170 | ], 171 | "language": "python", 172 | "outputs": [] 173 | } 174 | ] 175 | } 176 | ] 177 | } -------------------------------------------------------------------------------- /ngs-43-trinity-mRNAseq-assembly.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-43-trinity-mRNAseq-assembly" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "markdown", 11 | "source": [ 12 | "# Using Trinity for transcriptome assembly", 13 | "", 14 | "Trinity is another commonly-used transcriptome assembler; below, we'll use it to assemble some yeast mRNAseq data.", 15 | "", 16 | "See: http://trinityrnaseq.sourceforge.net", 17 | "", 18 | "You'll need to have the assembly data mounted on /assembly-data, as with the Velvet tutorial (ngs-41-velvet-genome-assembly)." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "source": [ 24 | "First, install some other stuff --" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "input": [ 30 | "!PATH=/sbin:/usr/sbin:$$PATH apt-get -y --force-yes install libbz2-dev" 31 | ], 32 | "language": "python", 33 | "outputs": [] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "input": [ 38 | "cd /mnt" 39 | ], 40 | "language": "python", 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "input": [ 46 | "!curl -O -L http://sourceforge.net/projects/bowtie-bio/files/bowtie/0.12.7/bowtie-0.12.7-linux-x86_64.zip", 47 | "!unzip bowtie-0.12.7-linux-x86_64.zip", 48 | "!cp bowtie-0.12.7/bowtie* /usr/local/bin" 49 | ], 50 | "language": "python", 51 | "outputs": [] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "source": [ 56 | "Copy the data over." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "input": [ 62 | "!mkdir yeast-trinity", 63 | "!cp /assembly-data/yeast-1m.fq.gz yeast-trinity" 64 | ], 65 | "language": "python", 66 | "outputs": [] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "input": [ 71 | "cd /mnt" 72 | ], 73 | "language": "python", 74 | "outputs": [] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "input": [ 79 | "!cp /assembly-data/trinityrnaseq_r2012-05-18.tar.gz ./" 80 | ], 81 | "language": "python", 82 | "outputs": [] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "input": [ 87 | "!tar xzf trinity*.tar.gz" 88 | ], 89 | "language": "python", 90 | "outputs": [] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "input": [ 95 | "cd trinityrnaseq_r2012-05-18" 96 | ], 97 | "language": "python", 98 | "outputs": [] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "input": [ 103 | "ls" 104 | ], 105 | "language": "python", 106 | "outputs": [] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "source": [ 111 | "Build Trinity." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "input": [ 117 | "!make" 118 | ], 119 | "language": "python", 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "input": [ 125 | "cd /mnt/yeast-trinity/" 126 | ], 127 | "language": "python", 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "source": [ 133 | "Split the Yeast data up into left and right ends." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "input": [ 139 | "!python /usr/local/src/khmer/sandbox/split-pe.py yeast-1m.fq.gz" 140 | ], 141 | "language": "python", 142 | "outputs": [] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "source": [ 147 | "And now... run Trinity! This will take a while." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "input": [ 153 | "!rm -fr trinity_out_dir", 154 | "!ulimit -s unlimited && ../trinityrnaseq_r2012-05-18/Trinity.pl --kmer_method meryl --SS_lib_type RF --seqType fa --left yeast-1m.fq.gz.1 --right yeast-1m.fq.gz.2" 155 | ], 156 | "language": "python", 157 | "outputs": [] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "input": [ 162 | "!python /usr/local/src/khmer/sandbox/assemstats3.py 300 /mnt/yeast-trinity/trinity_out_dir/Trinity.fasta" 163 | ], 164 | "language": "python", 165 | "outputs": [] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "source": [ 170 | "Note, if you've already run the Oases assembly, you can compare with that:" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "input": [ 176 | "!python /usr/local/src/khmer/sandbox/assemstats3.py 300 /mnt/yeast-oases/yeast.31/transcripts.fa" 177 | ], 178 | "language": "python", 179 | "outputs": [] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "source": [ 184 | "The Trinity one *looks* better... but how would you tell, anyway? More on this later." 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "source": [ 190 | "And, of course, we can look at the transcript length distribution here, too:" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "input": [ 196 | "import screed", 197 | "", 198 | "lengths = []", 199 | "for read in screed.open('trinity_out_dir/Trinity.fasta'):", 200 | " lengths.append(len(read.sequence))", 201 | "" 202 | ], 203 | "language": "python", 204 | "outputs": [] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "input": [ 209 | "hist(lengths, bins=20)" 210 | ], 211 | "language": "python", 212 | "outputs": [] 213 | } 214 | ] 215 | } 216 | ] 217 | } -------------------------------------------------------------------------------- /ngs-44-kmer-distributions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-44-kmer-distributions" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "code", 11 | "input": [ 12 | "cd /mnt" 13 | ], 14 | "language": "python", 15 | "outputs": [] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "## First, grab some code from the digital normalization paper for making simulated genomes & reads." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "input": [ 26 | "!git clone git://github.com/ged-lab/2012-paper-diginorm.git" 27 | ], 28 | "language": "python", 29 | "outputs": [] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "input": [ 34 | "mkdir kmer" 35 | ], 36 | "language": "python", 37 | "outputs": [] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "input": [ 42 | "cd kmer" 43 | ], 44 | "language": "python", 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "source": [ 50 | "## Now, make a random genome of length 8kb (4 x 2000)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "input": [ 56 | "import random", 57 | "random.seed(1)", 58 | "", 59 | "x = [\"A\"] + [\"G\"] + [\"C\"] + [\"T\"]", 60 | "x = x*2000", 61 | "random.shuffle(x)", 62 | "x = \"\".join(x)", 63 | "x" 64 | ], 65 | "language": "python", 66 | "outputs": [] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "input": [ 71 | "fp = open('genome.fa', 'w')", 72 | "fp.write('>genome\\n')", 73 | "fp.write(x)", 74 | "fp.close()" 75 | ], 76 | "language": "python", 77 | "outputs": [] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "source": [ 82 | "## Make reads with a 1% error rate to a coverage of about 200" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "input": [ 88 | "!python /mnt/2012-paper-diginorm/pipeline/make-reads.py genome.fa > reads.fa" 89 | ], 90 | "language": "python", 91 | "outputs": [] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "input": [ 96 | "!head reads.fa" 97 | ], 98 | "language": "python", 99 | "outputs": [] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "source": [ 104 | "## Now, use khmer to load all the 20-mers in the reads into a counting data structure.", 105 | "", 106 | "The counting data structure is saved as counts.kh. Other parameters explained in:", 107 | "", 108 | "http://readthedocs.org/docs/khmer/en/latest/scripts.html" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "input": [ 114 | "!python /usr/local/src/khmer/scripts/load-into-counting.py -x 1e8 -N 4 -k 20 counts.kh reads.fa" 115 | ], 116 | "language": "python", 117 | "outputs": [] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "source": [ 122 | "## Get the abundance distribution of the k-mers in the reads" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "input": [ 128 | "!python /usr/local/src/khmer/scripts/abundance-dist.py -s counts.kh reads.fa reads.dist" 129 | ], 130 | "language": "python", 131 | "outputs": [] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "source": [ 136 | "## The output is in 'reads.dist'", 137 | "", 138 | "Column 0 is the k-mer count; column 1 is the number of k-mers with that count; column 2 is the running total of column 1; and column 3 is the fraction of totla in column 2." 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "input": [ 144 | "!head reads.dist" 145 | ], 146 | "language": "python", 147 | "outputs": [] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "source": [ 152 | "## Now, plot the abundance distribution" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "input": [ 158 | "x = []", 159 | "y = []", 160 | "for line in open('reads.dist'):", 161 | " count, num, total, frac = line.split()", 162 | " x.append(int(count))", 163 | " y.append(int(num))" 164 | ], 165 | "language": "python", 166 | "outputs": [] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "source": [ 171 | "## Reads with errors have many k-mers with abundance=1." 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "input": [ 177 | "plot(x, y)" 178 | ], 179 | "language": "python", 180 | "outputs": [] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "source": [ 185 | "## There will also be a bunch of high-count k-mers correlated with coverage" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "input": [ 191 | "plot(x, y)", 192 | "axis(ymax=1000)" 193 | ], 194 | "language": "python", 195 | "outputs": [] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "source": [ 200 | "## Trim reads at k-mers with an abundance of 1, using filter-abund." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "input": [ 206 | "!python /usr/local/src/khmer/scripts/filter-abund.py -C 2 counts.kh reads.fa" 207 | ], 208 | "language": "python", 209 | "outputs": [] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "source": [ 214 | "## Count the k-mers in the newly trimmed reads" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "input": [ 220 | "!python /usr/local/src/khmer/scripts/load-into-counting.py -x 1e8 -N 4 -k 20 counts-filt.kh reads.fa.abundfilt" 221 | ], 222 | "language": "python", 223 | "outputs": [] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "input": [ 228 | "!python /usr/local/src/khmer/scripts/abundance-dist.py -s counts-filt.kh reads.fa.abundfilt reads.abundfilt.dist" 229 | ], 230 | "language": "python", 231 | "outputs": [] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "source": [ 236 | "## Compare the raw and abundance-filtered reads" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "input": [ 242 | "x2 = []", 243 | "y2 = []", 244 | "for line in open('reads.abundfilt.dist'):", 245 | " count, num, total, frac = line.split()", 246 | " x2.append(int(count))", 247 | " y2.append(int(num))" 248 | ], 249 | "language": "python", 250 | "outputs": [] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "input": [ 255 | "plot(x, y)", 256 | "plot(x2, y2)", 257 | "axis(xmax=20)" 258 | ], 259 | "language": "python", 260 | "outputs": [] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "source": [ 265 | "## Show the abundance distributions of raw vs filtered reads, at high abundance" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "input": [ 271 | "plot(x, y)", 272 | "plot(x2, y2)", 273 | "axis(ymax=500)", 274 | "legend(['unfiltered reads', 'filter @ 1 reads'])", 275 | "title('k-mer abundance distribution')", 276 | "xlabel('k-mer count')", 277 | "ylabel('number of k-mers with that count')" 278 | ], 279 | "language": "python", 280 | "outputs": [] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "source": [ 285 | "## Now let's look at mapping-based measures of coverage", 286 | "", 287 | "### First, install bowtie" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "input": [ 293 | "!curl -O -L http://sourceforge.net/projects/bowtie-bio/files/bowtie/0.12.7/bowtie-0.12.7-linux-x86_64.zip", 294 | "!unzip bowtie-0.12.7-linux-x86_64.zip", 295 | "!cp bowtie-0.12.7/bowtie* /usr/local/bin" 296 | ], 297 | "language": "python", 298 | "outputs": [] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "source": [ 303 | "### Index the reference genome" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "input": [ 309 | "!bowtie-build genome.fa genome" 310 | ], 311 | "language": "python", 312 | "outputs": [] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "source": [ 317 | "## Map the raw and abundance filtered reads to the reference genome" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "input": [ 323 | "!bowtie -f -a --strata --best genome reads.fa > reads.map", 324 | "!bowtie -f -a --strata --best genome reads.fa.abundfilt > reads.map.abundfilt" 325 | ], 326 | "language": "python", 327 | "outputs": [] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "input": [ 332 | "!head -3 reads.map" 333 | ], 334 | "language": "python", 335 | "outputs": [] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "source": [ 340 | "## Build a function to read in the mapping output file" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "input": [ 346 | "# define a function that we can use to read in the mapping output file", 347 | "def read_mapping_file(filename):", 348 | " for line in open(filename):", 349 | " _, orient, refname, pos, read, _, _, mismatches = line.split('\\t')[:8]", 350 | " pos = int(pos)", 351 | " mismatches = mismatches.split(',')", 352 | " x = []", 353 | " for posn in mismatches:", 354 | " posn = posn.strip()", 355 | " if posn:", 356 | " posn = posn.split(':')[0]", 357 | " posn = int(posn)", 358 | " x.append(posn)", 359 | " yield orient, refname, pos, read, x" 360 | ], 361 | "language": "python", 362 | "outputs": [] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "source": [ 367 | "## Count mismatches between reads and genome, by position within read" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "input": [ 373 | "d = {}", 374 | "", 375 | "for (orient, refname, pos, read, mismatches) in read_mapping_file('reads.map'):", 376 | " for posn in mismatches:", 377 | " d[posn] = d.get(posn, 0) + 1", 378 | " ", 379 | "e = {}", 380 | "", 381 | "for (orient, refname, pos, read, mismatches) in read_mapping_file('reads.map.abundfilt'):", 382 | " for posn in mismatches:", 383 | " e[posn] = e.get(posn, 0) + 1" 384 | ], 385 | "language": "python", 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "input": [ 391 | "items = d.items()", 392 | "items.sort()", 393 | "", 394 | "x = []", 395 | "y = []", 396 | "for (a, b) in items:", 397 | " x.append(a)", 398 | " y.append(b)", 399 | "", 400 | " items = d.items()", 401 | "items.sort()", 402 | "", 403 | "xf = []", 404 | "yf = []", 405 | "for (a, b) in items:", 406 | " xf.append(a)", 407 | " yf.append(b)", 408 | "" 409 | ], 410 | "language": "python", 411 | "outputs": [] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "source": [ 416 | "## Graph reference-read mismatches by position within read" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "input": [ 422 | "plot(x, y, 'r--', linewidth=5, )", 423 | "plot(xf, yf)", 424 | "#axis(ymax=4000)", 425 | "axis(xmin=0, ymin=0)" 426 | ], 427 | "language": "python", 428 | "outputs": [] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "source": [ 433 | "## Now, calculate mismatches by position within genome" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "input": [ 439 | "genome_pos = {}", 440 | "", 441 | "for (orient, refname, pos, read, mismatches) in read_mapping_file('reads.map'):", 442 | " for readi in range(len(read)):", 443 | " genome_pos[pos + readi] = genome_pos.get(pos + readi, 0) + 1", 444 | "", 445 | "genome_pos_filt = {}", 446 | "", 447 | "for (orient, refname, pos, read, mismatches) in read_mapping_file('reads.map.abundfilt'):", 448 | " for readi in range(len(read)):", 449 | " genome_pos_filt[pos + readi] = genome_pos_filt.get(pos + readi, 0) + 1", 450 | " " 451 | ], 452 | "language": "python", 453 | "outputs": [] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "input": [ 458 | "landscape = genome_pos.items()", 459 | "landscape.sort()", 460 | "landscape[:10]", 461 | "", 462 | "x = []", 463 | "y = []", 464 | "for (a, b) in landscape:", 465 | " x.append(a)", 466 | " y.append(b)" 467 | ], 468 | "language": "python", 469 | "outputs": [] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "input": [ 474 | "plot(x, y)", 475 | "xlabel('posn in genome')", 476 | "ylabel('coverage of that position')" 477 | ], 478 | "language": "python", 479 | "outputs": [] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "source": [ 484 | "## Plot *coverage* distribution (using 'hist')" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "input": [ 490 | "hist(genome_pos.values(), bins=100)", 491 | "xlabel('coverage')", 492 | "ylabel('number of bases with that coverage')" 493 | ], 494 | "language": "python", 495 | "outputs": [] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "source": [ 500 | "## Coverage distribution of k-mers in filtered reads" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "input": [ 506 | "hist(genome_pos_filt.values(), bins=100)", 507 | "xlabel('coverage')" 508 | ], 509 | "language": "python", 510 | "outputs": [] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "input": [ 515 | "" 516 | ], 517 | "language": "python", 518 | "outputs": [] 519 | } 520 | ] 521 | } 522 | ] 523 | } -------------------------------------------------------------------------------- /ngs-5x-digital-normalization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-5x-digital-normalization" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "markdown", 11 | "source": [ 12 | "You'll need to install khmer and screed; see ngs-02 and ngs-03 notebooks." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "input": [ 18 | "cd /mnt" 19 | ], 20 | "language": "python", 21 | "outputs": [] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "source": [ 26 | "## First, grab some code from the digital normalization paper for making simulated genomes & reads." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "input": [ 32 | "!git clone git://github.com/ged-lab/2012-paper-diginorm.git" 33 | ], 34 | "language": "python", 35 | "outputs": [] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "input": [ 40 | "mkdir kmer" 41 | ], 42 | "language": "python", 43 | "outputs": [] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "input": [ 48 | "cd kmer" 49 | ], 50 | "language": "python", 51 | "outputs": [] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "source": [ 56 | "## Now, make a random genome of length 8kb (4 x 2000)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "input": [ 62 | "import random", 63 | "random.seed(1)", 64 | "", 65 | "x = [\"A\"] + [\"G\"] + [\"C\"] + [\"T\"]", 66 | "x = x*2000", 67 | "random.shuffle(x)", 68 | "x = \"\".join(x)", 69 | "x" 70 | ], 71 | "language": "python", 72 | "outputs": [] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "input": [ 77 | "fp = open('genome.fa', 'w')", 78 | "fp.write('>genome\\n')", 79 | "fp.write(x)", 80 | "fp.close()" 81 | ], 82 | "language": "python", 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "source": [ 88 | "## Make reads with a 1% error rate to a coverage of about 200" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "input": [ 94 | "!python /mnt/2012-paper-diginorm/pipeline/make-reads.py genome.fa > reads.fa" 95 | ], 96 | "language": "python", 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "input": [ 102 | "!head reads.fa" 103 | ], 104 | "language": "python", 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "source": [ 110 | "## Now, use khmer to load all the 20-mers in the reads into a counting data structure.", 111 | "", 112 | "The counting data structure is saved as counts.kh. Other parameters explained in:", 113 | "", 114 | "http://readthedocs.org/docs/khmer/en/latest/scripts.html" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "input": [ 120 | "!python /usr/local/src/khmer/scripts/load-into-counting.py -x 1e8 -N 4 -k 20 counts.kh reads.fa" 121 | ], 122 | "language": "python", 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "source": [ 128 | "## Get the abundance distribution of the k-mers in the reads" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "input": [ 134 | "!python /usr/local/src/khmer/scripts/abundance-dist.py -s counts.kh reads.fa reads.dist" 135 | ], 136 | "language": "python", 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "source": [ 142 | "## The output is in 'reads.dist'", 143 | "", 144 | "Column 0 is the k-mer count; column 1 is the number of k-mers with that count; column 2 is the running total of column 1; and column 3 is the fraction of totla in column 2." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "input": [ 150 | "!head reads.dist" 151 | ], 152 | "language": "python", 153 | "outputs": [] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "source": [ 158 | "## Now, plot the abundance distribution" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "input": [ 164 | "x = []", 165 | "y = []", 166 | "for line in open('reads.dist'):", 167 | " count, num, total, frac = line.split()", 168 | " x.append(int(count))", 169 | " y.append(int(num))" 170 | ], 171 | "language": "python", 172 | "outputs": [] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "source": [ 177 | "## Reads with errors have many k-mers with abundance=1." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "input": [ 183 | "plot(x, y)" 184 | ], 185 | "language": "python", 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "source": [ 191 | "## There will also be a bunch of high-count k-mers correlated with coverage" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "input": [ 197 | "plot(x, y)", 198 | "axis(ymax=1000)" 199 | ], 200 | "language": "python", 201 | "outputs": [] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "source": [ 206 | "## Apply digital normalization to downsample to a coverage of 20 @ k=20" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "input": [ 212 | "!python /usr/local/src/khmer/scripts/normalize-by-median.py -x 2e8 -N 4 -k 20 -C 20 reads.fa" 213 | ], 214 | "language": "python", 215 | "outputs": [] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "source": [ 220 | "## Count the k-mers in the digitally normalized reads" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "input": [ 226 | "!python /usr/local/src/khmer/scripts/load-into-counting.py -x 1e8 -N 4 -k 20 counts-dn.kh reads.fa.keep" 227 | ], 228 | "language": "python", 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "input": [ 234 | "!python /usr/local/src/khmer/scripts/abundance-dist.py -s counts-dn.kh reads.fa.keep reads.dn.dist" 235 | ], 236 | "language": "python", 237 | "outputs": [] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "input": [ 242 | "!head reads.dn.dist" 243 | ], 244 | "language": "python", 245 | "outputs": [] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "source": [ 250 | "## Compare the raw and diginormed reads" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "input": [ 256 | "x2 = []", 257 | "y2 = []", 258 | "for line in open('reads.dn.dist'):", 259 | " count, num, total, frac = line.split()", 260 | " x2.append(int(count))", 261 | " y2.append(int(num))" 262 | ], 263 | "language": "python", 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "input": [ 269 | "plot(x2, y2)", 270 | "axis(ymax=2000)" 271 | ], 272 | "language": "python", 273 | "outputs": [] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "source": [ 278 | "## Now let's look at mapping-based measures of coverage", 279 | "", 280 | "### First, install bowtie" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "input": [ 286 | "!curl -O -L http://sourceforge.net/projects/bowtie-bio/files/bowtie/0.12.7/bowtie-0.12.7-linux-x86_64.zip", 287 | "!unzip bowtie-0.12.7-linux-x86_64.zip", 288 | "!cp bowtie-0.12.7/bowtie* /usr/local/bin" 289 | ], 290 | "language": "python", 291 | "outputs": [] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "source": [ 296 | "### Index the reference genome" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "input": [ 302 | "!bowtie-build genome.fa genome" 303 | ], 304 | "language": "python", 305 | "outputs": [] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "source": [ 310 | "## Map the raw and abundance filtered reads to the reference genome" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "input": [ 316 | "!bowtie -f -a --strata --best genome reads.fa > reads.map", 317 | "!bowtie -f -a --strata --best genome reads.fa.keep > reads.map.keep" 318 | ], 319 | "language": "python", 320 | "outputs": [] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "input": [ 325 | "!head -3 reads.map" 326 | ], 327 | "language": "python", 328 | "outputs": [] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "source": [ 333 | "## Build a function to read in the mapping output file" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "input": [ 339 | "# define a function that we can use to read in the mapping output file", 340 | "def read_mapping_file(filename):", 341 | " for line in open(filename):", 342 | " _, orient, refname, pos, read, _, _, mismatches = line.split('\\t')[:8]", 343 | " pos = int(pos)", 344 | " mismatches = mismatches.split(',')", 345 | " x = []", 346 | " for posn in mismatches:", 347 | " posn = posn.strip()", 348 | " if posn:", 349 | " posn = posn.split(':')[0]", 350 | " posn = int(posn)", 351 | " x.append(posn)", 352 | " yield orient, refname, pos, read, x" 353 | ], 354 | "language": "python", 355 | "outputs": [] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "source": [ 360 | "## Plot *coverage* distribution (using 'hist')" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "input": [ 366 | "genome_pos = {}", 367 | "", 368 | "for (orient, refname, pos, read, mismatches) in read_mapping_file('reads.map'):", 369 | " for readi in range(len(read)):", 370 | " genome_pos[pos + readi] = genome_pos.get(pos + readi, 0) + 1", 371 | "", 372 | "genome_pos_keep = {}", 373 | "", 374 | "for (orient, refname, pos, read, mismatches) in read_mapping_file('reads.map.keep'):", 375 | " for readi in range(len(read)):", 376 | " genome_pos_keep[pos + readi] = genome_pos_keep.get(pos + readi, 0) + 1", 377 | " " 378 | ], 379 | "language": "python", 380 | "outputs": [] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "source": [ 385 | "## Coverage distribution of original reads" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "input": [ 391 | "hist(genome_pos.values(), bins=100)", 392 | "xlabel('coverage')", 393 | "ylabel('number of bases with that coverage')" 394 | ], 395 | "language": "python", 396 | "outputs": [] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "source": [ 401 | "## Coverage distribution of normalized reads" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "input": [ 407 | "hist(genome_pos_keep.values(), bins=100)", 408 | "xlabel('coverage')" 409 | ], 410 | "language": "python", 411 | "outputs": [] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "input": [ 416 | "" 417 | ], 418 | "language": "python", 419 | "outputs": [] 420 | } 421 | ] 422 | } 423 | ] 424 | } -------------------------------------------------------------------------------- /ngs-61-intro-to-dicts.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-61-intro-to-dicts" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "markdown", 11 | "source": [ 12 | "Goal: take some data from counting birds, and ask questions of it.", 13 | "", 14 | "In particular, how many birds did we see of each type?" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "input": [ 20 | "s = \"\"\"", 21 | "bluejay", 22 | "goldfinch", 23 | "goldfinch", 24 | "bald eagle", 25 | "ostrich", 26 | "bluejay", 27 | "goose", 28 | "goose", 29 | "goose", 30 | "goose", 31 | "goose", 32 | "goose", 33 | "goose", 34 | "duck", 35 | "\"\"\"" 36 | ], 37 | "language": "python", 38 | "outputs": [], 39 | "prompt_number": 1 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "source": [ 44 | "First, let's transform this text list into a \"real\" python list." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "input": [ 50 | "birdlist = s.split('\\n')", 51 | "birdlist = [ x for x in birdlist if x ]", 52 | "birdlist" 53 | ], 54 | "language": "python", 55 | "outputs": [ 56 | { 57 | "output_type": "pyout", 58 | "prompt_number": 6, 59 | "text": [ 60 | "['bluejay',", 61 | " 'goldfinch',", 62 | " 'goldfinch',", 63 | " 'bald eagle',", 64 | " 'ostrich',", 65 | " 'bluejay',", 66 | " 'goose',", 67 | " 'goose',", 68 | " 'goose',", 69 | " 'goose',", 70 | " 'goose',", 71 | " 'goose',", 72 | " 'goose',", 73 | " 'duck']" 74 | ] 75 | } 76 | ], 77 | "prompt_number": 6 78 | }, 79 | { 80 | "cell_type": "code", 81 | "input": [ 82 | "countlist = []", 83 | "for bird in birdlist:", 84 | " newcountlist = []", 85 | " found = False", 86 | " for (j, count) in countlist:", 87 | " if j == bird:", 88 | " found = True", 89 | " count += 1", 90 | " newcountlist.append((j, count))", 91 | " if not found:", 92 | " newcountlist.append((bird, 1))", 93 | " countlist = newcountlist", 94 | " ", 95 | "countlist" 96 | ], 97 | "language": "python", 98 | "outputs": [ 99 | { 100 | "output_type": "pyout", 101 | "prompt_number": 9, 102 | "text": [ 103 | "[('bluejay', 2),", 104 | " ('goldfinch', 2),", 105 | " ('bald eagle', 1),", 106 | " ('ostrich', 1),", 107 | " ('goose', 7),", 108 | " ('duck', 1)]" 109 | ] 110 | } 111 | ], 112 | "prompt_number": 9 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "source": [ 117 | "Let's use a dictionary instead" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "input": [ 123 | "birddict = {}", 124 | "# set the value associated with the key 'goose' to the number 7", 125 | "birddict['goose'] = 7", 126 | "# get the value associated with the key 'goose'", 127 | "print birddict['goose']" 128 | ], 129 | "language": "python", 130 | "outputs": [ 131 | { 132 | "output_type": "stream", 133 | "stream": "stdout", 134 | "text": [ 135 | "7", 136 | "" 137 | ] 138 | } 139 | ], 140 | "prompt_number": 10 141 | }, 142 | { 143 | "cell_type": "code", 144 | "input": [ 145 | "birddict = {}", 146 | "for bird in birdlist:", 147 | " birddict[bird] = 0", 148 | " ", 149 | "for bird in birdlist:", 150 | " birddict[bird] = birddict[bird] + 1", 151 | "", 152 | "birddict" 153 | ], 154 | "language": "python", 155 | "outputs": [ 156 | { 157 | "output_type": "pyout", 158 | "prompt_number": 14, 159 | "text": [ 160 | "{'bald eagle': 1,", 161 | " 'bluejay': 2,", 162 | " 'duck': 1,", 163 | " 'goldfinch': 2,", 164 | " 'goose': 7,", 165 | " 'ostrich': 1}" 166 | ] 167 | } 168 | ], 169 | "prompt_number": 14 170 | }, 171 | { 172 | "cell_type": "code", 173 | "input": [ 174 | "birddict = {}", 175 | "", 176 | "for bird in birdlist:", 177 | " if bird not in birddict:", 178 | " birddict[bird] = 1", 179 | " else:", 180 | " birddict[bird] = birddict[bird] + 1", 181 | "", 182 | "birddict" 183 | ], 184 | "language": "python", 185 | "outputs": [ 186 | { 187 | "output_type": "pyout", 188 | "prompt_number": 17, 189 | "text": [ 190 | "{'bald eagle': 1,", 191 | " 'bluejay': 2,", 192 | " 'duck': 1,", 193 | " 'goldfinch': 2,", 194 | " 'goose': 7,", 195 | " 'ostrich': 1}" 196 | ] 197 | } 198 | ], 199 | "prompt_number": 17 200 | }, 201 | { 202 | "cell_type": "code", 203 | "input": [ 204 | "birddict = {}", 205 | "", 206 | "for bird in birdlist:", 207 | " birddict[bird] = birddict.get(bird, 0) + 1", 208 | "", 209 | "birddict" 210 | ], 211 | "language": "python", 212 | "outputs": [ 213 | { 214 | "output_type": "pyout", 215 | "prompt_number": 18, 216 | "text": [ 217 | "{'bald eagle': 1,", 218 | " 'bluejay': 2,", 219 | " 'duck': 1,", 220 | " 'goldfinch': 2,", 221 | " 'goose': 7,", 222 | " 'ostrich': 1}" 223 | ] 224 | } 225 | ], 226 | "prompt_number": 18 227 | }, 228 | { 229 | "cell_type": "code", 230 | "input": [ 231 | "birddict.keys()" 232 | ], 233 | "language": "python", 234 | "outputs": [ 235 | { 236 | "output_type": "pyout", 237 | "prompt_number": 19, 238 | "text": [ 239 | "['bald eagle', 'bluejay', 'ostrich', 'goose', 'goldfinch', 'duck']" 240 | ] 241 | } 242 | ], 243 | "prompt_number": 19 244 | }, 245 | { 246 | "cell_type": "code", 247 | "input": [ 248 | "birddict.values()" 249 | ], 250 | "language": "python", 251 | "outputs": [ 252 | { 253 | "output_type": "pyout", 254 | "prompt_number": 20, 255 | "text": [ 256 | "[1, 2, 1, 7, 2, 1]" 257 | ] 258 | } 259 | ], 260 | "prompt_number": 20 261 | }, 262 | { 263 | "cell_type": "code", 264 | "input": [ 265 | "birddict.items()" 266 | ], 267 | "language": "python", 268 | "outputs": [ 269 | { 270 | "output_type": "pyout", 271 | "prompt_number": 21, 272 | "text": [ 273 | "[('bald eagle', 1),", 274 | " ('bluejay', 2),", 275 | " ('ostrich', 1),", 276 | " ('goose', 7),", 277 | " ('goldfinch', 2),", 278 | " ('duck', 1)]" 279 | ] 280 | } 281 | ], 282 | "prompt_number": 21 283 | }, 284 | { 285 | "cell_type": "code", 286 | "input": [ 287 | "x = sorted(birddict.items())", 288 | "x" 289 | ], 290 | "language": "python", 291 | "outputs": [ 292 | { 293 | "output_type": "pyout", 294 | "prompt_number": 23, 295 | "text": [ 296 | "[('bald eagle', 1),", 297 | " ('bluejay', 2),", 298 | " ('duck', 1),", 299 | " ('goldfinch', 2),", 300 | " ('goose', 7),", 301 | " ('ostrich', 1)]" 302 | ] 303 | } 304 | ], 305 | "prompt_number": 23 306 | }, 307 | { 308 | "cell_type": "code", 309 | "input": [ 310 | "def get_count(a):", 311 | " return a[1]", 312 | "", 313 | "# Sort by count, not by bird name; use get_count to get the count.", 314 | "x = sorted(birddict.items(), key=get_count)", 315 | "# by default sorted goes from least to most; reverse that.", 316 | "x.reverse()", 317 | "x" 318 | ], 319 | "language": "python", 320 | "outputs": [ 321 | { 322 | "output_type": "pyout", 323 | "prompt_number": 33, 324 | "text": [ 325 | "[('goose', 7),", 326 | " ('goldfinch', 2),", 327 | " ('bluejay', 2),", 328 | " ('duck', 1),", 329 | " ('ostrich', 1),", 330 | " ('bald eagle', 1)]" 331 | ] 332 | } 333 | ], 334 | "prompt_number": 33 335 | }, 336 | { 337 | "cell_type": "code", 338 | "input": [ 339 | "" 340 | ], 341 | "language": "python", 342 | "outputs": [] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "input": [ 347 | "" 348 | ], 349 | "language": "python", 350 | "outputs": [] 351 | } 352 | ] 353 | } 354 | ] 355 | } -------------------------------------------------------------------------------- /ngs-62-screed-database-as-dict.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-62-screed-database-as-dict" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "code", 11 | "input": [ 12 | "cd /usr/local/src/screed" 13 | ], 14 | "language": "python", 15 | "outputs": [ 16 | { 17 | "output_type": "stream", 18 | "stream": "stdout", 19 | "text": [ 20 | "/usr/local/src/screed", 21 | "" 22 | ] 23 | } 24 | ], 25 | "prompt_number": 1 26 | }, 27 | { 28 | "cell_type": "code", 29 | "input": [ 30 | "cp screed/tests/test.fa.gz /mnt", 31 | "" 32 | ], 33 | "language": "python", 34 | "outputs": [], 35 | "prompt_number": 5 36 | }, 37 | { 38 | "cell_type": "code", 39 | "input": [ 40 | "cd /mnt" 41 | ], 42 | "language": "python", 43 | "outputs": [ 44 | { 45 | "output_type": "stream", 46 | "stream": "stdout", 47 | "text": [ 48 | "/mnt" 49 | ] 50 | }, 51 | { 52 | "output_type": "stream", 53 | "stream": "stdout", 54 | "text": [ 55 | "", 56 | "" 57 | ] 58 | } 59 | ], 60 | "prompt_number": 6 61 | }, 62 | { 63 | "cell_type": "code", 64 | "input": [ 65 | "ls", 66 | "" 67 | ], 68 | "language": "python", 69 | "outputs": [ 70 | { 71 | "output_type": "stream", 72 | "stream": "stdout", 73 | "text": [ 74 | "\u001b[0m\u001b[01;34mlost+found\u001b[0m/ \u001b[01;34msamtools-0.1.18\u001b[0m/ samtools-0.1.18.tar.bz2 test.fa.gz", 75 | "" 76 | ] 77 | } 78 | ], 79 | "prompt_number": 7 80 | }, 81 | { 82 | "cell_type": "code", 83 | "input": [ 84 | "!gunzip test.fa.gz" 85 | ], 86 | "language": "python", 87 | "outputs": [], 88 | "prompt_number": 12 89 | }, 90 | { 91 | "cell_type": "code", 92 | "input": [ 93 | "import screed" 94 | ], 95 | "language": "python", 96 | "outputs": [], 97 | "prompt_number": 10 98 | }, 99 | { 100 | "cell_type": "code", 101 | "input": [ 102 | "db = screed.read_fasta_sequences('test.fa')" 103 | ], 104 | "language": "python", 105 | "outputs": [], 106 | "prompt_number": 14 107 | }, 108 | { 109 | "cell_type": "code", 110 | "input": [ 111 | "seq = db['ENSMICT00000012722']", 112 | "print seq.name", 113 | "print seq.sequence" 114 | ], 115 | "language": "python", 116 | "outputs": [ 117 | { 118 | "output_type": "stream", 119 | "stream": "stdout", 120 | "text": [ 121 | "ENSMICT00000012722" 122 | ] 123 | }, 124 | { 125 | "output_type": "stream", 126 | "stream": "stdout", 127 | "text": [ 128 | "", 129 | "TGCAGAAAATATCAAGAGTCAGCAGAAAAACTATACAAGGGCTGGTATTTTGATTATTCTATAAAAATTCACTTTTTGCTCAGTGTCTTTCATCTGGGCCTGGCCTCCTCTCTTGCAAGCCCTGGATTCATAACATCTATAATAATTTTTATATGTGGTAGAGTAATATTAGCTGATTCCTTTGCCTCCTGTTCCTTCCCCTCATTCAGGCAGCTGGCCAGGTTTGTGCTCCTTATCTCGCAGAAGAGATGTGATAGCAGGCAGAGAATTAAAGTCTTCCTGGCTTTTGGTTTCAGAAGCTGCCTTGGGAAGGAAGCAAACAAACATGCCACAGATAAAATATTTGAAAGAAAAGATAATGAAAGTAGAAAAGGGTTCCCTGTTCTTGTGGGGAGGAAGTGA", 130 | "" 131 | ] 132 | } 133 | ], 134 | "prompt_number": 17 135 | }, 136 | { 137 | "cell_type": "code", 138 | "input": [ 139 | "!ls -l" 140 | ], 141 | "language": "python", 142 | "outputs": [ 143 | { 144 | "output_type": "stream", 145 | "stream": "stdout", 146 | "text": [ 147 | "total 444", 148 | "drwx------ 2 root root 16384 2011-04-24 08:02 lost+found", 149 | "drwxr-xr-x 6 501 staff 4096 2012-06-14 23:15 samtools-0.1.18", 150 | "-rw-r--r-- 1 root root 379306 2012-06-14 23:14 samtools-0.1.18.tar.bz2", 151 | "-rw-r--r-- 1 root root 12717 2012-06-15 00:18 test.fa", 152 | "-rw-r--r-- 1 root root 3072 2012-06-15 00:21 test.fa.gz_screed", 153 | "-rw-r--r-- 1 root root 1544 2012-06-15 00:21 test.fa.gz_screed-journal", 154 | "-rw-r--r-- 1 root root 22528 2012-06-15 00:22 test.fa_screed", 155 | "" 156 | ] 157 | } 158 | ], 159 | "prompt_number": 18 160 | }, 161 | { 162 | "cell_type": "code", 163 | "input": [ 164 | "" 165 | ], 166 | "language": "python", 167 | "outputs": [] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "input": [ 172 | "" 173 | ], 174 | "language": "python", 175 | "outputs": [] 176 | } 177 | ] 178 | } 179 | ] 180 | } -------------------------------------------------------------------------------- /ngs-70-hmp-diginorm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ngs-70-hmp-diginorm" 4 | }, 5 | "nbformat": 3, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "markdown", 11 | "source": [ 12 | "## Running digital normalization and artifact removal an an HMP mock data set", 13 | "", 14 | "This notebook runs 'digital normalization' (see http://ged.msu.edu/papers/2012-diginorm/) and Illumina artifact removal on an HMP Illumina mock data set. The dataset and description originated from http://www.ncbi.nlm.nih.gov/bioproject/48475. ", 15 | "", 16 | "Prerequisites for this tutorial", 17 | "", 18 | "1. Start a new NGS 2012 EC2 instance", 19 | "http://ged.msu.edu/angus/tutorials-2012/start-up-an-ec2-instance.html", 20 | "", 21 | "2. Install khmer and screed", 22 | "In your ipython notebook homepage: run ngs-00-update-notebooks and ngs-03-install-khmer", 23 | "", 24 | "3. Make a volume of a snapshot (snap-08efea77) containing the HMP data and mount it on your instance as /hmp-mock-tutorial", 25 | "", 26 | "", 27 | "We are going to start with the HMP gzipped fastq Illumina sequencing reads and:", 28 | "", 29 | "1. Normalize to coverage = 10", 30 | "2. Trim high-abundance (likely Illumina artifacts)", 31 | "", 32 | "This produces files that can be used as input into the partitioning algorithm (see the next notebook!)", 33 | "", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "input": [ 40 | "cd /hmp-mock-tutorial/raw-data" 41 | ], 42 | "language": "python", 43 | "outputs": [ 44 | { 45 | "output_type": "stream", 46 | "stream": "stdout", 47 | "text": [ 48 | "/hmp-mock-tutorial/raw-data", 49 | "" 50 | ] 51 | } 52 | ], 53 | "prompt_number": 1 54 | }, 55 | { 56 | "cell_type": "code", 57 | "input": [ 58 | "!ls" 59 | ], 60 | "language": "python", 61 | "outputs": [ 62 | { 63 | "output_type": "stream", 64 | "stream": "stdout", 65 | "text": [ 66 | "SRR172902.fastq.gz SRR172903.fastq.gz", 67 | "" 68 | ] 69 | } 70 | ], 71 | "prompt_number": 2 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "source": [ 76 | "We are going to be working with the SRR172903 dataset (staggered mixture). You can play with a combination of the two or the other dataset (even mixture) later if you like." 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "source": [ 82 | "### Pass 1: normalize to C=10.", 83 | "", 84 | "The only parameter to change here is the memory, which is fixed at 4gb (multiply the -N and -x parameters).", 85 | "", 86 | "This should take 10-15 minutes." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "input": [ 92 | "cd /hmp-mock-tutorial" 93 | ], 94 | "language": "python", 95 | "outputs": [ 96 | { 97 | "output_type": "stream", 98 | "stream": "stdout", 99 | "text": [ 100 | "/hmp-mock-tutorial", 101 | "" 102 | ] 103 | } 104 | ], 105 | "prompt_number": 3 106 | }, 107 | { 108 | "cell_type": "code", 109 | "input": [ 110 | "mkdir tutorial-files" 111 | ], 112 | "language": "python", 113 | "outputs": [], 114 | "prompt_number": 4 115 | }, 116 | { 117 | "cell_type": "code", 118 | "input": [ 119 | "cd /hmp-mock-tutorial/tutorial-files" 120 | ], 121 | "language": "python", 122 | "outputs": [ 123 | { 124 | "output_type": "stream", 125 | "stream": "stdout", 126 | "text": [ 127 | "/hmp-mock-tutorial/tutorial-files" 128 | ] 129 | }, 130 | { 131 | "output_type": "stream", 132 | "stream": "stdout", 133 | "text": [ 134 | "", 135 | "" 136 | ] 137 | } 138 | ], 139 | "prompt_number": 5 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "source": [ 144 | "Now, let's run the digital normalization for a word length of -k 20, we are going to remove redundant reads which are contribute to a coverage greater than -C 10. We'll be using a hashtable size of -x 1e9 and four of them -N 4. We'll save this hashtable as mock-pass1.kh. The actual size of your hashtable is dependent on the characteristics of your dataset - the important thing is that the false positive rate in the hashtable (caluclated at the end) is less than 15%. The output will be saved in pass1.report. The input of reads is the last parameter of the command (there can be multiple read files here). The output shows you the number of reads kept over the total of reads processed. " 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "input": [ 150 | "!python /usr/local/src/khmer/scripts/normalize-by-median.py -k 20 -C 10 -N 4 -x 1e9 -s /hmp-mock-tutorial/tutorial-files/mock-pass1.kh -R /hmp-mock-tutorial/tutorial-files/pass1.report /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz" 151 | ], 152 | "language": "python", 153 | "outputs": [ 154 | { 155 | "output_type": "stream", 156 | "stream": "stdout", 157 | "text": [ 158 | "", 159 | "PARAMETERS:", 160 | " - kmer size = 20 \t\t(-k)", 161 | " - n hashes = 4 \t\t(-N)", 162 | " - min hashsize = 1e+09 \t(-x)", 163 | "", 164 | "Estimated memory usage is 4e+09 bytes (n_hashes x min_hashsize)", 165 | "--------", 166 | "making hashtable", 167 | "" 168 | ] 169 | }, 170 | { 171 | "output_type": "stream", 172 | "stream": "stdout", 173 | "text": [ 174 | "... kept 98471 of 100000 , or 98 %", 175 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 176 | "" 177 | ] 178 | }, 179 | { 180 | "output_type": "stream", 181 | "stream": "stdout", 182 | "text": [ 183 | "... kept 194971 of 200000 , or 97 %", 184 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 185 | "" 186 | ] 187 | }, 188 | { 189 | "output_type": "stream", 190 | "stream": "stdout", 191 | "text": [ 192 | "... kept 289555 of 300000 , or 96 %", 193 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 194 | "" 195 | ] 196 | }, 197 | { 198 | "output_type": "stream", 199 | "stream": "stdout", 200 | "text": [ 201 | "... kept 383133 of 400000 , or 95 %", 202 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 203 | "" 204 | ] 205 | }, 206 | { 207 | "output_type": "stream", 208 | "stream": "stdout", 209 | "text": [ 210 | "... kept 476015 of 500000 , or 95 %", 211 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 212 | "" 213 | ] 214 | }, 215 | { 216 | "output_type": "stream", 217 | "stream": "stdout", 218 | "text": [ 219 | "... kept 568701 of 600000 , or 94 %", 220 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 221 | "" 222 | ] 223 | }, 224 | { 225 | "output_type": "stream", 226 | "stream": "stdout", 227 | "text": [ 228 | "... kept 661095 of 700000 , or 94 %", 229 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 230 | "" 231 | ] 232 | }, 233 | { 234 | "output_type": "stream", 235 | "stream": "stdout", 236 | "text": [ 237 | "... kept 753224 of 800000 , or 94 %", 238 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 239 | "" 240 | ] 241 | }, 242 | { 243 | "output_type": "stream", 244 | "stream": "stdout", 245 | "text": [ 246 | "... kept 845404 of 900000 , or 93 %", 247 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 248 | "" 249 | ] 250 | }, 251 | { 252 | "output_type": "stream", 253 | "stream": "stdout", 254 | "text": [ 255 | "... kept 937527 of 1000000 , or 93 %", 256 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 257 | "" 258 | ] 259 | }, 260 | { 261 | "output_type": "stream", 262 | "stream": "stdout", 263 | "text": [ 264 | "... kept 1030022 of 1100000 , or 93 %", 265 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 266 | "" 267 | ] 268 | }, 269 | { 270 | "output_type": "stream", 271 | "stream": "stdout", 272 | "text": [ 273 | "... kept 1121084 of 1200000 , or 93 %", 274 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 275 | "" 276 | ] 277 | }, 278 | { 279 | "output_type": "stream", 280 | "stream": "stdout", 281 | "text": [ 282 | "... kept 1211308 of 1300000 , or 93 %", 283 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 284 | "" 285 | ] 286 | }, 287 | { 288 | "output_type": "stream", 289 | "stream": "stdout", 290 | "text": [ 291 | "... kept 1301535 of 1400000 , or 92 %", 292 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 293 | "" 294 | ] 295 | }, 296 | { 297 | "output_type": "stream", 298 | "stream": "stdout", 299 | "text": [ 300 | "... kept 1391544 of 1500000 , or 92 %", 301 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 302 | "" 303 | ] 304 | }, 305 | { 306 | "output_type": "stream", 307 | "stream": "stdout", 308 | "text": [ 309 | "... kept 1480364 of 1600000 , or 92 %", 310 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 311 | "" 312 | ] 313 | }, 314 | { 315 | "output_type": "stream", 316 | "stream": "stdout", 317 | "text": [ 318 | "... kept 1568191 of 1700000 , or 92 %", 319 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 320 | "" 321 | ] 322 | }, 323 | { 324 | "output_type": "stream", 325 | "stream": "stdout", 326 | "text": [ 327 | "... kept 1655168 of 1800000 , or 91 %", 328 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 329 | "" 330 | ] 331 | }, 332 | { 333 | "output_type": "stream", 334 | "stream": "stdout", 335 | "text": [ 336 | "... kept 1743478 of 1900000 , or 91 %", 337 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 338 | "" 339 | ] 340 | }, 341 | { 342 | "output_type": "stream", 343 | "stream": "stdout", 344 | "text": [ 345 | "... kept 1828559 of 2000000 , or 91 %", 346 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 347 | "" 348 | ] 349 | }, 350 | { 351 | "output_type": "stream", 352 | "stream": "stdout", 353 | "text": [ 354 | "... kept 1912000 of 2100000 , or 91 %", 355 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 356 | "" 357 | ] 358 | }, 359 | { 360 | "output_type": "stream", 361 | "stream": "stdout", 362 | "text": [ 363 | "... kept 1997606 of 2200000 , or 90 %", 364 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 365 | "" 366 | ] 367 | }, 368 | { 369 | "output_type": "stream", 370 | "stream": "stdout", 371 | "text": [ 372 | "... kept 2080707 of 2300000 , or 90 %", 373 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 374 | "" 375 | ] 376 | }, 377 | { 378 | "output_type": "stream", 379 | "stream": "stdout", 380 | "text": [ 381 | "... kept 2164877 of 2400000 , or 90 %", 382 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 383 | "" 384 | ] 385 | }, 386 | { 387 | "output_type": "stream", 388 | "stream": "stdout", 389 | "text": [ 390 | "... kept 2247921 of 2500000 , or 89 %", 391 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 392 | "" 393 | ] 394 | }, 395 | { 396 | "output_type": "stream", 397 | "stream": "stdout", 398 | "text": [ 399 | "... kept 2324387 of 2600000 , or 89 %", 400 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 401 | "" 402 | ] 403 | }, 404 | { 405 | "output_type": "stream", 406 | "stream": "stdout", 407 | "text": [ 408 | "... kept 2398885 of 2700000 , or 88 %", 409 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 410 | "" 411 | ] 412 | }, 413 | { 414 | "output_type": "stream", 415 | "stream": "stdout", 416 | "text": [ 417 | "... kept 2471525 of 2800000 , or 88 %", 418 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 419 | "" 420 | ] 421 | }, 422 | { 423 | "output_type": "stream", 424 | "stream": "stdout", 425 | "text": [ 426 | "... kept 2543473 of 2900000 , or 87 %", 427 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 428 | "" 429 | ] 430 | }, 431 | { 432 | "output_type": "stream", 433 | "stream": "stdout", 434 | "text": [ 435 | "... kept 2621185 of 3000000 , or 87 %", 436 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 437 | "" 438 | ] 439 | }, 440 | { 441 | "output_type": "stream", 442 | "stream": "stdout", 443 | "text": [ 444 | "... kept 2691512 of 3100000 , or 86 %", 445 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 446 | "" 447 | ] 448 | }, 449 | { 450 | "output_type": "stream", 451 | "stream": "stdout", 452 | "text": [ 453 | "... kept 2771750 of 3200000 , or 86 %", 454 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 455 | "" 456 | ] 457 | }, 458 | { 459 | "output_type": "stream", 460 | "stream": "stdout", 461 | "text": [ 462 | "... kept 2841597 of 3300000 , or 86 %", 463 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 464 | "" 465 | ] 466 | }, 467 | { 468 | "output_type": "stream", 469 | "stream": "stdout", 470 | "text": [ 471 | "... kept 2912094 of 3400000 , or 85 %", 472 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 473 | "" 474 | ] 475 | }, 476 | { 477 | "output_type": "stream", 478 | "stream": "stdout", 479 | "text": [ 480 | "... kept 2976001 of 3500000 , or 85 %", 481 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 482 | "" 483 | ] 484 | }, 485 | { 486 | "output_type": "stream", 487 | "stream": "stdout", 488 | "text": [ 489 | "... kept 3038330 of 3600000 , or 84 %", 490 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 491 | "" 492 | ] 493 | }, 494 | { 495 | "output_type": "stream", 496 | "stream": "stdout", 497 | "text": [ 498 | "... kept 3099250 of 3700000 , or 83 %", 499 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 500 | "" 501 | ] 502 | }, 503 | { 504 | "output_type": "stream", 505 | "stream": "stdout", 506 | "text": [ 507 | "... kept 3156056 of 3800000 , or 83 %", 508 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 509 | "" 510 | ] 511 | }, 512 | { 513 | "output_type": "stream", 514 | "stream": "stdout", 515 | "text": [ 516 | "... kept 3212197 of 3900000 , or 82 %", 517 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 518 | "" 519 | ] 520 | }, 521 | { 522 | "output_type": "stream", 523 | "stream": "stdout", 524 | "text": [ 525 | "... kept 3267587 of 4000000 , or 81 %", 526 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 527 | "" 528 | ] 529 | }, 530 | { 531 | "output_type": "stream", 532 | "stream": "stdout", 533 | "text": [ 534 | "... kept 3320406 of 4100000 , or 80 %", 535 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 536 | "" 537 | ] 538 | }, 539 | { 540 | "output_type": "stream", 541 | "stream": "stdout", 542 | "text": [ 543 | "... kept 3371773 of 4200000 , or 80 %", 544 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 545 | "" 546 | ] 547 | }, 548 | { 549 | "output_type": "stream", 550 | "stream": "stdout", 551 | "text": [ 552 | "... kept 3422626 of 4300000 , or 79 %", 553 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 554 | "" 555 | ] 556 | }, 557 | { 558 | "output_type": "stream", 559 | "stream": "stdout", 560 | "text": [ 561 | "... kept 3471702 of 4400000 , or 78 %", 562 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 563 | "" 564 | ] 565 | }, 566 | { 567 | "output_type": "stream", 568 | "stream": "stdout", 569 | "text": [ 570 | "... kept 3520891 of 4500000 , or 78 %", 571 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 572 | "" 573 | ] 574 | }, 575 | { 576 | "output_type": "stream", 577 | "stream": "stdout", 578 | "text": [ 579 | "... kept 3580270 of 4600000 , or 77 %", 580 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 581 | "" 582 | ] 583 | }, 584 | { 585 | "output_type": "stream", 586 | "stream": "stdout", 587 | "text": [ 588 | "... kept 3632578 of 4700000 , or 77 %", 589 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 590 | "" 591 | ] 592 | }, 593 | { 594 | "output_type": "stream", 595 | "stream": "stdout", 596 | "text": [ 597 | "... kept 3681283 of 4800000 , or 76 %", 598 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 599 | "" 600 | ] 601 | }, 602 | { 603 | "output_type": "stream", 604 | "stream": "stdout", 605 | "text": [ 606 | "... kept 3737769 of 4900000 , or 76 %", 607 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 608 | "" 609 | ] 610 | }, 611 | { 612 | "output_type": "stream", 613 | "stream": "stdout", 614 | "text": [ 615 | "... kept 3796621 of 5000000 , or 75 %", 616 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 617 | "" 618 | ] 619 | }, 620 | { 621 | "output_type": "stream", 622 | "stream": "stdout", 623 | "text": [ 624 | "... kept 3841452 of 5100000 , or 75 %", 625 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 626 | "" 627 | ] 628 | }, 629 | { 630 | "output_type": "stream", 631 | "stream": "stdout", 632 | "text": [ 633 | "... kept 3883343 of 5200000 , or 74 %", 634 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 635 | "" 636 | ] 637 | }, 638 | { 639 | "output_type": "stream", 640 | "stream": "stdout", 641 | "text": [ 642 | "... kept 3926327 of 5300000 , or 74 %", 643 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 644 | "" 645 | ] 646 | }, 647 | { 648 | "output_type": "stream", 649 | "stream": "stdout", 650 | "text": [ 651 | "... kept 3967557 of 5400000 , or 73 %", 652 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 653 | "" 654 | ] 655 | }, 656 | { 657 | "output_type": "stream", 658 | "stream": "stdout", 659 | "text": [ 660 | "... kept 4007282 of 5500000 , or 72 %", 661 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 662 | "" 663 | ] 664 | }, 665 | { 666 | "output_type": "stream", 667 | "stream": "stdout", 668 | "text": [ 669 | "... kept 4046391 of 5600000 , or 72 %", 670 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 671 | "" 672 | ] 673 | }, 674 | { 675 | "output_type": "stream", 676 | "stream": "stdout", 677 | "text": [ 678 | "... kept 4084792 of 5700000 , or 71 %", 679 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 680 | "" 681 | ] 682 | }, 683 | { 684 | "output_type": "stream", 685 | "stream": "stdout", 686 | "text": [ 687 | "... kept 4122792 of 5800000 , or 71 %", 688 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 689 | "" 690 | ] 691 | }, 692 | { 693 | "output_type": "stream", 694 | "stream": "stdout", 695 | "text": [ 696 | "... kept 4159952 of 5900000 , or 70 %", 697 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 698 | "" 699 | ] 700 | }, 701 | { 702 | "output_type": "stream", 703 | "stream": "stdout", 704 | "text": [ 705 | "... kept 4196225 of 6000000 , or 69 %", 706 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 707 | "" 708 | ] 709 | }, 710 | { 711 | "output_type": "stream", 712 | "stream": "stdout", 713 | "text": [ 714 | "... kept 4231920 of 6100000 , or 69 %", 715 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 716 | "" 717 | ] 718 | }, 719 | { 720 | "output_type": "stream", 721 | "stream": "stdout", 722 | "text": [ 723 | "... kept 4267195 of 6200000 , or 68 %", 724 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 725 | "" 726 | ] 727 | }, 728 | { 729 | "output_type": "stream", 730 | "stream": "stdout", 731 | "text": [ 732 | "... kept 4301540 of 6300000 , or 68 %", 733 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 734 | "" 735 | ] 736 | }, 737 | { 738 | "output_type": "stream", 739 | "stream": "stdout", 740 | "text": [ 741 | "... kept 4335437 of 6400000 , or 67 %", 742 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 743 | "" 744 | ] 745 | }, 746 | { 747 | "output_type": "stream", 748 | "stream": "stdout", 749 | "text": [ 750 | "... kept 4369518 of 6500000 , or 67 %", 751 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 752 | "" 753 | ] 754 | }, 755 | { 756 | "output_type": "stream", 757 | "stream": "stdout", 758 | "text": [ 759 | "... kept 4402650 of 6600000 , or 66 %", 760 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 761 | "" 762 | ] 763 | }, 764 | { 765 | "output_type": "stream", 766 | "stream": "stdout", 767 | "text": [ 768 | "... kept 4434925 of 6700000 , or 66 %", 769 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 770 | "" 771 | ] 772 | }, 773 | { 774 | "output_type": "stream", 775 | "stream": "stdout", 776 | "text": [ 777 | "... kept 4467070 of 6800000 , or 65 %", 778 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 779 | "" 780 | ] 781 | }, 782 | { 783 | "output_type": "stream", 784 | "stream": "stdout", 785 | "text": [ 786 | "... kept 4499248 of 6900000 , or 65 %", 787 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 788 | "" 789 | ] 790 | }, 791 | { 792 | "output_type": "stream", 793 | "stream": "stdout", 794 | "text": [ 795 | "... kept 4531198 of 7000000 , or 64 %", 796 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 797 | "" 798 | ] 799 | }, 800 | { 801 | "output_type": "stream", 802 | "stream": "stdout", 803 | "text": [ 804 | "... kept 4562298 of 7100000 , or 64 %", 805 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 806 | "" 807 | ] 808 | }, 809 | { 810 | "output_type": "stream", 811 | "stream": "stdout", 812 | "text": [ 813 | "... kept 4593518 of 7200000 , or 63 %", 814 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 815 | "" 816 | ] 817 | }, 818 | { 819 | "output_type": "stream", 820 | "stream": "stdout", 821 | "text": [ 822 | "... kept 4624607 of 7300000 , or 63 %", 823 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 824 | "" 825 | ] 826 | }, 827 | { 828 | "output_type": "stream", 829 | "stream": "stdout", 830 | "text": [ 831 | "... kept 4655495 of 7400000 , or 62 %", 832 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 833 | "" 834 | ] 835 | }, 836 | { 837 | "output_type": "stream", 838 | "stream": "stdout", 839 | "text": [ 840 | "... kept 4686324 of 7500000 , or 62 %", 841 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 842 | "" 843 | ] 844 | }, 845 | { 846 | "output_type": "stream", 847 | "stream": "stdout", 848 | "text": [ 849 | "... kept 4717418 of 7600000 , or 62 %", 850 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 851 | "" 852 | ] 853 | }, 854 | { 855 | "output_type": "stream", 856 | "stream": "stdout", 857 | "text": [ 858 | "... kept 4747864 of 7700000 , or 61 %", 859 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 860 | "" 861 | ] 862 | }, 863 | { 864 | "output_type": "stream", 865 | "stream": "stdout", 866 | "text": [ 867 | "... kept 4780874 of 7800000 , or 61 %", 868 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 869 | "" 870 | ] 871 | }, 872 | { 873 | "output_type": "stream", 874 | "stream": "stdout", 875 | "text": [ 876 | "... kept 4813815 of 7900000 , or 60 %", 877 | "... in file /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 878 | "" 879 | ] 880 | }, 881 | { 882 | "output_type": "stream", 883 | "stream": "stdout", 884 | "text": [ 885 | "DONE with /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz ; kept 4823693 of 7932819 or 60 %", 886 | "output in SRR172903.fastq.gz.keep", 887 | "Saving hashfile through /hmp-mock-tutorial/raw-data/SRR172903.fastq.gz", 888 | "...saving to /hmp-mock-tutorial/tutorial-files/mock-pass1.kh", 889 | "" 890 | ] 891 | }, 892 | { 893 | "output_type": "stream", 894 | "stream": "stdout", 895 | "text": [ 896 | "fp rate estimated to be 0.000", 897 | "" 898 | ] 899 | } 900 | ], 901 | "prompt_number": 6 902 | }, 903 | { 904 | "cell_type": "markdown", 905 | "source": [ 906 | "Check out the fp rate. We're good to go... What output files were produced?" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "input": [ 912 | "!ls" 913 | ], 914 | "language": "python", 915 | "outputs": [ 916 | { 917 | "output_type": "stream", 918 | "stream": "stdout", 919 | "text": [ 920 | "mock-pass1.kh pass1.report SRR172903.fastq.gz.keep", 921 | "" 922 | ] 923 | } 924 | ], 925 | "prompt_number": 7 926 | }, 927 | { 928 | "cell_type": "code", 929 | "input": [ 930 | "!head pass1.report" 931 | ], 932 | "language": "python", 933 | "outputs": [ 934 | { 935 | "output_type": "stream", 936 | "stream": "stdout", 937 | "text": [ 938 | "100000 98471 0.98471", 939 | "200000 194971 0.974855", 940 | "300000 289555 0.965183333333", 941 | "400000 383133 0.9578325", 942 | "500000 476015 0.95203", 943 | "600000 568701 0.947835", 944 | "700000 661095 0.944421428571", 945 | "800000 753224 0.94153", 946 | "900000 845404 0.939337777778", 947 | "1000000 937527 0.937527", 948 | "" 949 | ] 950 | } 951 | ], 952 | "prompt_number": 8 953 | }, 954 | { 955 | "cell_type": "markdown", 956 | "source": [ 957 | "You can make a graph of the rate at which diginorm eliminates reads. What does this tell you about the diversity (or redundancy of your dataset?)" 958 | ] 959 | }, 960 | { 961 | "cell_type": "markdown", 962 | "source": [ 963 | "### Pass 2: Trim off high-abundance k-mers", 964 | "", 965 | "These are likely to be the results of Illumina crap. Partitioning (what we do next) is much harder if you don't do this, and the assemblies may be less good. We'll be trimming sequences where we find k-mers which are present in our dataset at a coverage greater than 50. Note, that we should not do this without doing digital normalization first, why?", 966 | "", 967 | "This should take ~10-15 minutes." 968 | ] 969 | }, 970 | { 971 | "cell_type": "code", 972 | "input": [ 973 | "!cd /hmp-mock-tutorial/tutorial-files/", 974 | "" 975 | ], 976 | "language": "python", 977 | "outputs": [], 978 | "prompt_number": 9 979 | }, 980 | { 981 | "cell_type": "code", 982 | "input": [ 983 | "!python /usr/local/src/khmer/sandbox/filter-below-abund.py /hmp-mock-tutorial/tutorial-files/mock-pass1.kh /hmp-mock-tutorial/tutorial-files/*.keep" 984 | ], 985 | "language": "python", 986 | "outputs": [ 987 | { 988 | "output_type": "stream", 989 | "stream": "stdout", 990 | "text": [ 991 | "file with ht: /hmp-mock-tutorial/tutorial-files/mock-pass1.kh", 992 | "-- settings:", 993 | "N THREADS 8", 994 | "--", 995 | "making hashtable", 996 | "" 997 | ] 998 | }, 999 | { 1000 | "output_type": "stream", 1001 | "stream": "stdout", 1002 | "text": [ 1003 | "filtering /hmp-mock-tutorial/tutorial-files/SRR172903.fastq.gz.keep", 1004 | "starting threads", 1005 | "starting writer", 1006 | "loading...", 1007 | "... filtering 0", 1008 | "" 1009 | ] 1010 | }, 1011 | { 1012 | "output_type": "stream", 1013 | "stream": "stdout", 1014 | "text": [ 1015 | "... filtering 100000", 1016 | "" 1017 | ] 1018 | }, 1019 | { 1020 | "output_type": "stream", 1021 | "stream": "stdout", 1022 | "text": [ 1023 | "... filtering 200000", 1024 | "" 1025 | ] 1026 | }, 1027 | { 1028 | "output_type": "stream", 1029 | "stream": "stdout", 1030 | "text": [ 1031 | "... filtering 300000", 1032 | "" 1033 | ] 1034 | }, 1035 | { 1036 | "output_type": "stream", 1037 | "stream": "stdout", 1038 | "text": [ 1039 | "... filtering 400000", 1040 | "" 1041 | ] 1042 | }, 1043 | { 1044 | "output_type": "stream", 1045 | "stream": "stdout", 1046 | "text": [ 1047 | "... filtering 500000", 1048 | "" 1049 | ] 1050 | }, 1051 | { 1052 | "output_type": "stream", 1053 | "stream": "stdout", 1054 | "text": [ 1055 | "processed 500000 / wrote 497528 / removed 2472", 1056 | "processed 37530000 bp / wrote 36322889 bp / removed 1207111 bp", 1057 | "discarded 3.2%", 1058 | "" 1059 | ] 1060 | }, 1061 | { 1062 | "output_type": "stream", 1063 | "stream": "stdout", 1064 | "text": [ 1065 | "... filtering 600000", 1066 | "" 1067 | ] 1068 | }, 1069 | { 1070 | "output_type": "stream", 1071 | "stream": "stdout", 1072 | "text": [ 1073 | "... filtering 700000", 1074 | "" 1075 | ] 1076 | }, 1077 | { 1078 | "output_type": "stream", 1079 | "stream": "stdout", 1080 | "text": [ 1081 | "... filtering 800000", 1082 | "" 1083 | ] 1084 | }, 1085 | { 1086 | "output_type": "stream", 1087 | "stream": "stdout", 1088 | "text": [ 1089 | "... filtering 900000", 1090 | "" 1091 | ] 1092 | }, 1093 | { 1094 | "output_type": "stream", 1095 | "stream": "stdout", 1096 | "text": [ 1097 | "... filtering 1000000", 1098 | "" 1099 | ] 1100 | }, 1101 | { 1102 | "output_type": "stream", 1103 | "stream": "stdout", 1104 | "text": [ 1105 | "processed 1000000 / wrote 995045 / removed 4955", 1106 | "processed 75007500 bp / wrote 72670517 bp / removed 2336983 bp", 1107 | "discarded 3.1%", 1108 | "" 1109 | ] 1110 | }, 1111 | { 1112 | "output_type": "stream", 1113 | "stream": "stdout", 1114 | "text": [ 1115 | "... filtering 1100000", 1116 | "" 1117 | ] 1118 | }, 1119 | { 1120 | "output_type": "stream", 1121 | "stream": "stdout", 1122 | "text": [ 1123 | "... filtering 1200000", 1124 | "" 1125 | ] 1126 | }, 1127 | { 1128 | "output_type": "stream", 1129 | "stream": "stdout", 1130 | "text": [ 1131 | "... filtering 1300000", 1132 | "" 1133 | ] 1134 | }, 1135 | { 1136 | "output_type": "stream", 1137 | "stream": "stdout", 1138 | "text": [ 1139 | "... filtering 1400000", 1140 | "" 1141 | ] 1142 | }, 1143 | { 1144 | "output_type": "stream", 1145 | "stream": "stdout", 1146 | "text": [ 1147 | "... filtering 1500000", 1148 | "" 1149 | ] 1150 | }, 1151 | { 1152 | "output_type": "stream", 1153 | "stream": "stdout", 1154 | "text": [ 1155 | "processed 1500000 / wrote 1467162 / removed 32838", 1156 | "processed 112552500 bp / wrote 107221701 bp / removed 5330799 bp", 1157 | "discarded 4.7%", 1158 | "" 1159 | ] 1160 | }, 1161 | { 1162 | "output_type": "stream", 1163 | "stream": "stdout", 1164 | "text": [ 1165 | "... filtering 1600000", 1166 | "" 1167 | ] 1168 | }, 1169 | { 1170 | "output_type": "stream", 1171 | "stream": "stdout", 1172 | "text": [ 1173 | "... filtering 1700000", 1174 | "" 1175 | ] 1176 | }, 1177 | { 1178 | "output_type": "stream", 1179 | "stream": "stdout", 1180 | "text": [ 1181 | "... filtering 1800000", 1182 | "" 1183 | ] 1184 | }, 1185 | { 1186 | "output_type": "stream", 1187 | "stream": "stdout", 1188 | "text": [ 1189 | "... filtering 1900000", 1190 | "" 1191 | ] 1192 | }, 1193 | { 1194 | "output_type": "stream", 1195 | "stream": "stdout", 1196 | "text": [ 1197 | "... filtering 2000000", 1198 | "" 1199 | ] 1200 | }, 1201 | { 1202 | "output_type": "stream", 1203 | "stream": "stdout", 1204 | "text": [ 1205 | "processed 2000000 / wrote 1929021 / removed 70979", 1206 | "processed 150045000 bp / wrote 141062463 bp / removed 8982537 bp", 1207 | "discarded 6.0%", 1208 | "" 1209 | ] 1210 | }, 1211 | { 1212 | "output_type": "stream", 1213 | "stream": "stdout", 1214 | "text": [ 1215 | "... filtering 2100000", 1216 | "" 1217 | ] 1218 | }, 1219 | { 1220 | "output_type": "stream", 1221 | "stream": "stdout", 1222 | "text": [ 1223 | "... filtering 2200000", 1224 | "" 1225 | ] 1226 | }, 1227 | { 1228 | "output_type": "stream", 1229 | "stream": "stdout", 1230 | "text": [ 1231 | "... filtering 2300000", 1232 | "" 1233 | ] 1234 | }, 1235 | { 1236 | "output_type": "stream", 1237 | "stream": "stdout", 1238 | "text": [ 1239 | "... filtering 2400000", 1240 | "" 1241 | ] 1242 | }, 1243 | { 1244 | "output_type": "stream", 1245 | "stream": "stdout", 1246 | "text": [ 1247 | "... filtering 2500000", 1248 | "" 1249 | ] 1250 | }, 1251 | { 1252 | "output_type": "stream", 1253 | "stream": "stdout", 1254 | "text": [ 1255 | "processed 2500000 / wrote 2403245 / removed 96755", 1256 | "processed 187530000 bp / wrote 175725490 bp / removed 11804510 bp", 1257 | "discarded 6.3%", 1258 | "" 1259 | ] 1260 | }, 1261 | { 1262 | "output_type": "stream", 1263 | "stream": "stdout", 1264 | "text": [ 1265 | "... filtering" 1266 | ] 1267 | }, 1268 | { 1269 | "output_type": "stream", 1270 | "stream": "stdout", 1271 | "text": [ 1272 | " 2600000", 1273 | "" 1274 | ] 1275 | }, 1276 | { 1277 | "output_type": "stream", 1278 | "stream": "stdout", 1279 | "text": [ 1280 | "... filtering 2700000", 1281 | "" 1282 | ] 1283 | }, 1284 | { 1285 | "output_type": "stream", 1286 | "stream": "stdout", 1287 | "text": [ 1288 | "... filtering 2800000", 1289 | "" 1290 | ] 1291 | }, 1292 | { 1293 | "output_type": "stream", 1294 | "stream": "stdout", 1295 | "text": [ 1296 | "... filtering 2900000", 1297 | "" 1298 | ] 1299 | }, 1300 | { 1301 | "output_type": "stream", 1302 | "stream": "stdout", 1303 | "text": [ 1304 | "... filtering 3000000", 1305 | "" 1306 | ] 1307 | }, 1308 | { 1309 | "output_type": "stream", 1310 | "stream": "stdout", 1311 | "text": [ 1312 | "processed 3000000 / wrote 2845946 / removed 154054", 1313 | "processed 225052500 bp / wrote 208177480 bp / removed 16875020 bp", 1314 | "discarded 7.5%", 1315 | "" 1316 | ] 1317 | }, 1318 | { 1319 | "output_type": "stream", 1320 | "stream": "stdout", 1321 | "text": [ 1322 | "... filtering 3100000", 1323 | "" 1324 | ] 1325 | }, 1326 | { 1327 | "output_type": "stream", 1328 | "stream": "stdout", 1329 | "text": [ 1330 | "... filtering 3200000", 1331 | "" 1332 | ] 1333 | }, 1334 | { 1335 | "output_type": "stream", 1336 | "stream": "stdout", 1337 | "text": [ 1338 | "... filtering 3300000", 1339 | "" 1340 | ] 1341 | }, 1342 | { 1343 | "output_type": "stream", 1344 | "stream": "stdout", 1345 | "text": [ 1346 | "... filtering 3400000", 1347 | "" 1348 | ] 1349 | }, 1350 | { 1351 | "output_type": "stream", 1352 | "stream": "stdout", 1353 | "text": [ 1354 | "... filtering 3500000", 1355 | "" 1356 | ] 1357 | }, 1358 | { 1359 | "output_type": "stream", 1360 | "stream": "stdout", 1361 | "text": [ 1362 | "processed 3500000 / wrote 3343199 / removed 156801", 1363 | "processed 262545000 bp / wrote 244592062 bp / removed 17952938 bp", 1364 | "discarded 6.8%", 1365 | "" 1366 | ] 1367 | }, 1368 | { 1369 | "output_type": "stream", 1370 | "stream": "stdout", 1371 | "text": [ 1372 | "... filtering 3600000", 1373 | "" 1374 | ] 1375 | }, 1376 | { 1377 | "output_type": "stream", 1378 | "stream": "stdout", 1379 | "text": [ 1380 | "... filtering 3700000", 1381 | "" 1382 | ] 1383 | }, 1384 | { 1385 | "output_type": "stream", 1386 | "stream": "stdout", 1387 | "text": [ 1388 | "... filtering 3800000", 1389 | "" 1390 | ] 1391 | }, 1392 | { 1393 | "output_type": "stream", 1394 | "stream": "stdout", 1395 | "text": [ 1396 | "... filtering 3900000", 1397 | "" 1398 | ] 1399 | }, 1400 | { 1401 | "output_type": "stream", 1402 | "stream": "stdout", 1403 | "text": [ 1404 | "... filtering 4000000", 1405 | "" 1406 | ] 1407 | }, 1408 | { 1409 | "output_type": "stream", 1410 | "stream": "stdout", 1411 | "text": [ 1412 | "processed 4000000 / wrote 3802478 / removed 197522", 1413 | "processed 300090000 bp / wrote 278301488 bp / removed 21788512 bp", 1414 | "discarded 7.3%", 1415 | "" 1416 | ] 1417 | }, 1418 | { 1419 | "output_type": "stream", 1420 | "stream": "stdout", 1421 | "text": [ 1422 | "... filtering 4100000", 1423 | "" 1424 | ] 1425 | }, 1426 | { 1427 | "output_type": "stream", 1428 | "stream": "stdout", 1429 | "text": [ 1430 | "... filtering 4200000", 1431 | "" 1432 | ] 1433 | }, 1434 | { 1435 | "output_type": "stream", 1436 | "stream": "stdout", 1437 | "text": [ 1438 | "... filtering 4300000", 1439 | "" 1440 | ] 1441 | }, 1442 | { 1443 | "output_type": "stream", 1444 | "stream": "stdout", 1445 | "text": [ 1446 | "... filtering 4400000", 1447 | "" 1448 | ] 1449 | }, 1450 | { 1451 | "output_type": "stream", 1452 | "stream": "stdout", 1453 | "text": [ 1454 | "... filtering 4500000", 1455 | "" 1456 | ] 1457 | }, 1458 | { 1459 | "output_type": "stream", 1460 | "stream": "stdout", 1461 | "text": [ 1462 | "processed 4500000 / wrote 4298953 / removed 201047", 1463 | "processed 337680000 bp / wrote 314656018 bp / removed 23023982 bp", 1464 | "discarded 6.8%", 1465 | "" 1466 | ] 1467 | }, 1468 | { 1469 | "output_type": "stream", 1470 | "stream": "stdout", 1471 | "text": [ 1472 | "... filtering 4600000", 1473 | "" 1474 | ] 1475 | }, 1476 | { 1477 | "output_type": "stream", 1478 | "stream": "stdout", 1479 | "text": [ 1480 | "... filtering 4700000", 1481 | "" 1482 | ] 1483 | }, 1484 | { 1485 | "output_type": "stream", 1486 | "stream": "stdout", 1487 | "text": [ 1488 | "... filtering 4800000", 1489 | "" 1490 | ] 1491 | }, 1492 | { 1493 | "output_type": "stream", 1494 | "stream": "stdout", 1495 | "text": [ 1496 | "done loading in sequences", 1497 | "" 1498 | ] 1499 | }, 1500 | { 1501 | "output_type": "stream", 1502 | "stream": "stdout", 1503 | "text": [ 1504 | "DONE writing.", 1505 | "processed 4815993 / wrote 4613117 / removed 202876", 1506 | "processed 361424475 bp / wrote 337650426 bp / removed 23774049 bp", 1507 | "discarded 6.6%", 1508 | "" 1509 | ] 1510 | } 1511 | ], 1512 | "prompt_number": 10 1513 | }, 1514 | { 1515 | "cell_type": "markdown", 1516 | "source": [ 1517 | "This needs to be fixed later but for some reason our output filtered file is being saved somewhere else...we'll fix this later but for now let's just go move it to where I want it." 1518 | ] 1519 | }, 1520 | { 1521 | "cell_type": "code", 1522 | "input": [ 1523 | "cd /hmp-mock-tutorial/tutorial-files/" 1524 | ], 1525 | "language": "python", 1526 | "outputs": [ 1527 | { 1528 | "output_type": "stream", 1529 | "stream": "stdout", 1530 | "text": [ 1531 | "/hmp-mock-tutorial/tutorial-files", 1532 | "" 1533 | ] 1534 | } 1535 | ], 1536 | "prompt_number": 14 1537 | }, 1538 | { 1539 | "cell_type": "code", 1540 | "input": [ 1541 | "ls", 1542 | "" 1543 | ], 1544 | "language": "python", 1545 | "outputs": [ 1546 | { 1547 | "output_type": "stream", 1548 | "stream": "stdout", 1549 | "text": [ 1550 | "mock-pass1.kh SRR172903.fastq.gz.keep", 1551 | "pass1.report SRR172903.fastq.gz.keep.below", 1552 | "" 1553 | ] 1554 | } 1555 | ], 1556 | "prompt_number": 17 1557 | }, 1558 | { 1559 | "cell_type": "markdown", 1560 | "source": [ 1561 | "Okay, now we have a set of reads which are can be assembled -- we've normalized the coverage and removed artifacts. Sometimes, you may have a dataset that is too large to assemble on your computational resources. If this is the case, you can partition your reads by connectivity. That is in the next notebook, hmp-partitioning." 1562 | ] 1563 | }, 1564 | { 1565 | "cell_type": "code", 1566 | "input": [ 1567 | "" 1568 | ], 1569 | "language": "python", 1570 | "outputs": [] 1571 | } 1572 | ] 1573 | } 1574 | ] 1575 | } -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | exec > /tmp/ipynb.out 2> /tmp/ipynb.errout 3 | 4 | cd /usr/local/notebooks 5 | 6 | #git checkout -f master 7 | #git pull origin master 8 | 9 | export PATH=/usr/local/bin:$PATH 10 | /usr/local/bin/ipython notebook --pylab inline 11 | # --certfile=~/.ipython/mycert.pem --pylab inline --no-browser --ip=* --port=443 12 | --------------------------------------------------------------------------------