├── .gitignore ├── 1.01_StringBasics.ipynb ├── 1.02_ManipulatingDNAStrings.ipynb ├── 1.03_ParsingRefGenome.ipynb ├── 1.04_WorkingWithSequencingReads.ipynb ├── 1.05_AnalyzingReadsByPosition.ipynb ├── 1.06_NaiveExactMatching-MatchingArtificialReads.ipynb ├── 1.07_NaiveExactMatching-MatchingRealReads.ipynb ├── 2.01_BoyerMoore.ipynb ├── 2.02_SubstringIndex.ipynb ├── 2.03_ApproximateMatching.ipynb ├── 3.01_EditDistanceDP.ipynb ├── 3.02_GlobalAlignment.ipynb ├── 3.03_FindingOverlaps.ipynb ├── 3.04_FindingAllOverlaps.ipynb ├── 4.01_ShortestCommonSuperstring.ipynb ├── 4.02_GreedySCS.ipynb ├── 4.03_DeBruijn.ipynb ├── ERR266411_1.for_asm.fastq ├── README.md ├── SRR835775_1.first1000.fastq ├── ads1_week4_reads.fq ├── bm_preproc.py ├── bm_preproc.pyc ├── chr1.GRCh38.excerpt.fasta ├── homework-1.ipynb ├── homework-2.ipynb ├── homework-3.ipynb ├── homework-4.ipynb ├── lambda_virus.fa └── phix.fa /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | -------------------------------------------------------------------------------- /1.01_StringBasics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# define a new sequence\n", 12 | "seq = 'ACGT'" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "# define a new sequence\n", 24 | "seq = \"ACGT\"" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "'C'" 38 | ] 39 | }, 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "# Get a character from a string\n", 47 | "seq[1]" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "4" 61 | ] 62 | }, 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "# get the length of a sequence\n", 70 | "len(seq)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 5, 76 | "metadata": { 77 | "collapsed": false 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "# empty string (epsilon)\n", 82 | "e = ''" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 6, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "0" 96 | ] 97 | }, 98 | "execution_count": 6, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "len(e)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 7, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "AACCGGTT\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "# concatenation\n", 124 | "seq1 = 'AACC'\n", 125 | "seq2 = 'GGTT'\n", 126 | "print(seq1 + seq2)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 8, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [ 136 | { 137 | "name": "stdout", 138 | "output_type": "stream", 139 | "text": [ 140 | "ACGT\n" 141 | ] 142 | } 143 | ], 144 | "source": [ 145 | "seqs = ['A', 'C', 'G', 'T']\n", 146 | "print(''.join(seqs))" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 17, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "'T'" 160 | ] 161 | }, 162 | "execution_count": 17, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "# generate a random nucleotide\n", 169 | "import random\n", 170 | "random.seed(0)\n", 171 | "random.choice('ACGT')" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 20, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "GTGCACGTTC\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "# generate a random sequence\n", 191 | "seq = ''\n", 192 | "for _ in range(10): #no need to specify index so can use _\n", 193 | " seq += random.choice('ACGT')\n", 194 | "print(seq)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 21, 200 | "metadata": { 201 | "collapsed": false 202 | }, 203 | "outputs": [ 204 | { 205 | "name": "stdout", 206 | "output_type": "stream", 207 | "text": [ 208 | "TCTGAGCTGA\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "# another way to generate a random sequence\n", 214 | "seq = ''.join([random.choice('ACGT') for _ in range(10)])\n", 215 | "print(seq)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 22, 221 | "metadata": { 222 | "collapsed": false 223 | }, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/plain": [ 228 | "'CT'" 229 | ] 230 | }, 231 | "execution_count": 22, 232 | "metadata": {}, 233 | "output_type": "execute_result" 234 | } 235 | ], 236 | "source": [ 237 | "# get a substring\n", 238 | "seq[1:3]" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 23, 244 | "metadata": { 245 | "collapsed": false 246 | }, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "'TCT'" 252 | ] 253 | }, 254 | "execution_count": 23, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "# get a prefix\n", 261 | "seq[:3]" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 24, 267 | "metadata": { 268 | "collapsed": false 269 | }, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "'TGA'" 275 | ] 276 | }, 277 | "execution_count": 24, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "# get a suffix\n", 284 | "seq[7:]" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 25, 290 | "metadata": { 291 | "collapsed": false 292 | }, 293 | "outputs": [ 294 | { 295 | "data": { 296 | "text/plain": [ 297 | "'TGA'" 298 | ] 299 | }, 300 | "execution_count": 25, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "# another way to get a suffix\n", 307 | "seq[-3:]" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "collapsed": true 315 | }, 316 | "outputs": [], 317 | "source": [] 318 | } 319 | ], 320 | "metadata": { 321 | "kernelspec": { 322 | "display_name": "Python 2", 323 | "language": "python", 324 | "name": "python2" 325 | }, 326 | "language_info": { 327 | "codemirror_mode": { 328 | "name": "ipython", 329 | "version": 2 330 | }, 331 | "file_extension": ".py", 332 | "mimetype": "text/x-python", 333 | "name": "python", 334 | "nbconvert_exporter": "python", 335 | "pygments_lexer": "ipython2", 336 | "version": "2.7.11" 337 | } 338 | }, 339 | "nbformat": 4, 340 | "nbformat_minor": 0 341 | } 342 | -------------------------------------------------------------------------------- /1.02_ManipulatingDNAStrings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "data": { 12 | "text/plain": [ 13 | "'ACCA'" 14 | ] 15 | }, 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "output_type": "execute_result" 19 | } 20 | ], 21 | "source": [ 22 | "def longestCommonPrefix(s1, s2):\n", 23 | " i = 0\n", 24 | " while i < len(s1) and i < len(s2) and s1[i] == s2[i]:\n", 25 | " i += 1\n", 26 | " return s1[:i]\n", 27 | "longestCommonPrefix('ACCATTG', 'ACCAAGTC')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 4, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "True" 41 | ] 42 | }, 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "def match(s1, s2):\n", 50 | " if not len(s1) == len(s2):\n", 51 | " return False\n", 52 | " for i in range(0, len(s1)):\n", 53 | " if not s1[i] == s2[i]:\n", 54 | " return False\n", 55 | " return True\n", 56 | "match('ACCATTG', 'ACCATTG')" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 5, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "True" 70 | ] 71 | }, 72 | "execution_count": 5, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "'ACCATTG' == 'ACCATTG'" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 6, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 7, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "'T'" 103 | ] 104 | }, 105 | "execution_count": 7, 106 | "metadata": {}, 107 | "output_type": "execute_result" 108 | } 109 | ], 110 | "source": [ 111 | "complement['A']" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 8, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "'G'" 125 | ] 126 | }, 127 | "execution_count": 8, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "complement['C']" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 9, 139 | "metadata": { 140 | "collapsed": false 141 | }, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "'CAATGGT'" 147 | ] 148 | }, 149 | "execution_count": 9, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "def reverseComplement(s):\n", 156 | " complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}\n", 157 | " t = ''\n", 158 | " for base in s:\n", 159 | " t = complement[base] + t\n", 160 | " return t\n", 161 | "reverseComplement('ACCATTG')" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "collapsed": true 169 | }, 170 | "outputs": [], 171 | "source": [] 172 | } 173 | ], 174 | "metadata": { 175 | "kernelspec": { 176 | "display_name": "Python 2", 177 | "language": "python", 178 | "name": "python2" 179 | }, 180 | "language_info": { 181 | "codemirror_mode": { 182 | "name": "ipython", 183 | "version": 2 184 | }, 185 | "file_extension": ".py", 186 | "mimetype": "text/x-python", 187 | "name": "python", 188 | "nbconvert_exporter": "python", 189 | "pygments_lexer": "ipython2", 190 | "version": "2.7.11" 191 | } 192 | }, 193 | "nbformat": 4, 194 | "nbformat_minor": 0 195 | } 196 | -------------------------------------------------------------------------------- /1.03_ParsingRefGenome.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "'wget' is not recognized as an internal or external command,\n", 15 | "operable program or batch file.\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "# Following line downloads FASTA file containing the lambda phage reference genome\n", 21 | "!wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 6, 27 | "metadata": { 28 | "collapsed": false 29 | }, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "'GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACC'" 35 | ] 36 | }, 37 | "execution_count": 6, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "def readGenome(filename):\n", 44 | " genome = ''\n", 45 | " with open(filename, 'r') as f:\n", 46 | " for line in f:\n", 47 | " # ignore header line with genome information\n", 48 | " if not line[0] == '>':\n", 49 | " genome += line.rstrip()\n", 50 | " return genome\n", 51 | "genome = readGenome('lambda_virus.fa')\n", 52 | "genome[:100]\n" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 7, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "{'A': 12334, 'C': 11362, 'T': 11986, 'G': 12820}\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "# Count the number of occurences of each base\n", 72 | "counts = {'A': 0, 'C': 0, 'G': 0, 'T': 0}\n", 73 | "for base in genome:\n", 74 | " counts[base] += 1\n", 75 | "print(counts)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 4, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "Counter({'G': 12820, 'A': 12334, 'T': 11986, 'C': 11362})" 89 | ] 90 | }, 91 | "execution_count": 4, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "import collections\n", 98 | "collections.Counter(genome)" 99 | ] 100 | } 101 | ], 102 | "metadata": { 103 | "kernelspec": { 104 | "display_name": "Python 2", 105 | "language": "python", 106 | "name": "python2" 107 | }, 108 | "language_info": { 109 | "codemirror_mode": { 110 | "name": "ipython", 111 | "version": 2 112 | }, 113 | "file_extension": ".py", 114 | "mimetype": "text/x-python", 115 | "name": "python", 116 | "nbconvert_exporter": "python", 117 | "pygments_lexer": "ipython2", 118 | "version": "2.7.11" 119 | } 120 | }, 121 | "nbformat": 4, 122 | "nbformat_minor": 0 123 | } 124 | -------------------------------------------------------------------------------- /1.04_WorkingWithSequencingReads.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "scrolled": false 9 | }, 10 | "outputs": [ 11 | { 12 | "name": "stdout", 13 | "output_type": "stream", 14 | "text": [ 15 | "--2015-07-14 11:22:35-- http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/SRR835775_1.first1000.fastq\n", 16 | "Resolving d28rh4a8wq0iu5.cloudfront.net... 54.192.54.164, 54.192.54.222, 54.192.55.189, ...\n", 17 | "Connecting to d28rh4a8wq0iu5.cloudfront.net|54.192.54.164|:80... connected.\n", 18 | "HTTP request sent, awaiting response... 200 OK\n", 19 | "Length: 224786 (220K) [application/octet-stream]\n", 20 | "Saving to: 'SRR835775_1.first1000.fastq'\n", 21 | "\n", 22 | "SRR835775_1.first10 100%[=====================>] 219.52K 1.03MB/s in 0.2s \n", 23 | "\n", 24 | "2015-07-14 11:22:36 (1.03 MB/s) - 'SRR835775_1.first1000.fastq' saved [224786/224786]\n", 25 | "\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "!wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/SRR835775_1.first1000.fastq" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "def readFastq(filename):\n", 42 | " sequences = []\n", 43 | " qualities = []\n", 44 | " with open(filename) as fh:\n", 45 | " while True:\n", 46 | " fh.readline() # skip name line\n", 47 | " seq = fh.readline().rstrip() # read base sequence\n", 48 | " fh.readline() # skip placeholder line\n", 49 | " qual = fh.readline().rstrip() #base quality line\n", 50 | " if len(seq) == 0:\n", 51 | " break\n", 52 | " sequences.append(seq)\n", 53 | " qualities.append(qual)\n", 54 | " return sequences, qualities\n", 55 | "seqs, quals = readFastq('SRR835775_1.first1000.fastq')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "def phred33ToQ(qual):\n", 67 | " return ord(qual) - 33" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "[0, 0, 6178, 0, 0, 54, 108, 574, 345, 83, 193, 124, 79, 165, 49, 236, 184, 327, 514, 238, 531, 254, 313, 798, 992, 888, 1396, 1488, 993, 1752, 3387, 4487, 3248, 5476, 8375, 11814, 4243, 7827, 6579, 8179, 9349, 8180, 0, 0, 0, 0, 0, 0, 0, 0]\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "def createHist(qualityStrings):\n", 87 | " # Create a histogram of quality scores\n", 88 | " hist = [0]*50\n", 89 | " for read in qualityStrings:\n", 90 | " for phred in read:\n", 91 | " q = phred33ToQ(phred)\n", 92 | " hist[q] += 1\n", 93 | " return hist\n", 94 | "h = createHist(quals)\n", 95 | "print(h)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": { 102 | "collapsed": false, 103 | "scrolled": true 104 | }, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEACAYAAACznAEdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X3YXHV95/H3h4TwIAGMQAghcmNNWmKxPGnA2hJQaIoa\naOUS3JbClnXXTbW223pJbK25dlusfVBwd6W7FRSoUKlUCBgoEQ11VQgiaCRAQjFKAgmISIJAScx3\n//j9hpyM83hm5p6nz+u65rpnzpwz85uTO/dnfo9HEYGZmdke/S6AmZkNBgeCmZkBDgQzM8scCGZm\nBjgQzMwscyCYmRnQJBAkXSFpi6Q1hW1/LekBSd+W9M+SDig8t1TSekkPSjq9sP14SWvyc5cWtu8l\n6XN5+52Sjuj2BzQzs9Y0qyF8GlhUte024DUR8UvAOmApgKT5wDnA/HzMJyUpH3MZcGFEzAXmSqq8\n5oXAU3n7x4GPdvh5zMyspIaBEBFfBZ6u2rYyInbmh3cBh+f7ZwLXRsT2iNgAPAwskDQLmB4Rq/N+\nVwFn5fuLgSvz/euBN3XwWczMrAOd9iH8LrAi3z8M2Fh4biMwu8b2TXk7+eejABGxA3hG0owOy2Rm\nZiWUDgRJfwK8GBHXdLE8ZmbWJ1PLHCTpAuAMdm/i2QTMKTw+nFQz2MSuZqXi9soxrwQekzQVOCAi\nflTj/bzgkplZCRGh5nslbQdC7hB+P3ByRLxQeGo5cI2kj5GaguYCqyMiJG2VtABYDZwHfKJwzPnA\nncDZwO313redDzXKJC2LiGX9Lscg8LnYxediF5+LXdr9Mt0wECRdC5wMHCTpUeDDpFFF04CVeRDR\nNyJiSUSslXQdsBbYASyJXUupLgE+A+wDrIiIW/P2y4GrJa0HngLObafwZjY4JH4NuC0C1+iHVMNA\niIh31th8RYP9LwYurrH9HuDoGtv/HXhH82Ka2SCTmAJ8EZgFPNnn4lhJnqk8fFb1uwADZFW/CzBA\nVvX5/Q8BpgAv63M5oP/nYmhpGC6QIynch2A2uCROAO4GXhvBmmb72+Ro92+nawhm1g2H5Z/79bUU\n1hEHgpl1Q2WyqQNhiDkQzKwbHAgjwIFgZt0wGwgcCEPNgWBm3TAb+D4OhKHmQDCzbjgMeAiY3u+C\nWHkOBDPrhtmkQHANYYg5EMysIxL7AnuTlrJ3IAwxB4KZdWo28BiwDQfCUHMgmFmnZpOWsncgDDkH\ngpl16jBSIDyLA2GoORDMrFOVJqNn8SijoeZAMLNOVZqMXEMYcg4EM+uUA2FEOBDMrFPuVB4RDgQz\n65Q7lUeEL5BjZqVJ7AE8DxwA7ASeA/b0dZUHgy+QY2aT6SBgWwQvRPAi8FNgrz6XyUpyIJhZJyr9\nBxVuNhpiDgQz64QDYYQ4EMysE5UO5QqPNBpiDgQz60RllnKFawhDzIFgZp2o1WTk5SuGlAPBzDrh\nPoQR4kAws044EEaIA8HMOuFO5RHSMBAkXSFpi6Q1hW0zJK2UtE7SbZIOLDy3VNJ6SQ9KOr2w/XhJ\na/Jzlxa27yXpc3n7nZKO6PYHNLPekNib1F/wVGGzawhDrFkN4dPAoqptFwErI2IecHt+jKT5wDnA\n/HzMJyVVpkxfBlwYEXOBuZIqr3kh8FTe/nHgox1+HjObPIcBj0ews7DNgTDEGgZCRHwVeLpq82Lg\nynz/SuCsfP9M4NqI2B4RG4CHgQWSZgHTI2J13u+qwjHF17oeeFPJz2Fmk6+6/wA8ymiolelDmBkR\nW/L9LcDMfP8wYGNhv42kX5jq7ZvydvLPRwEiYgfwjKQZJcpkZpOvXiC4hjCkOupUjrRUqlc1NBtP\n1R3K4EAYalNLHLNF0qERsTk3Bz2Rt28C5hT2O5xUM9iU71dvrxzzSuAxSVOBAyLiR7XeVNKywsNV\nEbGqRNnNrHuqZymDRxn1laSFwMKyx5cJhOXA+aQO4POBGwrbr5H0MdIvylxgdUSEpK2SFgCrgfOA\nT1S91p3A2aRO6poiYlmJsppZ78wG7qna5hpCH+UvyqsqjyV9uJ3jGwaCpGuBk4GDJD0K/Bnwl8B1\nki4ENgDvyAVZK+k6YC2wA1gSu66+swT4DLAPsCIibs3bLweulrSeNHTt3HYKb2Z95T6EEeMrpplZ\nKRKPAKdH8HBh23zg+giO6l/JrMJXTDOznpMQqVO5ug/BNYQh5kAwszJmAM9H8FzVdgfCEHMgmFkZ\ntfoPIAdCrkHYkHEgmFkZNQMhgheBncC0SS+RdcyBYGZl1KshgJevGFoOBDMro9Ys5Qr3IwwpB4KZ\nlVFrlnKFA2FIORDMrIxGTUZ9W75C4hiJX+7He4+CMktXmJk160PoVw3hT4CDgFP69P5DzYFgZmUM\nXCBI7AWcBkyRmBFBzYUyrT43GZlZWySmAQeya6Xjav0aZXQq8F3gy8Bb+vD+Q8+BYGbtmgVsqbp0\nZlG/mowWAzfm25l9eP+h5yYjM2tXo+Yi6EMgSOxBCoRTgB8DH5fYO4IXJrMcw841BDNrV7NA6Mco\no+OBbRGsi+AJYA3uWG6bA8HM2jVwNQRSE9GNhcduNirBgWBm7Wo0SxkGJxAW56Yka5FPlpm1q9Es\nZZjkUUYSrwIOAe6qbItgHbAVOGGyyjEKHAhm1q5BazJaDNwcwU+rtrvZqE0OBDNr16B1Klc3F1Xc\nSAoLa5EDwcxali98MzA1BIkZpBFGX6rx9F3AIblJyVrgQDCzdhwAbI/g2Qb7TGaT0RnAl2tcypPc\nhHQTbjZqmQPBzNrRrEMZJjcQzgSWN3je/QhtcCCYWTuaNRfBJI0yKixmd3OD3W4HjpN4Ra/LMwoc\nCGbWjlYDYb/c39BLpwLfzTOTa8pNSV7srkUOBDNrR9NAiOBFYCcwrcdlqTe6qJpHG7XIgWBm7Wg2\nS7mip/0IeQby22jcf1BxM3CaxN69Ks+ocCCYWTvmABtb2K/XHcuVxewearZjBE8C3yE1MVkDDgQz\na8cEsKGF/XodCK02F1V4tFELHAhm1pLcSTwBfL+F3Xs90qjdQFiOF7trqvTJkbRU0v2S1ki6RtJe\nkmZIWilpnaTbJB1Ytf96SQ9KOr2w/fj8GuslXdrpBzKznpkB7Ijgxy3s27MagsQEMJPCYnbNFBa7\ne20vyjQqSgWCpAngXcBxEXE0MAU4F7gIWBkR80jjfy/K+88HzgHmA4uAT0qqDEm7DLgwIuYCcyUt\nKv1pzKyXJmituQh6u57RG4E7aixm18wPSKuiWh1lawhbge3AvpKmAvuSZi8uBq7M+1wJnJXvnwlc\nGxHbI2ID8DCwQNIsYHpErM77XVU4xswGyxG0Hgi97EM4CfhGieN+Arysy2UZKaUCISJ+BPwtKXEf\nA34cESuBmRGxJe+2hVStgzRUrTgyYSNpPHP19k15u5kNngla6z+A3gbCG4CvlzjOgdDE1DIHSfo5\n4A9IvyDPAP8k6beL+0RESIqOS7jrPZcVHq6KiFXdem0za8kE8EiL+/akU1liOjAPuLfE4c8y4oEg\naSGwsOzxpQKBdBWir0fEU7kQ/0yqxm2WdGhEbM7NQZUp5ZtI45crDifVDDbl+8XtNSe9RMSykmU1\ns+6YIC0D0Ype1RBeB9wXwb+XOPYnTP6lPSdV/qK8qvJY0ofbOb5sH8KDwImS9smdw28G1pKWmj0/\n73M+cEO+vxw4V9I0SUcCc4HVEbEZ2CppQX6d8wrHmNlgmaDLncoSh0gc2Gy/grLNReAmo6ZK1RAi\n4tuSrgK+SVqz5FvA/yVVEa+TdCHpF+cdef+1kq4jhcYOYElEVJqTlgCfAfYBVkTEraU/jZn1RJ6D\ncATd70P4EGmAyn9r8XVPAi5vcd9qPwFeXvLYsaBdf5cHl6SIiF6vnGhmdUi8nPQl78AImv7RkPgt\n4IwIfqvJfl8AjorgF1p4zT2AHwKvieDxlgq++/Hvye/1e+0eO6za/dvpWXtm1ooJYEMrYZC1WkM4\nDPj5Fi9zOQ94pkwYZG4yasKBYGatmKD1/gNofZTRLFIn6K+3sG8n/QfgQGjKgWBmrZig/UBoWEPI\nTUCHAleQro3cjAOhxxwIZtaKdjqUobVRRgeRVj24GfhViX2a7F92hnLFZF7reSg5EMysFRN0uYZA\nai56PIKnSRPNFtbbMXdqv5J0XYOyXENowoFgZq2YoPuBcBhp6RuAFTRuNloAfDOCHW2UoZoDoQkH\ngpm1YoJJCIQ836GWTvsPKmVyIDTgQDCzhvJM4inA060eE8GL+di9Gux2GLw0hHQNsBdpFYNaTqLz\nQBj5pSs65UAws2aOoL05CBXNOpZnkWsI+bVvoUazkcQUUpPRnW2+fzU3GTXhQDCzZiZor7moolmz\nUbHJCOr3I/wi8FgET5UoQ9GLwB4S0zp8nZHlQDCzZiaYnEC4HThJ+pljOh1uCrxUC3E/QgMOBDNr\nZoLeBMIsdvUhEMFWYDVwatV+3ehQrnCzUQMOBDNrZoL2JqVV1A2EPEt5JrC56qlazUbd6FCucCA0\n4EAws2bauZZyUaP1jA4mLVRXfaGb3YafShyS932gxPvX4kBowIFgZs1MUC4QGo0yKg45LXqQdI2V\n+fnxScCdEews8f61ePmKBhwIZlaXxAHANCg1wqfRH9+XhpwW5Y7fYrNRVzqUC1xDaMCBYGaNlJ2D\nAI0DoXqEUVExELrZoQwOhIYcCGbWyATlOpShfCB8BThB4iDgOOCuku9fiwOhAQeCmTVStkMZGncq\n7zbktCiCnwBfA/4YeCQPR+0W9yE04EAws0Ym6CwQytQQIDUbvZfuNheBawgNORDMrJEJygdCs1FG\njQLhFmBfutuhDA6EhhwIZtbIBL3rQ6jZZAQQwXrgS6TrLXeTl65oYGq/C2BmA22CLjcZ5dVLD+Fn\nZynvJoLTSr5vI14CuwHXEMysJonpwD7AkyVfol4N4WDg6co1EyaZm4wacCCYWT2dzEGA+qOMmvUf\n9JIDoQEHgpnVM0H55iKo36lcd8jpJPCw0wYcCGZWzwTlO5Sh/h9f1xAGlAPBzOqZoLMaggNhyJQO\nBEkHSvq8pAckrZW0QNIMSSslrZN0m6QDC/svlbRe0oOSTi9sP17SmvzcpZ1+oF2vy3t9qTyzjnQy\nS5lKp3GN/4cNh5z2mAOhgU5qCJcCKyLiKOC1pGVrLwJWRsQ80uXwLgKQNB84h7Sk7SLgk5KUX+cy\n4MKImAvMlbSogzIVXQzM7tJrmY2jCTqrIUDtWkLNlU4nifsQGigVCJIOAH4lIq4AiIgdEfEMsBi4\nMu92JXBWvn8mcG1EbI+IDcDDwAJJs4DpEbE673dV4ZjS8jjn/YD9O30tszE2QWd9CFB7pJGbjAZU\n2RrCkcCTkj4t6VuS/l7Sy4CZEbEl77OFdIk8SL8AGwvHbyR9e6/evonufKuv/AIe0IXXMhs7+UL3\n+5H+H3ei1kijvgdC5YpstruyM5WnkpalfU9E3C3pEnLzUEVEhKSy45d/hqRlhYerImJVg90PqPpp\nZu05Avh+B3MQKnZrosm194PpPGhKiWCHxHZgL+CFfpShlyQtBBaWPb5sIGwENkbE3fnx54GlwGZJ\nh0bE5twc9ER+fhMwp3D84fk1NuX7xe2bar1hRCxro3wOBLPOdNShXFDdZn8I8KMItnfhtcuqLF8x\ncoGQvyivqjyW9OF2ji/VZBQRm4FHJc3Lm94M3A/cBJyft50P3JDvLwfOlTRN0pHAXGB1fp2teYSS\ngPMKx3TCgWDWmQl6Ewj9bC6qcD9CHZ0sbvde4LOSpgH/BvxHYApwnaQLSb9M7wCIiLWSrgPWAjuA\nJRFRqYouAT5DWjNlRUTc2kGZKhwIZp2ZoPMOZXAgDJXSgRAR3wZeV+OpN9fZ/2LSUNDq7fcAR5ct\nRx0OBLPOTAD3duF1trH7KKN+LltR4SWw6xjVmcoHADtxIJiV1as+hEGpIXguQg2jHAibcCCYlTXB\naAeCawg1jHIg/AAHglnbJPYl/d/pxtDQ6kAYhCYjB0IdDgQzq3YE8IMIdnbhtQaxhuDlK+pwIJhZ\ntQm601wEgxkIriHUMaqBsD8pELyWkVn75pHWG+uGl0YZSUwFXkGfZikXOBDqGNVAcA3BrLyjSHOG\nuqFYQ6jMUt7Rpdcuy8NO6xjlQNhEWsRqVD+jWa8cBTzQpdcqBsIgNBeBh53WNap/LA8Anib9w9e6\nyLeZ1Tef0Q8E1xBqGOVAeCbf3Gxk1iKJg4A96d7Q0GIgDMKQU3Ag1DVygZCbiKYDW/PNgWDWuqOA\nB7qw7HVF8QI5g1JDcB9CHSMXCKRvI89H8FNcQzBrVzebi2D3C+QMSiC4D6GOUQyESnMROBDM2tXN\nEUZE8CKAxDT6ey3lIjcZ1eFAMLOibtcQYFc/wmG4D2GgORDMrKirNYSsGAiDUENwH0IdDgQzA0Bi\nf2AG3bkwTtGzwIGkWcpPNNl3MrgPoY5RDIT92T0QvHyFWWt+AVjXpUXtirYBrwZ+OACzlMFNRnWN\nYiC4hmBWTi+aiyDVEOYxGM1FAM8B+3gVg581iifEgWBWTjeXrCgaqEDINaDngX37XZZB40Aws4pe\njDCCFAhzGZBAyNxsVIMDwcwqet1kNAhDTiscCDWMeiB46QqzFkjsDcyhe9dBKHqWtPT1INUQPPS0\nhlEPBNcQbKxJ7NPirvOARyLY3oNibMs/BykQPPS0BgeC2WhbJfH2FvbrVYcypG/j4CajgTeqgbA1\n398KTJdQH8tj1hcSrwJeD5zZwu6TEQiDVkNwIFQZ1UB4BiBPgnkBVw1tPL0duBVY1MKY+/n0pkMZ\nUiDsZDBmKVe4D6GGkQ6EzM1GNq7OBv4WeBJ4XZN9e11D2JyXpB8U7kOoYaQCITcNFZeuAAeCjSGJ\nI4BXAXcAXwTOaLDvVNLSEg/1qDjbGKz+A3CTUU0dBYKkKZLulXRTfjxD0kpJ6yTdJunAwr5LJa2X\n9KCk0wvbj5e0Jj93aSflIc08fLFqpITXM7Jx9JvAjfn/wgrgLQ32fRXweATP9agsXwPe1aPXLsuB\nUEOnNYT3kdodK5fbuwhYGRHzgNvzYyTNB84htVMuAj4pqdLRexlwYUTMBeZKWtRBeaqbi8A1BBtP\nZwPX5/tfA14tcWidfXvZXEQEL0Rwb69evyT3IdRQOhAkHU6qhn4KXhrFsxi4Mt+/Ejgr3z8TuDYi\ntkfEBtLklwWSZgHTI2J13u+qwjFlOBBs7EnMJv2Rvx0g1xJWkr6M1dLLDuVB5T6EGjqpIXwceD/s\ntlTuzIjYku9vAWbm+4cBGwv7bQRm19i+KW8vy4FgBr8B3FS5fGXWqNmopzWEAeUmoxqmljlI0luB\nJyLiXkkLa+0TESEpaj1X8j2XFR6uiohVNXZzIJjtGl1UdAvwMYk9a8xGPorUdDtORrLJKP89Xlj2\n+FKBALwBWCzpDGBvYH9JVwNbJB0aEZtzc1Bl3PEm0jopFYeTagab8v3i9k213jAilrVQrlqB4PWM\nbGxIzASOITURvSSCzRL/Rvq/e0dh/z0Y3xrCyDUZ5S/KqyqPJX24neNLNRlFxAcjYk5EHAmcC3w5\nIs4DlgPn593OB27I95cD50qaJulI0lK4qyNiM7BV0oLcyXxe4ZgyXEOwcXcWsCKCF2o8V6vZ6HBg\nawQ/7nnJBoubjGro1jyEStPQXwKnSVoHnJofExFrgetIHVe3AEsionLMElLH9Hrg4Yi4tYNyOBBs\n3J0NfL7Oc7XmI4xj7QAcCDWVbTJ6SUTcQa6CRsSPgDfX2e9i4OIa2+8Bju60HJkDwcaWxEE0Xrvo\nm8AhEkdE8P28bRxHGMGI9iF0aqRmKrP7wnYVDgQbF2cCt9WbYJaXjriV3WsJ41xDGLk+hE6NYiC4\nhmDjqlFzUcUKdg+Eca0huMmoBgeC2QiQeDlpBNGKJrveBpwssXde+2ucawgOhCqjFgjVC9uB1zKy\n8fA24MsRL12drKYIfgR8mzRW/WDSKgODtCz1ZHkB2DMv7GfZqJ2MujUECUXQtYlyZgPmbOBzLe5b\naTZ6AVg7jv8vIgjppVpC9d+MsTVqNYSfCYQ8K3M7tHxtWbOhIrE/6Rv/zS0e8kXSfIRxbS6qcLNR\nlZEPhMz9CDbKzgC+GtHyN901wF6kSWzjHAgeelplZAIhd5DVCwQvX2Gj7GSqlqpoJDcRrQBOZzxH\nGFV46GmVkQkE0ppKOyP49xrPuYZgo+xE4BttHvPF/HOcawhuMqoySoFQr3YADgQbURL7AfOA+9o8\n9PZ8e7TrhRoeDoQqozTKyIFg4+gE4Dt1asZ1RfAsdZaZGSPuQ6jiGoLZcDuJ9puLLHEfQhUHgtlw\nOxG4s9+FGFJuMqoyaoFQvbBdhQPBRk4eWedAKM+BUGXUAsE1BBsnRwI7GO+O4U64D6HKKAVCrXWM\nKryekY2iE4E7x3HpiS5xH0KVUQoE1xBs3JSZf2C7uMmoigPBbHidhPsPOuFAqOJAMBtCEvuQLm5z\nT7/LMsTch1BlXALBaxnZqDmOtHT18/0uyBBzH0KVcQkE1xBs1Li5qHNuMqriQDAbTp5/0Dk3GVUZ\ni0CI4AUAib0ntURmPZAnpHnJis65yajKWARC5lqCjYrDgT2B7/W7IEPOTUZVHAhmw+dE4BuekNYx\nB0KVkQgEib1In+WFBrs5EGxUuEO5O34CvCw3wRkjEgjkhe2afGNyINiocIdyF0TwIhDAtH6XZVCM\nSiA0WseowusZ2dDLteFfAu7ud1lGhJuNCkoFgqQ5kr4i6X5J35X0+3n7DEkrJa2TdJukAwvHLJW0\nXtKDkk4vbD9e0pr83KUlP0ez/gNwDcFGwzHA+nzFM+ucA6GgbA1hO/CHEfEaUvX19yQdBVwErIyI\neaTrtV4EIGk+cA5pqv0i4JOSKu12lwEXRsRcYK6kRSXK40CwceHmou7yXISCUoEQEZsj4r58/1ng\nAWA2sBi4Mu92JXBWvn8mcG1EbI+IDcDDwAJJs4DpEbE673dV4Zh2OBBsXHiF0+7yXISCjvsQJE0A\nxwJ3ATMjYkt+agswM98/DNhYOGwjKUCqt2/K29vVSiB4PSMbBR5h1F1uMiroKBAk7QdcD7wvIrYV\nn4uIgEkbJ+0ago08iVnAdGBdv8syQtxkVDC17IGS9iSFwdURcUPevEXSoRGxOTcHPZG3bwLmFA4/\nnFQz2JTvF7dvqvN+ywoPV0XEqsJjB4KNgxOBuzwhratGqoYgaSGwsOzxpQIhdwhfDqyNiEsKTy0H\nzgc+mn/eUNh+jaSPkZqE5gKrIyIkbZW0AFgNnAd8otZ7RsSyBkU6AHi8SbEdCDbs3KHcfSPVh5C/\nKK+qPJb04XaOL9tk9MvAbwOnSLo33xYBfwmcJmkdcGp+TESsBa4D1gK3AEtykxLAEuBTwHrg4Yi4\ntUR5XEOwceAO5e4bqRpCp0rVECLi/1E/TN5c55iLgYtrbL8HOLpMOQocCDbSJPYEjifVpK173IdQ\nMCozlR0INureAtwf0fT33NozUk1GnXIgmA24vPjaUuCv+l2WEeQmo4JRCYT9SfMMGnkemCp5ISsb\nOqeQvsx8od8FGUEOhIJRCYSmNYQ8VM8L3NkwWgp8NIKd/S7ICHIfQsHYBELmZiPrK4l9Jd4k7Tb/\nptH+rwN+Hvhsb0s2ttyHUFB6YtqgyKMv9iL9wzbj5StsUuXlqk8kDcM+BTgOeIh0YZbjI5r+3i4F\n/iav3W/d5yajglGoIexP84vjVLiGYJNC4lUSK4EfkjqD9wL+HJgZwfGkCWYNl3uXmE+a8/OpHhd3\nnDkQCoa+hkDrzUXgQLBJIDGV1MTzL8DZdYaKvhe4R+KcCD5X56U+AHwigud6VFRzH8JuHAhm3fen\npObJ/16vIziCbRLvBG6RWB3B94rPS0wAbwV+rteFHXPuQygYhSYjB4INDIk3AO8GLmg2KiiCe0jL\nu1yT+8KK/hj4+wh+3JuSWuYmowIHglmXSOwP/APw7oimiy1WXAI8DSwrvM5M4D/k56y33GRU4EAw\n655PAF+KeGmV36ZyLeIC4AKJU/PmPwCujWBz94toVZ4D9s2zwcfeOPYhzOthWWxMSbwDeAPp6oFt\nieAJiQuAqyROBv4zaSE767EIfirxIrAPuPN+HAPBNQRri8R+wM56o30k5gD/C3hLC/MKaopgpcRn\nSauZfjGCDWXLa22r9COMfSCMQpPR/rQXCF66wloisY/EB4DvAVskbpF4r7Rr5I/EFOAq4JII7u7w\nLT8E3Ar8RYevY+1xP0I2KjWER1rc1zUEayrPIzif1NF7F/BGYAvpWh9nAB+U2AqsAKbk20c7fd88\nG/m3On0da5uHnmajEghuMrKO5Y7FxcBHSNcDPzuCuwq7fB74vMQewDGkcDgO+J0IfjrZ5bWu8dDT\nbNwCwWsZWU0SR5KGjE4H/gi4td5yKHlk0LfyzYafAyEbhT4E1xCsI3kBun8iXe/72AhuaXFtLBsN\n7kPIxi0QngX2zm3EZhUfBR4F/sJNP2PJfQjZKPxhbDkQIgiJbaSRRj/qaalsKEicBZxFqhm4VjCe\n3GSUjVsNASax2UhitsQvTcZ7WfskjgD+D3BuBE/3uzzWN24yyoY6EPIY8H1J/6CtmpRAkNiX1CZ9\nh8QleXKT1ZCvHdDWta4l3ihxQdklB/JicteSLj5zZ5nXsJHhGkI21IFAavrZ1ua1ZnseCPmP1GXA\nt0nLFx8IfFfijF6+77CReK3EjcB3SME5p8Xj3g1cT1oR9LMlw/Z/AD8G/rbEsTZa3IeQDXsgtNtc\nBJNTQ3gXaS2ad0fwVAQXAP8J+J8S/5hXsxxbEvMkrgVuA74CHAx8Abhb4tcaHDdN4u+A3yddSex1\nwAvAXRJHtfH+i0gTwM73hesN1xBe4kDoMokTSEsPvL24rk0EXwKOBjYAayR+d9xWWJQ4QuJy4GvA\nGuDVEVwSwfMR/BVwDnCFxLLcHFg89hDgduAw4MQIHs7H/S7wMeBfJc5poQyzgc8Avx3Bk139gDas\n3IeQDXsgtLOOUUXP1jOSeAVpNuu7I3io+vkInovgIuB0YAlwk8TBvSjLoJCQxIIcBN8CHgfmRnBx\nxO59PxELJK/VAAAGfUlEQVTcQapZnQysqJwbiWOBu4FVwFkRbK067nLSOb1Y4hPV/RES+0mcIPE7\npKam/53fywzcZPSSYQ+Egakh5OUMrgauj+D6RvtGcB9pqeTvAvcW1sEfChL75s/baJ8DJd4D3Ee6\nvvA64Bci+NNGVwHL1wA4jRQe90h8kNS09McRfKjBJSnvBU4AjiDVFv5GYoXEBuBJ0oXqF5EmoF3c\n3ie2Eecmo2wg5iFIWkS6OtQU4FMR0epCYQfA7t8WW7AVeHmbx7TiT0jLHlzUys55IbOLJG4Hrpb4\nDLAsgu31jpF4JTAH+E4E2zovcnty38cHgQuBKRKPAOurbkG64MtZpJU7/xBY1U5bfQQ7gKUSXydd\naP60HKLNjnta4jdIfTgvB/4OWAt8zxPOrAEHQtb3QJA0hbSW/JuBTcDdkpZHxAMtHF62hjDR5jEN\nSZwO/FfghEZ/0GvJ6+AfC1xJ+mb7zspa+LmP4TjSgmuLgcPh5qfgrXMkHgPuJX2Tvhe4N4InWiir\nSJ//V/LtEOBm4MZ6x0vMAN4P/BdSLejVwLb8c26+nQicR/qPdQ3w/k7b6CO4Cbip/mfRwohYVXXM\nTtLcgrFS61yMqxLnwn0IWd8DAXg98HBEbACQ9I/AmUAvA6F0k1Hu7JwgXXltHvDzwNuBcyJ4rMxr\n5itmvYV06cTVEhfn115MumjHjaSRNV+Ht30I4s/z+x6bbx8AjpUIUqd19e3JvF8lBKYA/wp8lTRj\n+0zgryXuI7WxfyGCjRLTc5neRxoFdEwEPygU/dv51i8LSf0K5nNRtJD2zoX7ELJBCITZpHVkKjYC\nC1o8tieBkL9FH8yuP/jFn0eS1sZfBzxECq63RbC6zXLsJn+z/ZjEHcBS0pWz3lTdOS291KRyf779\nQ6HMryCFVeU2j9TZeihprP9tpIuw/FvVMg3XSOxDarv/TWCZxMP5Nb4EnBTB+k4+n9kAc5NRNgiB\n0NL6MVLNpoNfBP6qzfd7GlhQ5/WKQRCkP/gPkf74/0O+/3AEz7f5ni2L4B7g7BLHBfDDfPtmieOf\nB5YDy/Ms3l8FNkdwf7uvZTZktgGz6vxNGAaPRPC+bryQIvq7npekE4FlEbEoP14K7Cx2LEvyomNm\nZiVERMvznQYhEKaSvnm/CXiM1FTyzhY7lc3MrEv63mQUETskvQf4F1Jn5+UOAzOzydf3GoKZmQ2G\ngZ+pLGmRpAclrZf0gX6XZzJJukLSFklrCttmSFopaZ2k2yQd2M8yTgZJcyR9RdL9kr4r6ffz9nE8\nF3tLukvSfZLWSvpI3j5256JC0hRJ90q6KT8ey3MhaYOk7+RzsTpva+tcDHQgFCatLQLmA++U1PKq\nliPg06TPXnQRsDIi5pEWe2tpZvSQ2w78YUS8hjQB7vfy78HYnYuIeAE4JSKOAV4LnCLpjYzhuSh4\nH2lGeqW5Y1zPRQALI+LYiHh93tbWuRjoQKAwaS0itgOVSWtjISK+Cj9zJa/FpFnN5J9nTWqh+iAi\nNkfEffn+s6S5H7MZw3MBEBHP5bvTSP1uTzOm50LS4cAZpLWqKqNpxvJcZNUjito6F4MeCLUmrc3u\nU1kGxcyI2JLvb4HxuraCpAnSrOu7GNNzIWkPSfeRPvNXIuJ+xvRcAB8nLatSXCtrXM9FAF+S9E1J\n78rb2joXfR9l1IR7vBuIiBinORqS9iMtrfG+iNgm7foyNE7nIiJ2AsdIOgD4F0mnVD0/FudC0luB\nJyLiXkkLa+0zLuci++WIeFzSwcBKSQ8Wn2zlXAx6DWET7HZZxTmkWsI42yLpUABJs6D5gnajQNKe\npDC4OiJuyJvH8lxURMQzwBdJ15AYx3PxBmCxpO+Rro99qqSrGc9zQUQ8nn8+SVp77PW0eS4GPRC+\nCcyVNCFpGumKWsv7XKZ+Ww6cn++fD9zQYN+RoFQVuBxYGxGXFJ4ax3NxUGWkiKTK+lP3MobnIiI+\nGBFzIuJI4FzgyxFxHmN4LiTtK2l6vv8y0hpma2jzXAz8PARJv86uayVcHhEf6XORJo2ka0lXDzuI\n1P73Z6SVT68DXklayfQdEVH3gjOjII+i+VfSAn2VX9jKAoDjdi6OJnUO7pFvV0fEX0uawZidiyJJ\nJwN/FBGLx/FcSDqSVCuA1BXw2Yj4SLvnYuADwczMJsegNxmZmdkkcSCYmRngQDAzs8yBYGZmgAPB\nzMwyB4KZmQEOBDMzyxwIZmYGwP8HaRAVJVJ//3sAAAAASUVORK5CYII=\n", 109 | "text/plain": [ 110 | "" 111 | ] 112 | }, 113 | "metadata": {}, 114 | "output_type": "display_data" 115 | } 116 | ], 117 | "source": [ 118 | "# Plot the histogram\n", 119 | "%matplotlib inline\n", 120 | "import matplotlib.pyplot as plt\n", 121 | "plt.plot(range(len(h)), h)\n", 122 | "plt.show()" 123 | ] 124 | } 125 | ], 126 | "metadata": { 127 | "kernelspec": { 128 | "display_name": "Python 2", 129 | "language": "python", 130 | "name": "python2" 131 | }, 132 | "language_info": { 133 | "codemirror_mode": { 134 | "name": "ipython", 135 | "version": 2 136 | }, 137 | "file_extension": ".py", 138 | "mimetype": "text/x-python", 139 | "name": "python", 140 | "nbconvert_exporter": "python", 141 | "pygments_lexer": "ipython2", 142 | "version": "2.7.11" 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 0 147 | } 148 | -------------------------------------------------------------------------------- /1.06_NaiveExactMatching-MatchingArtificialReads.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "--2015-07-14 11:54:42-- http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/phix.fa\n", 15 | "Resolving d28rh4a8wq0iu5.cloudfront.net... 54.192.55.189, 54.192.54.164, 54.192.54.222, ...\n", 16 | "Connecting to d28rh4a8wq0iu5.cloudfront.net|54.192.55.189|:80... connected.\n", 17 | "HTTP request sent, awaiting response... 200 OK\n", 18 | "Length: 5528 (5.4K) [application/octet-stream]\n", 19 | "Saving to: 'phix.fa'\n", 20 | "\n", 21 | "phix.fa 100%[=====================>] 5.40K --.-KB/s in 0.002s \n", 22 | "\n", 23 | "2015-07-14 11:54:42 (3.30 MB/s) - 'phix.fa' saved [5528/5528]\n", 24 | "\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "!wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/phix.fa" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "def readGenome(filename):\n", 41 | " genome = ''\n", 42 | " with open(filename, 'r') as f:\n", 43 | " for line in f:\n", 44 | " # ignore header line with genome information\n", 45 | " if not line[0] == '>':\n", 46 | " genome += line.rstrip()\n", 47 | " return genome" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "genome = readGenome('phix.fa')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "def naive(p, t):\n", 70 | " occurrences = []\n", 71 | " for i in range(len(t) - len(p) + 1):\n", 72 | " match = True\n", 73 | " for j in range(len(p)):\n", 74 | " if t[i+j] != p[j]:\n", 75 | " match = False\n", 76 | " break\n", 77 | " if match:\n", 78 | " occurrences.append(i)\n", 79 | " return occurrences" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "metadata": { 86 | "collapsed": false 87 | }, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | "[0, 5, 9]" 93 | ] 94 | }, 95 | "execution_count": 5, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "t = 'AGCTTAGATAGC'\n", 102 | "p = 'AG'\n", 103 | "naive(p, t)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 6, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "import random\n", 115 | "def generateReads(genome, numReads, readLen):\n", 116 | " ''' Generate reads from random positions in the given genome. '''\n", 117 | " reads = []\n", 118 | " for _ in range(numReads):\n", 119 | " start = random.randint(0, len(genome)-readLen) - 1\n", 120 | " reads.append(genome[start : start+readLen])\n", 121 | " return reads" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 7, 127 | "metadata": { 128 | "collapsed": false 129 | }, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "100 / 100 reads matched the genome exactly!\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "# Generate 100 reads of length 100\n", 141 | "reads = generateReads(genome, 100, 100)\n", 142 | "\n", 143 | "# Count how many reads match the genome exactly\n", 144 | "numMatched = 0\n", 145 | "for r in reads:\n", 146 | " matches = naive(r, genome)\n", 147 | " if len(matches) > 0:\n", 148 | " numMatched += 1\n", 149 | "print('%d / %d reads matched the genome exactly!' % (numMatched, len(reads)))" 150 | ] 151 | } 152 | ], 153 | "metadata": { 154 | "kernelspec": { 155 | "display_name": "Python 2", 156 | "language": "python", 157 | "name": "python2" 158 | }, 159 | "language_info": { 160 | "codemirror_mode": { 161 | "name": "ipython", 162 | "version": 2 163 | }, 164 | "file_extension": ".py", 165 | "mimetype": "text/x-python", 166 | "name": "python", 167 | "nbconvert_exporter": "python", 168 | "pygments_lexer": "ipython2", 169 | "version": "2.7.11" 170 | } 171 | }, 172 | "nbformat": 4, 173 | "nbformat_minor": 0 174 | } 175 | -------------------------------------------------------------------------------- /1.07_NaiveExactMatching-MatchingRealReads.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "'wget' is not recognized as an internal or external command,\n", 15 | "operable program or batch file.\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "!wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/phix.fa" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "def readGenome(filename):\n", 32 | " genome = ''\n", 33 | " with open(filename, 'r') as f:\n", 34 | " for line in f:\n", 35 | " # ignore header line with genome information\n", 36 | " if not line[0] == '>':\n", 37 | " genome += line.rstrip()\n", 38 | " return genome" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "genome = readGenome('phix.fa')" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "def naive(p, t):\n", 61 | " occurrences = []\n", 62 | " for i in range(len(t) - len(p) + 1):\n", 63 | " match = True\n", 64 | " for j in range(len(p)):\n", 65 | " if t[i+j] != p[j]:\n", 66 | " match = False\n", 67 | " break\n", 68 | " if match:\n", 69 | " occurrences.append(i)\n", 70 | " return occurrences" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 5, 76 | "metadata": { 77 | "collapsed": false 78 | }, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "[0, 5, 9]" 84 | ] 85 | }, 86 | "execution_count": 5, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "t = 'AGCTTAGATAGC'\n", 93 | "p = 'AG'\n", 94 | "naive(p, t)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 6, 100 | "metadata": { 101 | "collapsed": false 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "import random\n", 106 | "def generateReads(genome, numReads, readLen):\n", 107 | " ''' Generate reads from random positions in the given genome. '''\n", 108 | " reads = []\n", 109 | " for _ in range(numReads):\n", 110 | " start = random.randint(0, len(genome)-readLen) - 1\n", 111 | " reads.append(genome[start : start+readLen])\n", 112 | " return reads" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 7, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "100 / 100 reads matched the genome exactly!\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "# Generate 100 reads of length 100\n", 132 | "reads = generateReads(genome, 100, 100)\n", 133 | "\n", 134 | "# Count how many reads match the genome exactly\n", 135 | "numMatched = 0\n", 136 | "for r in reads:\n", 137 | " matches = naive(r, genome)\n", 138 | " if len(matches) > 0:\n", 139 | " numMatched += 1\n", 140 | "print('%d / %d reads matched the genome exactly!' % (numMatched, len(reads)))" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 8, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [ 150 | { 151 | "name": "stderr", 152 | "output_type": "stream", 153 | "text": [ 154 | "'wget' is not recognized as an internal or external command,\n", 155 | "operable program or batch file.\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "!wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR266411_1.first1000.fastq" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 9, 166 | "metadata": { 167 | "collapsed": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "def readFastq(filename):\n", 172 | " sequences = []\n", 173 | " with open(filename) as fh:\n", 174 | " while True:\n", 175 | " fh.readline() # skip name line\n", 176 | " seq = fh.readline().rstrip() # read base sequence\n", 177 | " fh.readline() # skip placeholder line\n", 178 | " fh.readline() # skip base quality line\n", 179 | " if len(seq) == 0:\n", 180 | " break\n", 181 | " sequences.append(seq)\n", 182 | " return sequences" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 10, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [ 192 | { 193 | "ename": "IOError", 194 | "evalue": "[Errno 2] No such file or directory: 'ERR266411_1.first1000.fastq'", 195 | "output_type": "error", 196 | "traceback": [ 197 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 198 | "\u001b[1;31mIOError\u001b[0m Traceback (most recent call last)", 199 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mcollections\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mphix_reads\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreadFastq\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'ERR266411_1.first1000.fastq'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mcount\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcollections\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mCounter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mread\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mphix_reads\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mcount\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 200 | "\u001b[1;32m\u001b[0m in \u001b[0;36mreadFastq\u001b[1;34m(filename)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mreadFastq\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0msequences\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfh\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[0mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mfh\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# skip name line\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 201 | "\u001b[1;31mIOError\u001b[0m: [Errno 2] No such file or directory: 'ERR266411_1.first1000.fastq'" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "import collections\n", 207 | "phix_reads = readFastq('ERR266411_1.first1000.fastq')\n", 208 | "count = collections.Counter()\n", 209 | "for read in phix_reads:\n", 210 | " count.update(read)\n", 211 | "count" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 11, 217 | "metadata": { 218 | "collapsed": false 219 | }, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "7 / 1000 reads matched the genome exactly!\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "numMatched = 0\n", 231 | "n = 0\n", 232 | "for r in phix_reads:\n", 233 | " matches = naive(r, genome)\n", 234 | " n += 1\n", 235 | " if len(matches) > 0:\n", 236 | " numMatched += 1\n", 237 | "print('%d / %d reads matched the genome exactly!' % (numMatched, n))" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 12, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "459 / 1000 reads matched the genome exactly!\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "# Now let's try matching just the first 30 bases of each read\n", 257 | "numMatched = 0\n", 258 | "n = 0\n", 259 | "for r in phix_reads:\n", 260 | " r = r[:30] # just taking the first 30 bases\n", 261 | " matches = naive(r, genome)\n", 262 | " n += 1\n", 263 | " if len(matches) > 0:\n", 264 | " numMatched += 1\n", 265 | "print('%d / %d reads matched the genome exactly!' % (numMatched, n))" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 13, 271 | "metadata": { 272 | "collapsed": false 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "def reverseComplement(s):\n", 277 | " complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}\n", 278 | " t = ''\n", 279 | " for base in s:\n", 280 | " t = complement[base] + t\n", 281 | " return t" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 14, 287 | "metadata": { 288 | "collapsed": false 289 | }, 290 | "outputs": [ 291 | { 292 | "name": "stdout", 293 | "output_type": "stream", 294 | "text": [ 295 | "932 / 1000 reads matched the genome exactly!\n" 296 | ] 297 | } 298 | ], 299 | "source": [ 300 | "numMatched = 0\n", 301 | "n = 0\n", 302 | "for r in phix_reads:\n", 303 | " r = r[:30] # just taking the first 30 bases\n", 304 | " matches = naive(r, genome)\n", 305 | " matches.extend(naive(reverseComplement(r), genome))\n", 306 | " n += 1\n", 307 | " if len(matches) > 0:\n", 308 | " numMatched += 1\n", 309 | "print('%d / %d reads matched the genome exactly!' % (numMatched, n))" 310 | ] 311 | } 312 | ], 313 | "metadata": { 314 | "kernelspec": { 315 | "display_name": "Python 2", 316 | "language": "python", 317 | "name": "python2" 318 | }, 319 | "language_info": { 320 | "codemirror_mode": { 321 | "name": "ipython", 322 | "version": 2 323 | }, 324 | "file_extension": ".py", 325 | "mimetype": "text/x-python", 326 | "name": "python", 327 | "nbconvert_exporter": "python", 328 | "pygments_lexer": "ipython2", 329 | "version": "2.7.11" 330 | } 331 | }, 332 | "nbformat": 4, 333 | "nbformat_minor": 0 334 | } 335 | -------------------------------------------------------------------------------- /2.01_BoyerMoore.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "The following gray box contains several functions that preprocess the pattern *P* and create the tables needed to apply the bad character and good suffix rules. We won't discuss these, but if you are interested, see Chapter 2 of:\n", 8 | "\n", 9 | "Gusfield, Dan. Algorithms on strings, trees and sequences: computer science and computational biology. Cambridge university press, 1997." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "collapsed": false 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import string\n", 21 | "\n", 22 | "def z_array(s):\n", 23 | " \"\"\" Use Z algorithm (Gusfield theorem 1.4.1) to preprocess s \"\"\"\n", 24 | " assert len(s) > 1\n", 25 | " z = [len(s)] + [0] * (len(s)-1)\n", 26 | " # Initial comparison of s[1:] with prefix\n", 27 | " for i in range(1, len(s)):\n", 28 | " if s[i] == s[i-1]:\n", 29 | " z[1] += 1\n", 30 | " else:\n", 31 | " break\n", 32 | " r, l = 0, 0\n", 33 | " if z[1] > 0:\n", 34 | " r, l = z[1], 1\n", 35 | " for k in range(2, len(s)):\n", 36 | " assert z[k] == 0\n", 37 | " if k > r:\n", 38 | " # Case 1\n", 39 | " for i in range(k, len(s)):\n", 40 | " if s[i] == s[i-k]:\n", 41 | " z[k] += 1\n", 42 | " else:\n", 43 | " break\n", 44 | " r, l = k + z[k] - 1, k\n", 45 | " else:\n", 46 | " # Case 2\n", 47 | " # Calculate length of beta\n", 48 | " nbeta = r - k + 1\n", 49 | " zkp = z[k - l]\n", 50 | " if nbeta > zkp:\n", 51 | " # Case 2a: Zkp wins\n", 52 | " z[k] = zkp\n", 53 | " else:\n", 54 | " # Case 2b: Compare characters just past r\n", 55 | " nmatch = 0\n", 56 | " for i in range(r+1, len(s)):\n", 57 | " if s[i] == s[i - k]:\n", 58 | " nmatch += 1\n", 59 | " else:\n", 60 | " break\n", 61 | " l, r = k, r + nmatch\n", 62 | " z[k] = r - k + 1\n", 63 | " return z\n", 64 | "\n", 65 | "\n", 66 | "def n_array(s):\n", 67 | " \"\"\" Compile the N array (Gusfield theorem 2.2.2) from the Z array \"\"\"\n", 68 | " return z_array(s[::-1])[::-1]\n", 69 | "\n", 70 | "\n", 71 | "def big_l_prime_array(p, n):\n", 72 | " \"\"\" Compile L' array (Gusfield theorem 2.2.2) using p and N array.\n", 73 | " L'[i] = largest index j less than n such that N[j] = |P[i:]| \"\"\"\n", 74 | " lp = [0] * len(p)\n", 75 | " for j in range(len(p)-1):\n", 76 | " i = len(p) - n[j]\n", 77 | " if i < len(p):\n", 78 | " lp[i] = j + 1\n", 79 | " return lp\n", 80 | "\n", 81 | "\n", 82 | "def big_l_array(p, lp):\n", 83 | " \"\"\" Compile L array (Gusfield theorem 2.2.2) using p and L' array.\n", 84 | " L[i] = largest index j less than n such that N[j] >= |P[i:]| \"\"\"\n", 85 | " l = [0] * len(p)\n", 86 | " l[1] = lp[1]\n", 87 | " for i in range(2, len(p)):\n", 88 | " l[i] = max(l[i-1], lp[i])\n", 89 | " return l\n", 90 | "\n", 91 | "\n", 92 | "def small_l_prime_array(n):\n", 93 | " \"\"\" Compile lp' array (Gusfield theorem 2.2.4) using N array. \"\"\"\n", 94 | " small_lp = [0] * len(n)\n", 95 | " for i in range(len(n)):\n", 96 | " if n[i] == i+1: # prefix matching a suffix\n", 97 | " small_lp[len(n)-i-1] = i+1\n", 98 | " for i in range(len(n)-2, -1, -1): # \"smear\" them out to the left\n", 99 | " if small_lp[i] == 0:\n", 100 | " small_lp[i] = small_lp[i+1]\n", 101 | " return small_lp\n", 102 | "\n", 103 | "\n", 104 | "def good_suffix_table(p):\n", 105 | " \"\"\" Return tables needed to apply good suffix rule. \"\"\"\n", 106 | " n = n_array(p)\n", 107 | " lp = big_l_prime_array(p, n)\n", 108 | " return lp, big_l_array(p, lp), small_l_prime_array(n)\n", 109 | "\n", 110 | "\n", 111 | "def good_suffix_mismatch(i, big_l_prime, small_l_prime):\n", 112 | " \"\"\" Given a mismatch at offset i, and given L/L' and l' arrays,\n", 113 | " return amount to shift as determined by good suffix rule. \"\"\"\n", 114 | " length = len(big_l_prime)\n", 115 | " assert i < length\n", 116 | " if i == length - 1:\n", 117 | " return 0\n", 118 | " i += 1 # i points to leftmost matching position of P\n", 119 | " if big_l_prime[i] > 0:\n", 120 | " return length - big_l_prime[i]\n", 121 | " return length - small_l_prime[i]\n", 122 | "\n", 123 | "\n", 124 | "def good_suffix_match(small_l_prime):\n", 125 | " \"\"\" Given a full match of P to T, return amount to shift as\n", 126 | " determined by good suffix rule. \"\"\"\n", 127 | " return len(small_l_prime) - small_l_prime[1]\n", 128 | "\n", 129 | "\n", 130 | "def dense_bad_char_tab(p, amap):\n", 131 | " \"\"\" Given pattern string and list with ordered alphabet characters, create\n", 132 | " and return a dense bad character table. Table is indexed by offset\n", 133 | " then by character. \"\"\"\n", 134 | " tab = []\n", 135 | " nxt = [0] * len(amap)\n", 136 | " for i in range(0, len(p)):\n", 137 | " c = p[i]\n", 138 | " assert c in amap\n", 139 | " tab.append(nxt[:])\n", 140 | " nxt[amap[c]] = i+1\n", 141 | " return tab\n", 142 | "\n", 143 | "\n", 144 | "class BoyerMoore(object):\n", 145 | " \"\"\" Encapsulates pattern and associated Boyer-Moore preprocessing. \"\"\"\n", 146 | " \n", 147 | " def __init__(self, p, alphabet='ACGT'):\n", 148 | " self.p = p\n", 149 | " self.alphabet = alphabet\n", 150 | " # Create map from alphabet characters to integers\n", 151 | " self.amap = {}\n", 152 | " for i in range(len(self.alphabet)):\n", 153 | " self.amap[self.alphabet[i]] = i\n", 154 | " # Make bad character rule table\n", 155 | " self.bad_char = dense_bad_char_tab(p, self.amap)\n", 156 | " # Create good suffix rule table\n", 157 | " _, self.big_l, self.small_l_prime = good_suffix_table(p)\n", 158 | " \n", 159 | " def bad_character_rule(self, i, c):\n", 160 | " \"\"\" Return # skips given by bad character rule at offset i \"\"\"\n", 161 | " assert c in self.amap\n", 162 | " ci = self.amap[c]\n", 163 | " assert i > (self.bad_char[i][ci]-1)\n", 164 | " return i - (self.bad_char[i][ci]-1)\n", 165 | " \n", 166 | " def good_suffix_rule(self, i):\n", 167 | " \"\"\" Given a mismatch at offset i, return amount to shift\n", 168 | " as determined by (weak) good suffix rule. \"\"\"\n", 169 | " length = len(self.big_l)\n", 170 | " assert i < length\n", 171 | " if i == length - 1:\n", 172 | " return 0\n", 173 | " i += 1 # i points to leftmost matching position of P\n", 174 | " if self.big_l[i] > 0:\n", 175 | " return length - self.big_l[i]\n", 176 | " return length - self.small_l_prime[i]\n", 177 | " \n", 178 | " def match_skip(self):\n", 179 | " \"\"\" Return amount to shift in case where P matches T \"\"\"\n", 180 | " return len(self.small_l_prime) - self.small_l_prime[1]" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "Let's make sure our rules give the expected results." 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 2, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "2" 201 | ] 202 | }, 203 | "execution_count": 2, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "# GCTAGCTCTACGAGTCTA\n", 210 | "p = 'TCAA'\n", 211 | "p_bm = BoyerMoore(p, alphabet='ACGT')\n", 212 | "p_bm.bad_character_rule(2, 'T')" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 3, 218 | "metadata": { 219 | "collapsed": false 220 | }, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "3" 226 | ] 227 | }, 228 | "execution_count": 3, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "# GCTAGCTCTACGAGTCTA\n", 235 | "# ACTA\n", 236 | "p = 'ACTA'\n", 237 | "p_bm = BoyerMoore(p, alphabet='ACGT')\n", 238 | "p_bm.good_suffix_rule(0)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 4, 244 | "metadata": { 245 | "collapsed": false 246 | }, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "2" 252 | ] 253 | }, 254 | "execution_count": 4, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "# ACACGCTCTACGAGTCTA\n", 261 | "# ACAC\n", 262 | "p = 'ACAC'\n", 263 | "p_bm = BoyerMoore(p, alphabet='ACGT')\n", 264 | "p_bm.match_skip()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 5, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "def boyer_moore(p, p_bm, t):\n", 276 | " \"\"\" Do Boyer-Moore matching \"\"\"\n", 277 | " i = 0\n", 278 | " occurrences = []\n", 279 | " while i < len(t) - len(p) + 1:\n", 280 | " shift = 1\n", 281 | " mismatched = False\n", 282 | " for j in range(len(p)-1, -1, -1):\n", 283 | " if p[j] != t[i+j]:\n", 284 | " skip_bc = p_bm.bad_character_rule(j, t[i+j])\n", 285 | " skip_gs = p_bm.good_suffix_rule(j)\n", 286 | " shift = max(shift, skip_bc, skip_gs)\n", 287 | " mismatched = True\n", 288 | " break\n", 289 | " if not mismatched:\n", 290 | " occurrences.append(i)\n", 291 | " skip_gs = p_bm.match_skip()\n", 292 | " shift = max(shift, skip_gs)\n", 293 | " i += shift\n", 294 | " return occurrences" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 6, 300 | "metadata": { 301 | "collapsed": false 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "t = 'GCTAGCTCTACGAGTCTA'\n", 306 | "p = 'TCTA'\n", 307 | "p_bm = BoyerMoore(p, alphabet='ACGT')" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 7, 313 | "metadata": { 314 | "collapsed": false 315 | }, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/plain": [ 320 | "[6, 14]" 321 | ] 322 | }, 323 | "execution_count": 7, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "boyer_moore(p, p_bm, t)" 330 | ] 331 | } 332 | ], 333 | "metadata": { 334 | "kernelspec": { 335 | "display_name": "Python 2", 336 | "language": "python", 337 | "name": "python2" 338 | }, 339 | "language_info": { 340 | "codemirror_mode": { 341 | "name": "ipython", 342 | "version": 2 343 | }, 344 | "file_extension": ".py", 345 | "mimetype": "text/x-python", 346 | "name": "python", 347 | "nbconvert_exporter": "python", 348 | "pygments_lexer": "ipython2", 349 | "version": "2.7.11" 350 | } 351 | }, 352 | "nbformat": 4, 353 | "nbformat_minor": 0 354 | } 355 | -------------------------------------------------------------------------------- /2.02_SubstringIndex.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import bisect\n", 12 | "import sys" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "class Index(object):\n", 24 | " def __init__(self, t, k):\n", 25 | " ''' Create index from all substrings of size 'length' '''\n", 26 | " self.k = k # k-mer length (k)\n", 27 | " self.index = []\n", 28 | " for i in range(len(t) - k + 1): # for each k-mer\n", 29 | " self.index.append((t[i:i+k], i)) # add (k-mer, offset) pair\n", 30 | " self.index.sort() # alphabetize by k-mer\n", 31 | " \n", 32 | " def query(self, p):\n", 33 | " ''' Return index hits for first k-mer of P '''\n", 34 | " kmer = p[:self.k] # query with first k-mer\n", 35 | " i = bisect.bisect_left(self.index, (kmer, -1)) # binary search\n", 36 | " hits = []\n", 37 | " while i < len(self.index): # collect matching index entries\n", 38 | " if self.index[i][0] != kmer:\n", 39 | " break\n", 40 | " hits.append(self.index[i][1])\n", 41 | " i += 1\n", 42 | " return hits" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "def queryIndex(p, t, index):\n", 54 | " k = index.k\n", 55 | " offsets = []\n", 56 | " for i in index.query(p):\n", 57 | " if p[k:] == t[i+k:i+len(p)]: # verify that rest of P matches\n", 58 | " offsets.append(i)\n", 59 | " return offsets" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "t = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'\n", 71 | "p = 'GGTATTCGGGA'" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "[21, 68]\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "index = Index(t, 4)\n", 91 | "print(queryIndex(p, t, index))" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "collapsed": true 99 | }, 100 | "outputs": [], 101 | "source": [] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 2", 107 | "language": "python", 108 | "name": "python2" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 2 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython2", 120 | "version": "2.7.11" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 0 125 | } 126 | -------------------------------------------------------------------------------- /2.03_ApproximateMatching.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import string\n", 12 | "\n", 13 | "def z_array(s):\n", 14 | " \"\"\" Use Z algorithm (Gusfield theorem 1.4.1) to preprocess s \"\"\"\n", 15 | " assert len(s) > 1\n", 16 | " z = [len(s)] + [0] * (len(s)-1)\n", 17 | " # Initial comparison of s[1:] with prefix\n", 18 | " for i in range(1, len(s)):\n", 19 | " if s[i] == s[i-1]:\n", 20 | " z[1] += 1\n", 21 | " else:\n", 22 | " break\n", 23 | " r, l = 0, 0\n", 24 | " if z[1] > 0:\n", 25 | " r, l = z[1], 1\n", 26 | " for k in range(2, len(s)):\n", 27 | " assert z[k] == 0\n", 28 | " if k > r:\n", 29 | " # Case 1\n", 30 | " for i in range(k, len(s)):\n", 31 | " if s[i] == s[i-k]:\n", 32 | " z[k] += 1\n", 33 | " else:\n", 34 | " break\n", 35 | " r, l = k + z[k] - 1, k\n", 36 | " else:\n", 37 | " # Case 2\n", 38 | " # Calculate length of beta\n", 39 | " nbeta = r - k + 1\n", 40 | " zkp = z[k - l]\n", 41 | " if nbeta > zkp:\n", 42 | " # Case 2a: Zkp wins\n", 43 | " z[k] = zkp\n", 44 | " else:\n", 45 | " # Case 2b: Compare characters just past r\n", 46 | " nmatch = 0\n", 47 | " for i in range(r+1, len(s)):\n", 48 | " if s[i] == s[i - k]:\n", 49 | " nmatch += 1\n", 50 | " else:\n", 51 | " break\n", 52 | " l, r = k, r + nmatch\n", 53 | " z[k] = r - k + 1\n", 54 | " return z\n", 55 | "\n", 56 | "\n", 57 | "def n_array(s):\n", 58 | " \"\"\" Compile the N array (Gusfield theorem 2.2.2) from the Z array \"\"\"\n", 59 | " return z_array(s[::-1])[::-1]\n", 60 | "\n", 61 | "\n", 62 | "def big_l_prime_array(p, n):\n", 63 | " \"\"\" Compile L' array (Gusfield theorem 2.2.2) using p and N array.\n", 64 | " L'[i] = largest index j less than n such that N[j] = |P[i:]| \"\"\"\n", 65 | " lp = [0] * len(p)\n", 66 | " for j in range(len(p)-1):\n", 67 | " i = len(p) - n[j]\n", 68 | " if i < len(p):\n", 69 | " lp[i] = j + 1\n", 70 | " return lp\n", 71 | "\n", 72 | "\n", 73 | "def big_l_array(p, lp):\n", 74 | " \"\"\" Compile L array (Gusfield theorem 2.2.2) using p and L' array.\n", 75 | " L[i] = largest index j less than n such that N[j] >= |P[i:]| \"\"\"\n", 76 | " l = [0] * len(p)\n", 77 | " l[1] = lp[1]\n", 78 | " for i in range(2, len(p)):\n", 79 | " l[i] = max(l[i-1], lp[i])\n", 80 | " return l\n", 81 | "\n", 82 | "\n", 83 | "def small_l_prime_array(n):\n", 84 | " \"\"\" Compile lp' array (Gusfield theorem 2.2.4) using N array. \"\"\"\n", 85 | " small_lp = [0] * len(n)\n", 86 | " for i in range(len(n)):\n", 87 | " if n[i] == i+1: # prefix matching a suffix\n", 88 | " small_lp[len(n)-i-1] = i+1\n", 89 | " for i in range(len(n)-2, -1, -1): # \"smear\" them out to the left\n", 90 | " if small_lp[i] == 0:\n", 91 | " small_lp[i] = small_lp[i+1]\n", 92 | " return small_lp\n", 93 | "\n", 94 | "\n", 95 | "def good_suffix_table(p):\n", 96 | " \"\"\" Return tables needed to apply good suffix rule. \"\"\"\n", 97 | " n = n_array(p)\n", 98 | " lp = big_l_prime_array(p, n)\n", 99 | " return lp, big_l_array(p, lp), small_l_prime_array(n)\n", 100 | "\n", 101 | "\n", 102 | "def good_suffix_mismatch(i, big_l_prime, small_l_prime):\n", 103 | " \"\"\" Given a mismatch at offset i, and given L/L' and l' arrays,\n", 104 | " return amount to shift as determined by good suffix rule. \"\"\"\n", 105 | " length = len(big_l_prime)\n", 106 | " assert i < length\n", 107 | " if i == length - 1:\n", 108 | " return 0\n", 109 | " i += 1 # i points to leftmost matching position of P\n", 110 | " if big_l_prime[i] > 0:\n", 111 | " return length - big_l_prime[i]\n", 112 | " return length - small_l_prime[i]\n", 113 | "\n", 114 | "\n", 115 | "def good_suffix_match(small_l_prime):\n", 116 | " \"\"\" Given a full match of P to T, return amount to shift as\n", 117 | " determined by good suffix rule. \"\"\"\n", 118 | " return len(small_l_prime) - small_l_prime[1]\n", 119 | "\n", 120 | "\n", 121 | "def dense_bad_char_tab(p, amap):\n", 122 | " \"\"\" Given pattern string and list with ordered alphabet characters, create\n", 123 | " and return a dense bad character table. Table is indexed by offset\n", 124 | " then by character. \"\"\"\n", 125 | " tab = []\n", 126 | " nxt = [0] * len(amap)\n", 127 | " for i in range(0, len(p)):\n", 128 | " c = p[i]\n", 129 | " assert c in amap\n", 130 | " tab.append(nxt[:])\n", 131 | " nxt[amap[c]] = i+1\n", 132 | " return tab\n", 133 | "\n", 134 | "\n", 135 | "class BoyerMoore(object):\n", 136 | " \"\"\" Encapsulates pattern and associated Boyer-Moore preprocessing. \"\"\"\n", 137 | " \n", 138 | " def __init__(self, p, alphabet='ACGT'):\n", 139 | " self.p = p\n", 140 | " self.alphabet = alphabet\n", 141 | " # Create map from alphabet characters to integers\n", 142 | " self.amap = {}\n", 143 | " for i in range(len(self.alphabet)):\n", 144 | " self.amap[self.alphabet[i]] = i\n", 145 | " # Make bad character rule table\n", 146 | " self.bad_char = dense_bad_char_tab(p, self.amap)\n", 147 | " # Create good suffix rule table\n", 148 | " _, self.big_l, self.small_l_prime = good_suffix_table(p)\n", 149 | " \n", 150 | " def bad_character_rule(self, i, c):\n", 151 | " \"\"\" Return # skips given by bad character rule at offset i \"\"\"\n", 152 | " assert c in self.amap\n", 153 | " ci = self.amap[c]\n", 154 | " assert i > (self.bad_char[i][ci]-1)\n", 155 | " return i - (self.bad_char[i][ci]-1)\n", 156 | " \n", 157 | " def good_suffix_rule(self, i):\n", 158 | " \"\"\" Given a mismatch at offset i, return amount to shift\n", 159 | " as determined by (weak) good suffix rule. \"\"\"\n", 160 | " length = len(self.big_l)\n", 161 | " assert i < length\n", 162 | " if i == length - 1:\n", 163 | " return 0\n", 164 | " i += 1 # i points to leftmost matching position of P\n", 165 | " if self.big_l[i] > 0:\n", 166 | " return length - self.big_l[i]\n", 167 | " return length - self.small_l_prime[i]\n", 168 | " \n", 169 | " def match_skip(self):\n", 170 | " \"\"\" Return amount to shift in case where P matches T \"\"\"\n", 171 | " return len(self.small_l_prime) - self.small_l_prime[1]" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 2, 177 | "metadata": { 178 | "collapsed": true 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "def boyer_moore(p, p_bm, t):\n", 183 | " \"\"\" Do Boyer-Moore matching \"\"\"\n", 184 | " i = 0\n", 185 | " occurrences = []\n", 186 | " while i < len(t) - len(p) + 1:\n", 187 | " shift = 1\n", 188 | " mismatched = False\n", 189 | " for j in range(len(p)-1, -1, -1):\n", 190 | " if p[j] != t[i+j]:\n", 191 | " skip_bc = p_bm.bad_character_rule(j, t[i+j])\n", 192 | " skip_gs = p_bm.good_suffix_rule(j)\n", 193 | " shift = max(shift, skip_bc, skip_gs)\n", 194 | " mismatched = True\n", 195 | " break\n", 196 | " if not mismatched:\n", 197 | " occurrences.append(i)\n", 198 | " skip_gs = p_bm.match_skip()\n", 199 | " shift = max(shift, skip_gs)\n", 200 | " i += shift\n", 201 | " return occurrences" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 3, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "def approximate_match(p, t, n):\n", 213 | " segment_length = int(round(len(p) / (n+1)))\n", 214 | " all_matches = set()\n", 215 | " for i in range(n+1):\n", 216 | " start = i*segment_length\n", 217 | " end = min((i+1)*segment_length, len(p))\n", 218 | " p_bm = BoyerMoore(p[start:end], alphabet='ACGT')\n", 219 | " matches = boyer_moore(p[start:end], p_bm, t)\n", 220 | " # Extend matching segments to see if whole p matches\n", 221 | " for m in matches:\n", 222 | " if m < start or m-start+len(p) > len(t):\n", 223 | " continue\n", 224 | " mismatches = 0\n", 225 | " for j in range(0, start):\n", 226 | " if not p[j] == t[m-start+j]:\n", 227 | " mismatches += 1\n", 228 | " if mismatches > n:\n", 229 | " break\n", 230 | " for j in range(end, len(p)):\n", 231 | " if not p[j] == t[m-start+j]:\n", 232 | " mismatches += 1\n", 233 | " if mismatches > n:\n", 234 | " break\n", 235 | " if mismatches <= n:\n", 236 | " all_matches.add(m - start)\n", 237 | " return list(all_matches)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 4, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "[0, 5]\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "p = 'AACTTG'\n", 257 | "t = 'CACTTAATTTG'\n", 258 | "print(approximate_match(p, t, 2))" 259 | ] 260 | } 261 | ], 262 | "metadata": { 263 | "kernelspec": { 264 | "display_name": "Python 2", 265 | "language": "python", 266 | "name": "python2" 267 | }, 268 | "language_info": { 269 | "codemirror_mode": { 270 | "name": "ipython", 271 | "version": 2 272 | }, 273 | "file_extension": ".py", 274 | "mimetype": "text/x-python", 275 | "name": "python", 276 | "nbconvert_exporter": "python", 277 | "pygments_lexer": "ipython2", 278 | "version": "2.7.11" 279 | } 280 | }, 281 | "nbformat": 4, 282 | "nbformat_minor": 0 283 | } 284 | -------------------------------------------------------------------------------- /3.01_EditDistanceDP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "def editDistRecursive(x, y):\n", 12 | " # This implementation is very slow\n", 13 | " if len(x) == 0:\n", 14 | " return len(y)\n", 15 | " elif len(y) == 0:\n", 16 | " return len(x)\n", 17 | " else:\n", 18 | " distHor = editDistRecursive(x[:-1], y) + 1\n", 19 | " distVer = editDistRecursive(x, y[:-1]) + 1\n", 20 | " if x[-1] == y[-1]:\n", 21 | " distDiag = editDistRecursive(x[:-1], y[:-1])\n", 22 | " else:\n", 23 | " distDiag = editDistRecursive(x[:-1], y[:-1]) + 1\n", 24 | " return min(distHor, distVer, distDiag)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "def editDistance(x, y):\n", 36 | " # Create distance matrix\n", 37 | " D = []\n", 38 | " for i in range(len(x)+1):\n", 39 | " D.append([0]*(len(y)+1))\n", 40 | " # Initialize first row and column of matrix\n", 41 | " for i in range(len(x)+1):\n", 42 | " D[i][0] = i\n", 43 | " for i in range(len(y)+1):\n", 44 | " D[0][i] = i\n", 45 | " # Fill in the rest of the matrix\n", 46 | " for i in range(1, len(x)+1):\n", 47 | " for j in range(1, len(y)+1):\n", 48 | " distHor = D[i][j-1] + 1\n", 49 | " distVer = D[i-1][j] + 1\n", 50 | " if x[i-1] == y[j-1]:\n", 51 | " distDiag = D[i-1][j-1]\n", 52 | " else:\n", 53 | " distDiag = D[i-1][j-1] + 1\n", 54 | " D[i][j] = min(distHor, distVer, distDiag)\n", 55 | " # Edit distance is the value in the bottom right corner of the matrix\n", 56 | " return D[-1][-1]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "Wall time: 20.3 s\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "%%time\n", 76 | "x = 'shake spea'\n", 77 | "y = 'Shakespear'\n", 78 | "editDistRecursive(x, y)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "Wall time: 1e+03 µs\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "%%time\n", 98 | "x = 'shake spea'\n", 99 | "y = 'Shakespear'\n", 100 | "editDistance(x, y)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "Python 2", 116 | "language": "python", 117 | "name": "python2" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 2 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython2", 129 | "version": "2.7.11" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 0 134 | } 135 | -------------------------------------------------------------------------------- /3.02_GlobalAlignment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "alphabet = ['A', 'C', 'G', 'T']\n", 12 | "score = [[0, 4, 2, 4, 8],\n", 13 | " [4, 0, 4, 2, 8],\n", 14 | " [2, 4, 0, 4, 8],\n", 15 | " [4, 2, 4, 0, 8],\n", 16 | " [8, 8, 8, 8, 8]]" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/plain": [ 29 | "0" 30 | ] 31 | }, 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "output_type": "execute_result" 35 | } 36 | ], 37 | "source": [ 38 | "# converts from character to its offset in list alphabet\n", 39 | "alphabet.index('A')" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/plain": [ 52 | "2" 53 | ] 54 | }, 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "output_type": "execute_result" 58 | } 59 | ], 60 | "source": [ 61 | "alphabet.index('G')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "4" 75 | ] 76 | }, 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "# penalty associated with A (from X) mismatching with T (from Y)\n", 84 | "score[alphabet.index('A')][alphabet.index('T')]" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 5, 90 | "metadata": { 91 | "collapsed": false 92 | }, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "8" 98 | ] 99 | }, 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "# penalty associated with C (from X) being deleted in Y\n", 107 | "score[alphabet.index('C')][-1]" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 12, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "def globalAlignment(x, y):\n", 119 | " # Create distance matrix\n", 120 | " D = []\n", 121 | " for i in range(len(x)+1):\n", 122 | " D.append([0] * (len(y)+1))\n", 123 | " \n", 124 | " # Initialize first column\n", 125 | " for i in range(1, len(x)+1):\n", 126 | " D[i][0] = D[i-1][0] + score[alphabet.index(x[i-1])][-1]\n", 127 | "\n", 128 | " # Initialize first row\n", 129 | " for j in range(1,len(y)+1):\n", 130 | " D[0][j] = D[0][j-1] + score[-1][alphabet.index(y[j-1])]\n", 131 | " \n", 132 | " # Fill rest of the matrix\n", 133 | " for i in range(1, len(x)+1):\n", 134 | " for j in range(1, len(y)+1):\n", 135 | " distHor = D[i][j-1] + score[-1][alphabet.index(y[j-1])]\n", 136 | " distVer = D[i-1][j] + score[alphabet.index(x[i-1])][-1]\n", 137 | " distDiag = D[i-1][j-1] + score[alphabet.index(x[i-1])][alphabet.index(y[j-1])]\n", 138 | " D[i][j] = min(distHor, distVer, distDiag)\n", 139 | " \n", 140 | " return D[-1][-1] # return value in bottom right corner" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 13, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "12\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "x = 'TATGTCATGC'\n", 160 | "y = 'TATGGCAGC'\n", 161 | "print(globalAlignment(x,y))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [], 171 | "source": [] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": true 178 | }, 179 | "outputs": [], 180 | "source": [] 181 | } 182 | ], 183 | "metadata": { 184 | "kernelspec": { 185 | "display_name": "Python 2", 186 | "language": "python", 187 | "name": "python2" 188 | }, 189 | "language_info": { 190 | "codemirror_mode": { 191 | "name": "ipython", 192 | "version": 2 193 | }, 194 | "file_extension": ".py", 195 | "mimetype": "text/x-python", 196 | "name": "python", 197 | "nbconvert_exporter": "python", 198 | "pygments_lexer": "ipython2", 199 | "version": "2.7.11" 200 | } 201 | }, 202 | "nbformat": 4, 203 | "nbformat_minor": 0 204 | } 205 | -------------------------------------------------------------------------------- /3.03_FindingOverlaps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "def overlap(a, b, min_length=3):\n", 12 | " \"\"\" Return length of longest suffix of 'a' matching\n", 13 | " a prefix of 'b' that is at least 'min_length'\n", 14 | " characters long. If no such overlap exists,\n", 15 | " return 0. \"\"\"\n", 16 | " start = 0 # start all the way at the left\n", 17 | " while True:\n", 18 | " start = a.find(b[:min_length], start) # look for b's prefix in a\n", 19 | " if start == -1: # no more occurrences to right\n", 20 | " return 0\n", 21 | " # found occurrence; check for full suffix/prefix match\n", 22 | " if b.startswith(a[start:]):\n", 23 | " return len(a)-start\n", 24 | " start += 1 # move just past previous match" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "3" 38 | ] 39 | }, 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "overlap('TTACGT', 'CGTGTGC')" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "0" 60 | ] 61 | }, 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "overlap('TTACGT', 'GTGTGC')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 2", 84 | "language": "python", 85 | "name": "python2" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 2 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython2", 97 | "version": "2.7.11" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 0 102 | } 103 | -------------------------------------------------------------------------------- /3.04_FindingAllOverlaps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "def overlap(a, b, min_length=3):\n", 12 | " \"\"\" Return length of longest suffix of 'a' matching\n", 13 | " a prefix of 'b' that is at least 'min_length'\n", 14 | " characters long. If no such overlap exists,\n", 15 | " return 0. \"\"\"\n", 16 | " start = 0 # start all the way at the left\n", 17 | " while True:\n", 18 | " start = a.find(b[:min_length], start) # look for b's suffx in a\n", 19 | " if start == -1: # no more occurrences to right\n", 20 | " return 0\n", 21 | " # found occurrence; check for full suffix/prefix match\n", 22 | " if b.startswith(a[start:]):\n", 23 | " return len(a)-start\n", 24 | " start += 1 # move just past previous match" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]" 38 | ] 39 | }, 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "from itertools import permutations\n", 47 | "\n", 48 | "list(permutations([1,2,3], 2))" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "def naive_overlap_map(reads, k):\n", 60 | " olaps = {}\n", 61 | " for a, b in permutations(reads, 2):\n", 62 | " olen = overlap(a, b, min_length=k)\n", 63 | " if olen > 0:\n", 64 | " olaps[(a, b)] = olen\n", 65 | " return olaps" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "{('TTCACGGA', 'ACGGATC'): 5, ('ACGGATC', 'GATCAAGT'): 4}\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "reads = ['ACGGATC', 'GATCAAGT', 'TTCACGGA']\n", 85 | "print(naive_overlap_map(reads, 3))" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [] 96 | } 97 | ], 98 | "metadata": { 99 | "kernelspec": { 100 | "display_name": "Python 2", 101 | "language": "python", 102 | "name": "python2" 103 | }, 104 | "language_info": { 105 | "codemirror_mode": { 106 | "name": "ipython", 107 | "version": 2 108 | }, 109 | "file_extension": ".py", 110 | "mimetype": "text/x-python", 111 | "name": "python", 112 | "nbconvert_exporter": "python", 113 | "pygments_lexer": "ipython2", 114 | "version": "2.7.11" 115 | } 116 | }, 117 | "nbformat": 4, 118 | "nbformat_minor": 0 119 | } 120 | -------------------------------------------------------------------------------- /4.01_ShortestCommonSuperstring.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "def overlap(a, b, min_length=3):\n", 12 | " \"\"\" Return length of longest suffix of 'a' matching\n", 13 | " a prefix of 'b' that is at least 'min_length'\n", 14 | " characters long. If no such overlap exists,\n", 15 | " return 0. \"\"\"\n", 16 | " start = 0 # start all the way at the left\n", 17 | " while True:\n", 18 | " start = a.find(b[:min_length], start) # look for b's prefix in a\n", 19 | " if start == -1: # no more occurrences to right\n", 20 | " return 0\n", 21 | " # found occurrence; check for full suffix/prefix match\n", 22 | " if b.startswith(a[start:]):\n", 23 | " return len(a)-start\n", 24 | " start += 1 # move just past previous match" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 4, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "import itertools\n", 36 | "\n", 37 | "def scs(ss):\n", 38 | " \"\"\" Returns shortest common superstring of given strings,\n", 39 | " assuming no string is a strict substring of another \"\"\"\n", 40 | " shortest_sup = None\n", 41 | " for ssperm in itertools.permutations(ss):\n", 42 | " sup = ssperm[0]\n", 43 | " for i in range(len(ss)-1):\n", 44 | " olen = overlap(ssperm[i], ssperm[i+1], min_length=1)\n", 45 | " sup += ssperm[i+1][olen:]\n", 46 | " if shortest_sup is None or len(sup) < len(shortest_sup):\n", 47 | " shortest_sup = sup\n", 48 | " return shortest_sup" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/plain": [ 61 | "'ACGGATGAGCGAGCGGA'" 62 | ] 63 | }, 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "scs(['ACGGATGAGC', 'GAGCGGA', 'GAGCGAG'])" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [] 81 | } 82 | ], 83 | "metadata": { 84 | "kernelspec": { 85 | "display_name": "Python 2", 86 | "language": "python", 87 | "name": "python2" 88 | }, 89 | "language_info": { 90 | "codemirror_mode": { 91 | "name": "ipython", 92 | "version": 2 93 | }, 94 | "file_extension": ".py", 95 | "mimetype": "text/x-python", 96 | "name": "python", 97 | "nbconvert_exporter": "python", 98 | "pygments_lexer": "ipython2", 99 | "version": "2.7.11" 100 | } 101 | }, 102 | "nbformat": 4, 103 | "nbformat_minor": 0 104 | } 105 | -------------------------------------------------------------------------------- /4.02_GreedySCS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "def overlap(a, b, min_length=3):\n", 12 | " \"\"\" Return length of longest suffix of 'a' matching\n", 13 | " a prefix of 'b' that is at least 'min_length'\n", 14 | " characters long. If no such overlap exists,\n", 15 | " return 0. \"\"\"\n", 16 | " start = 0 # start all the way at the left\n", 17 | " while True:\n", 18 | " start = a.find(b[:min_length], start) # look for b's prefix in a\n", 19 | " if start == -1: # no more occurrences to right\n", 20 | " return 0\n", 21 | " # found occurrence; check for full suffix/prefix match\n", 22 | " if b.startswith(a[start:]):\n", 23 | " return len(a)-start\n", 24 | " start += 1 # move just past previous match" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 8, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "import itertools\n", 36 | "\n", 37 | "def scs(ss):\n", 38 | " \"\"\" Returns shortest common superstring of given strings,\n", 39 | " assuming no string is a strict substring of another \"\"\"\n", 40 | " shortest_sup = None\n", 41 | " for ssperm in itertools.permutations(ss):\n", 42 | " sup = ssperm[0] # superstring starts as first string\n", 43 | " for i in range(len(ss)-1):\n", 44 | " # overlap adjacent strings A and B in the permutation\n", 45 | " olen = overlap(ssperm[i], ssperm[i+1], min_length=1)\n", 46 | " # add non-overlapping portion of B to superstring\n", 47 | " #sup += ssperm[i+1][-(len(ssperm[i+1])-olen):]\n", 48 | " sup += ssperm[i+1][olen:]\n", 49 | " if shortest_sup is None or len(sup) < len(shortest_sup):\n", 50 | " shortest_sup = sup # found shorter superstring\n", 51 | " return shortest_sup # return shortest" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 9, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "def pick_maximal_overlap(reads, k):\n", 63 | " \"\"\" Return a pair of reads from the list with a\n", 64 | " maximal suffix/prefix overlap >= k. Returns\n", 65 | " overlap length 0 if there are no such overlaps.\"\"\"\n", 66 | " reada, readb = None, None\n", 67 | " best_olen = 0\n", 68 | " for a, b in itertools.permutations(reads, 2):\n", 69 | " olen = overlap(a, b, min_length=k)\n", 70 | " if olen > best_olen:\n", 71 | " reada, readb = a, b\n", 72 | " best_olen = olen\n", 73 | " return reada, readb, best_olen" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 10, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "def greedy_scs(reads, k):\n", 85 | " \"\"\" Greedy shortest-common-superstring merge.\n", 86 | " Repeat until no edges (overlaps of length >= k)\n", 87 | " remain. \"\"\"\n", 88 | " read_a, read_b, olen = pick_maximal_overlap(reads, k)\n", 89 | " while olen > 0:\n", 90 | " reads.remove(read_a)\n", 91 | " reads.remove(read_b)\n", 92 | " reads.append(read_a + read_b[olen:])\n", 93 | " read_a, read_b, olen = pick_maximal_overlap(reads, k)\n", 94 | " return ''.join(reads)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 11, 100 | "metadata": { 101 | "collapsed": false 102 | }, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "'CABCA'" 108 | ] 109 | }, 110 | "execution_count": 11, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "greedy_scs(['ABC', 'BCA', 'CAB'], 2)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 12, 122 | "metadata": { 123 | "collapsed": false 124 | }, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "'CDBCABCDA'" 130 | ] 131 | }, 132 | "execution_count": 12, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "greedy_scs(['ABCD', 'CDBC', 'BCDA'], 1)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 13, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "'ABCDBCDA'" 152 | ] 153 | }, 154 | "execution_count": 13, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "scs(['ABCD', 'CDBC', 'BCDA'])" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": true 168 | }, 169 | "outputs": [], 170 | "source": [] 171 | } 172 | ], 173 | "metadata": { 174 | "kernelspec": { 175 | "display_name": "Python 2", 176 | "language": "python", 177 | "name": "python2" 178 | }, 179 | "language_info": { 180 | "codemirror_mode": { 181 | "name": "ipython", 182 | "version": 2 183 | }, 184 | "file_extension": ".py", 185 | "mimetype": "text/x-python", 186 | "name": "python", 187 | "nbconvert_exporter": "python", 188 | "pygments_lexer": "ipython2", 189 | "version": "2.7.11" 190 | } 191 | }, 192 | "nbformat": 4, 193 | "nbformat_minor": 0 194 | } 195 | -------------------------------------------------------------------------------- /4.03_DeBruijn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "def de_bruijn_ize(st, k):\n", 12 | " \"\"\" Return a list holding, for each k-mer, its left\n", 13 | " k-1-mer and its right k-1-mer in a pair \"\"\"\n", 14 | " edges = []\n", 15 | " nodes = set()\n", 16 | " for i in range(len(st) - k + 1):\n", 17 | " edges.append((st[i:i+k-1], st[i+1:i+k]))\n", 18 | " nodes.add(st[i:i+k-1])\n", 19 | " nodes.add(st[i+1:i+k])\n", 20 | " return nodes, edges" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 9, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "nodes, edges = de_bruijn_ize(\"ACGCGTCG\", 3)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 10, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "{'AC', 'CG', 'GC', 'GT', 'TC'}" 45 | ] 46 | }, 47 | "execution_count": 10, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "nodes" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 17, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "[('AC', 'CG'),\n", 67 | " ('CG', 'GC'),\n", 68 | " ('GC', 'CG'),\n", 69 | " ('CG', 'GT'),\n", 70 | " ('GT', 'TC'),\n", 71 | " ('TC', 'CG')]" 72 | ] 73 | }, 74 | "execution_count": 17, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "edges" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 18, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "def visualize_de_bruijn(st, k):\n", 92 | " \"\"\" Visualize a directed multigraph using graphviz \"\"\"\n", 93 | " nodes, edges = de_bruijn_ize(st, k)\n", 94 | " dot_str = 'digraph \"DeBruijn graph\" {\\n'\n", 95 | " for node in nodes:\n", 96 | " dot_str += ' %s [label=\"%s\"] ;\\n' % (node, node)\n", 97 | " for src, dst in edges:\n", 98 | " dot_str += ' %s -> %s ;\\n' % (src, dst)\n", 99 | " return dot_str + '}\\n'" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 20, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "# might have to do this first:\n", 111 | "# %install_ext https://raw.github.com/cjdrake/ipython-magic/master/gvmagic.py\n", 112 | "%reload_ext gvmagic" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 21, 118 | "metadata": { 119 | "collapsed": false, 120 | "scrolled": true 121 | }, 122 | "outputs": [ 123 | { 124 | "ename": "WindowsError", 125 | "evalue": "[Error 2] The system cannot find the file specified", 126 | "output_type": "error", 127 | "traceback": [ 128 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 129 | "\u001b[1;31mWindowsError\u001b[0m Traceback (most recent call last)", 130 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mget_ipython\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmagic\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mu'dotstr visualize_de_bruijn(\"ACGCGTCG\", 3)'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 131 | "\u001b[1;32mC:\\Users\\Siddhant\\Anaconda2\\lib\\site-packages\\IPython\\core\\interactiveshell.pyc\u001b[0m in \u001b[0;36mmagic\u001b[1;34m(self, arg_s)\u001b[0m\n\u001b[0;32m 2334\u001b[0m \u001b[0mmagic_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmagic_arg_s\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0marg_s\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpartition\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m' '\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2335\u001b[0m \u001b[0mmagic_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmagic_name\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlstrip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprefilter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mESC_MAGIC\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2336\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmagic_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmagic_arg_s\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2337\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2338\u001b[0m \u001b[1;31m#-------------------------------------------------------------------------\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 132 | "\u001b[1;32mC:\\Users\\Siddhant\\Anaconda2\\lib\\site-packages\\IPython\\core\\interactiveshell.pyc\u001b[0m in \u001b[0;36mrun_line_magic\u001b[1;34m(self, magic_name, line)\u001b[0m\n\u001b[0;32m 2255\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'local_ns'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getframe\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstack_depth\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mf_locals\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2256\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2257\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2258\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2259\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 133 | "\u001b[1;32mC:\\Users\\Siddhant\\.ipython\\extensions\\gvmagic.py\u001b[0m in \u001b[0;36mdotstr\u001b[1;34m(self, line)\u001b[0m\n", 134 | "\u001b[1;32mC:\\Users\\Siddhant\\Anaconda2\\lib\\site-packages\\IPython\\core\\magic.pyc\u001b[0m in \u001b[0;36m\u001b[1;34m(f, *a, **k)\u001b[0m\n\u001b[0;32m 191\u001b[0m \u001b[1;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 192\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 193\u001b[1;33m \u001b[0mcall\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mk\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mk\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 194\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 195\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 135 | "\u001b[1;32mC:\\Users\\Siddhant\\.ipython\\extensions\\gvmagic.py\u001b[0m in \u001b[0;36mdotstr\u001b[1;34m(self, line)\u001b[0m\n\u001b[0;32m 48\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mline_magic\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 49\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdotstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mline\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 50\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_from_str\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'dot'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 51\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 52\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mline_magic\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 136 | "\u001b[1;32mC:\\Users\\Siddhant\\.ipython\\extensions\\gvmagic.py\u001b[0m in \u001b[0;36m_from_str\u001b[1;34m(self, line, layout_engine)\u001b[0m\n\u001b[0;32m 149\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_from_str\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mline\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlayout_engine\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 150\u001b[0m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshell\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mev\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 151\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrun_graphviz\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlayout_engine\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 152\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 153\u001b[0m \u001b[0mdisplay_svg\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mraw\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 137 | "\u001b[1;32mC:\\Users\\Siddhant\\.ipython\\extensions\\gvmagic.py\u001b[0m in \u001b[0;36mrun_graphviz\u001b[1;34m(s, layout_engine)\u001b[0m\n\u001b[0;32m 28\u001b[0m \u001b[0mcmd\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'dot'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'-Tsvg'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'-K'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlayout_engine\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 29\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 30\u001b[1;33m \u001b[0mdot\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mPopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcmd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstdin\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mPIPE\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstdout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mPIPE\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mPIPE\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 31\u001b[0m \u001b[0mstdoutdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstderrdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdot\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommunicate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'utf-8'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 32\u001b[0m \u001b[0mstatus\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdot\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 138 | "\u001b[1;32mC:\\Users\\Siddhant\\Anaconda2\\lib\\subprocess.pyc\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags)\u001b[0m\n\u001b[0;32m 708\u001b[0m \u001b[0mp2cread\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mp2cwrite\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 709\u001b[0m \u001b[0mc2pread\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mc2pwrite\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 710\u001b[1;33m errread, errwrite)\n\u001b[0m\u001b[0;32m 711\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 712\u001b[0m \u001b[1;31m# Preserve original exception in case os.close raises.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 139 | "\u001b[1;32mC:\\Users\\Siddhant\\Anaconda2\\lib\\subprocess.pyc\u001b[0m in \u001b[0;36m_execute_child\u001b[1;34m(self, args, executable, preexec_fn, close_fds, cwd, env, universal_newlines, startupinfo, creationflags, shell, to_close, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite)\u001b[0m\n\u001b[0;32m 956\u001b[0m \u001b[0menv\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 957\u001b[0m \u001b[0mcwd\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 958\u001b[1;33m startupinfo)\n\u001b[0m\u001b[0;32m 959\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mpywintypes\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 960\u001b[0m \u001b[1;31m# Translate pywintypes.error to WindowsError, which is\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 140 | "\u001b[1;31mWindowsError\u001b[0m: [Error 2] The system cannot find the file specified" 141 | ] 142 | } 143 | ], 144 | "source": [ 145 | "%dotstr visualize_de_bruijn(\"ACGCGTCG\", 3)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 22, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "nodes, edges = de_bruijn_ize('a_long_long_long_time',5)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 23, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "{'_lon',\n", 170 | " '_tim',\n", 171 | " 'a_lo',\n", 172 | " 'g_lo',\n", 173 | " 'g_ti',\n", 174 | " 'long',\n", 175 | " 'ng_l',\n", 176 | " 'ng_t',\n", 177 | " 'ong_',\n", 178 | " 'time'}" 179 | ] 180 | }, 181 | "execution_count": 23, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "nodes" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 24, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "[('a_lo', '_lon'),\n", 201 | " ('_lon', 'long'),\n", 202 | " ('long', 'ong_'),\n", 203 | " ('ong_', 'ng_l'),\n", 204 | " ('ng_l', 'g_lo'),\n", 205 | " ('g_lo', '_lon'),\n", 206 | " ('_lon', 'long'),\n", 207 | " ('long', 'ong_'),\n", 208 | " ('ong_', 'ng_l'),\n", 209 | " ('ng_l', 'g_lo'),\n", 210 | " ('g_lo', '_lon'),\n", 211 | " ('_lon', 'long'),\n", 212 | " ('long', 'ong_'),\n", 213 | " ('ong_', 'ng_t'),\n", 214 | " ('ng_t', 'g_ti'),\n", 215 | " ('g_ti', '_tim'),\n", 216 | " ('_tim', 'time')]" 217 | ] 218 | }, 219 | "execution_count": 24, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "edges" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": { 232 | "collapsed": true 233 | }, 234 | "outputs": [], 235 | "source": [] 236 | } 237 | ], 238 | "metadata": { 239 | "kernelspec": { 240 | "display_name": "Python 2", 241 | "language": "python", 242 | "name": "python2" 243 | }, 244 | "language_info": { 245 | "codemirror_mode": { 246 | "name": "ipython", 247 | "version": 2 248 | }, 249 | "file_extension": ".py", 250 | "mimetype": "text/x-python", 251 | "name": "python", 252 | "nbconvert_exporter": "python", 253 | "pygments_lexer": "ipython2", 254 | "version": "2.7.11" 255 | } 256 | }, 257 | "nbformat": 4, 258 | "nbformat_minor": 0 259 | } 260 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ads1-notebooks 2 | Copies of notebooks used in the practical sessions for Algorithms for DNA Sequencing 3 | # Algorithms-for-DNA-sequencing 4 | -------------------------------------------------------------------------------- /bm_preproc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """bm_preproc.py: Boyer-Moore preprocessing.""" 4 | 5 | __author__ = "Ben Langmead" 6 | 7 | import unittest 8 | 9 | 10 | def z_array(s): 11 | """ Use Z algorithm (Gusfield theorem 1.4.1) to preprocess s """ 12 | assert len(s) > 1 13 | z = [len(s)] + [0] * (len(s)-1) 14 | 15 | # Initial comparison of s[1:] with prefix 16 | for i in range(1, len(s)): 17 | if s[i] == s[i-1]: 18 | z[1] += 1 19 | else: 20 | break 21 | 22 | r, l = 0, 0 23 | if z[1] > 0: 24 | r, l = z[1], 1 25 | 26 | for k in range(2, len(s)): 27 | assert z[k] == 0 28 | if k > r: 29 | # Case 1 30 | for i in range(k, len(s)): 31 | if s[i] == s[i-k]: 32 | z[k] += 1 33 | else: 34 | break 35 | r, l = k + z[k] - 1, k 36 | else: 37 | # Case 2 38 | # Calculate length of beta 39 | nbeta = r - k + 1 40 | zkp = z[k - l] 41 | if nbeta > zkp: 42 | # Case 2a: zkp wins 43 | z[k] = zkp 44 | else: 45 | # Case 2b: Compare characters just past r 46 | nmatch = 0 47 | for i in range(r+1, len(s)): 48 | if s[i] == s[i - k]: 49 | nmatch += 1 50 | else: 51 | break 52 | l, r = k, r + nmatch 53 | z[k] = r - k + 1 54 | return z 55 | 56 | 57 | def n_array(s): 58 | """ Compile the N array (Gusfield theorem 2.2.2) from the Z array """ 59 | return z_array(s[::-1])[::-1] 60 | 61 | 62 | def big_l_prime_array(p, n): 63 | """ Compile L' array (Gusfield theorem 2.2.2) using p and N array. 64 | L'[i] = largest index j less than n such that N[j] = |P[i:]| """ 65 | lp = [0] * len(p) 66 | for j in range(len(p)-1): 67 | i = len(p) - n[j] 68 | if i < len(p): 69 | lp[i] = j + 1 70 | return lp 71 | 72 | 73 | def big_l_array(p, lp): 74 | """ Compile L array (Gusfield theorem 2.2.2) using p and L' array. 75 | L[i] = largest index j less than n such that N[j] >= |P[i:]| """ 76 | l = [0] * len(p) 77 | l[1] = lp[1] 78 | for i in range(2, len(p)): 79 | l[i] = max(l[i-1], lp[i]) 80 | return l 81 | 82 | 83 | def small_l_prime_array(n): 84 | """ Compile lp' array (Gusfield theorem 2.2.4) using N array. """ 85 | small_lp = [0] * len(n) 86 | for i in range(len(n)): 87 | if n[i] == i+1: # prefix matching a suffix 88 | small_lp[len(n)-i-1] = i+1 89 | for i in range(len(n)-2, -1, -1): # "smear" them out to the left 90 | if small_lp[i] == 0: 91 | small_lp[i] = small_lp[i+1] 92 | return small_lp 93 | 94 | 95 | def good_suffix_table(p): 96 | """ Return tables needed to apply good suffix rule. """ 97 | n = n_array(p) 98 | lp = big_l_prime_array(p, n) 99 | return lp, big_l_array(p, lp), small_l_prime_array(n) 100 | 101 | 102 | def good_suffix_mismatch(i, big_l_prime, small_l_prime): 103 | """ Given a mismatch at offset i, and given L/L' and l' arrays, 104 | return amount to shift as determined by good suffix rule. """ 105 | length = len(big_l_prime) 106 | assert i < length 107 | if i == length - 1: 108 | return 0 109 | i += 1 # i points to leftmost matching position of P 110 | if big_l_prime[i] > 0: 111 | return length - big_l_prime[i] 112 | return length - small_l_prime[i] 113 | 114 | 115 | def good_suffix_match(small_l_prime): 116 | """ Given a full match of P to T, return amount to shift as 117 | determined by good suffix rule. """ 118 | return len(small_l_prime) - small_l_prime[1] 119 | 120 | 121 | def dense_bad_char_tab(p, amap): 122 | """ Given pattern string and list with ordered alphabet characters, create 123 | and return a dense bad character table. Table is indexed by offset 124 | then by character. """ 125 | tab = [] 126 | nxt = [0] * len(amap) 127 | for i in range(0, len(p)): 128 | c = p[i] 129 | assert c in amap 130 | tab.append(nxt[:]) 131 | nxt[amap[c]] = i+1 132 | return tab 133 | 134 | 135 | class BoyerMoore(object): 136 | """ Encapsulates pattern and associated Boyer-Moore preprocessing. """ 137 | 138 | def __init__(self, p, alphabet='ACGT'): 139 | # Create map from alphabet characters to integers 140 | self.amap = {alphabet[i]: i for i in range(len(alphabet))} 141 | # Make bad character rule table 142 | self.bad_char = dense_bad_char_tab(p, self.amap) 143 | # Create good suffix rule table 144 | _, self.big_l, self.small_l_prime = good_suffix_table(p) 145 | 146 | def bad_character_rule(self, i, c): 147 | """ Return # skips given by bad character rule at offset i """ 148 | assert c in self.amap 149 | assert i < len(self.bad_char) 150 | ci = self.amap[c] 151 | return i - (self.bad_char[i][ci]-1) 152 | 153 | def good_suffix_rule(self, i): 154 | """ Given a mismatch at offset i, return amount to shift 155 | as determined by (weak) good suffix rule. """ 156 | length = len(self.big_l) 157 | assert i < length 158 | if i == length - 1: 159 | return 0 160 | i += 1 # i points to leftmost matching position of P 161 | if self.big_l[i] > 0: 162 | return length - self.big_l[i] 163 | return length - self.small_l_prime[i] 164 | 165 | def match_skip(self): 166 | """ Return amount to shift in case where P matches T """ 167 | return len(self.small_l_prime) - self.small_l_prime[1] 168 | 169 | 170 | class TestBoyerMoorePreproc(unittest.TestCase): 171 | 172 | def test_z_1(self): 173 | s = 'abb' 174 | # -00 175 | z = z_array(s) 176 | self.assertEqual([3, 0, 0], z) 177 | 178 | def test_z_2(self): 179 | s = 'abababab' 180 | # 00604020 181 | z = z_array(s) 182 | self.assertEqual([8, 0, 6, 0, 4, 0, 2, 0], z) 183 | 184 | def test_z_3(self): 185 | s = 'abababab' 186 | # 00604020 187 | z = z_array(s) 188 | self.assertEqual([8, 0, 6, 0, 4, 0, 2, 0], z) 189 | 190 | def test_n_1(self): 191 | s = 'abb' 192 | # 01- 193 | n = n_array(s) 194 | self.assertEqual([0, 1, 3], n) 195 | 196 | def test_n_2(self): 197 | s = 'abracadabra' 198 | # 1004010100- 199 | n = n_array(s) 200 | self.assertEqual([1, 0, 0, 4, 0, 1, 0, 1, 0, 0, 11], n) 201 | 202 | def test_n_3(self): 203 | s = 'abababab' 204 | # 0204060- 205 | n = n_array(s) 206 | self.assertEqual([0, 2, 0, 4, 0, 6, 0, 8], n) 207 | 208 | def test_big_l_prime_1(self): 209 | s = 'abb' 210 | # 001 211 | big_l_prime = big_l_prime_array(s, n_array(s)) 212 | self.assertEqual([0, 0, 2], big_l_prime) 213 | 214 | def test_big_l_prime_2(self): 215 | s = 'abracadabra' 216 | # 01234567890 217 | # L' 00000003007 218 | # L 00000003337 219 | big_l_prime = big_l_prime_array(s, n_array(s)) 220 | self.assertEqual([0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 8], big_l_prime) 221 | 222 | def test_small_l_prime_1(self): 223 | s = 'abracadabra' 224 | # N 1004010100- 225 | # l' 1 226 | # l' 4 227 | # l' 44444444111 228 | small_l_prime = small_l_prime_array(n_array(s)) 229 | self.assertEqual([11, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1], small_l_prime) 230 | 231 | def test_good_suffix_match_mismatch_1(self): 232 | p = 'GGTAGGT' 233 | big_l_prime, big_l, small_l_prime = good_suffix_table(p) 234 | self.assertEqual([0, 0, 0, 0, 3, 0, 0], big_l_prime) 235 | self.assertEqual([0, 0, 0, 0, 3, 3, 3], big_l) 236 | self.assertEqual([7, 3, 3, 3, 3, 0, 0], small_l_prime) 237 | self.assertEqual(0, good_suffix_mismatch(6, big_l_prime, small_l_prime)) 238 | self.assertEqual(0, good_suffix_mismatch(6, big_l, small_l_prime)) 239 | # t: xT 240 | # p: GGTAGGT 241 | # L': -000300 242 | # L: -000333 243 | self.assertEqual(7, good_suffix_mismatch(5, big_l_prime, small_l_prime)) 244 | self.assertEqual(4, good_suffix_mismatch(5, big_l, small_l_prime)) 245 | # t: xGT 246 | # p: GGTAGGT 247 | # L': -000300 248 | # L: -000333 249 | self.assertEqual(7, good_suffix_mismatch(4, big_l_prime, small_l_prime)) 250 | self.assertEqual(4, good_suffix_mismatch(4, big_l, small_l_prime)) 251 | # t: xGGT 252 | # p: GGTAGGT 253 | # L': -000300 254 | # L: -000333 255 | self.assertEqual(4, good_suffix_mismatch(3, big_l_prime, small_l_prime)) 256 | self.assertEqual(4, good_suffix_mismatch(3, big_l, small_l_prime)) 257 | # t: xAGGT 258 | # p: GGTAGGT 259 | # L': -000300 260 | # L: -000333 261 | self.assertEqual(4, good_suffix_mismatch(2, big_l_prime, small_l_prime)) 262 | self.assertEqual(4, good_suffix_mismatch(2, big_l, small_l_prime)) 263 | # t: xTAGGT 264 | # p: GGTAGGT 265 | # L': -000300 266 | # L: -000333 267 | self.assertEqual(4, good_suffix_mismatch(1, big_l_prime, small_l_prime)) 268 | self.assertEqual(4, good_suffix_mismatch(1, big_l, small_l_prime)) 269 | # t: xGTAGGT 270 | # p: GGTAGGT 271 | # L': -000300 272 | # L: -000333 273 | self.assertEqual(4, good_suffix_mismatch(0, big_l_prime, small_l_prime)) 274 | self.assertEqual(4, good_suffix_mismatch(0, big_l, small_l_prime)) 275 | 276 | def test_good_suffix_table_1(self): 277 | s = 'abb' 278 | # 001 279 | big_l_prime, big_l, small_l_prime = good_suffix_table(s) 280 | self.assertEqual([0, 0, 2], big_l_prime) 281 | self.assertEqual([0, 0, 2], big_l) 282 | self.assertEqual([3, 0, 0], small_l_prime) 283 | 284 | def test_good_suffix_table_2(self): 285 | s = 'abracadabra' 286 | # 01234567890 287 | # L' 00000003007 288 | # L 00000003337 289 | # l' -4444444111 290 | big_l_prime, big_l, small_l_prime = good_suffix_table(s) 291 | self.assertEqual([0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 8], big_l_prime) 292 | self.assertEqual([0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 8], big_l) 293 | self.assertEqual([11, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1], small_l_prime) 294 | 295 | if __name__ == '__main__': 296 | unittest.main() 297 | -------------------------------------------------------------------------------- /bm_preproc.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sidsriv/Algorithms-for-DNA-sequencing/2f93893379c067c19e08d69385a34acdd326852e/bm_preproc.pyc -------------------------------------------------------------------------------- /homework-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "def reverseComplement(s):\n", 12 | " complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}\n", 13 | " t = ''\n", 14 | " for base in s:\n", 15 | " t = complement[base] + t\n", 16 | " return t" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 3, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "def match(s1, s2):\n", 28 | " if not len(s1) == len(s2):\n", 29 | " return False\n", 30 | " for i in range(0, len(s1)):\n", 31 | " if not s1[i] == s2[i]:\n", 32 | " return False\n", 33 | " return True" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "def readFastq(filename):\n", 45 | " sequences = []\n", 46 | " qualities = []\n", 47 | " with open(filename) as fh:\n", 48 | " while True:\n", 49 | " fh.readline() # skip name line\n", 50 | " seq = fh.readline().rstrip() # read base sequence\n", 51 | " fh.readline() # skip placeholder line\n", 52 | " qual = fh.readline().rstrip() #base quality line\n", 53 | " if len(seq) == 0:\n", 54 | " break\n", 55 | " sequences.append(seq)\n", 56 | " qualities.append(qual)\n", 57 | " return sequences, qualities" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 5, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "def readGenome(filename):\n", 69 | " genome = ''\n", 70 | " with open(filename, 'r') as f:\n", 71 | " for line in f:\n", 72 | " # ignore header line with genome information\n", 73 | " if not line[0] == '>':\n", 74 | " genome += line.rstrip()\n", 75 | " return genome" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 6, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "def naive(p, t):\n", 87 | " occurrences = []\n", 88 | " for i in range(len(t) - len(p) + 1):\n", 89 | " match = True\n", 90 | " for j in range(len(p)):\n", 91 | " if t[i+j] != p[j]:\n", 92 | " match = False\n", 93 | " break\n", 94 | " if match:\n", 95 | " occurrences.append(i)\n", 96 | " return occurrences" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 7, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "lambda_gene = readGenome('lambda_virus.fa')" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 8, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "[122, 302, 383, 555, 729, 921, 2403, 2688, 3163, 3254, 4312, 4646, 4917, 5069, 5162, 5192, 5685, 6315, 6507, 6585, 7397, 7715, 7822, 7987, 8103, 8121, 8296, 8762, 8986, 9621, 10419, 10836, 11196, 11205, 11919, 12183, 12489, 12994, 13090, 13163, 13346, 13415, 13516, 13562, 13747, 13816, 14384, 14621, 14758, 14953, 15068, 15421, 15611, 16165, 16195, 16234, 16702, 17104, 17131, 17410, 17437, 17776, 17939, 18554, 18736, 18856, 18914, 18955, 19035, 19313, 19470, 19591, 19836, 19950, 19983, 20067, 20492, 20802, 21158, 21243, 22052, 22384, 23507, 23936, 23967, 24598, 25181, 25197, 25230, 25384, 25773, 26196, 26442, 26579, 26656, 27367, 27456, 27525, 27588, 27784, 27963, 28325, 28835, 28979, 30003, 30315, 30919, 31622, 32984, 35148, 35190, 35703, 36481, 36622, 37004, 37046, 37366, 37576, 37589, 38033, 38200, 38482, 38652, 38720, 39675, 39684, 39831, 40407, 40425, 40430, 40801, 40818, 41157, 41190, 41267, 41720, 42266, 43692, 43866, 44111, 44165, 44705, 44738, 44792, 45036, 46864, 47103, 47855, 48304, 48494]\n", 122 | "150\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "print naive('AGGT', lambda_gene)\n", 128 | "print len(naive('AGGT', lambda_gene))" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 9, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "ACCT\n", 143 | "156\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "reverse_complement_AGGT = reverseComplement('AGGT')\n", 149 | "print reverse_complement_AGGT\n", 150 | "print len(naive(reverse_complement_AGGT,lambda_gene))\n" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 10, 156 | "metadata": { 157 | "collapsed": false 158 | }, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "[46, 77, 90, 732, 1455, 2175, 3274, 3418, 4348, 4463, 5267, 5708, 6075, 6183, 6540, 7026, 7948, 8061, 8199, 8460, 9078, 9252, 9691, 9951, 11082, 11115, 11517, 11583, 12618, 12966, 14107, 14991, 15627, 15967, 16033, 16294, 16858, 18868, 19136, 19481, 20799, 21440, 21634, 21701, 21902, 21947, 22723, 23047, 23083, 23110, 23205, 23278, 23284, 23478, 23497, 23554, 23692, 23886, 23978, 23986, 24072, 24101, 24750, 24986, 25169, 25254, 25352, 25436, 25577, 25745, 25967, 26132, 26425, 26450, 26482, 26522, 26584, 26650, 26665, 26678, 26692, 26762, 26809, 26983, 26992, 27037, 27091, 27202, 27316, 27682, 28455, 28568, 28921, 28971, 29373, 29980, 30381, 30619, 30638, 30721, 30734, 30755, 31168, 31807, 32193, 32217, 32640, 32703, 33221, 33350, 33687, 33735, 33747, 33800, 34201, 34365, 34405, 34589, 34617, 34686, 34734, 34752, 34830, 35061, 35259, 35533, 35540, 36274, 36281, 36302, 36429, 36491, 36530, 36542, 36603, 36735, 36756, 36878, 37031, 37441, 37765, 37782, 37881, 37915, 37943, 38163, 38286, 38411, 38833, 39606, 39834, 40146, 40505, 40559, 40880, 41016, 41422, 41651, 41714, 42316, 42630, 42650, 42672, 42813, 43409, 43491, 43506, 43527, 43592, 44423, 44537, 44595, 44867, 45150, 45787, 46032, 46407, 46555, 46803, 46842, 46852, 46895, 46951, 47255, 47273, 47286, 47429, 47575, 47698, 47798, 48058, 48124, 48226, 48358, 48377]\n", 165 | "195\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "print naive('TTAA', lambda_gene)\n", 171 | "print len(naive('TTAA', lambda_gene))" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 11, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "TTAA\n", 186 | "195\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "reverse_complement_TTAA = reverseComplement('TTAA')\n", 192 | "print reverse_complement_TTAA\n", 193 | "print len(naive(reverse_complement_TTAA,lambda_gene))\n" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 12, 199 | "metadata": { 200 | "collapsed": false 201 | }, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "[27733, 45382]\n", 208 | "[26028]\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "print naive('ACTAAGT', lambda_gene)\n", 214 | "complement = reverseComplement('ACTAAGT')\n", 215 | "print naive(complement, lambda_gene)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 13, 221 | "metadata": { 222 | "collapsed": false 223 | }, 224 | "outputs": [ 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "[18005, 23320, 33657, 44806]\n", 230 | "[450, 1908, 2472, 41927, 45369]\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "print naive('AGTCGA', lambda_gene)\n", 236 | "complement = reverseComplement('AGTCGA')\n", 237 | "print naive(complement, lambda_gene)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 14, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "def hamming_distance(s1,s2):\n", 249 | " count = 0\n", 250 | " for i in range(len(s1)):\n", 251 | " if s1[i]!=s2[i]:\n", 252 | " count +=1\n", 253 | " return count" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 15, 259 | "metadata": { 260 | "collapsed": false 261 | }, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | "[0, 4]\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "def naive_2mm(p, t):\n", 273 | " occurrences = []\n", 274 | " for i in range(len(t) - len(p) + 1):\n", 275 | " subset_dna = t[i:i+len(p)]\n", 276 | " ham_dist = hamming_distance(subset_dna,p)\n", 277 | " if ham_dist <= 2:\n", 278 | " occurrences.append(i)\n", 279 | " return occurrences\n", 280 | "print naive_2mm('ACTTTA', 'ACTTACTTGATAAAGT')" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 16, 286 | "metadata": { 287 | "collapsed": false 288 | }, 289 | "outputs": [ 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "191\n" 295 | ] 296 | } 297 | ], 298 | "source": [ 299 | "print len(naive_2mm('TTCAAGCC',lambda_gene))" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 17, 305 | "metadata": { 306 | "collapsed": false 307 | }, 308 | "outputs": [ 309 | { 310 | "name": "stdout", 311 | "output_type": "stream", 312 | "text": [ 313 | "[49, 282, 299, 302, 380, 1560, 1650, 2235, 2277, 2400, 2562, 2565, 2729, 2823, 3160, 3181, 3946, 4210, 4294, 4309, 4405, 4580, 5069, 5159, 5189, 5231, 5331, 5519, 5737, 5882, 5993, 5996, 6011, 6312, 6522, 6585, 6606, 7316, 7394, 7819, 7904, 7966, 7998, 8534, 8648, 8946, 9339, 9354, 9530, 9842, 9966, 10041, 10250, 10416, 10445, 10484, 10527, 10874, 11193, 11292, 11505, 11568, 11655, 11745, 11838, 12078, 12180, 12222, 12697, 12745, 12819, 12880, 12935, 13011, 13087, 13256, 13415, 13526, 13813, 14259, 15385, 15473, 16192, 17101, 17437, 17755, 17936, 17989, 18016, 18040, 18727, 18853, 18911, 19232, 19263, 19310, 19833, 19929, 19932, 19947, 19980, 20793, 20802, 21305, 21528, 21627, 21684, 22414, 22660, 22670, 22787, 23326, 24063, 24145, 24409, 24595, 24681, 25120, 25139, 25210, 25381, 25384, 25648, 25664, 25773, 25987, 26196, 26208, 26576, 26587, 26653, 26736, 27892, 27967, 28042, 28622, 28840, 28976, 29119, 30029, 30530, 30673, 30902, 31619, 31645, 31682, 31843, 31859, 32069, 33180, 33365, 33715, 33952, 34321, 34421, 34841, 34848, 34956, 35145, 35253, 35289, 35643, 36185, 36687, 36869, 38030, 38197, 38381, 38479, 38737, 39282, 39600, 39681, 39786, 39828, 39954, 40119, 40337, 40508, 40781, 40887, 40890, 40946, 41110, 41225, 41264, 41282, 41324, 41570, 41693, 41717, 41768, 42079, 42082, 42266, 42353, 43039, 43184, 43389, 43662, 43689, 45033, 45727, 45763, 45781, 45790, 46173, 46215, 47028, 47220, 47930, 48101, 48256, 48301, 48411]\n" 314 | ] 315 | } 316 | ], 317 | "source": [ 318 | "print naive_2mm('AGGAGGTT',lambda_gene)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 18, 324 | "metadata": { 325 | "collapsed": false 326 | }, 327 | "outputs": [ 328 | { 329 | "name": "stdout", 330 | "output_type": "stream", 331 | "text": [ 332 | "[56922, 84641, 147558, 160162, 160729, 191452, 262042, 273669, 364263, 421221, 429299, 465647, 551134, 635931, 657496, 681737, 717706, 724927, 747359]\n", 333 | "19\n" 334 | ] 335 | } 336 | ], 337 | "source": [ 338 | "#function for parsing fasta file\n", 339 | "def readGenome(filename):\n", 340 | " genome = ''\n", 341 | " with open(filename, 'r') as f:\n", 342 | " for line in f:\n", 343 | " # ignore header line with genome information\n", 344 | " if not line[0] == '>':\n", 345 | " genome += line.rstrip()\n", 346 | " return genome\n", 347 | "genome = readGenome('chr1.GRCh38.excerpt.fasta')\n", 348 | "genome[:100]\n", 349 | "p = 'GGCGCGGTGGCTCACGCCTGTAAT'\n", 350 | "t = genome\n", 351 | "print naive_2mm(p,t)\n", 352 | "print len(naive_2mm(p,t))" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": { 359 | "collapsed": true 360 | }, 361 | "outputs": [], 362 | "source": [] 363 | } 364 | ], 365 | "metadata": { 366 | "kernelspec": { 367 | "display_name": "Python 2", 368 | "language": "python", 369 | "name": "python2" 370 | }, 371 | "language_info": { 372 | "codemirror_mode": { 373 | "name": "ipython", 374 | "version": 2 375 | }, 376 | "file_extension": ".py", 377 | "mimetype": "text/x-python", 378 | "name": "python", 379 | "nbconvert_exporter": "python", 380 | "pygments_lexer": "ipython2", 381 | "version": "2.7.11" 382 | } 383 | }, 384 | "nbformat": 4, 385 | "nbformat_minor": 0 386 | } 387 | -------------------------------------------------------------------------------- /homework-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "data": { 12 | "text/plain": [ 13 | "'TTGAATGCTGAAATCAGCAGGTAATATATGATAATAGAGAAAGCTATCCCGAAGGTGCATAGGTCAACAATACTTGAGCCTAACTCAGTAGATCCTAAAA'" 14 | ] 15 | }, 16 | "execution_count": 6, 17 | "metadata": {}, 18 | "output_type": "execute_result" 19 | } 20 | ], 21 | "source": [ 22 | "#function for parsing fasta file\n", 23 | "def readGenome(filename):\n", 24 | " genome = ''\n", 25 | " with open(filename, 'r') as f:\n", 26 | " for line in f:\n", 27 | " # ignore header line with genome information\n", 28 | " if not line[0] == '>':\n", 29 | " genome += line.rstrip()\n", 30 | " return genome\n", 31 | "genome = readGenome('chr1.GRCh38.excerpt.fasta')\n", 32 | "genome[:100]" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 7, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "#function for naive pattern matching algorithm\n", 44 | "def naive(p, t):\n", 45 | " occurrences = []\n", 46 | " for i in range(len(t) - len(p) + 1):\n", 47 | " match = True\n", 48 | " for j in range(len(p)):\n", 49 | " if t[i+j] != p[j]:\n", 50 | " match = False\n", 51 | " break\n", 52 | " if match:\n", 53 | " occurrences.append(i)\n", 54 | " return occurrences" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 10, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "#prepocessor for boyer-moore fast pattern matching algorithm\n", 66 | "import string\n", 67 | "\n", 68 | "def z_array(s):\n", 69 | " \"\"\" Use Z algorithm (Gusfield theorem 1.4.1) to preprocess s \"\"\"\n", 70 | " assert len(s) > 1\n", 71 | " z = [len(s)] + [0] * (len(s)-1)\n", 72 | " # Initial comparison of s[1:] with prefix\n", 73 | " for i in range(1, len(s)):\n", 74 | " if s[i] == s[i-1]:\n", 75 | " z[1] += 1\n", 76 | " else:\n", 77 | " break\n", 78 | " r, l = 0, 0\n", 79 | " if z[1] > 0:\n", 80 | " r, l = z[1], 1\n", 81 | " for k in range(2, len(s)):\n", 82 | " assert z[k] == 0\n", 83 | " if k > r:\n", 84 | " # Case 1\n", 85 | " for i in range(k, len(s)):\n", 86 | " if s[i] == s[i-k]:\n", 87 | " z[k] += 1\n", 88 | " else:\n", 89 | " break\n", 90 | " r, l = k + z[k] - 1, k\n", 91 | " else:\n", 92 | " # Case 2\n", 93 | " # Calculate length of beta\n", 94 | " nbeta = r - k + 1\n", 95 | " zkp = z[k - l]\n", 96 | " if nbeta > zkp:\n", 97 | " # Case 2a: Zkp wins\n", 98 | " z[k] = zkp\n", 99 | " else:\n", 100 | " # Case 2b: Compare characters just past r\n", 101 | " nmatch = 0\n", 102 | " for i in range(r+1, len(s)):\n", 103 | " if s[i] == s[i - k]:\n", 104 | " nmatch += 1\n", 105 | " else:\n", 106 | " break\n", 107 | " l, r = k, r + nmatch\n", 108 | " z[k] = r - k + 1\n", 109 | " return z\n", 110 | "\n", 111 | "\n", 112 | "def n_array(s):\n", 113 | " \"\"\" Compile the N array (Gusfield theorem 2.2.2) from the Z array \"\"\"\n", 114 | " return z_array(s[::-1])[::-1]\n", 115 | "\n", 116 | "\n", 117 | "def big_l_prime_array(p, n):\n", 118 | " \"\"\" Compile L' array (Gusfield theorem 2.2.2) using p and N array.\n", 119 | " L'[i] = largest index j less than n such that N[j] = |P[i:]| \"\"\"\n", 120 | " lp = [0] * len(p)\n", 121 | " for j in range(len(p)-1):\n", 122 | " i = len(p) - n[j]\n", 123 | " if i < len(p):\n", 124 | " lp[i] = j + 1\n", 125 | " return lp\n", 126 | "\n", 127 | "\n", 128 | "def big_l_array(p, lp):\n", 129 | " \"\"\" Compile L array (Gusfield theorem 2.2.2) using p and L' array.\n", 130 | " L[i] = largest index j less than n such that N[j] >= |P[i:]| \"\"\"\n", 131 | " l = [0] * len(p)\n", 132 | " l[1] = lp[1]\n", 133 | " for i in range(2, len(p)):\n", 134 | " l[i] = max(l[i-1], lp[i])\n", 135 | " return l\n", 136 | "\n", 137 | "\n", 138 | "def small_l_prime_array(n):\n", 139 | " \"\"\" Compile lp' array (Gusfield theorem 2.2.4) using N array. \"\"\"\n", 140 | " small_lp = [0] * len(n)\n", 141 | " for i in range(len(n)):\n", 142 | " if n[i] == i+1: # prefix matching a suffix\n", 143 | " small_lp[len(n)-i-1] = i+1\n", 144 | " for i in range(len(n)-2, -1, -1): # \"smear\" them out to the left\n", 145 | " if small_lp[i] == 0:\n", 146 | " small_lp[i] = small_lp[i+1]\n", 147 | " return small_lp\n", 148 | "\n", 149 | "\n", 150 | "def good_suffix_table(p):\n", 151 | " \"\"\" Return tables needed to apply good suffix rule. \"\"\"\n", 152 | " n = n_array(p)\n", 153 | " lp = big_l_prime_array(p, n)\n", 154 | " return lp, big_l_array(p, lp), small_l_prime_array(n)\n", 155 | "\n", 156 | "\n", 157 | "def good_suffix_mismatch(i, big_l_prime, small_l_prime):\n", 158 | " \"\"\" Given a mismatch at offset i, and given L/L' and l' arrays,\n", 159 | " return amount to shift as determined by good suffix rule. \"\"\"\n", 160 | " length = len(big_l_prime)\n", 161 | " assert i < length\n", 162 | " if i == length - 1:\n", 163 | " return 0\n", 164 | " i += 1 # i points to leftmost matching position of P\n", 165 | " if big_l_prime[i] > 0:\n", 166 | " return length - big_l_prime[i]\n", 167 | " return length - small_l_prime[i]\n", 168 | "\n", 169 | "\n", 170 | "def good_suffix_match(small_l_prime):\n", 171 | " \"\"\" Given a full match of P to T, return amount to shift as\n", 172 | " determined by good suffix rule. \"\"\"\n", 173 | " return len(small_l_prime) - small_l_prime[1]\n", 174 | "\n", 175 | "\n", 176 | "def dense_bad_char_tab(p, amap):\n", 177 | " \"\"\" Given pattern string and list with ordered alphabet characters, create\n", 178 | " and return a dense bad character table. Table is indexed by offset\n", 179 | " then by character. \"\"\"\n", 180 | " tab = []\n", 181 | " nxt = [0] * len(amap)\n", 182 | " for i in range(0, len(p)):\n", 183 | " c = p[i]\n", 184 | " assert c in amap\n", 185 | " tab.append(nxt[:])\n", 186 | " nxt[amap[c]] = i+1\n", 187 | " return tab\n", 188 | "\n", 189 | "\n", 190 | "class BoyerMoore(object):\n", 191 | " \"\"\" Encapsulates pattern and associated Boyer-Moore preprocessing. \"\"\"\n", 192 | " \n", 193 | " def __init__(self, p, alphabet='ACGT'):\n", 194 | " self.p = p\n", 195 | " self.alphabet = alphabet\n", 196 | " # Create map from alphabet characters to integers\n", 197 | " self.amap = {}\n", 198 | " for i in range(len(self.alphabet)):\n", 199 | " self.amap[self.alphabet[i]] = i\n", 200 | " # Make bad character rule table\n", 201 | " self.bad_char = dense_bad_char_tab(p, self.amap)\n", 202 | " # Create good suffix rule table\n", 203 | " _, self.big_l, self.small_l_prime = good_suffix_table(p)\n", 204 | " \n", 205 | " def bad_character_rule(self, i, c):\n", 206 | " \"\"\" Return # skips given by bad character rule at offset i \"\"\"\n", 207 | " assert c in self.amap\n", 208 | " ci = self.amap[c]\n", 209 | " assert i > (self.bad_char[i][ci]-1)\n", 210 | " return i - (self.bad_char[i][ci]-1)\n", 211 | " \n", 212 | " def good_suffix_rule(self, i):\n", 213 | " \"\"\" Given a mismatch at offset i, return amount to shift\n", 214 | " as determined by (weak) good suffix rule. \"\"\"\n", 215 | " length = len(self.big_l)\n", 216 | " assert i < length\n", 217 | " if i == length - 1:\n", 218 | " return 0\n", 219 | " i += 1 # i points to leftmost matching position of P\n", 220 | " if self.big_l[i] > 0:\n", 221 | " return length - self.big_l[i]\n", 222 | " return length - self.small_l_prime[i]\n", 223 | " \n", 224 | " def match_skip(self):\n", 225 | " \"\"\" Return amount to shift in case where P matches T \"\"\"\n", 226 | " return len(self.small_l_prime) - self.small_l_prime[1]" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 11, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "#function for boyer-moore fast pattern matching algorithm\n", 238 | "def boyer_moore(p, p_bm, t):\n", 239 | " \"\"\" Do Boyer-Moore matching \"\"\"\n", 240 | " i = 0\n", 241 | " occurrences = []\n", 242 | " while i < len(t) - len(p) + 1:\n", 243 | " shift = 1\n", 244 | " mismatched = False\n", 245 | " for j in range(len(p)-1, -1, -1):\n", 246 | " if p[j] != t[i+j]:\n", 247 | " skip_bc = p_bm.bad_character_rule(j, t[i+j])\n", 248 | " skip_gs = p_bm.good_suffix_rule(j)\n", 249 | " shift = max(shift, skip_bc, skip_gs)\n", 250 | " mismatched = True\n", 251 | " break\n", 252 | " if not mismatched:\n", 253 | " occurrences.append(i)\n", 254 | " skip_gs = p_bm.match_skip()\n", 255 | " shift = max(shift, skip_gs)\n", 256 | " i += shift\n", 257 | " return occurrences" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 13, 263 | "metadata": { 264 | "collapsed": false 265 | }, 266 | "outputs": [ 267 | { 268 | "data": { 269 | "text/plain": [ 270 | "[6, 14]" 271 | ] 272 | }, 273 | "execution_count": 13, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "#testing both functions\n", 280 | "#boyer-moore\n", 281 | "t = 'GCTAGCTCTACGAGTCTA'\n", 282 | "p = 'TCTA'\n", 283 | "p_bm = BoyerMoore(p, alphabet='ACGT')\n", 284 | "boyer_moore(p, p_bm, t)\n" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 15, 290 | "metadata": { 291 | "collapsed": false 292 | }, 293 | "outputs": [ 294 | { 295 | "data": { 296 | "text/plain": [ 297 | "[6, 14]" 298 | ] 299 | }, 300 | "execution_count": 15, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "#naive\n", 307 | "naive(p, t)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 4, 313 | "metadata": { 314 | "collapsed": false 315 | }, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "([40], 46, 41)\n" 322 | ] 323 | } 324 | ], 325 | "source": [ 326 | "def naive_with_counts(p, t):\n", 327 | " occurrences = []\n", 328 | " num_char_comp = 0\n", 329 | " num_aligments_tried = 0\n", 330 | " for i in range(len(t) - len(p) + 1): # loop over alignments\n", 331 | " match = True\n", 332 | " num_aligments_tried += 1\n", 333 | " for j in range(len(p)): # loop over characters\n", 334 | " num_char_comp += 1\n", 335 | " if t[i+j] != p[j]: # compare characters\n", 336 | " match = False\n", 337 | " break\n", 338 | " if match:\n", 339 | " occurrences.append(i) # all chars matched; record\n", 340 | " return occurrences, num_char_comp, num_aligments_tried\n", 341 | "p = 'word'\n", 342 | "t = 'there would have been a time for such a word'\n", 343 | "print naive_with_counts(p,t) " 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 6, 349 | "metadata": { 350 | "collapsed": false 351 | }, 352 | "outputs": [ 353 | { 354 | "name": "stdout", 355 | "output_type": "stream", 356 | "text": [ 357 | "([40], 15, 12)\n" 358 | ] 359 | } 360 | ], 361 | "source": [ 362 | "def boyer_moore_with_counts(p, p_bm, t):\n", 363 | " \"\"\" Do Boyer-Moore matching. p=pattern, t=text, p_bm=BoyerMoore object for p \"\"\"\n", 364 | " i = 0\n", 365 | " occurrences = []\n", 366 | " num_char_comp = 0\n", 367 | " num_aligments_tried = 0\n", 368 | " \n", 369 | " while i < len(t) - len(p) + 1:\n", 370 | " shift = 1\n", 371 | " mismatched = False\n", 372 | " num_aligments_tried += 1\n", 373 | "\n", 374 | " for j in range(len(p)-1, -1, -1):\n", 375 | " num_char_comp += 1\n", 376 | " if p[j] != t[i+j]: \n", 377 | " skip_bc = p_bm.bad_character_rule(j, t[i+j])\n", 378 | " skip_gs = p_bm.good_suffix_rule(j)\n", 379 | " shift = max(shift, skip_bc, skip_gs)\n", 380 | " mismatched = True\n", 381 | " break\n", 382 | "\n", 383 | " if not mismatched:\n", 384 | " occurrences.append(i)\n", 385 | " skip_gs = p_bm.match_skip()\n", 386 | " shift = max(shift, skip_gs)\n", 387 | " i += shift\n", 388 | " \n", 389 | " return occurrences, num_char_comp, num_aligments_tried\n", 390 | "from bm_preproc import BoyerMoore\n", 391 | "p = 'word'\n", 392 | "t = 'there would have been a time for such a word'\n", 393 | "lowercase_alphabet = 'abcdefghijklmnopqrstuvwxyz '\n", 394 | "p_bm = BoyerMoore(p, lowercase_alphabet)\n", 395 | "print boyer_moore_with_counts(p,p_bm,t)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 8, 401 | "metadata": { 402 | "collapsed": false 403 | }, 404 | "outputs": [ 405 | { 406 | "ename": "NameError", 407 | "evalue": "name 'genome' is not defined", 408 | "output_type": "error", 409 | "traceback": [ 410 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 411 | "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", 412 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mt\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgenome\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mtotal_char_comp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mtotal_align_comp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 413 | "\u001b[1;31mNameError\u001b[0m: name 'genome' is not defined" 414 | ] 415 | } 416 | ], 417 | "source": [ 418 | "p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'\n", 419 | "t = genome\n", 420 | "\n", 421 | "total_char_comp = 0\n", 422 | "total_align_comp = 0\n", 423 | "\n", 424 | "for t in reads:\n", 425 | " occurrences, num_char_comp, num_aligments_tried = naive_with_counts(p, t)\n", 426 | " total_char_comp += num_char_comp\n", 427 | " total_align_comp += num_aligments_tried\n", 428 | "print total_align_com" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": { 435 | "collapsed": true 436 | }, 437 | "outputs": [], 438 | "source": [] 439 | } 440 | ], 441 | "metadata": { 442 | "kernelspec": { 443 | "display_name": "Python 2", 444 | "language": "python", 445 | "name": "python2" 446 | }, 447 | "language_info": { 448 | "codemirror_mode": { 449 | "name": "ipython", 450 | "version": 2 451 | }, 452 | "file_extension": ".py", 453 | "mimetype": "text/x-python", 454 | "name": "python", 455 | "nbconvert_exporter": "python", 456 | "pygments_lexer": "ipython2", 457 | "version": "2.7.11" 458 | } 459 | }, 460 | "nbformat": 4, 461 | "nbformat_minor": 0 462 | } 463 | -------------------------------------------------------------------------------- /homework-3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "data": { 12 | "text/plain": [ 13 | "'TTGAATGCTGAAATCAGCAGGTAATATATGATAATAGAGAAAGCTATCCCGAAGGTGCATAGGTCAACAATACTTGAGCCTAACTCAGTAGATCCTAAAA'" 14 | ] 15 | }, 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "output_type": "execute_result" 19 | } 20 | ], 21 | "source": [ 22 | "def readGenome(filename):\n", 23 | " genome = ''\n", 24 | " with open(filename, 'r') as f:\n", 25 | " for line in f:\n", 26 | " # ignore header line with genome information\n", 27 | " if not line[0] == '>':\n", 28 | " genome += line.rstrip()\n", 29 | " return genome\n", 30 | "chr1 = readGenome('chr1.GRCh38.excerpt.fasta')\n", 31 | "chr1[:100]\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "def approximate_match(p, t):\n", 43 | " # Create distance matrix\n", 44 | " D = []\n", 45 | " for i in range(len(p)+1):\n", 46 | " D.append([0]*(len(t)+1))\n", 47 | " \n", 48 | " # Initialize first row and column of matrix\n", 49 | " for i in range(len(p)+1):\n", 50 | " D[i][0] = i\n", 51 | " for i in range(len(t)+1):\n", 52 | " D[0][i] = 0\n", 53 | " \n", 54 | " # Fill in the rest of the matrix\n", 55 | " for i in range(1, len(p)+1):\n", 56 | " for j in range(1, len(t)+1):\n", 57 | " distHor = D[i][j-1] + 1\n", 58 | " distVer = D[i-1][j] + 1\n", 59 | " if p[i-1] == t[j-1]:\n", 60 | " distDiag = D[i-1][j-1]\n", 61 | " else:\n", 62 | " distDiag = D[i-1][j-1] + 1\n", 63 | " D[i][j] = min(distHor, distVer, distDiag)\n", 64 | " \n", 65 | " # Edit distance is the value in the bottom right corner of the matrix\n", 66 | " return min(D[-1])" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "def overlap(a, b, min_length=3):\n", 78 | " \"\"\" Return length of longest suffix of 'a' matching\n", 79 | " a prefix of 'b' that is at least 'min_length'\n", 80 | " characters long. If no such overlap exists,\n", 81 | " return 0. \"\"\"\n", 82 | " start = 0 # start all the way at the left\n", 83 | " while True:\n", 84 | " start = a.find(b[:min_length], start) # look for b's suffx in a\n", 85 | " if start == -1: # no more occurrences to right\n", 86 | " return 0\n", 87 | " # found occurrence; check for full suffix/prefix match\n", 88 | " if b.startswith(a[start:]):\n", 89 | " return len(a)-start\n", 90 | " start += 1 # move just past previous match" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 6, 96 | "metadata": { 97 | "collapsed": true 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "from collections import defaultdict\n", 102 | "def overlap_graph(reads, k):\n", 103 | " # Make index\n", 104 | " index = defaultdict(set)\n", 105 | " for read in reads:\n", 106 | " for i in range(len(read) - k + 1):\n", 107 | " index[read[i:i+k]].add(read)\n", 108 | "\n", 109 | " # Make graph\n", 110 | " graph = defaultdict(set)\n", 111 | " for r in reads:\n", 112 | " for o in index[r[-k:]]:\n", 113 | " if r != o:\n", 114 | " if overlap(r, o, k):\n", 115 | " graph[r].add(o)\n", 116 | "\n", 117 | " edges = 0\n", 118 | " for read in graph:\n", 119 | " edges += len(graph[read])\n", 120 | " return(edges, len(graph))" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 7, 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "3\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "p = 'GCTGATCGATCGTACG'\n", 140 | "print(approximate_match(p, chr1))" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 8, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "2\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "p = 'GATTTACCAGATTGAG'\n", 160 | "print(approximate_match(p, chr1))" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 9, 166 | "metadata": { 167 | "collapsed": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "\n", 172 | "def readFastq(filename):\n", 173 | " sequences = []\n", 174 | " qualities = []\n", 175 | " with open(filename) as fh:\n", 176 | " while True:\n", 177 | " fh.readline() # skip name line\n", 178 | " seq = fh.readline().rstrip() # read base sequence\n", 179 | " fh.readline() # skip placeholder line\n", 180 | " qual = fh.readline().rstrip() # base quality line\n", 181 | " if len(seq) == 0:\n", 182 | " break\n", 183 | " sequences.append(seq)\n", 184 | " qualities.append(qual)\n", 185 | " return sequences, qualities" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 10, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "904746\n", 200 | "7161\n" 201 | ] 202 | } 203 | ], 204 | "source": [ 205 | "seqs, quals = readFastq('ERR266411_1.for_asm.fastq')\n", 206 | "edges, suffixes = overlap_graph(seqs, 30)\n", 207 | "print(edges)\n", 208 | "print(suffixes)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [] 219 | } 220 | ], 221 | "metadata": { 222 | "kernelspec": { 223 | "display_name": "Python 2", 224 | "language": "python", 225 | "name": "python2" 226 | }, 227 | "language_info": { 228 | "codemirror_mode": { 229 | "name": "ipython", 230 | "version": 2 231 | }, 232 | "file_extension": ".py", 233 | "mimetype": "text/x-python", 234 | "name": "python", 235 | "nbconvert_exporter": "python", 236 | "pygments_lexer": "ipython2", 237 | "version": "2.7.11" 238 | } 239 | }, 240 | "nbformat": 4, 241 | "nbformat_minor": 0 242 | } 243 | -------------------------------------------------------------------------------- /homework-4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "def overlap(a, b, min_length=3):\n", 12 | " \"\"\" Return length of longest suffix of 'a' matching\n", 13 | " a prefix of 'b' that is at least 'min_length'\n", 14 | " characters long. If no such overlap exists,\n", 15 | " return 0. \"\"\"\n", 16 | " start = 0 # start all the way at the left\n", 17 | " while True:\n", 18 | " start = a.find(b[:min_length], start) # look for b's prefix in a\n", 19 | " if start == -1: # no more occurrences to right\n", 20 | " return 0\n", 21 | " # found occurrence; check for full suffix/prefix match\n", 22 | " if b.startswith(a[start:]):\n", 23 | " return len(a)-start\n", 24 | " start += 1 # move just past previous match" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "import itertools\n", 36 | "\n", 37 | "def scs(ss):\n", 38 | " \"\"\" Returns shortest common superstring of given strings,\n", 39 | " assuming no string is a strict substring of another \"\"\"\n", 40 | " shortest_sup = None\n", 41 | " for ssperm in itertools.permutations(ss):\n", 42 | " sup = ssperm[0]\n", 43 | " for i in range(len(ss)-1):\n", 44 | " olen = overlap(ssperm[i], ssperm[i+1], min_length=1)\n", 45 | " sup += ssperm[i+1][olen:]\n", 46 | " if shortest_sup is None or len(sup) < len(shortest_sup):\n", 47 | " shortest_sup = sup\n", 48 | " return shortest_sup" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/plain": [ 61 | "11" 62 | ] 63 | }, 64 | "execution_count": 4, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "len(scs(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT']))" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 16, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "import itertools\n", 82 | "\n", 83 | "def scs_with_counts(ss):\n", 84 | " \"\"\" Returns shortest common superstring of given strings,\n", 85 | " assuming no string is a strict substring of another \"\"\"\n", 86 | " all_scs = []\n", 87 | " shortest_sup = None\n", 88 | " for ssperm in itertools.permutations(ss):\n", 89 | " sup = ssperm[0]\n", 90 | " for i in range(len(ss)-1):\n", 91 | " olen = overlap(ssperm[i], ssperm[i+1], min_length=1)\n", 92 | " sup += ssperm[i+1][olen:]\n", 93 | " if shortest_sup is None or len(sup) <= len(shortest_sup):\n", 94 | " shortest_sup = sup\n", 95 | " all_scs.append(sup)\n", 96 | " return shortest_sup,all_scs" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 17, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "('GATTGCCTTGG',\n", 110 | " ['CCTTGCTGGATT',\n", 111 | " 'CCTTGCGATTGG',\n", 112 | " 'CCTTGGATTGC',\n", 113 | " 'TGCCTTGGATT',\n", 114 | " 'TGGATTGCCTT',\n", 115 | " 'GATTGCCTTGG'])" 116 | ] 117 | }, 118 | "execution_count": 17, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "scs_with_counts(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'])" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 18, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "('CABCA', ['ABCAB', 'BCABC', 'CABCA'])" 138 | ] 139 | }, 140 | "execution_count": 18, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "scs_with_counts(['ABC', 'BCA', 'CAB'])" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 19, 152 | "metadata": { 153 | "collapsed": true 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "def readFastq(filename):\n", 158 | " sequences = []\n", 159 | " qualities = []\n", 160 | " with open(filename) as fh:\n", 161 | " while True:\n", 162 | " fh.readline() # skip name line\n", 163 | " seq = fh.readline().rstrip() # read base sequence\n", 164 | " fh.readline() # skip placeholder line\n", 165 | " qual = fh.readline().rstrip() # base quality line\n", 166 | " if len(seq) == 0:\n", 167 | " break\n", 168 | " sequences.append(seq)\n", 169 | " qualities.append(qual)\n", 170 | " return sequences, qualities" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 20, 176 | "metadata": { 177 | "collapsed": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "seqs, quals = readFastq('ads1_week4_reads.fq')" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 21, 187 | "metadata": { 188 | "collapsed": true 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "def overlap_graph(reads, k):\n", 193 | " # Make index\n", 194 | " index = defaultdict(set)\n", 195 | " for read in reads:\n", 196 | " for i in range(len(read) - k + 1):\n", 197 | " index[read[i:i+k]].add(read)\n", 198 | "\n", 199 | " # Make graph\n", 200 | " graph = defaultdict(set)\n", 201 | " for r in reads:\n", 202 | " for o in index[r[-k:]]:\n", 203 | " if r != o:\n", 204 | " if overlap(r, o, k):\n", 205 | " graph[r].add(o)\n", 206 | "\n", 207 | " return graph\n", 208 | "\n" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 22, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "def pick_maximal_overlap(reads, k):\n", 220 | " \"\"\" Return a pair of reads from the list with a\n", 221 | " maximal suffix/prefix overlap >= k. Returns\n", 222 | " overlap length 0 if there are no such overlaps.\"\"\"\n", 223 | " reada, readb = None, None\n", 224 | " best_olen = 0\n", 225 | "\n", 226 | " # Make index\n", 227 | " index = defaultdict(set)\n", 228 | " for read in reads:\n", 229 | " for i in range(len(read) - k + 1):\n", 230 | " index[read[i:i+k]].add(read)\n", 231 | "\n", 232 | " for r in reads:\n", 233 | " for o in index[r[-k:]]:\n", 234 | " if r != o:\n", 235 | " olen = overlap(r, o, k)\n", 236 | " if olen > best_olen:\n", 237 | " reada, readb = r, o\n", 238 | " best_olen = olen\n", 239 | "\n", 240 | " return reada, readb, best_olen" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 23, 246 | "metadata": { 247 | "collapsed": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "def greedy_scs(reads, k):\n", 252 | " \"\"\" Greedy shortest-common-superstring merge.\n", 253 | " Repeat until no edges (overlaps of length >= k)\n", 254 | " remain. \"\"\"\n", 255 | " read_a, read_b, olen = pick_maximal_overlap(reads, k)\n", 256 | " while olen > 0:\n", 257 | " reads.remove(read_a)\n", 258 | " reads.remove(read_b)\n", 259 | " reads.append(read_a + read_b[olen:])\n", 260 | " read_a, read_b, olen = pick_maximal_overlap(reads, k)\n", 261 | " return ''.join(reads)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 25, 267 | "metadata": { 268 | "collapsed": false 269 | }, 270 | "outputs": [ 271 | { 272 | "name": "stdout", 273 | "output_type": "stream", 274 | "text": [ 275 | "4633\n", 276 | "3723\n", 277 | "ACCAAACAAAGTTGGGTAAGGATAGATCAATCAATGATCATATTCTAGTACACTTAGGATTCAAGATCCTATTATCAGGGACAAGAGCAGGATTAGGGATATCCGAGATGGCCACACTTTTGAGGAGCTTAGCATTGTTCAAAAGAAACAAGGACAAACCACCCATTACATCAGGATCCGGTGGAGCCATCAGAGGAATCAAACACATTATTATAGTACCAATTCCTGGAGATTCCTCAATTACCACTCGATCCAGACTACTGGACCGGTTGGTCAGGTTAATTGGAAACCCGGATGTGAGCGGGCCCAAACTAACAGGGGCACTAATAGGTATATTATCCTTATTTGTGGAGTCTCCAGGTCAATTGATTCAGAGGATCACCGATGACCCTGACGTTAGCATCAGGCTGTTAGAGGTTGTTCAGAGTGACCAGTCACAATCTGGCCTTACCTTCGCATCAAGAGGTACCAACATGGAGGATGAGGCGGACCAATACTTTTCACATGATGATCCAAGCAGTAGTGATCAATCCAGGTCCGGATGGTTCGAGAACAAGGAAATCTCAGATATTGAAGTGCAAGACCCTGAGGGATTCAACATGATTCTGGGTACCATTCTAGCCCAGATCTGGGTCTTGCTCGCAAAGGCGGTTACGGCCCCAGACACGGCAGCTGATTCGGAGCTAAGAAGGTGGATAAAGTACACCCAACAAAGAAGGGTAGTTGGTGAATTTAGATTGGAGAGAAAATGGTTGGATGTGGTGAGGAACAGGATTGCCGAGGACCTCTCTTTACGCCGATTCATGGTGGCTCTAATCCTGGATATCAAGAGGACACCCGGGAACAAACCTAGGATTGCTGAAATGATATGTGACATTGATACATATATCGTAGAGGCAGGATTAGCCAGTTTTATCCTGACTATTAAGTTTGGGATAGAAACTATGTATCCTGCTCTTGGACTGCATGAATTTGCTGGTGAGTTATCCACACTTGAGTCCTTGATGAATCTTTACCAGCAAATGGGAGAAACTGCACCCTACATGGTAATCCTAGAGAACTCAATTCAGAACAAGTTCAGTGCAGGATCATACCCTCTGCTCTGGAGCTATGCCATGGGAGTAGGAGTGGAACTTGAAAACTCCATGGGAGGTTTGAACTTTGGTCGATCTTACTTTGATCCAGCATATTTTAGATTAGGGCAAGAGATGGTGAGGAGGTCAGCTGGAAAGGTCAGTTCCACATTGGCATCCGAACTCGGTATCACTGCCGAGGATGCAAGGCTTGTTTCAGAGATTGCAATGCATACTACTGAGGACAGGATCAGTAGAGCGGTCGGACCCAGACAAGCCCAAGTGTCATTTCTACACGGTGATCAAAGTGAGAATGAGCTACCAGGATTGGGGGGCAAGGAAGATAGGAGGGTCAAACAGGGTCGGGGAGAAGCCAGGGAGAGCTACAGAGAAACCGGGTCCAGCAGAGCAAGTGATGCGAGAGCTGCCCATCCTCCAACCAGCATGCCCCTAGACATTGACACTGCATCGGAGTCAGGCCAAGATCCGCAGGACAGTCGAAGGTCAGCTGACGCCCTGCTCAGGCTGCAAGCCATGGCAGGAATCTTGGAAGAACAAGGCTCAGACACGGACACCCCTAGGGTATACAATGACAGAGATCTTCTAGACTAGGTGCGAGAGGCCGAGGACCAGAACAACATCCGCCTACCCTCCATCATTGTTATAAAAAACTTAGGAACCAGGTCCACACAGCCGCCAGCCAACCAACCATCCACTCCCACGACTGGAGCCGATGGCAGAAGAGCAGGCACGCCATGTCAAAAACGGACTGGAATGCATCCGGGCTCTCAAGGCCGAGCCCATCGGCTCACTGGCCGTCGAGGAAGCCATGGCAGCATGGTCAGAAATATCAGACAACCCAGGACAGGACCGAGCCACCTGCAAGGAAGAGGAGGCAGGCAGTTCGGGTCTCAGCAAACCATGCCTCTCAGCAATTGGATCAACTGAAGGCGGTGCACCTCGCATCCGCGGTCAGGGATCTGGAGAAAGCGATGACGACGCTGAAACTTTGGGAATCCCCTCAAGAAATCTCCAGGCATCAAGCACTGGGTTACAGTGTTATCATGTTTATGATCACAGCGGTGAAGCGGTTAAGGGAATCCAAGATGCTGACTCTATCATGGTTCAATCAGGCCTTGATGGTGATAGCACCCTCTCAGGAGGAGACGATGAATCTGAAAACAGCGATGTGGATATTGGCGAACCTGATACCGAGGGATATGCTATCACTGACCGGGGATCTGCTCCCATCTCTATGGGGTTCAGGGCTTCTGATGTTGAAACTGCAGAAGGAGGGGAGATCCACGAGCTCCTGAAACTCCAATCCAGAGGCAACAACTTTCCGAAGCTTGGGAAAACTCTCAATGTTCCTCCGCCCCCGAACCCCAGTAGGGCCAGCACTTCCGAGACACCCATTAAAAAGGGCACAGACGCGAGATTGGCCTCATTTGGAACGGAGATCGCGTCTTTATTGACAGGTGGTGCAACCCAATGTGCTCGAAAGTCACCCTCGGAACCATCAGGGCCAGGTGCACCTGCGGGGAATGTCCCCGAGTGTGTGAGCAATGCCGCACTGATACAGGAGTGGACACCCGAATCTGGTACCACAATCTCCCCGAGATCCCAGAATAATGAAGAAGGGGGAGACTATTATGATGATGAGCTGTTCTCCGATGTCCAAGACATCAAAACAGCCTTGGCCAAAATACACGAGGATAATCAGAAGATAATCTCCAAGCTAGAATCATTGCTGTTATTGAAGGGAGAAGTTGAGTCAATTAAGAAGCAGATCAACAGGCAAAATATCAGCATATCCACCCTGGAAGGACACCTCTCAAGCATCATGATTGCCATTCCTGGACTTGGGAAGGATCCCAACGACCCCACTGCAGATGTCGAACTCAATCCCGACCTGAAACCCATCATAGGCAGAGATTCAGGCCGAGCACTGGCCGAAGTTCTCAAGAAGCCCGTTGCCAGCCGACAACTCCAGGGAATGACTAATGGACGGACCAGTTCCAGAGGACAGCTGCTGAAGGAATTTCAACTAAAGCCGATCGGGAAAAAGGTGAGCTCAGCCGTCGGGTTTGTTCCTGACACCGGCCCTGCATCACGCAGTGTAATCCGCTCCATTATAAAATCCAGCCGGCTAGAGGAGGATCGGAAGCGTTACCTGATGACTCTCCTTGATGATATCAAAGGAGCCAACGATCTTGCCAAGTTCCACCAGATGCTGATGAAGATAATAATGAAGTAGCTACAGCTCAACTTACCTGCCAACCCCATGCCAGTCGACCTAATTAGTACAACCTAAATCCATTATAAAAAACTTAGGAGCAAAGTGATTGCCTCCTAAGTTCCACAATGACAGAGATCTACGATTTCGACAAGTCGGCATGGGACATCAAAGGGTCGATCGCTCCGATACAACCTACCACCTACAGTGATGGCAGGCTGGTGCCCCAGGTCAGAGTCATAGATCCTGGTCTAGGTGATAGGAAGGATGAATGCTTTATGTACATGTTTCTGCTGGGGGTTGTTGAGGACAGCGATCCCCTAGGGCCTCCAATCGGGCGAGCATTCGGGTCCCTGCCCTTAGGTGTTGGTAGATCCACAGCAAAACCCGAGGAACTCCTCAAAGAGGCCACTGAGCTTGACATAGTTGTTAGACGTACAGCAGGGCTCAATGAAAAACTGGTGTTCTACAACAACACCCCACTAACCCTCCTCACACCTTGGAGAAAGGTCCTAACAACAGGGAGTGTCTTCAATGCAAACCAAGTGTGCAATGCGGTTAATCTAATACCGCTGGACACCCCGCAGAGGTTCCGTGTTGTTTATATGAGCATCACCCGTCTTTCGGATAACGGGTATTACACCGTTCCCAGAAGAATGCTGGAATTCAGATCGGTCAATGCAGTGGCCTTCAACCTGCTAGTGACCCTTAGGATTGACAAGGCGATTGGCCCTGGGAAGATCATCGACAATGCAGAGCAACTTCCTGAGGCAACATTTATGGTCCACATCGGGAACTTCAGGAGAAAGAAGAGTGAAGTCTACTCTGCCGATTATTGCAAAATGAAAATCGAAAAGATGGGCCTGGTTTTTGCACTTGGTGGGATAGGGGGCACCAGTCTTCACATTAGAAGCACAGGCAAAATGAGCAAGACTCTCCATGCACAACTCGGGTTCAAGAAGACCTTATGTTACCCACTGATGGATATCAATGAAGACCTTAATCGGTTACTCTGGAGGAGCAGATGCAAGATAGTAAGAATCCAGGCAGTTTTGCAGCCATCAGTTCCTCAAGAATTCCGCATTTACGACGACGTGATCATAAATGATGACCAAGGACTATTCAAAGTTCTGTAGACCGCAGTGCCCAGCAATACCCGAAAACGACCCCCCTCATAATGACAGCCAGAAGGCCCGGACAAAAAAGCCCCCTCCAGAAGACTCCACGGACCAAGCGAGAGGCCAGCCAGCAGCCGACAGCAAGTGTGGACACCAGGCGGCCCAAGCACAGAACAGCCCCGACACAAGGCCACCACCAGCCATCCCAATCTGCGTCCTCCTCGTGGGACCCCCGAGGACCAACCCCGAAGGTCGCTCCGAACACAGACCACCAACCGCATCCCCACAGCTCCCGGGAAAGGAACCCCCAGCAACTGGAAGGCCCCTCCCCCCCTCCCCCAACGCAAGAACCCCACAACCGAACCGCACAAGCGACCGAGGTGACCCAACCGCAGGCATCCGACTCCTTAGACAGATCCTCTCCCCCCGGCATACTAAACAAAACTTAGGGCCAAGGAACACACACACTCGACAGAACCCAGACCCCGGCCCGCGGCACCGCGCCCCCACCCCCCGAAAACCAGAGGGAGCCCCCAACCAAACCCGCCGGCCCCCCCGGTGCCCACAGGTAGGCACACCAACCCCCGACCAGACCCAGCACCCAGCCACCGACAATCCAAGACGGGGGGCCCCCCCCAAAAAAAGGCCCCCAGGGGCCGACAGCCAGCATCGCGAGGAAGCACACCCACCCCACACACGACCACGGCAACCGAACCAGAGTCCAGACCACCCTGGGCCACCAGCTCCCAGACTCGGCCATCACCCCGCAAAAAGGAAAGGCCACAACCCGCGCACCCCAGCCCCGATCCGGCGGGCAGCCACTCAACCCGAACCAGCACCCAAGAGCGATCCCTGGGGGACCCCCAAACCGCAAAAGACATCAGTATCCCACAGCCTCTCCAAGTCCCCCGGTCTCCTCCTCTTCTCGAAGGGACCAAAAGATCAATCCACCACATCCGACGACACTCAATTCCCCACCCCCAAAGGAGACACCGGGAATCCCAGAATCAAGACTCATCCAGTGTCCATCATGGGTCTCAAGGTGAACGTCTCTGCCATATTCATGGCAGTACTGTTAACTCTCCAAACACCCACCGGTCAAATCCATTGGGGCAATCTCTCTAAGATAGGGGTGGTAGGGATAGGAAGTGCAAGCTACAAAGTTATGACTCGTTCCAGCCATCAATCATTGGTCATAAAATTAATGCCCAATATAACTCTCCTCAATAACTGCACGAGGGTAGAGATCGCAGAATACAGGAGACTACTGAGAACAGTTTTGGAACCAATTAGAGATGCACTTAATGCAATGACCCAGAATATAAGACCGGTTCAGAGTGTAGCTTCAAGTAGGAGACACAAGAGATTTGCGGGAGTTGTCCTGGCAGGTGCGGCCCTAGGCGTTGCCACAGCTGCTCAGATAACAGCCGGCATTGCACTTCACCAGTCCATGCTGAACTCTCAAGCCATCGACAATCTGAGAGCAAGCCTGGAAACTACTAATCAGGCAATTGAGGCAATCAGACAAGCAGGGCAGGAGATGATATTGGCTGTTCAGGGTGTCCAAGACTACATCAATAATGAGCTGATACCGTCTATGAACCAACTATCTTGTGATTTAATCGGCCAGAAGCTAGGGCTCAAATTGCTCAGATACTATACAGAAATCCTGTCATTATTTGGCCCCAGCTTACGGGACCCCATATCTGCGGAGATATCCATCCAGGCTTTGAGCTATGCGCTTGGAGGAGATATCAATAAGGTATTAGAAAAGCTCGGATACAGTGGAGGTGATTTACTGGGCATCTTAGAGAGCAGAGGAATAAAGGCCCGGATAACTCACGTCGACACAGAGTCCTACTTCATTGTACTCAGTATAGCCTATCCGACGCTGTCCGAGATTAAGGGGGTGATTGTCCACCGGCTAGAGGGGGTCTCGTACAATATAGGCTCTCAAGAGTGGTATACCACTGTGCCCAAGTATGTTGCAACCCAAGGGTACCTTATCTCGAATTTTGATGAGTCATCGTGTACTTTCATGCCAGAGGGGACTGTGTGCAGCCAAAATGCCTTGTACCCGATGAGTCCTCTGCTCCAAGAATGCCTCCGGGGGTCCACCAAGTCCTGTGCTCGTACACTCGTATCCGGGTCTTTTGGGAACCGGTTCATTTTATCACAAGGGAACCTAATAGCCAATTGTGCATCAATCCTCTGCAAGTGTTACACAACAGGAACGATCATTAATCAAGACCCTGACAAGATCCTAACATACATTGCTGCCGATCACTGCCCGGTGGTCGAGGTGAACGGTGTGACCATCCAAGTCGGGAGCAGGAGGTATCCGGACGCGGTGTACCTGCACAGAATTGACCTCGGTCCTCCCATATCATTGGAGAGGTTGGACGTAGGGACAAATCTGGGGAATGCAATTGCTAAGTTGGAGGATGCCAAGGAATTGTTGGAGTCATCGGACCAGATATTGAGGAGTATGAAAGGTTTATCGAGCACTAGCATAGTTTACATCCTGATTGCAGTGTGTCTTGGAGGGTTGATAGGGATCCCCGCTTTAATATGTTGCTGCAGGGGGCGCTGTAACAAAAAGGGAGAACAAGTTGGTATGTCAAGACCAGGCCTAAAGCCTGATCTTACAGGGACATCAAAATCCTATGTAAGGTCGCTCTGATCCTCTACAACTCTTGAAACACAGATTTCCCACAAGTCTCCTCTTCGTCATCAAGCAACCACCGCATCCAGCATCAAGCCCACCTGAAATTGTCTCCGGCTTCCCTCTGGCCGAACGATATCGGTAGTTAATTAAAACTTAGGGTGCAAGATCATCCACAATGTCACCACAACGAGACCGAATAAATGCCTTCTACAAAGACAACCCACATCCTAAGGGAAGTAGGATAGTTATTAACAGAGAACATCTTATGATTGATAGACCTTATGTTTTGCTGGCTGTTCTATTCGTCATGTTTCTGAGCTTGATCGGGTTGCTAGCCATTGCAGGCATTAGACTCCATCGTGCAGCCATCTACACCGCAGAGATCCATAAGAGCCTCAGCACCAATCTAGATGTAACTAACTCGATCGAGCATCAGGTCAAGGACGTGCTGACACCACTCTTCAAGATCATTGGTGATGAAGTGGGCCTGAGGACACCTCAGAGATTCACTGACCTAGTGAAATTCATCTCTGACAAAATTAAATTCCTTAATCCGGATAGGGAGTACGACTTCAGAGATCTCACTTGGTGTATCAACCCGCCAGAGAGAATCAAATTGGATTATGATCAATACTGTGCAGATGTGGCTGCTGAAGAACTCATGAATGCATTGGTGAACTCAACTCTACTGGAGGCCAGGGCAACCAATCAGTTCCTAGCTGTCTCAAAGGGAAACTGCTCAGGGCCCACTACAATCAGAGGTCAATTCTCAAACATGTCGCTGTCCCTGTTGGACTTGTATTTAAGTCGAGGTTACAATGTGTCATCTATAGTCACTATGACATCCCAGGGAATGTACGGGGGAACTTACCTAGTGGGAAAGCCTAATCTGAGCAGTAAAGGGTCAGAGTTGTCACAACTGAGCATGCACCGAGTGTTTGAAGTAGGGGTTATCAGAAATCCGGGTTTGGGGGCTCCGGTGTTCCATATGACAAACTATTTTGAGCAACCAGTCAGTAATGATTTCAGCAACTGCATGGTGGCTTTGGGGGAGCTTAAATTCGCAGCCCTCTGTCACAGGGAAGATTCTATCACAATTCCCTATCAGGGGTCAGGGAAAGGTGTCAGCTTCCAGCTCGTCAAGCTAGGTGTCTGGAAATCCCCAACCGACATGCGATCCTGGGTCCCCCTATCAACGGATGATCCAGTGATAGATAGGCTTTACCTCTCATCTCACAGAGGTGTTATCGCTGACAATCAAGCAAAATGGGCTGTCCCGACAACACGGACAGATGACAAGTTGCGAATGGAGACATGCTTCCAGCAGGCGTGTAAGGGTAAAAACCAAGCACTCTGCGAGAATCCCGAGTGGGCACCATTGAAGGATAACAGGATTCCTTCATACGGGGTCTTGTCTGTTAATCTGAGTCTGACAGTTGAGCTTAAAATCAAAATTGCTTCAGGATTCGGGCCATTGATCACACACGGTTCAGGGATGGACCTATACAAAACCAACCACAACAATGTGTATTGGCTGACTATCCCGCCAATGAAGAACCTAGCCTTAGGTGTAATCAACACATTGGAGTGGATACCGAGATTCAAGGTTAGTCCCAACCTCTTCACTGTTCCAATCAAGGAAGCAGGCGAGGACTGCCATGCCCCAACATACCTACCTGCGGAGGTGGATGGTGATGTCAAACTCAGTTCCAATCTGGTAATTCTACCTGGTCAGGATCTCCAATATGTTTTGGCAACCTACGATACTTCCAGGGTTGAACATGCTGTGGTTTATTATGTTTACAGCCCAAGCCGCTCATTTTCTTACTTTTATCCTTTTAGGTTGCCTATAAAGGGGGTCCCAATCGAATTACAAGTGGAATGCTTCACATGGGACAAAAAACTCTGGTGCCGTCACTTCTGTGTGCTTGCGGACTCAGAATCTGGTGGACATATCACTCACTCTGGGATGGTGGGCATGGGAGTCAGCTGCACAGTCACTCGGGAAGATGGAACCAATCGCAGATAGGGCTGCCAGTGAACCGATCACATGATGTCACCCAGACATCAGGCATACCCACTAGTGTGAAATAGACATCAGAATTAAGAAAAACGTAGGGTCCAAGTGGTTTCCCGTTATGGACTCGCTATCTGTCAACCAGATCTTATACCCTGAAGTTCACCTAGATAGCCCGATAGTTACCAATAAGATAGTAGCTATCCTGGAGTATGCTCGAGTCCCTCACGCTTACAGCCTGGAGGACCCTACACTGTGTCAGAACATCAAGCACCGCCTAAAAAACGGATTCTCCAACCAAATGATTATAAACAATGTGGAAGTTGGGAATGTCATCAAGTCCAAGCTTAGGAGTTATCCGGCCCACTCTCATATTCCATATCCAAATTGTAATCAGGATTTATTTAACATAGAAGACAAAGAGTCAACAAGGAAGATCCGTGAGCTCCTAAAAAAGGGAAATTCGCTGTACTCCAAAGTCAGTGATAAGGTTTTCCAATGCCTGAGGGACACTAACTCACGGCTTGGCCTAGGCTCCGAATTGAGGGAGGACATCAAGGAGAAAATTATTAACTTGGGAGTTTACATGCACAGCTCCCAATGGTTTGAGCCCTTTCTGTTTTGGTTTACAGTCAAGACTGAGATGAGGTCAGTGATTAAATCACAAACCCATACTTGCCATAGGAGGAGACACACACCTGTATTCTTCACTGGTAGTTCAGTTGAGCTGTTAATCTCTCGTGACCTTGTTGCTATAATCAGTAAGGAGTCTCAACATGTATATTACCTGACGTTTGAACTGGTTTTGATGTATTGTGATGTCATAGAGGGGAGGTTAATGACAGAGACCGCTATGACCATTGATGCTAGGTATGCAGAACTTCTAGGAAGAGTCAGATACATGTGGAAACTGATAGATGGTTTCTTCCCTGCACTCGGGAATCCAACTTATCAAATTGTAGCCATGCTGGAGCCACTTTCACTTGCTTACCTGCAACTGAGGGATATAACAGTAGAACTCAGAGGTGCTTTCCTTAACCACTGCTTTACTGAAATACATGATGTTCTTGACCAAAACGGGTTTTCTGATGAAGGTACTTATCATGAGTTAATTGAAGCCCTAGATTACATTTTCATAACTGATGACATACATCTGACAGGGGAGATTTTCTCATTTTTCAGAAGTTTCGGCCACCCCAGACTTGAAGCAGTAACGGCTGCTGAAAATGTCAGGAAATACATGAATCAGCCTAAAGTCATTGTGTATGAGACTCTGATGAAAGGTCATGCCATATTTTGTGGAATCATAATCAACGGCTATCGTGACAGGCACGGAGGCAGTTGGCCACCCCTGACCCTCCCCCTGCATGCTGCAGACACAATCCGGAATGCTCAAGCTTCAGGTGAAGGGTTAACACATGAGCAGTGCGTTGATAACTGGAAATCATTTGCTGGAGTGAGATTTGGCTGTTTTATGCCTCTTAGCCTGGACAGTGATCTGACAATGTACCTAAAGGACAAGGCACTTGCTGCTCTCCAAAGGGAATGGGATTCAGTTTACCCGAAAGAGTTCCTGCGTTACGATCCTCCCAAGGGAACCGGGTCACGGAGGCTTGTAGATGTTTTCCTTAATGATTCGAGCTTTGACCCATATGATATGATAATGTATGTCGTAAGTGGAGCCTACCTCCATGACCCTGAGTTCAACCTGTCTTACAGCCTGAAAGAAAAGGAGATCAAGGAAACAGGTAGACTTTTCGCTAAAATGACTTACAAAATGAGGGCATGCCAAGTGATCGCTGAAAATCTAATCTCAAACGGGATTGGCAAGTATTTTAAGGACAATGGGATGGCCAAGGATGAGCACGATTTGACTAAGGCACTCCACACTCTGGCTGTCTCAGGAGTCCCCAAAGATCTCAAAGAAAGTCACAGGGGGGGGCCAGTCTTAAAAACCTACTCCCGAAGCCCAGTCCACACAAGTACCAGGAACGTTAAAGCAGAAAAAGGGTTTGTAGGATTCCCTCATGTAATTCGGCAGAATCAAGACACTGATCATCCGGAGAATATAGAAACCTACGAGACAGTCAGCGCATTTATCACGACTGATCTCAAGAAGTACTGCCTTAATTGGAGATATGAGACCATCAGCTTATTTGCACAGAGGCTAAATGAGATTTACGGATTACCCTCATTTTTTCAGTGGCTGCATAAGAGGCTTGAAACCTCTGTCCTCTATGTAAGTGACCCTCATTGCCCCCCCGACCTTGACGCCCATGTCCCGTTATGCAAAGTCCCCAATGACCAAATCTTCATCAAGTACCCTATGGGAGGTATAGAAGGGTATTGTCAGAAGCTGTGGACCATCAGCACCATTCCCTACTTATACCTGGCTGCTTATGAGAGCGGGGTAAGGATTGCTTCGTTAGTGCAAGGGGACAATCAGACCATAGCCGTAACAAAAAGGGTACCCAGCACATGGCCTTACAACCTTAAGAAACGGGAAGCTGCTAGAGTAACTAGAGATTACTTTGTAATTCTTAGGCAAAGGCTACATGACATTGGCCATCACCTCAAGGCAAATGAGACAATTGTTTCATCACATTTTTTTGTCTATTCAAAAGGAATATATTATGATGGGCTACTTGTGTCCCAATCACTCAAGAGCATCGCAAGATGTGTATTCTGGTCAGAGACTATAGTTGATGAAACAAGGGCAGCATGCAGTAATATTGCTACAACAATGGCTAAAAGCATCGAGAGAGGTTATGACCGTTATCTTGCATATTCCCTGAACGTCCTAAAAGTGATACAGCAAATTTTGATCTCTCTTGGCTTCACAATCAATTCAACCATGACCCGAGATGTAGTCATACCCCTCCTCACAAACAACGATCTCTTAATAAGGATGGCACTGTTGCCCGCTCCTATTGGGGGGATGAATTATCTGAATATGAGCAGGCTGTTTGTCAGAAACATCGGTGATCCAGTAACATCATCAATTGCTGATCTCAAGAGAATGATTCTCGCATCACTAATGCCTGAAGAGACCCTCCATCAAGTAATGACACAACAACCGGGGGACTCTTCATTCCTAGACTGGGCTAGCGACCCTTACTCAGCAAATCTTGTATGCGTCCAGAGCATCACTAGACTCCTCAAGAACATAACTGCAAGGTTTGTCCTAATCCATAGTCCAAACCCAATGTTAAAAGGGTTATTCCATGATGACAGTAAAGAAGAGGACGAGAGACTGGCGGCATTCCTCATGGACAGGCATATTATAGTACCTAGGGCAGCTCATGAAATCCTGGATCATAGTGTCACAGGGGCAAGAGAGTCTATTGCAGGCATGCTAGATACCACAAAAGGCCTGATTCGAGCCAGCATGAGGAAGGGGGGGTTAACCTCTCGAGTGATAACCAGATTGTCCAATTATGACTATGAACAATTTAGAGCAGGGATGGTGCTATTGACAGGAAGAAAGAGAAATGTCCTCATTGACAAAGAGTCATGTTCAGTGCAGCTGGCTAGAGCCCTAAGAAGCCATATGTGGGCAAGACTAGCTCGAGGACGGCCTATTTACGGCCTTGAGGTCCCTGATGTACTAGAATCTATGCGAGGCCACCTTATTCGGCGTCATGAGACATGTGTCATCTGCGAGTGTGGATCAGTCAACTACGGATGGTTTTTTGTCCCCTCGGGTTGCCAACTGGATGATATTGACAAGGAAACATCATCCTTGAGAGTCCCATATATTGGTTCTACCACTGATGAGAGAACAGACATGAAGCTCGCCTTCGTAAGAGCCCCAAGTAGATCCTTGCGATCTGCCGTTAGAATAGCAACAGTGTACTCATGGGCTTACGGTGATGATGATAGCTCTTGGAACGAAGCCTGGTTGTTGGCAAGGCAAAGGGCCAATGTGAGCCTGGAGGAGCTAAGGGTGATCACTCCCATCTCGACTTCGACTAATTTAGCGCATAGGTTGAGGGATCGTAGCACTCAAGTGAAATACTCAGGTACATCCCTTGTCCGAGTGGCAAGGTATACCACAATCTCCAACGACAATCTCTCATTTGTCATATCAGATAAGAAGGTTGATACTAACTTTATATACCAACAAGGAATGCTTCTAGGGTTGGGTGTTTTAGAAACATTGTTTCGACTCGAGAAAGATACTGGATCATCTAACACGGTATTACATCTTCACGTCGAAACAGATTGTTGCGTGATCCCGATGATAGATCATCCCAGGATACCCAGCTCCCGCAAGCTAGAGCTGAGGGCAGAGCTATGTACCAACCCATTGATATATGATAATGCACCTTTAATTGACAGAGATGCAACAAGGCTATACACCCAGAGCCATAGGAGGCACCTTGTGGAATTTGTTACATGGTCCACACCCCAACTATATCACATTCTAGCTAAGTCCACAGCACTATCTATGATTGACCTGGTAACAAAATTTGAGAAGGACCATATGAATGAAATTTCAGCTCTCATAGGGGATGACGATATCAATAGTTTCATAACTGAGTTTCTGCTTATAGAGCCAAGATTATTCACCATCTACTTGGGCCAGTGTGCAGCCATCAATTGGGCATTTGATGTACATTATCATAGACCATCAGGGAAATATCAGATGGGTGAGCTGTTGTCTTCGTTCCTTTCTAGAATGAGCAAAGGAGTGTTTAAGGTGCTTGTCAATGCTCTAAGCCACCCAAAGATCTACAAGAAATTCTGGCATTGTGGTATTATAGAGCCTATCCATGGTCCTTCACTTGATGCTCAAAACTTGCACACAACTGTGTGCAACATGGTTTACACATGCTATATGACCTACCTCGACCTGTTGTTGAATGAAGAGTTAGAAGAGTTCACATTTCTTTTGTGTGAAAGCGATGAGGATGTAGTACCGGACAGATTCGACAACATCCAGGCAAAACACTTGTGTGTTCTGGCAGATTTGTACTGTCAACCAGGGACCTGCCCACCGATTCGAGGTCTAAGGCCGGTAGAGAAATGTGCAGTTCTAACCGATCATATCAAGGCAGAGGCTAGGTTATCTCCAGCAGGATCTTCGTGGAACATAAATCCAATTATTGTAGACCATTACTCATGCTCTCTGACTTATCTCCGTCGAGGATCTATCAAACAGATAAGATTGAGAGTTGATCCAGGATTCATTTTTGACGCCCTCGCTGAGGTAAATGTCAGTCAGCCAAAGGTCGGCAGCAACAACATCTCAAATATGAGCATCAAGGATTTCAGACCTCCACACGATGATGTTGCAAAATTGCTCAAAGATATCAACACAAGCAAGCACAATCTTCCCATTTCAGGGGGTAGTCTCGCCAATTATGAAATCCATGCTTTCCGCAGAATCGGGTTAAACTCATCTGCTTGCTACAAAGCTGTTGAGATATCAACATTAATTAGGAGATGCCTTGAGCCAGGGGAAGACGGCTTGTTCTTGGGTGAGGGGTCGGGTTCTATGTTGATCACTTATAAGGAGATACTAAAACTAAACAAGTGCTTCTATAATAGTGGGGTTTCCGCCAATTCTAGATCTGGTCAAAGGGAATTAGCACCCTATCCCTCCGAAGTTGGCCTTGTCGAACACAGAATGGGAGTAGGTAATATTGTCAAGGTGCTCTTTAACGGGAGGCCCGAAGTCACGTGGGTAGGCAGTATAGATTGCTTCAATTTCATAGTCAGTAATATCCCTACCTCTAGTGTGGGGTTTATCCATTCAGATATAGAGACCTTACCTAACAAAGATACTATAGAGAAGCTAGAGGAATTGGCAGCCATCTTATCGATGGCTCTACTCCTTGGCAAAATAGGATCAATACTGGTGATTAAGCTTATGCCTTTCAGCGGGGATTTTGTTCAGGGATTTATAAGCTATGTAGGGTCTCATTATAGAGAAGTGAACCTTGTCTACCCTAGGTACAGCAACTTCATATCTACTGAATCTTATTTAGTTATGACAGATCTCAAAGCTAACCGGCTAATGAATCCTGAAAAGATCAAGCAGCAGATAATTGAATCATCTGTGCGGACTTCACCTGGACTTATAGGTCACATCCTATCCATTAAGCAACTAAGCTGCATACAAGCAATTGTGGGAGGCGCAGTTAGTAGAGGTGATATCAACCCTATTCTGAAAAAACTTACACCTATAGAGCAGGTGCTGATCAGTTGCGGGTTGGCAATTAACGGACCTAAACTGTGCAAAGAATTAATCCACCATGATGTTGCCTCAGGGCAAGATGGATTGCTTAACTCTATACTCATCCTCTACAGGGAGTTGGCAAGATTCAAAGACAACCAAAGAAGTCAACAAGGGATGTTCCACGCTTACCCCGTATTGGTAAGTAGTAGGCAACGAGAACTTGTATCTAGGATCACTCGCAAATTTTGGGGGCATATTCTTCTTTACTCCGGGAACAGAAAGTTGATAAATCGGTTTATCCAGAATCTCAAGTCCGGTTATCTAGTACTAGACTTACACCAGAATATCTTCGTTAAGAATCTATCCAAGTCAGAGAAACAGATTATTATGACGGGGGGTTTAAAACGTGAGTGGGTTTTTAAGGTAACAGTCAAGGAGACCAAAGAATGGTACAAGTTAGTCGGATACAGCGCTCTGATTAAGGATTAATTGGTTGAACTCCGGAACCCTAATCCTGCCCTAGGTAGTTAGGCATTATTTGCAATATATTAAAGAAAACTTTGAAAATACGAAGTTTCTATTCCCAGCTTTGTCTGGT\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "from collections import defaultdict\n", 283 | "for k in range(100, 1, -1):\n", 284 | " genome = greedy_scs(seqs, k)\n", 285 | " if len(genome) == 15894:\n", 286 | " print(genome.count('A'))\n", 287 | " print(genome.count('T'))\n", 288 | " print(genome)\n", 289 | " break" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": { 296 | "collapsed": true 297 | }, 298 | "outputs": [], 299 | "source": [] 300 | } 301 | ], 302 | "metadata": { 303 | "kernelspec": { 304 | "display_name": "Python 2", 305 | "language": "python", 306 | "name": "python2" 307 | }, 308 | "language_info": { 309 | "codemirror_mode": { 310 | "name": "ipython", 311 | "version": 2 312 | }, 313 | "file_extension": ".py", 314 | "mimetype": "text/x-python", 315 | "name": "python", 316 | "nbconvert_exporter": "python", 317 | "pygments_lexer": "ipython2", 318 | "version": "2.7.11" 319 | } 320 | }, 321 | "nbformat": 4, 322 | "nbformat_minor": 0 323 | } 324 | -------------------------------------------------------------------------------- /phix.fa: -------------------------------------------------------------------------------- 1 | >gi|216019|gb|J02482.1|PX1CG Coliphage phi-X174, complete genome 2 | GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTT 3 | GATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAA 4 | ATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTG 5 | TCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTA 6 | GATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATC 7 | TGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTT 8 | TCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTT 9 | CGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCT 10 | TGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCG 11 | TCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTAC 12 | GGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTA 13 | CGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCGGAAGGAG 14 | TGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACT 15 | AAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGC 16 | CCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCA 17 | TCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC 18 | TCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTA 19 | CTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAA 20 | GGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTT 21 | GGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACA 22 | ACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGC 23 | TCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTT 24 | TCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGC 25 | ATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCATGATGTTATTTCTTCATTTGGAGGTAAAAC 26 | CTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTT 27 | GATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGC 28 | CGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGAC 29 | TAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTG 30 | TATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGT 31 | TTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA 32 | AGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGAT 33 | TATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTT 34 | ATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAAC 35 | GCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGC 36 | TTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGT 37 | TCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTA 38 | TATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTG 39 | TCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGC 40 | CTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTG 41 | AATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGC 42 | CGGGCAATAACGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGT 43 | TTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTG 44 | CTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAA 45 | AGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCT 46 | GGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTG 47 | GTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGA 48 | TAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTAT 49 | CTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGG 50 | TTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGA 51 | GATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGAC 52 | CAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTA 53 | TGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCA 54 | AACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGAC 55 | TTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTT 56 | CTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGA 57 | TACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCG 58 | TCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTT 59 | CTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTAT 60 | TGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGC 61 | ATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATG 62 | TTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGA 63 | ATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGG 64 | GACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCC 65 | CTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATT 66 | GCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAG 67 | GCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTT 68 | ATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCG 69 | CAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGC 70 | CGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTC 71 | GTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCAT 72 | CGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAG 73 | CCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATA 74 | TGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACT 75 | TCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTG 76 | TCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGC 77 | AGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACC 78 | TGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA 79 | 80 | --------------------------------------------------------------------------------