├── .gitignore ├── ProgrammingHomework1 ├── ERR037900_1.first1000.fastq ├── homework1.py ├── lambda_virus.fa └── phix.fa ├── ProgrammingHomework2 ├── 1110.txt.utf-8 ├── bm_preproc.py ├── chr1.GRCh38.excerpt.fasta ├── homework2.py └── kmer_index.py ├── ProgrammingHomework3 ├── 0440_approx__editdist3.pdf ├── ERR266411_1.for_asm.fastq ├── chr1.GRCh38.excerpt.fasta └── homework3.py ├── ProgrammingHomework4 ├── ads1_week4_reads.fq └── homework4.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /ProgrammingHomework1/homework1.py: -------------------------------------------------------------------------------- 1 | def naive(p, t): 2 | occurrences = [] 3 | for i in range(len(t) - len(p) + 1): # loop over alignments 4 | match = True 5 | for j in range(len(p)): # loop over characters 6 | if t[i+j] != p[j]: # compare characters 7 | match = False 8 | break 9 | if match: 10 | occurrences.append(i) # all chars matched; record 11 | return occurrences 12 | 13 | def naive_2mm(p, t): 14 | occurrences = [] 15 | for i in range(len(t) - len(p) + 1): # loop over alignments 16 | match = True 17 | mismatches = 0 18 | for j in range(len(p)): # loop over characters 19 | if t[i+j] != p[j]: # compare characters 20 | mismatches += 1 21 | if mismatches > 2: 22 | match = False 23 | break 24 | if match: 25 | occurrences.append(i) # all chars matched; record 26 | return occurrences 27 | 28 | def naive_with_rc_first(p, t): 29 | """First, implement a version of the naive exact matching algorithm that is strand-aware. 30 | That is, instead of looking only for occurrences of P in T, additionally look for occurrences of the reverse 31 | complement of P in T. If P is ACT, your function should find occurrences of both ACT and its reverse complement AGT in T.""" 32 | occurrences = naive(p, t) 33 | more_occurenences = naive(reverseComplement(p), t) 34 | return occurrences + more_occurenences 35 | 36 | def naive_with_rc_then(p, t): 37 | """If P and its reverse complement are identical (e.g. AACGTT), then a given match offset 38 | should be reported only once. So if your new function is called naive_with_rc, then the old naive 39 | function and your new naive_with_rc function should return the same results when P equals its reverse complement.""" 40 | occurrences = naive(p, t) 41 | revP = reverseComplement(p) 42 | if p == revP: 43 | return occurrences 44 | else: 45 | more_occurenences = naive(revP, t) 46 | return occurrences + more_occurenences 47 | 48 | def naive_with_rc(p, t): 49 | return naive_with_rc_then(p, t) 50 | 51 | def reverseComplement(s): 52 | complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'} 53 | t = '' 54 | for base in s: 55 | t = complement[base] + t 56 | return t 57 | 58 | def readGenome(filename): 59 | genome = '' 60 | with open(filename, 'r') as f: 61 | for line in f: 62 | # ignore header line with genome information 63 | if not line[0] == '>': 64 | genome += line.rstrip() 65 | return genome 66 | 67 | 68 | def readFastq(filename): 69 | sequences = [] 70 | qualities = [] 71 | with open(filename) as fh: 72 | while True: 73 | fh.readline() # skip name line 74 | seq = fh.readline().rstrip() # read base sequence 75 | fh.readline() # skip placeholder line 76 | qual = fh.readline().rstrip() # base quality line 77 | if len(seq) == 0: 78 | break 79 | sequences.append(seq) 80 | qualities.append(qual) 81 | return sequences, qualities 82 | 83 | def example1(): 84 | p = 'CCC' 85 | ten_as = 'AAAAAAAAAA' 86 | t = ten_as + 'CCC' + ten_as + 'GGG' + ten_as 87 | occurrences = naive_with_rc(p, t) 88 | #print(occurrences) 89 | assert(occurrences == [10, 23]) 90 | 91 | def example2(): 92 | p = 'CGCG' 93 | ten_as = 'AAAAAAAAAA' 94 | t = ten_as + 'CGCG' + ten_as + 'CGCG' + ten_as 95 | occurrences = naive_with_rc(p, t) 96 | #print(occurrences) 97 | assert(occurrences == [10, 24]) 98 | 99 | def example3(): 100 | phix_genome = readGenome('phix.fa') 101 | occurrences = naive_with_rc('ATTA', phix_genome) 102 | #print(occurrences) 103 | #print('offset of leftmost occurrence: %d' % min(occurrences)) 104 | assert(min(occurrences) == 62) 105 | #print('# occurrences: %d' % len(occurrences)) 106 | assert(len(occurrences) == 60) 107 | 108 | def question1(genome): 109 | p = 'AGGT' 110 | revP = reverseComplement(p) 111 | assert(revP == 'ACCT') 112 | occurrences = naive_with_rc(p, genome) 113 | print 'How many times does AGGT or its reverse complement (ACCT) occur in the lambda virus genome?' 114 | print len(occurrences) 115 | 116 | def question2(genome): 117 | p = 'TTAA' 118 | revP = reverseComplement(p) 119 | assert(revP == 'TTAA') 120 | occurrences = naive_with_rc(p, genome) 121 | assert ( len(naive_with_rc_first(p, genome)) == 2 * len(naive_with_rc_then(p, genome)) ) 122 | print 'How many times does TTAA or its reverse complement occur in the lambda virus genome?' 123 | print 'Hint: TTAA and its reverse complement are equal, so remember not to double count.' 124 | print len(occurrences) 125 | 126 | def question3(genome): 127 | p = 'ACTAAGT' 128 | occurrences = naive_with_rc(p, genome) 129 | print """What is the offset of the leftmost occurrence of ACTAAGT or its reverse complement in the 130 | Lambda virus genome? E.g. if the leftmost occurrence of ACTAAGT is at offset 40 (0-based) and the 131 | leftmost occurrence of its reverse complement ACTTAGT is at offset 29, then report 29.""" 132 | print min(occurrences) 133 | 134 | def question4(genome): 135 | p = 'AGTCGA' 136 | revP = reverseComplement(p) 137 | assert (p != revP) 138 | occurrences = naive_with_rc(p, genome) 139 | print """What is the offset of the leftmost occurrence of ACTAAGT or its reverse complement in the 140 | Lambda virus genome? E.g. if the leftmost occurrence of ACTAAGT is at offset 40 (0-based) and the 141 | leftmost occurrence of its reverse complement ACTTAGT is at offset 29, then report 29.""" 142 | print min(occurrences) 143 | 144 | def example10(): 145 | """naive_2mm('ACTTTA', 'ACTTACTTGATAAAGT') should return the list [0, 4].""" 146 | assert(naive_2mm('ACTTTA', 'ACTTACTTGATAAAGT') == [0, 4]) 147 | 148 | def example11(): 149 | p = 'CTGT' 150 | ten_as = 'AAAAAAAAAA' 151 | t = ten_as + 'CTGT' + ten_as + 'CTTT' + ten_as + 'CGGG' + ten_as 152 | occurrences = naive_2mm(p, t) 153 | #print(occurrences) 154 | assert(occurrences == [10, 24, 38]) 155 | 156 | def example12(): 157 | phix_genome = readGenome('phix.fa') 158 | occurrences = naive_2mm('GATTACA', phix_genome) 159 | #print('offset of leftmost occurrence: %d' % min(occurrences)) 160 | assert(min(occurrences)==10) 161 | #print('# occurrences: %d' % len(occurrences)) 162 | assert(len(occurrences)==79) 163 | 164 | def question5(genome): 165 | p = 'TTCAAGCC' 166 | occurrences = naive_2mm(p, genome) 167 | print """How many times does TTCAAGCC occur in the Lambda virus genome when allowing up to 2 mismatches?""" 168 | print len(occurrences) 169 | 170 | def question6(genome): 171 | p = 'AGGAGGTT' 172 | occurrences = naive_2mm(p, genome) 173 | print """What is the offset of the leftmost occurrence of AGGAGGTT in the Lambda virus genome when allowing up to 2 mismatches?""" 174 | print occurrences[0] 175 | 176 | def question7(): 177 | reads, qualities = readFastq('ERR037900_1.first1000.fastq') 178 | #print qualities, len(qualities) 179 | """ According to wikipedia: https://en.wikipedia.org/wiki/FASTQ_format""" 180 | """ !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI """ 181 | """ 0........................26...31.......40 """ 182 | threshold = 10 183 | #print qualities[0], len(qualities[0]) 184 | readLength = len(qualities[0]) 185 | readCount = len(qualities) 186 | assert(readCount == 1000) 187 | 188 | poorQualityReads=[] 189 | while len(poorQualityReads) <= 1: 190 | print threshold 191 | # for i in range(readLength): 192 | # qualitiesForRead=[] 193 | # for j in range(readCount): 194 | # #print i, j, qualities[j][i], ord(qualities[j][i])-33 195 | # quality = ord(qualities[j][i])-33 196 | # qualitiesForRead.append(quality) 197 | # #print qualitiesForRead 198 | # if min(qualitiesForRead) >= threshold: 199 | # poorQualityReads.append(i) 200 | # threshold += -1 201 | # print threshold 202 | 203 | # for i in range(readCount): 204 | # qualitiesForRead=[] 205 | # for j in range(readLength): 206 | # #print i, j, qualities[j][i], ord(qualities[j][i])-33 207 | # quality = ord(qualities[i][j])-33 208 | # qualitiesForRead.append(quality) 209 | # #print qualitiesForRead 210 | # if min(qualitiesForRead) >= threshold: 211 | # poorQualityReads.append(j) 212 | # threshold += -1 213 | 214 | 215 | for i in range(readCount): 216 | qualitiesForRead=[ ord(v)-33 for v in qualities[:][i] ] 217 | if max(qualitiesForRead) <= threshold: 218 | print max(qualitiesForRead), qualitiesForRead, qualities[:][i] 219 | poorQualityReads.append(i) 220 | threshold += 1 221 | 222 | print threshold, poorQualityReads 223 | 224 | 225 | 226 | def main(): 227 | example1() 228 | example2() 229 | example3() 230 | print "All tests passed successfully for examples in set1" 231 | genome = readGenome('lambda_virus.fa') 232 | question1(genome) 233 | question2(genome) 234 | question3(genome) 235 | question4(genome) 236 | example10() 237 | example11() 238 | example12() 239 | print "All tests passed successfully for examples in set2" 240 | question5(genome) 241 | question6(genome) 242 | print "Question 7:" 243 | question7() 244 | 245 | if __name__ == "__main__": 246 | main() -------------------------------------------------------------------------------- /ProgrammingHomework1/lambda_virus.fa: -------------------------------------------------------------------------------- 1 | >gi|9626243|ref|NC_001416.1| Enterobacteria phage lambda, complete genome 2 | GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCG 3 | TCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGACAGGTGCTGAAAGCGAGGC 4 | TTTTTGGCCTCTGTCGTTTCCTTTCTCTGTTTTTGTCCGTGGAATGAACAATGGAAGTCAACAAAAAGCA 5 | GCTGGCTGACATTTTCGGTGCGAGTATCCGTACCATTCAGAACTGGCAGGAACAGGGAATGCCCGTTCTG 6 | CGAGGCGGTGGCAAGGGTAATGAGGTGCTTTATGACTCTGCCGCCGTCATAAAATGGTATGCCGAAAGGG 7 | ATGCTGAAATTGAGAACGAAAAGCTGCGCCGGGAGGTTGAAGAACTGCGGCAGGCCAGCGAGGCAGATCT 8 | CCAGCCAGGAACTATTGAGTACGAACGCCATCGACTTACGCGTGCGCAGGCCGACGCACAGGAACTGAAG 9 | AATGCCAGAGACTCCGCTGAAGTGGTGGAAACCGCATTCTGTACTTTCGTGCTGTCGCGGATCGCAGGTG 10 | AAATTGCCAGTATTCTCGACGGGCTCCCCCTGTCGGTGCAGCGGCGTTTTCCGGAACTGGAAAACCGACA 11 | TGTTGATTTCCTGAAACGGGATATCATCAAAGCCATGAACAAAGCAGCCGCGCTGGATGAACTGATACCG 12 | GGGTTGCTGAGTGAATATATCGAACAGTCAGGTTAACAGGCTGCGGCATTTTGTCCGCGCCGGGCTTCGC 13 | TCACTGTTCAGGCCGGAGCCACAGACCGCCGTTGAATGGGCGGATGCTAATTACTATCTCCCGAAAGAAT 14 | CCGCATACCAGGAAGGGCGCTGGGAAACACTGCCCTTTCAGCGGGCCATCATGAATGCGATGGGCAGCGA 15 | CTACATCCGTGAGGTGAATGTGGTGAAGTCTGCCCGTGTCGGTTATTCCAAAATGCTGCTGGGTGTTTAT 16 | GCCTACTTTATAGAGCATAAGCAGCGCAACACCCTTATCTGGTTGCCGACGGATGGTGATGCCGAGAACT 17 | TTATGAAAACCCACGTTGAGCCGACTATTCGTGATATTCCGTCGCTGCTGGCGCTGGCCCCGTGGTATGG 18 | CAAAAAGCACCGGGATAACACGCTCACCATGAAGCGTTTCACTAATGGGCGTGGCTTCTGGTGCCTGGGC 19 | GGTAAAGCGGCAAAAAACTACCGTGAAAAGTCGGTGGATGTGGCGGGTTATGATGAACTTGCTGCTTTTG 20 | ATGATGATATTGAACAGGAAGGCTCTCCGACGTTCCTGGGTGACAAGCGTATTGAAGGCTCGGTCTGGCC 21 | AAAGTCCATCCGTGGCTCCACGCCAAAAGTGAGAGGCACCTGTCAGATTGAGCGTGCAGCCAGTGAATCC 22 | CCGCATTTTATGCGTTTTCATGTTGCCTGCCCGCATTGCGGGGAGGAGCAGTATCTTAAATTTGGCGACA 23 | AAGAGACGCCGTTTGGCCTCAAATGGACGCCGGATGACCCCTCCAGCGTGTTTTATCTCTGCGAGCATAA 24 | TGCCTGCGTCATCCGCCAGCAGGAGCTGGACTTTACTGATGCCCGTTATATCTGCGAAAAGACCGGGATC 25 | TGGACCCGTGATGGCATTCTCTGGTTTTCGTCATCCGGTGAAGAGATTGAGCCACCTGACAGTGTGACCT 26 | TTCACATCTGGACAGCGTACAGCCCGTTCACCACCTGGGTGCAGATTGTCAAAGACTGGATGAAAACGAA 27 | AGGGGATACGGGAAAACGTAAAACCTTCGTAAACACCACGCTCGGTGAGACGTGGGAGGCGAAAATTGGC 28 | GAACGTCCGGATGCTGAAGTGATGGCAGAGCGGAAAGAGCATTATTCAGCGCCCGTTCCTGACCGTGTGG 29 | CTTACCTGACCGCCGGTATCGACTCCCAGCTGGACCGCTACGAAATGCGCGTATGGGGATGGGGGCCGGG 30 | TGAGGAAAGCTGGCTGATTGACCGGCAGATTATTATGGGCCGCCACGACGATGAACAGACGCTGCTGCGT 31 | GTGGATGAGGCCATCAATAAAACCTATACCCGCCGGAATGGTGCAGAAATGTCGATATCCCGTATCTGCT 32 | GGGATACTGGCGGGATTGACCCGACCATTGTGTATGAACGCTCGAAAAAACATGGGCTGTTCCGGGTGAT 33 | CCCCATTAAAGGGGCATCCGTCTACGGAAAGCCGGTGGCCAGCATGCCACGTAAGCGAAACAAAAACGGG 34 | GTTTACCTTACCGAAATCGGTACGGATACCGCGAAAGAGCAGATTTATAACCGCTTCACACTGACGCCGG 35 | AAGGGGATGAACCGCTTCCCGGTGCCGTTCACTTCCCGAATAACCCGGATATTTTTGATCTGACCGAAGC 36 | GCAGCAGCTGACTGCTGAAGAGCAGGTCGAAAAATGGGTGGATGGCAGGAAAAAAATACTGTGGGACAGC 37 | AAAAAGCGACGCAATGAGGCACTCGACTGCTTCGTTTATGCGCTGGCGGCGCTGCGCATCAGTATTTCCC 38 | GCTGGCAGCTGGATCTCAGTGCGCTGCTGGCGAGCCTGCAGGAAGAGGATGGTGCAGCAACCAACAAGAA 39 | AACACTGGCAGATTACGCCCGTGCCTTATCCGGAGAGGATGAATGACGCGACAGGAAGAACTTGCCGCTG 40 | CCCGTGCGGCACTGCATGACCTGATGACAGGTAAACGGGTGGCAACAGTACAGAAAGACGGACGAAGGGT 41 | GGAGTTTACGGCCACTTCCGTGTCTGACCTGAAAAAATATATTGCAGAGCTGGAAGTGCAGACCGGCATG 42 | ACACAGCGACGCAGGGGACCTGCAGGATTTTATGTATGAAAACGCCCACCATTCCCACCCTTCTGGGGCC 43 | GGACGGCATGACATCGCTGCGCGAATATGCCGGTTATCACGGCGGTGGCAGCGGATTTGGAGGGCAGTTG 44 | CGGTCGTGGAACCCACCGAGTGAAAGTGTGGATGCAGCCCTGTTGCCCAACTTTACCCGTGGCAATGCCC 45 | GCGCAGACGATCTGGTACGCAATAACGGCTATGCCGCCAACGCCATCCAGCTGCATCAGGATCATATCGT 46 | CGGGTCTTTTTTCCGGCTCAGTCATCGCCCAAGCTGGCGCTATCTGGGCATCGGGGAGGAAGAAGCCCGT 47 | GCCTTTTCCCGCGAGGTTGAAGCGGCATGGAAAGAGTTTGCCGAGGATGACTGCTGCTGCATTGACGTTG 48 | AGCGAAAACGCACGTTTACCATGATGATTCGGGAAGGTGTGGCCATGCACGCCTTTAACGGTGAACTGTT 49 | CGTTCAGGCCACCTGGGATACCAGTTCGTCGCGGCTTTTCCGGACACAGTTCCGGATGGTCAGCCCGAAG 50 | CGCATCAGCAACCCGAACAATACCGGCGACAGCCGGAACTGCCGTGCCGGTGTGCAGATTAATGACAGCG 51 | GTGCGGCGCTGGGATATTACGTCAGCGAGGACGGGTATCCTGGCTGGATGCCGCAGAAATGGACATGGAT 52 | ACCCCGTGAGTTACCCGGCGGGCGCGCCTCGTTCATTCACGTTTTTGAACCCGTGGAGGACGGGCAGACT 53 | CGCGGTGCAAATGTGTTTTACAGCGTGATGGAGCAGATGAAGATGCTCGACACGCTGCAGAACACGCAGC 54 | TGCAGAGCGCCATTGTGAAGGCGATGTATGCCGCCACCATTGAGAGTGAGCTGGATACGCAGTCAGCGAT 55 | GGATTTTATTCTGGGCGCGAACAGTCAGGAGCAGCGGGAAAGGCTGACCGGCTGGATTGGTGAAATTGCC 56 | GCGTATTACGCCGCAGCGCCGGTCCGGCTGGGAGGCGCAAAAGTACCGCACCTGATGCCGGGTGACTCAC 57 | TGAACCTGCAGACGGCTCAGGATACGGATAACGGCTACTCCGTGTTTGAGCAGTCACTGCTGCGGTATAT 58 | CGCTGCCGGGCTGGGTGTCTCGTATGAGCAGCTTTCCCGGAATTACGCCCAGATGAGCTACTCCACGGCA 59 | CGGGCCAGTGCGAACGAGTCGTGGGCGTACTTTATGGGGCGGCGAAAATTCGTCGCATCCCGTCAGGCGA 60 | GCCAGATGTTTCTGTGCTGGCTGGAAGAGGCCATCGTTCGCCGCGTGGTGACGTTACCTTCAAAAGCGCG 61 | CTTCAGTTTTCAGGAAGCCCGCAGTGCCTGGGGGAACTGCGACTGGATAGGCTCCGGTCGTATGGCCATC 62 | GATGGTCTGAAAGAAGTTCAGGAAGCGGTGATGCTGATAGAAGCCGGACTGAGTACCTACGAGAAAGAGT 63 | GCGCAAAACGCGGTGACGACTATCAGGAAATTTTTGCCCAGCAGGTCCGTGAAACGATGGAGCGCCGTGC 64 | AGCCGGTCTTAAACCGCCCGCCTGGGCGGCTGCAGCATTTGAATCCGGGCTGCGACAATCAACAGAGGAG 65 | GAGAAGAGTGACAGCAGAGCTGCGTAATCTCCCGCATATTGCCAGCATGGCCTTTAATGAGCCGCTGATG 66 | CTTGAACCCGCCTATGCGCGGGTTTTCTTTTGTGCGCTTGCAGGCCAGCTTGGGATCAGCAGCCTGACGG 67 | ATGCGGTGTCCGGCGACAGCCTGACTGCCCAGGAGGCACTCGCGACGCTGGCATTATCCGGTGATGATGA 68 | CGGACCACGACAGGCCCGCAGTTATCAGGTCATGAACGGCATCGCCGTGCTGCCGGTGTCCGGCACGCTG 69 | GTCAGCCGGACGCGGGCGCTGCAGCCGTACTCGGGGATGACCGGTTACAACGGCATTATCGCCCGTCTGC 70 | AACAGGCTGCCAGCGATCCGATGGTGGACGGCATTCTGCTCGATATGGACACGCCCGGCGGGATGGTGGC 71 | GGGGGCATTTGACTGCGCTGACATCATCGCCCGTGTGCGTGACATAAAACCGGTATGGGCGCTTGCCAAC 72 | GACATGAACTGCAGTGCAGGTCAGTTGCTTGCCAGTGCCGCCTCCCGGCGTCTGGTCACGCAGACCGCCC 73 | GGACAGGCTCCATCGGCGTCATGATGGCTCACAGTAATTACGGTGCTGCGCTGGAGAAACAGGGTGTGGA 74 | AATCACGCTGATTTACAGCGGCAGCCATAAGGTGGATGGCAACCCCTACAGCCATCTTCCGGATGACGTC 75 | CGGGAGACACTGCAGTCCCGGATGGACGCAACCCGCCAGATGTTTGCGCAGAAGGTGTCGGCATATACCG 76 | GCCTGTCCGTGCAGGTTGTGCTGGATACCGAGGCTGCAGTGTACAGCGGTCAGGAGGCCATTGATGCCGG 77 | ACTGGCTGATGAACTTGTTAACAGCACCGATGCGATCACCGTCATGCGTGATGCACTGGATGCACGTAAA 78 | TCCCGTCTCTCAGGAGGGCGAATGACCAAAGAGACTCAATCAACAACTGTTTCAGCCACTGCTTCGCAGG 79 | CTGACGTTACTGACGTGGTGCCAGCGACGGAGGGCGAGAACGCCAGCGCGGCGCAGCCGGACGTGAACGC 80 | GCAGATCACCGCAGCGGTTGCGGCAGAAAACAGCCGCATTATGGGGATCCTCAACTGTGAGGAGGCTCAC 81 | GGACGCGAAGAACAGGCACGCGTGCTGGCAGAAACCCCCGGTATGACCGTGAAAACGGCCCGCCGCATTC 82 | TGGCCGCAGCACCACAGAGTGCACAGGCGCGCAGTGACACTGCGCTGGATCGTCTGATGCAGGGGGCACC 83 | GGCACCGCTGGCTGCAGGTAACCCGGCATCTGATGCCGTTAACGATTTGCTGAACACACCAGTGTAAGGG 84 | ATGTTTATGACGAGCAAAGAAACCTTTACCCATTACCAGCCGCAGGGCAACAGTGACCCGGCTCATACCG 85 | CAACCGCGCCCGGCGGATTGAGTGCGAAAGCGCCTGCAATGACCCCGCTGATGCTGGACACCTCCAGCCG 86 | TAAGCTGGTTGCGTGGGATGGCACCACCGACGGTGCTGCCGTTGGCATTCTTGCGGTTGCTGCTGACCAG 87 | ACCAGCACCACGCTGACGTTCTACAAGTCCGGCACGTTCCGTTATGAGGATGTGCTCTGGCCGGAGGCTG 88 | CCAGCGACGAGACGAAAAAACGGACCGCGTTTGCCGGAACGGCAATCAGCATCGTTTAACTTTACCCTTC 89 | ATCACTAAAGGCCGCCTGTGCGGCTTTTTTTACGGGATTTTTTTATGTCGATGTACACAACCGCCCAACT 90 | GCTGGCGGCAAATGAGCAGAAATTTAAGTTTGATCCGCTGTTTCTGCGTCTCTTTTTCCGTGAGAGCTAT 91 | CCCTTCACCACGGAGAAAGTCTATCTCTCACAAATTCCGGGACTGGTAAACATGGCGCTGTACGTTTCGC 92 | CGATTGTTTCCGGTGAGGTTATCCGTTCCCGTGGCGGCTCCACCTCTGAATTTACGCCGGGATATGTCAA 93 | GCCGAAGCATGAAGTGAATCCGCAGATGACCCTGCGTCGCCTGCCGGATGAAGATCCGCAGAATCTGGCG 94 | GACCCGGCTTACCGCCGCCGTCGCATCATCATGCAGAACATGCGTGACGAAGAGCTGGCCATTGCTCAGG 95 | TCGAAGAGATGCAGGCAGTTTCTGCCGTGCTTAAGGGCAAATACACCATGACCGGTGAAGCCTTCGATCC 96 | GGTTGAGGTGGATATGGGCCGCAGTGAGGAGAATAACATCACGCAGTCCGGCGGCACGGAGTGGAGCAAG 97 | CGTGACAAGTCCACGTATGACCCGACCGACGATATCGAAGCCTACGCGCTGAACGCCAGCGGTGTGGTGA 98 | ATATCATCGTGTTCGATCCGAAAGGCTGGGCGCTGTTCCGTTCCTTCAAAGCCGTCAAGGAGAAGCTGGA 99 | TACCCGTCGTGGCTCTAATTCCGAGCTGGAGACAGCGGTGAAAGACCTGGGCAAAGCGGTGTCCTATAAG 100 | GGGATGTATGGCGATGTGGCCATCGTCGTGTATTCCGGACAGTACGTGGAAAACGGCGTCAAAAAGAACT 101 | TCCTGCCGGACAACACGATGGTGCTGGGGAACACTCAGGCACGCGGTCTGCGCACCTATGGCTGCATTCA 102 | GGATGCGGACGCACAGCGCGAAGGCATTAACGCCTCTGCCCGTTACCCGAAAAACTGGGTGACCACCGGC 103 | GATCCGGCGCGTGAGTTCACCATGATTCAGTCAGCACCGCTGATGCTGCTGGCTGACCCTGATGAGTTCG 104 | TGTCCGTACAACTGGCGTAATCATGGCCCTTCGGGGCCATTGTTTCTCTGTGGAGGAGTCCATGACGAAA 105 | GATGAACTGATTGCCCGTCTCCGCTCGCTGGGTGAACAACTGAACCGTGATGTCAGCCTGACGGGGACGA 106 | AAGAAGAACTGGCGCTCCGTGTGGCAGAGCTGAAAGAGGAGCTTGATGACACGGATGAAACTGCCGGTCA 107 | GGACACCCCTCTCAGCCGGGAAAATGTGCTGACCGGACATGAAAATGAGGTGGGATCAGCGCAGCCGGAT 108 | ACCGTGATTCTGGATACGTCTGAACTGGTCACGGTCGTGGCACTGGTGAAGCTGCATACTGATGCACTTC 109 | ACGCCACGCGGGATGAACCTGTGGCATTTGTGCTGCCGGGAACGGCGTTTCGTGTCTCTGCCGGTGTGGC 110 | AGCCGAAATGACAGAGCGCGGCCTGGCCAGAATGCAATAACGGGAGGCGCTGTGGCTGATTTCGATAACC 111 | TGTTCGATGCTGCCATTGCCCGCGCCGATGAAACGATACGCGGGTACATGGGAACGTCAGCCACCATTAC 112 | ATCCGGTGAGCAGTCAGGTGCGGTGATACGTGGTGTTTTTGATGACCCTGAAAATATCAGCTATGCCGGA 113 | CAGGGCGTGCGCGTTGAAGGCTCCAGCCCGTCCCTGTTTGTCCGGACTGATGAGGTGCGGCAGCTGCGGC 114 | GTGGAGACACGCTGACCATCGGTGAGGAAAATTTCTGGGTAGATCGGGTTTCGCCGGATGATGGCGGAAG 115 | TTGTCATCTCTGGCTTGGACGGGGCGTACCGCCTGCCGTTAACCGTCGCCGCTGAAAGGGGGATGTATGG 116 | CCATAAAAGGTCTTGAGCAGGCCGTTGAAAACCTCAGCCGTATCAGCAAAACGGCGGTGCCTGGTGCCGC 117 | CGCAATGGCCATTAACCGCGTTGCTTCATCCGCGATATCGCAGTCGGCGTCACAGGTTGCCCGTGAGACA 118 | AAGGTACGCCGGAAACTGGTAAAGGAAAGGGCCAGGCTGAAAAGGGCCACGGTCAAAAATCCGCAGGCCA 119 | GAATCAAAGTTAACCGGGGGGATTTGCCCGTAATCAAGCTGGGTAATGCGCGGGTTGTCCTTTCGCGCCG 120 | CAGGCGTCGTAAAAAGGGGCAGCGTTCATCCCTGAAAGGTGGCGGCAGCGTGCTTGTGGTGGGTAACCGT 121 | CGTATTCCCGGCGCGTTTATTCAGCAACTGAAAAATGGCCGGTGGCATGTCATGCAGCGTGTGGCTGGGA 122 | AAAACCGTTACCCCATTGATGTGGTGAAAATCCCGATGGCGGTGCCGCTGACCACGGCGTTTAAACAAAA 123 | TATTGAGCGGATACGGCGTGAACGTCTTCCGAAAGAGCTGGGCTATGCGCTGCAGCATCAACTGAGGATG 124 | GTAATAAAGCGATGAAACATACTGAACTCCGTGCAGCCGTACTGGATGCACTGGAGAAGCATGACACCGG 125 | GGCGACGTTTTTTGATGGTCGCCCCGCTGTTTTTGATGAGGCGGATTTTCCGGCAGTTGCCGTTTATCTC 126 | ACCGGCGCTGAATACACGGGCGAAGAGCTGGACAGCGATACCTGGCAGGCGGAGCTGCATATCGAAGTTT 127 | TCCTGCCTGCTCAGGTGCCGGATTCAGAGCTGGATGCGTGGATGGAGTCCCGGATTTATCCGGTGATGAG 128 | CGATATCCCGGCACTGTCAGATTTGATCACCAGTATGGTGGCCAGCGGCTATGACTACCGGCGCGACGAT 129 | GATGCGGGCTTGTGGAGTTCAGCCGATCTGACTTATGTCATTACCTATGAAATGTGAGGACGCTATGCCT 130 | GTACCAAATCCTACAATGCCGGTGAAAGGTGCCGGGACCACCCTGTGGGTTTATAAGGGGAGCGGTGACC 131 | CTTACGCGAATCCGCTTTCAGACGTTGACTGGTCGCGTCTGGCAAAAGTTAAAGACCTGACGCCCGGCGA 132 | ACTGACCGCTGAGTCCTATGACGACAGCTATCTCGATGATGAAGATGCAGACTGGACTGCGACCGGGCAG 133 | GGGCAGAAATCTGCCGGAGATACCAGCTTCACGCTGGCGTGGATGCCCGGAGAGCAGGGGCAGCAGGCGC 134 | TGCTGGCGTGGTTTAATGAAGGCGATACCCGTGCCTATAAAATCCGCTTCCCGAACGGCACGGTCGATGT 135 | GTTCCGTGGCTGGGTCAGCAGTATCGGTAAGGCGGTGACGGCGAAGGAAGTGATCACCCGCACGGTGAAA 136 | GTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGTAACAGCGGCAACCGGCATGACCG 137 | TGACGCCTGCCAGCACCTCGGTGGTGAAAGGGCAGAGCACCACGCTGACCGTGGCCTTCCAGCCGGAGGG 138 | CGTAACCGACAAGAGCTTTCGTGCGGTGTCTGCGGATAAAACAAAAGCCACCGTGTCGGTCAGTGGTATG 139 | ACCATCACCGTGAACGGCGTTGCTGCAGGCAAGGTCAACATTCCGGTTGTATCCGGTAATGGTGAGTTTG 140 | CTGCGGTTGCAGAAATTACCGTCACCGCCAGTTAATCCGGAGAGTCAGCGATGTTCCTGAAAACCGAATC 141 | ATTTGAACATAACGGTGTGACCGTCACGCTTTCTGAACTGTCAGCCCTGCAGCGCATTGAGCATCTCGCC 142 | CTGATGAAACGGCAGGCAGAACAGGCGGAGTCAGACAGCAACCGGAAGTTTACTGTGGAAGACGCCATCA 143 | GAACCGGCGCGTTTCTGGTGGCGATGTCCCTGTGGCATAACCATCCGCAGAAGACGCAGATGCCGTCCAT 144 | GAATGAAGCCGTTAAACAGATTGAGCAGGAAGTGCTTACCACCTGGCCCACGGAGGCAATTTCTCATGCT 145 | GAAAACGTGGTGTACCGGCTGTCTGGTATGTATGAGTTTGTGGTGAATAATGCCCCTGAACAGACAGAGG 146 | ACGCCGGGCCCGCAGAGCCTGTTTCTGCGGGAAAGTGTTCGACGGTGAGCTGAGTTTTGCCCTGAAACTG 147 | GCGCGTGAGATGGGGCGACCCGACTGGCGTGCCATGCTTGCCGGGATGTCATCCACGGAGTATGCCGACT 148 | GGCACCGCTTTTACAGTACCCATTATTTTCATGATGTTCTGCTGGATATGCACTTTTCCGGGCTGACGTA 149 | CACCGTGCTCAGCCTGTTTTTCAGCGATCCGGATATGCATCCGCTGGATTTCAGTCTGCTGAACCGGCGC 150 | GAGGCTGACGAAGAGCCTGAAGATGATGTGCTGATGCAGAAAGCGGCAGGGCTTGCCGGAGGTGTCCGCT 151 | TTGGCCCGGACGGGAATGAAGTTATCCCCGCTTCCCCGGATGTGGCGGACATGACGGAGGATGACGTAAT 152 | GCTGATGACAGTATCAGAAGGGATCGCAGGAGGAGTCCGGTATGGCTGAACCGGTAGGCGATCTGGTCGT 153 | TGATTTGAGTCTGGATGCGGCCAGATTTGACGAGCAGATGGCCAGAGTCAGGCGTCATTTTTCTGGTACG 154 | GAAAGTGATGCGAAAAAAACAGCGGCAGTCGTTGAACAGTCGCTGAGCCGACAGGCGCTGGCTGCACAGA 155 | AAGCGGGGATTTCCGTCGGGCAGTATAAAGCCGCCATGCGTATGCTGCCTGCACAGTTCACCGACGTGGC 156 | CACGCAGCTTGCAGGCGGGCAAAGTCCGTGGCTGATCCTGCTGCAACAGGGGGGGCAGGTGAAGGACTCC 157 | TTCGGCGGGATGATCCCCATGTTCAGGGGGCTTGCCGGTGCGATCACCCTGCCGATGGTGGGGGCCACCT 158 | CGCTGGCGGTGGCGACCGGTGCGCTGGCGTATGCCTGGTATCAGGGCAACTCAACCCTGTCCGATTTCAA 159 | CAAAACGCTGGTCCTTTCCGGCAATCAGGCGGGACTGACGGCAGATCGTATGCTGGTCCTGTCCAGAGCC 160 | GGGCAGGCGGCAGGGCTGACGTTTAACCAGACCAGCGAGTCACTCAGCGCACTGGTTAAGGCGGGGGTAA 161 | GCGGTGAGGCTCAGATTGCGTCCATCAGCCAGAGTGTGGCGCGTTTCTCCTCTGCATCCGGCGTGGAGGT 162 | GGACAAGGTCGCTGAAGCCTTCGGGAAGCTGACCACAGACCCGACGTCGGGGCTGACGGCGATGGCTCGC 163 | CAGTTCCATAACGTGTCGGCGGAGCAGATTGCGTATGTTGCTCAGTTGCAGCGTTCCGGCGATGAAGCCG 164 | GGGCATTGCAGGCGGCGAACGAGGCCGCAACGAAAGGGTTTGATGACCAGACCCGCCGCCTGAAAGAGAA 165 | CATGGGCACGCTGGAGACCTGGGCAGACAGGACTGCGCGGGCATTCAAATCCATGTGGGATGCGGTGCTG 166 | GATATTGGTCGTCCTGATACCGCGCAGGAGATGCTGATTAAGGCAGAGGCTGCGTATAAGAAAGCAGACG 167 | ACATCTGGAATCTGCGCAAGGATGATTATTTTGTTAACGATGAAGCGCGGGCGCGTTACTGGGATGATCG 168 | TGAAAAGGCCCGTCTTGCGCTTGAAGCCGCCCGAAAGAAGGCTGAGCAGCAGACTCAACAGGACAAAAAT 169 | GCGCAGCAGCAGAGCGATACCGAAGCGTCACGGCTGAAATATACCGAAGAGGCGCAGAAGGCTTACGAAC 170 | GGCTGCAGACGCCGCTGGAGAAATATACCGCCCGTCAGGAAGAACTGAACAAGGCACTGAAAGACGGGAA 171 | AATCCTGCAGGCGGATTACAACACGCTGATGGCGGCGGCGAAAAAGGATTATGAAGCGACGCTGAAAAAG 172 | CCGAAACAGTCCAGCGTGAAGGTGTCTGCGGGCGATCGTCAGGAAGACAGTGCTCATGCTGCCCTGCTGA 173 | CGCTTCAGGCAGAACTCCGGACGCTGGAGAAGCATGCCGGAGCAAATGAGAAAATCAGCCAGCAGCGCCG 174 | GGATTTGTGGAAGGCGGAGAGTCAGTTCGCGGTACTGGAGGAGGCGGCGCAACGTCGCCAGCTGTCTGCA 175 | CAGGAGAAATCCCTGCTGGCGCATAAAGATGAGACGCTGGAGTACAAACGCCAGCTGGCTGCACTTGGCG 176 | ACAAGGTTACGTATCAGGAGCGCCTGAACGCGCTGGCGCAGCAGGCGGATAAATTCGCACAGCAGCAACG 177 | GGCAAAACGGGCCGCCATTGATGCGAAAAGCCGGGGGCTGACTGACCGGCAGGCAGAACGGGAAGCCACG 178 | GAACAGCGCCTGAAGGAACAGTATGGCGATAATCCGCTGGCGCTGAATAACGTCATGTCAGAGCAGAAAA 179 | AGACCTGGGCGGCTGAAGACCAGCTTCGCGGGAACTGGATGGCAGGCCTGAAGTCCGGCTGGAGTGAGTG 180 | GGAAGAGAGCGCCACGGACAGTATGTCGCAGGTAAAAAGTGCAGCCACGCAGACCTTTGATGGTATTGCA 181 | CAGAATATGGCGGCGATGCTGACCGGCAGTGAGCAGAACTGGCGCAGCTTCACCCGTTCCGTGCTGTCCA 182 | TGATGACAGAAATTCTGCTTAAGCAGGCAATGGTGGGGATTGTCGGGAGTATCGGCAGCGCCATTGGCGG 183 | GGCTGTTGGTGGCGGCGCATCCGCGTCAGGCGGTACAGCCATTCAGGCCGCTGCGGCGAAATTCCATTTT 184 | GCAACCGGAGGATTTACGGGAACCGGCGGCAAATATGAGCCAGCGGGGATTGTTCACCGTGGTGAGTTTG 185 | TCTTCACGAAGGAGGCAACCAGCCGGATTGGCGTGGGGAATCTTTACCGGCTGATGCGCGGCTATGCCAC 186 | CGGCGGTTATGTCGGTACACCGGGCAGCATGGCAGACAGCCGGTCGCAGGCGTCCGGGACGTTTGAGCAG 187 | AATAACCATGTGGTGATTAACAACGACGGCACGAACGGGCAGATAGGTCCGGCTGCTCTGAAGGCGGTGT 188 | ATGACATGGCCCGCAAGGGTGCCCGTGATGAAATTCAGACACAGATGCGTGATGGTGGCCTGTTCTCCGG 189 | AGGTGGACGATGAAGACCTTCCGCTGGAAAGTGAAACCCGGTATGGATGTGGCTTCGGTCCCTTCTGTAA 190 | GAAAGGTGCGCTTTGGTGATGGCTATTCTCAGCGAGCGCCTGCCGGGCTGAATGCCAACCTGAAAACGTA 191 | CAGCGTGACGCTTTCTGTCCCCCGTGAGGAGGCCACGGTACTGGAGTCGTTTCTGGAAGAGCACGGGGGC 192 | TGGAAATCCTTTCTGTGGACGCCGCCTTATGAGTGGCGGCAGATAAAGGTGACCTGCGCAAAATGGTCGT 193 | CGCGGGTCAGTATGCTGCGTGTTGAGTTCAGCGCAGAGTTTGAACAGGTGGTGAACTGATGCAGGATATC 194 | CGGCAGGAAACACTGAATGAATGCACCCGTGCGGAGCAGTCGGCCAGCGTGGTGCTCTGGGAAATCGACC 195 | TGACAGAGGTCGGTGGAGAACGTTATTTTTTCTGTAATGAGCAGAACGAAAAAGGTGAGCCGGTCACCTG 196 | GCAGGGGCGACAGTATCAGCCGTATCCCATTCAGGGGAGCGGTTTTGAACTGAATGGCAAAGGCACCAGT 197 | ACGCGCCCCACGCTGACGGTTTCTAACCTGTACGGTATGGTCACCGGGATGGCGGAAGATATGCAGAGTC 198 | TGGTCGGCGGAACGGTGGTCCGGCGTAAGGTTTACGCCCGTTTTCTGGATGCGGTGAACTTCGTCAACGG 199 | AAACAGTTACGCCGATCCGGAGCAGGAGGTGATCAGCCGCTGGCGCATTGAGCAGTGCAGCGAACTGAGC 200 | GCGGTGAGTGCCTCCTTTGTACTGTCCACGCCGACGGAAACGGATGGCGCTGTTTTTCCGGGACGTATCA 201 | TGCTGGCCAACACCTGCACCTGGACCTATCGCGGTGACGAGTGCGGTTATAGCGGTCCGGCTGTCGCGGA 202 | TGAATATGACCAGCCAACGTCCGATATCACGAAGGATAAATGCAGCAAATGCCTGAGCGGTTGTAAGTTC 203 | CGCAATAACGTCGGCAACTTTGGCGGCTTCCTTTCCATTAACAAACTTTCGCAGTAAATCCCATGACACA 204 | GACAGAATCAGCGATTCTGGCGCACGCCCGGCGATGTGCGCCAGCGGAGTCGTGCGGCTTCGTGGTAAGC 205 | ACGCCGGAGGGGGAAAGATATTTCCCCTGCGTGAATATCTCCGGTGAGCCGGAGGCTATTTCCGTATGTC 206 | GCCGGAAGACTGGCTGCAGGCAGAAATGCAGGGTGAGATTGTGGCGCTGGTCCACAGCCACCCCGGTGGT 207 | CTGCCCTGGCTGAGTGAGGCCGACCGGCGGCTGCAGGTGCAGAGTGATTTGCCGTGGTGGCTGGTCTGCC 208 | GGGGGACGATTCATAAGTTCCGCTGTGTGCCGCATCTCACCGGGCGGCGCTTTGAGCACGGTGTGACGGA 209 | CTGTTACACACTGTTCCGGGATGCTTATCATCTGGCGGGGATTGAGATGCCGGACTTTCATCGTGAGGAT 210 | GACTGGTGGCGTAACGGCCAGAATCTCTATCTGGATAATCTGGAGGCGACGGGGCTGTATCAGGTGCCGT 211 | TGTCAGCGGCACAGCCGGGCGATGTGCTGCTGTGCTGTTTTGGTTCATCAGTGCCGAATCACGCCGCAAT 212 | TTACTGCGGCGACGGCGAGCTGCTGCACCATATTCCTGAACAACTGAGCAAACGAGAGAGGTACACCGAC 213 | AAATGGCAGCGACGCACACACTCCCTCTGGCGTCACCGGGCATGGCGCGCATCTGCCTTTACGGGGATTT 214 | ACAACGATTTGGTCGCCGCATCGACCTTCGTGTGAAAACGGGGGCTGAAGCCATCCGGGCACTGGCCACA 215 | CAGCTCCCGGCGTTTCGTCAGAAACTGAGCGACGGCTGGTATCAGGTACGGATTGCCGGGCGGGACGTCA 216 | GCACGTCCGGGTTAACGGCGCAGTTACATGAGACTCTGCCTGATGGCGCTGTAATTCATATTGTTCCCAG 217 | AGTCGCCGGGGCCAAGTCAGGTGGCGTATTCCAGATTGTCCTGGGGGCTGCCGCCATTGCCGGATCATTC 218 | TTTACCGCCGGAGCCACCCTTGCAGCATGGGGGGCAGCCATTGGGGCCGGTGGTATGACCGGCATCCTGT 219 | TTTCTCTCGGTGCCAGTATGGTGCTCGGTGGTGTGGCGCAGATGCTGGCACCGAAAGCCAGAACTCCCCG 220 | TATACAGACAACGGATAACGGTAAGCAGAACACCTATTTCTCCTCACTGGATAACATGGTTGCCCAGGGC 221 | AATGTTCTGCCTGTTCTGTACGGGGAAATGCGCGTGGGGTCACGCGTGGTTTCTCAGGAGATCAGCACGG 222 | CAGACGAAGGGGACGGTGGTCAGGTTGTGGTGATTGGTCGCTGATGCAAAATGTTTTATGTGAAACCGCC 223 | TGCGGGCGGTTTTGTCATTTATGGAGCGTGAGGAATGGGTAAAGGAAGCAGTAAGGGGCATACCCCGCGC 224 | GAAGCGAAGGACAACCTGAAGTCCACGCAGTTGCTGAGTGTGATCGATGCCATCAGCGAAGGGCCGATTG 225 | AAGGTCCGGTGGATGGCTTAAAAAGCGTGCTGCTGAACAGTACGCCGGTGCTGGACACTGAGGGGAATAC 226 | CAACATATCCGGTGTCACGGTGGTGTTCCGGGCTGGTGAGCAGGAGCAGACTCCGCCGGAGGGATTTGAA 227 | TCCTCCGGCTCCGAGACGGTGCTGGGTACGGAAGTGAAATATGACACGCCGATCACCCGCACCATTACGT 228 | CTGCAAACATCGACCGTCTGCGCTTTACCTTCGGTGTACAGGCACTGGTGGAAACCACCTCAAAGGGTGA 229 | CAGGAATCCGTCGGAAGTCCGCCTGCTGGTTCAGATACAACGTAACGGTGGCTGGGTGACGGAAAAAGAC 230 | ATCACCATTAAGGGCAAAACCACCTCGCAGTATCTGGCCTCGGTGGTGATGGGTAACCTGCCGCCGCGCC 231 | CGTTTAATATCCGGATGCGCAGGATGACGCCGGACAGCACCACAGACCAGCTGCAGAACAAAACGCTCTG 232 | GTCGTCATACACTGAAATCATCGATGTGAAACAGTGCTACCCGAACACGGCACTGGTCGGCGTGCAGGTG 233 | GACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGTATTCTGCAGGTGC 234 | CGTCGAACTATAACCCGCAGACGCGGCAATACAGCGGTATCTGGGACGGAACGTTTAAACCGGCATACAG 235 | CAACAACATGGCCTGGTGTCTGTGGGATATGCTGACCCATCCGCGCTACGGCATGGGGAAACGTCTTGGT 236 | GCGGCGGATGTGGATAAATGGGCGCTGTATGTCATCGGCCAGTACTGCGACCAGTCAGTGCCGGACGGCT 237 | TTGGCGGCACGGAGCCGCGCATCACCTGTAATGCGTACCTGACCACACAGCGTAAGGCGTGGGATGTGCT 238 | CAGCGATTTCTGCTCGGCGATGCGCTGTATGCCGGTATGGAACGGGCAGACGCTGACGTTCGTGCAGGAC 239 | CGACCGTCGGATAAGACGTGGACCTATAACCGCAGTAATGTGGTGATGCCGGATGATGGCGCGCCGTTCC 240 | GCTACAGCTTCAGCGCCCTGAAGGACCGCCATAATGCCGTTGAGGTGAACTGGATTGACCCGAACAACGG 241 | CTGGGAGACGGCGACAGAGCTTGTTGAAGATACGCAGGCCATTGCCCGTTACGGTCGTAATGTTACGAAG 242 | ATGGATGCCTTTGGCTGTACCAGCCGGGGGCAGGCACACCGCGCCGGGCTGTGGCTGATTAAAACAGAAC 243 | TGCTGGAAACGCAGACCGTGGATTTCAGCGTCGGCGCAGAAGGGCTTCGCCATGTACCGGGCGATGTTAT 244 | TGAAATCTGCGATGATGACTATGCCGGTATCAGCACCGGTGGTCGTGTGCTGGCGGTGAACAGCCAGACC 245 | CGGACGCTGACGCTCGACCGTGAAATCACGCTGCCATCCTCCGGTACCGCGCTGATAAGCCTGGTTGACG 246 | GAAGTGGCAATCCGGTCAGCGTGGAGGTTCAGTCCGTCACCGACGGCGTGAAGGTAAAAGTGAGCCGTGT 247 | TCCTGACGGTGTTGCTGAATACAGCGTATGGGAGCTGAAGCTGCCGACGCTGCGCCAGCGACTGTTCCGC 248 | TGCGTGAGTATCCGTGAGAACGACGACGGCACGTATGCCATCACCGCCGTGCAGCATGTGCCGGAAAAAG 249 | AGGCCATCGTGGATAACGGGGCGCACTTTGACGGCGAACAGAGTGGCACGGTGAATGGTGTCACGCCGCC 250 | AGCGGTGCAGCACCTGACCGCAGAAGTCACTGCAGACAGCGGGGAATATCAGGTGCTGGCGCGATGGGAC 251 | ACACCGAAGGTGGTGAAGGGCGTGAGTTTCCTGCTCCGTCTGACCGTAACAGCGGACGACGGCAGTGAGC 252 | GGCTGGTCAGCACGGCCCGGACGACGGAAACCACATACCGCTTCACGCAACTGGCGCTGGGGAACTACAG 253 | GCTGACAGTCCGGGCGGTAAATGCGTGGGGGCAGCAGGGCGATCCGGCGTCGGTATCGTTCCGGATTGCC 254 | GCACCGGCAGCACCGTCGAGGATTGAGCTGACGCCGGGCTATTTTCAGATAACCGCCACGCCGCATCTTG 255 | CCGTTTATGACCCGACGGTACAGTTTGAGTTCTGGTTCTCGGAAAAGCAGATTGCGGATATCAGACAGGT 256 | TGAAACCAGCACGCGTTATCTTGGTACGGCGCTGTACTGGATAGCCGCCAGTATCAATATCAAACCGGGC 257 | CATGATTATTACTTTTATATCCGCAGTGTGAACACCGTTGGCAAATCGGCATTCGTGGAGGCCGTCGGTC 258 | GGGCGAGCGATGATGCGGAAGGTTACCTGGATTTTTTCAAAGGCAAGATAACCGAATCCCATCTCGGCAA 259 | GGAGCTGCTGGAAAAAGTCGAGCTGACGGAGGATAACGCCAGCAGACTGGAGGAGTTTTCGAAAGAGTGG 260 | AAGGATGCCAGTGATAAGTGGAATGCCATGTGGGCTGTCAAAATTGAGCAGACCAAAGACGGCAAACATT 261 | ATGTCGCGGGTATTGGCCTCAGCATGGAGGACACGGAGGAAGGCAAACTGAGCCAGTTTCTGGTTGCCGC 262 | CAATCGTATCGCATTTATTGACCCGGCAAACGGGAATGAAACGCCGATGTTTGTGGCGCAGGGCAACCAG 263 | ATATTCATGAACGACGTGTTCCTGAAGCGCCTGACGGCCCCCACCATTACCAGCGGCGGCAATCCTCCGG 264 | CCTTTTCCCTGACACCGGACGGAAAGCTGACCGCTAAAAATGCGGATATCAGTGGCAGTGTGAATGCGAA 265 | CTCCGGGACGCTCAGTAATGTGACGATAGCTGAAAACTGTACGATAAACGGTACGCTGAGGGCGGAAAAA 266 | ATCGTCGGGGACATTGTAAAGGCGGCGAGCGCGGCTTTTCCGCGCCAGCGTGAAAGCAGTGTGGACTGGC 267 | CGTCAGGTACCCGTACTGTCACCGTGACCGATGACCATCCTTTTGATCGCCAGATAGTGGTGCTTCCGCT 268 | GACGTTTCGCGGAAGTAAGCGTACTGTCAGCGGCAGGACAACGTATTCGATGTGTTATCTGAAAGTACTG 269 | ATGAACGGTGCGGTGATTTATGATGGCGCGGCGAACGAGGCGGTACAGGTGTTCTCCCGTATTGTTGACA 270 | TGCCAGCGGGTCGGGGAAACGTGATCCTGACGTTCACGCTTACGTCCACACGGCATTCGGCAGATATTCC 271 | GCCGTATACGTTTGCCAGCGATGTGCAGGTTATGGTGATTAAGAAACAGGCGCTGGGCATCAGCGTGGTC 272 | TGAGTGTGTTACAGAGGTTCGTCCGGGAACGGGCGTTTTATTATAAAACAGTGAGAGGTGAACGATGCGT 273 | AATGTGTGTATTGCCGTTGCTGTCTTTGCCGCACTTGCGGTGACAGTCACTCCGGCCCGTGCGGAAGGTG 274 | GACATGGTACGTTTACGGTGGGCTATTTTCAAGTGAAACCGGGTACATTGCCGTCGTTGTCGGGCGGGGA 275 | TACCGGTGTGAGTCATCTGAAAGGGATTAACGTGAAGTACCGTTATGAGCTGACGGACAGTGTGGGGGTG 276 | ATGGCTTCCCTGGGGTTCGCCGCGTCGAAAAAGAGCAGCACAGTGATGACCGGGGAGGATACGTTTCACT 277 | ATGAGAGCCTGCGTGGACGTTATGTGAGCGTGATGGCCGGACCGGTTTTACAAATCAGTAAGCAGGTCAG 278 | TGCGTACGCCATGGCCGGAGTGGCTCACAGTCGGTGGTCCGGCAGTACAATGGATTACCGTAAGACGGAA 279 | ATCACTCCCGGGTATATGAAAGAGACGACCACTGCCAGGGACGAAAGTGCAATGCGGCATACCTCAGTGG 280 | CGTGGAGTGCAGGTATACAGATTAATCCGGCAGCGTCCGTCGTTGTTGATATTGCTTATGAAGGCTCCGG 281 | CAGTGGCGACTGGCGTACTGACGGATTCATCGTTGGGGTCGGTTATAAATTCTGATTAGCCAGGTAACAC 282 | AGTGTTATGACAGCCCGCCGGAACCGGTGGGCTTTTTTGTGGGGTGAATATGGCAGTAAAGATTTCAGGA 283 | GTCCTGAAAGACGGCACAGGAAAACCGGTACAGAACTGCACCATTCAGCTGAAAGCCAGACGTAACAGCA 284 | CCACGGTGGTGGTGAACACGGTGGGCTCAGAGAATCCGGATGAAGCCGGGCGTTACAGCATGGATGTGGA 285 | GTACGGTCAGTACAGTGTCATCCTGCAGGTTGACGGTTTTCCACCATCGCACGCCGGGACCATCACCGTG 286 | TATGAAGATTCACAACCGGGGACGCTGAATGATTTTCTCTGTGCCATGACGGAGGATGATGCCCGGCCGG 287 | AGGTGCTGCGTCGTCTTGAACTGATGGTGGAAGAGGTGGCGCGTAACGCGTCCGTGGTGGCACAGAGTAC 288 | GGCAGACGCGAAGAAATCAGCCGGCGATGCCAGTGCATCAGCTGCTCAGGTCGCGGCCCTTGTGACTGAT 289 | GCAACTGACTCAGCACGCGCCGCCAGCACGTCCGCCGGACAGGCTGCATCGTCAGCTCAGGAAGCGTCCT 290 | CCGGCGCAGAAGCGGCATCAGCAAAGGCCACTGAAGCGGAAAAAAGTGCCGCAGCCGCAGAGTCCTCAAA 291 | AAACGCGGCGGCCACCAGTGCCGGTGCGGCGAAAACGTCAGAAACGAATGCTGCAGCGTCACAACAATCA 292 | GCCGCCACGTCTGCCTCCACCGCGGCCACGAAAGCGTCAGAGGCCGCCACTTCAGCACGAGATGCGGTGG 293 | CCTCAAAAGAGGCAGCAAAATCATCAGAAACGAACGCATCATCAAGTGCCGGTCGTGCAGCTTCCTCGGC 294 | AACGGCGGCAGAAAATTCTGCCAGGGCGGCAAAAACGTCCGAGACGAATGCCAGGTCATCTGAAACAGCA 295 | GCGGAACGGAGCGCCTCTGCCGCGGCAGACGCAAAAACAGCGGCGGCGGGGAGTGCGTCAACGGCATCCA 296 | CGAAGGCGACAGAGGCTGCGGGAAGTGCGGTATCAGCATCGCAGAGCAAAAGTGCGGCAGAAGCGGCGGC 297 | AATACGTGCAAAAAATTCGGCAAAACGTGCAGAAGATATAGCTTCAGCTGTCGCGCTTGAGGATGCGGAC 298 | ACAACGAGAAAGGGGATAGTGCAGCTCAGCAGTGCAACCAACAGCACGTCTGAAACGCTTGCTGCAACGC 299 | CAAAGGCGGTTAAGGTGGTAATGGATGAAACGAACAGAAAAGCCCACTGGACAGTCCGGCACTGACCGGA 300 | ACGCCAACAGCACCAACCGCGCTCAGGGGAACAAACAATACCCAGATTGCGAACACCGCTTTTGTACTGG 301 | CCGCGATTGCAGATGTTATCGACGCGTCACCTGACGCACTGAATACGCTGAATGAACTGGCCGCAGCGCT 302 | CGGGAATGATCCAGATTTTGCTACCACCATGACTAACGCGCTTGCGGGTAAACAACCGAAGAATGCGACA 303 | CTGACGGCGCTGGCAGGGCTTTCCACGGCGAAAAATAAATTACCGTATTTTGCGGAAAATGATGCCGCCA 304 | GCCTGACTGAACTGACTCAGGTTGGCAGGGATATTCTGGCAAAAAATTCCGTTGCAGATGTTCTTGAATA 305 | CCTTGGGGCCGGTGAGAATTCGGCCTTTCCGGCAGGTGCGCCGATCCCGTGGCCATCAGATATCGTTCCG 306 | TCTGGCTACGTCCTGATGCAGGGGCAGGCGTTTGACAAATCAGCCTACCCAAAACTTGCTGTCGCGTATC 307 | CATCGGGTGTGCTTCCTGATATGCGAGGCTGGACAATCAAGGGGAAACCCGCCAGCGGTCGTGCTGTATT 308 | GTCTCAGGAACAGGATGGAATTAAGTCGCACACCCACAGTGCCAGTGCATCCGGTACGGATTTGGGGACG 309 | AAAACCACATCGTCGTTTGATTACGGGACGAAAACAACAGGCAGTTTCGATTACGGCACCAAATCGACGA 310 | ATAACACGGGGGCTCATGCTCACAGTCTGAGCGGTTCAACAGGGGCCGCGGGTGCTCATGCCCACACAAG 311 | TGGTTTAAGGATGAACAGTTCTGGCTGGAGTCAGTATGGAACAGCAACCATTACAGGAAGTTTATCCACA 312 | GTTAAAGGAACCAGCACACAGGGTATTGCTTATTTATCGAAAACGGACAGTCAGGGCAGCCACAGTCACT 313 | CATTGTCCGGTACAGCCGTGAGTGCCGGTGCACATGCGCATACAGTTGGTATTGGTGCGCACCAGCATCC 314 | GGTTGTTATCGGTGCTCATGCCCATTCTTTCAGTATTGGTTCACACGGACACACCATCACCGTTAACGCT 315 | GCGGGTAACGCGGAAAACACCGTCAAAAACATTGCATTTAACTATATTGTGAGGCTTGCATAATGGCATT 316 | CAGAATGAGTGAACAACCACGGACCATAAAAATTTATAATCTGCTGGCCGGAACTAATGAATTTATTGGT 317 | GAAGGTGACGCATATATTCCGCCTCATACCGGTCTGCCTGCAAACAGTACCGATATTGCACCGCCAGATA 318 | TTCCGGCTGGCTTTGTGGCTGTTTTCAACAGTGATGAGGCATCGTGGCATCTCGTTGAAGACCATCGGGG 319 | TAAAACCGTCTATGACGTGGCTTCCGGCGACGCGTTATTTATTTCTGAACTCGGTCCGTTACCGGAAAAT 320 | TTTACCTGGTTATCGCCGGGAGGGGAATATCAGAAGTGGAACGGCACAGCCTGGGTGAAGGATACGGAAG 321 | CAGAAAAACTGTTCCGGATCCGGGAGGCGGAAGAAACAAAAAAAAGCCTGATGCAGGTAGCCAGTGAGCA 322 | TATTGCGCCGCTTCAGGATGCTGCAGATCTGGAAATTGCAACGAAGGAAGAAACCTCGTTGCTGGAAGCC 323 | TGGAAGAAGTATCGGGTGTTGCTGAACCGTGTTGATACATCAACTGCACCTGATATTGAGTGGCCTGCTG 324 | TCCCTGTTATGGAGTAATCGTTTTGTGATATGCCGCAGAAACGTTGTATGAAATAACGTTCTGCGGTTAG 325 | TTAGTATATTGTAAAGCTGAGTATTGGTTTATTTGGCGATTATTATCTTCAGGAGAATAATGGAAGTTCT 326 | ATGACTCAATTGTTCATAGTGTTTACATCACCGCCAATTGCTTTTAAGACTGAACGCATGAAATATGGTT 327 | TTTCGTCATGTTTTGAGTCTGCTGTTGATATTTCTAAAGTCGGTTTTTTTTCTTCGTTTTCTCTAACTAT 328 | TTTCCATGAAATACATTTTTGATTATTATTTGAATCAATTCCAATTACCTGAAGTCTTTCATCTATAATT 329 | GGCATTGTATGTATTGGTTTATTGGAGTAGATGCTTGCTTTTCTGAGCCATAGCTCTGATATCCAAATGA 330 | AGCCATAGGCATTTGTTATTTTGGCTCTGTCAGCTGCATAACGCCAAAAAATATATTTATCTGCTTGATC 331 | TTCAAATGTTGTATTGATTAAATCAATTGGATGGAATTGTTTATCATAAAAAATTAATGTTTGAATGTGA 332 | TAACCGTCCTTTAAAAAAGTCGTTTCTGCAAGCTTGGCTGTATAGTCAACTAACTCTTCTGTCGAAGTGA 333 | TATTTTTAGGCTTATCTACCAGTTTTAGACGCTCTTTAATATCTTCAGGAATTATTTTATTGTCATATTG 334 | TATCATGCTAAATGACAATTTGCTTATGGAGTAATCTTTTAATTTTAAATAAGTTATTCTCCTGGCTTCA 335 | TCAAATAAAGAGTCGAATGATGTTGGCGAAATCACATCGTCACCCATTGGATTGTTTATTTGTATGCCAA 336 | GAGAGTTACAGCAGTTATACATTCTGCCATAGATTATAGCTAAGGCATGTAATAATTCGTAATCTTTTAG 337 | CGTATTAGCGACCCATCGTCTTTCTGATTTAATAATAGATGATTCAGTTAAATATGAAGGTAATTTCTTT 338 | TGTGCAAGTCTGACTAACTTTTTTATACCAATGTTTAACATACTTTCATTTGTAATAAACTCAATGTCAT 339 | TTTCTTCAATGTAAGATGAAATAAGAGTAGCCTTTGCCTCGCTATACATTTCTAAATCGCCTTGTTTTTC 340 | TATCGTATTGCGAGAATTTTTAGCCCAAGCCATTAATGGATCATTTTTCCATTTTTCAATAACATTATTG 341 | TTATACCAAATGTCATATCCTATAATCTGGTTTTTGTTTTTTTGAATAATAAATGTTACTGTTCTTGCGG 342 | TTTGGAGGAATTGATTCAAATTCAAGCGAAATAATTCAGGGTCAAAATATGTATCAATGCAGCATTTGAG 343 | CAAGTGCGATAAATCTTTAAGTCTTCTTTCCCATGGTTTTTTAGTCATAAAACTCTCCATTTTGATAGGT 344 | TGCATGCTAGATGCTGATATATTTTAGAGGTGATAAAATTAACTGCTTAACTGTCAATGTAATACAAGTT 345 | GTTTGATCTTTGCAATGATTCTTATCAGAAACCATATAGTAAATTAGTTACACAGGAAATTTTTAATATT 346 | ATTATTATCATTCATTATGTATTAAAATTAGAGTTGTGGCTTGGCTCTGCTAACACGTTGCTCATAGGAG 347 | ATATGGTAGAGCCGCAGACACGTCGTATGCAGGAACGTGCTGCGGCTGGCTGGTGAACTTCCGATAGTGC 348 | GGGTGTTGAATGATTTCCAGTTGCTACCGATTTTACATATTTTTTGCATGAGAGAATTTGTACCACCTCC 349 | CACCGACCATCTATGACTGTACGCCACTGTCCCTAGGACTGCTATGTGCCGGAGCGGACATTACAAACGT 350 | CCTTCTCGGTGCATGCCACTGTTGCCAATGACCTGCCTAGGAATTGGTTAGCAAGTTACTACCGGATTTT 351 | GTAAAAACAGCCCTCCTCATATAAAAAGTATTCGTTCACTTCCGATAAGCGTCGTAATTTTCTATCTTTC 352 | ATCATATTCTAGATCCCTCTGAAAAAATCTTCCGAGTTTGCTAGGCACTGATACATAACTCTTTTCCAAT 353 | AATTGGGGAAGTCATTCAAATCTATAATAGGTTTCAGATTTGCTTCAATAAATTCTGACTGTAGCTGCTG 354 | AAACGTTGCGGTTGAACTATATTTCCTTATAACTTTTACGAAAGAGTTTCTTTGAGTAATCACTTCACTC 355 | AAGTGCTTCCCTGCCTCCAAACGATACCTGTTAGCAATATTTAATAGCTTGAAATGATGAAGAGCTCTGT 356 | GTTTGTCTTCCTGCCTCCAGTTCGCCGGGCATTCAACATAAAAACTGATAGCACCCGGAGTTCCGGAAAC 357 | GAAATTTGCATATACCCATTGCTCACGAAAAAAAATGTCCTTGTCGATATAGGGATGAATCGCTTGGTGT 358 | ACCTCATCTACTGCGAAAACTTGACCTTTCTCTCCCATATTGCAGTCGCGGCACGATGGAACTAAATTAA 359 | TAGGCATCACCGAAAATTCAGGATAATGTGCAATAGGAAGAAAATGATCTATATTTTTTGTCTGTCCTAT 360 | ATCACCACAAAATGGACATTTTTCACCTGATGAAACAAGCATGTCATCGTAATATGTTCTAGCGGGTTTG 361 | TTTTTATCTCGGAGATTATTTTCATAAAGCTTTTCTAATTTAACCTTTGTCAGGTTACCAACTACTAAGG 362 | TTGTAGGCTCAAGAGGGTGTGTCCTGTCGTAGGTAAATAACTGACCTGTCGAGCTTAATATTCTATATTG 363 | TTGTTCTTTCTGCAAAAAAGTGGGGAAGTGAGTAATGAAATTATTTCTAACATTTATCTGCATCATACCT 364 | TCCGAGCATTTATTAAGCATTTCGCTATAAGTTCTCGCTGGAAGAGGTAGTTTTTTCATTGTACTTTACC 365 | TTCATCTCTGTTCATTATCATCGCTTTTAAAACGGTTCGACCTTCTAATCCTATCTGACCATTATAATTT 366 | TTTAGAATGGTTTCATAAGAAAGCTCTGAATCAACGGACTGCGATAATAAGTGGTGGTATCCAGAATTTG 367 | TCACTTCAAGTAAAAACACCTCACGAGTTAAAACACCTAAGTTCTCACCGAATGTCTCAATATCCGGACG 368 | GATAATATTTATTGCTTCTCTTGACCGTAGGACTTTCCACATGCAGGATTTTGGAACCTCTTGCAGTACT 369 | ACTGGGGAATGAGTTGCAATTATTGCTACACCATTGCGTGCATCGAGTAAGTCGCTTAATGTTCGTAAAA 370 | AAGCAGAGAGCAAAGGTGGATGCAGATGAACCTCTGGTTCATCGAATAAAACTAATGACTTTTCGCCAAC 371 | GACATCTACTAATCTTGTGATAGTAAATAAAACAATTGCATGTCCAGAGCTCATTCGAAGCAGATATTTC 372 | TGGATATTGTCATAAAACAATTTAGTGAATTTATCATCGTCCACTTGAATCTGTGGTTCATTACGTCTTA 373 | ACTCTTCATATTTAGAAATGAGGCTGATGAGTTCCATATTTGAAAAGTTTTCATCACTACTTAGTTTTTT 374 | GATAGCTTCAAGCCAGAGTTGTCTTTTTCTATCTACTCTCATACAACCAATAAATGCTGAAATGAATTCT 375 | AAGCGGAGATCGCCTAGTGATTTTAAACTATTGCTGGCAGCATTCTTGAGTCCAATATAAAAGTATTGTG 376 | TACCTTTTGCTGGGTCAGGTTGTTCTTTAGGAGGAGTAAAAGGATCAAATGCACTAAACGAAACTGAAAC 377 | AAGCGATCGAAAATATCCCTTTGGGATTCTTGACTCGATAAGTCTATTATTTTCAGAGAAAAAATATTCA 378 | TTGTTTTCTGGGTTGGTGATTGCACCAATCATTCCATTCAAAATTGTTGTTTTACCACACCCATTCCGCC 379 | CGATAAAAGCATGAATGTTCGTGCTGGGCATAGAATTAACCGTCACCTCAAAAGGTATAGTTAAATCACT 380 | GAATCCGGGAGCACTTTTTCTATTAAATGAAAAGTGGAAATCTGACAATTCTGGCAAACCATTTAACACA 381 | CGTGCGAACTGTCCATGAATTTCTGAAAGAGTTACCCCTCTAAGTAATGAGGTGTTAAGGACGCTTTCAT 382 | TTTCAATGTCGGCTAATCGATTTGGCCATACTACTAAATCCTGAATAGCTTTAAGAAGGTTATGTTTAAA 383 | ACCATCGCTTAATTTGCTGAGATTAACATAGTAGTCAATGCTTTCACCTAAGGAAAAAAACATTTCAGGG 384 | AGTTGACTGAATTTTTTATCTATTAATGAATAAGTGCTTACTTCTTCTTTTTGACCTACAAAACCAATTT 385 | TAACATTTCCGATATCGCATTTTTCACCATGCTCATCAAAGACAGTAAGATAAAACATTGTAACAAAGGA 386 | ATAGTCATTCCAACCATCTGCTCGTAGGAATGCCTTATTTTTTTCTACTGCAGGAATATACCCGCCTCTT 387 | TCAATAACACTAAACTCCAACATATAGTAACCCTTAATTTTATTAAAATAACCGCAATTTATTTGGCGGC 388 | AACACAGGATCTCTCTTTTAAGTTACTCTCTATTACATACGTTTTCCATCTAAAAATTAGTAGTATTGAA 389 | CTTAACGGGGCATCGTATTGTAGTTTTCCATATTTAGCTTTCTGCTTCCTTTTGGATAACCCACTGTTAT 390 | TCATGTTGCATGGTGCACTGTTTATACCAACGATATAGTCTATTAATGCATATATAGTATCGCCGAACGA 391 | TTAGCTCTTCAGGCTTCTGAAGAAGCGTTTCAAGTACTAATAAGCCGATAGATAGCCACGGACTTCGTAG 392 | CCATTTTTCATAAGTGTTAACTTCCGCTCCTCGCTCATAACAGACATTCACTACAGTTATGGCGGAAAGG 393 | TATGCATGCTGGGTGTGGGGAAGTCGTGAAAGAAAAGAAGTCAGCTGCGTCGTTTGACATCACTGCTATC 394 | TTCTTACTGGTTATGCAGGTCGTAGTGGGTGGCACACAAAGCTTTGCACTGGATTGCGAGGCTTTGTGCT 395 | TCTCTGGAGTGCGACAGGTTTGATGACAAAAAATTAGCGCAAGAAGACAAAAATCACCTTGCGCTAATGC 396 | TCTGTTACAGGTCACTAATACCATCTAAGTAGTTGATTCATAGTGACTGCATATGTTGTGTTTTACAGTA 397 | TTATGTAGTCTGTTTTTTATGCAAAATCTAATTTAATATATTGATATTTATATCATTTTACGTTTCTCGT 398 | TCAGCTTTTTTATACTAAGTTGGCATTATAAAAAAGCATTGCTTATCAATTTGTTGCAACGAACAGGTCA 399 | CTATCAGTCAAAATAAAATCATTATTTGATTTCAATTTTGTCCCACTCCCTGCCTCTGTCATCACGATAC 400 | TGTGATGCCATGGTGTCCGACTTATGCCCGAGAAGATGTTGAGCAAACTTATCGCTTATCTGCTTCTCAT 401 | AGAGTCTTGCAGACAAACTGCGCAACTCGTGAAAGGTAGGCGGATCCCCTTCGAAGGAAAGACCTGATGC 402 | TTTTCGTGCGCGCATAAAATACCTTGATACTGTGCCGGATGAAAGCGGTTCGCGACGAGTAGATGCAATT 403 | ATGGTTTCTCCGCCAAGAATCTCTTTGCATTTATCAAGTGTTTCCTTCATTGATATTCCGAGAGCATCAA 404 | TATGCAATGCTGTTGGGATGGCAATTTTTACGCCTGTTTTGCTTTGCTCGACATAAAGATATCCATCTAC 405 | GATATCAGACCACTTCATTTCGCATAAATCACCAACTCGTTGCCCGGTAACAACAGCCAGTTCCATTGCA 406 | AGTCTGAGCCAACATGGTGATGATTCTGCTGCTTGATAAATTTTCAGGTATTCGTCAGCCGTAAGTCTTG 407 | ATCTCCTTACCTCTGATTTTGCTGCGCGAGTGGCAGCGACATGGTTTGTTGTTATATGGCCTTCAGCTAT 408 | TGCCTCTCGGAATGCATCGCTCAGTGTTGATCTGATTAACTTGGCTGACGCCGCCTTGCCCTCGTCTATG 409 | TATCCATTGAGCATTGCCGCAATTTCTTTTGTGGTGATGTCTTCAAGTGGAGCATCAGGCAGACCCCTCC 410 | TTATTGCTTTAATTTTGCTCATGTAATTTATGAGTGTCTTCTGCTTGATTCCTCTGCTGGCCAGGATTTT 411 | TTCGTAGCGATCAAGCCATGAATGTAACGTAACGGAATTATCACTGTTGATTCTCGCTGTCAGAGGCTTG 412 | TGTTTGTGTCCTGAAAATAACTCAATGTTGGCCTGTATAGCTTCAGTGATTGCGATTCGCCTGTCTCTGC 413 | CTAATCCAAACTCTTTACCCGTCCTTGGGTCCCTGTAGCAGTAATATCCATTGTTTCTTATATAAAGGTT 414 | AGGGGGTAAATCCCGGCGCTCATGACTTCGCCTTCTTCCCATTTCTGATCCTCTTCAAAAGGCCACCTGT 415 | TACTGGTCGATTTAAGTCAACCTTTACCGCTGATTCGTGGAACAGATACTCTCTTCCATCCTTAACCGGA 416 | GGTGGGAATATCCTGCATTCCCGAACCCATCGACGAACTGTTTCAAGGCTTCTTGGACGTCGCTGGCGTG 417 | CGTTCCACTCCTGAAGTGTCAAGTACATCGCAAAGTCTCCGCAATTACACGCAAGAAAAAACCGCCATCA 418 | GGCGGCTTGGTGTTCTTTCAGTTCTTCAATTCGAATATTGGTTACGTCTGCATGTGCTATCTGCGCCCAT 419 | ATCATCCAGTGGTCGTAGCAGTCGTTGATGTTCTCCGCTTCGATAACTCTGTTGAATGGCTCTCCATTCC 420 | ATTCTCCTGTGACTCGGAAGTGCATTTATCATCTCCATAAAACAAAACCCGCCGTAGCGAGTTCAGATAA 421 | AATAAATCCCCGCGAGTGCGAGGATTGTTATGTAATATTGGGTTTAATCATCTATATGTTTTGTACAGAG 422 | AGGGCAAGTATCGTTTCCACCGTACTCGTGATAATAATTTTGCACGGTATCAGTCATTTCTCGCACATTG 423 | CAGAATGGGGATTTGTCTTCATTAGACTTATAAACCTTCATGGAATATTTGTATGCCGACTCTATATCTA 424 | TACCTTCATCTACATAAACACCTTCGTGATGTCTGCATGGAGACAAGACACCGGATCTGCACAACATTGA 425 | TAACGCCCAATCTTTTTGCTCAGACTCTAACTCATTGATACTCATTTATAAACTCCTTGCAATGTATGTC 426 | GTTTCAGCTAAACGGTATCAGCAATGTTTATGTAAAGAAACAGTAAGATAATACTCAACCCGATGTTTGA 427 | GTACGGTCATCATCTGACACTACAGACTCTGGCATCGCTGTGAAGACGACGCGAAATTCAGCATTTTCAC 428 | AAGCGTTATCTTTTACAAAACCGATCTCACTCTCCTTTGATGCGAATGCCAGCGTCAGACATCATATGCA 429 | GATACTCACCTGCATCCTGAACCCATTGACCTCCAACCCCGTAATAGCGATGCGTAATGATGTCGATAGT 430 | TACTAACGGGTCTTGTTCGATTAACTGCCGCAGAAACTCTTCCAGGTCACCAGTGCAGTGCTTGATAACA 431 | GGAGTCTTCCCAGGATGGCGAACAACAAGAAACTGGTTTCCGTCTTCACGGACTTCGTTGCTTTCCAGTT 432 | TAGCAATACGCTTACTCCCATCCGAGATAACACCTTCGTAATACTCACGCTGCTCGTTGAGTTTTGATTT 433 | TGCTGTTTCAAGCTCAACACGCAGTTTCCCTACTGTTAGCGCAATATCCTCGTTCTCCTGGTCGCGGCGT 434 | TTGATGTATTGCTGGTTTCTTTCCCGTTCATCCAGCAGTTCCAGCACAATCGATGGTGTTACCAATTCAT 435 | GGAAAAGGTCTGCGTCAAATCCCCAGTCGTCATGCATTGCCTGCTCTGCCGCTTCACGCAGTGCCTGAGA 436 | GTTAATTTCGCTCACTTCGAACCTCTCTGTTTACTGATAAGTTCCAGATCCTCCTGGCAACTTGCACAAG 437 | TCCGACAACCCTGAACGACCAGGCGTCTTCGTTCATCTATCGGATCGCCACACTCACAACAATGAGTGGC 438 | AGATATAGCCTGGTGGTTCAGGCGGCGCATTTTTATTGCTGTGTTGCGCTGTAATTCTTCTATTTCTGAT 439 | GCTGAATCAATGATGTCTGCCATCTTTCATTAATCCCTGAACTGTTGGTTAATACGCTTGAGGGTGAATG 440 | CGAATAATAAAAAAGGAGCCTGTAGCTCCCTGATGATTTTGCTTTTCATGTTCATCGTTCCTTAAAGACG 441 | CCGTTTAACATGCCGATTGCCAGGCTTAAATGAGTCGGTGTGAATCCCATCAGCGTTACCGTTTCGCGGT 442 | GCTTCTTCAGTACGCTACGGCAAATGTCATCGACGTTTTTATCCGGAAACTGCTGTCTGGCTTTTTTTGA 443 | TTTCAGAATTAGCCTGACGGGCAATGCTGCGAAGGGCGTTTTCCTGCTGAGGTGTCATTGAACAAGTCCC 444 | ATGTCGGCAAGCATAAGCACACAGAATATGAAGCCCGCTGCCAGAAAAATGCATTCCGTGGTTGTCATAC 445 | CTGGTTTCTCTCATCTGCTTCTGCTTTCGCCACCATCATTTCCAGCTTTTGTGAAAGGGATGCGGCTAAC 446 | GTATGAAATTCTTCGTCTGTTTCTACTGGTATTGGCACAAACCTGATTCCAATTTGAGCAAGGCTATGTG 447 | CCATCTCGATACTCGTTCTTAACTCAACAGAAGATGCTTTGTGCATACAGCCCCTCGTTTATTATTTATC 448 | TCCTCAGCCAGCCGCTGTGCTTTCAGTGGATTTCGGATAACAGAAAGGCCGGGAAATACCCAGCCTCGCT 449 | TTGTAACGGAGTAGACGAAAGTGATTGCGCCTACCCGGATATTATCGTGAGGATGCGTCATCGCCATTGC 450 | TCCCCAAATACAAAACCAATTTCAGCCAGTGCCTCGTCCATTTTTTCGATGAACTCCGGCACGATCTCGT 451 | CAAAACTCGCCATGTACTTTTCATCCCGCTCAATCACGACATAATGCAGGCCTTCACGCTTCATACGCGG 452 | GTCATAGTTGGCAAAGTACCAGGCATTTTTTCGCGTCACCCACATGCTGTACTGCACCTGGGCCATGTAA 453 | GCTGACTTTATGGCCTCGAAACCACCGAGCCGGAACTTCATGAAATCCCGGGAGGTAAACGGGCATTTCA 454 | GTTCAAGGCCGTTGCCGTCACTGCATAAACCATCGGGAGAGCAGGCGGTACGCATACTTTCGTCGCGATA 455 | GATGATCGGGGATTCAGTAACATTCACGCCGGAAGTGAATTCAAACAGGGTTCTGGCGTCGTTCTCGTAC 456 | TGTTTTCCCCAGGCCAGTGCTTTAGCGTTAACTTCCGGAGCCACACCGGTGCAAACCTCAGCAAGCAGGG 457 | TGTGGAAGTAGGACATTTTCATGTCAGGCCACTTCTTTCCGGAGCGGGGTTTTGCTATCACGTTGTGAAC 458 | TTCTGAAGCGGTGATGACGCCGAGCCGTAATTTGTGCCACGCATCATCCCCCTGTTCGACAGCTCTCACA 459 | TCGATCCCGGTACGCTGCAGGATAATGTCCGGTGTCATGCTGCCACCTTCTGCTCTGCGGCTTTCTGTTT 460 | CAGGAATCCAAGAGCTTTTACTGCTTCGGCCTGTGTCAGTTCTGACGATGCACGAATGTCGCGGCGAAAT 461 | ATCTGGGAACAGAGCGGCAATAAGTCGTCATCCCATGTTTTATCCAGGGCGATCAGCAGAGTGTTAATCT 462 | CCTGCATGGTTTCATCGTTAACCGGAGTGATGTCGCGTTCCGGCTGACGTTCTGCAGTGTATGCAGTATT 463 | TTCGACAATGCGCTCGGCTTCATCCTTGTCATAGATACCAGCAAATCCGAAGGCCAGACGGGCACACTGA 464 | ATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTGATTTCTCTGCCTTCGC 465 | GAGTTTTGAATGGTTCGCGGCGGCATTCATCCATCCATTCGGTAACGCAGATCGGATGATTACGGTCCTT 466 | GCGGTAAATCCGGCATGTACAGGATTCATTGTCCTGCTCAAAGTCCATGCCATCAAACTGCTGGTTTTCA 467 | TTGATGATGCGGGACCAGCCATCAACGCCCACCACCGGAACGATGCCATTCTGCTTATCAGGAAAGGCGT 468 | AAATTTCTTTCGTCCACGGATTAAGGCCGTACTGGTTGGCAACGATCAGTAATGCGATGAACTGCGCATC 469 | GCTGGCATCACCTTTAAATGCCGTCTGGCGAAGAGTGGTGATCAGTTCCTGTGGGTCGACAGAATCCATG 470 | CCGACACGTTCAGCCAGCTTCCCAGCCAGCGTTGCGAGTGCAGTACTCATTCGTTTTATACCTCTGAATC 471 | AATATCAACCTGGTGGTGAGCAATGGTTTCAACCATGTACCGGATGTGTTCTGCCATGCGCTCCTGAAAC 472 | TCAACATCGTCATCAAACGCACGGGTAATGGATTTTTTGCTGGCCCCGTGGCGTTGCAAATGATCGATGC 473 | ATAGCGATTCAAACAGGTGCTGGGGCAGGCCTTTTTCCATGTCGTCTGCCAGTTCTGCCTCTTTCTCTTC 474 | ACGGGCGAGCTGCTGGTAGTGACGCGCCCAGCTCTGAGCCTCAAGACGATCCTGAATGTAATAAGCGTTC 475 | ATGGCTGAACTCCTGAAATAGCTGTGAAAATATCGCCCGCGAAATGCCGGGCTGATTAGGAAAACAGGAA 476 | AGGGGGTTAGTGAATGCTTTTGCTTGATCTCAGTTTCAGTATTAATATCCATTTTTTATAAGCGTCGACG 477 | GCTTCACGAAACATCTTTTCATCGCCAATAAAAGTGGCGATAGTGAATTTAGTCTGGATAGCCATAAGTG 478 | TTTGATCCATTCTTTGGGACTCCTGGCTGATTAAGTATGTCGATAAGGCGTTTCCATCCGTCACGTAATT 479 | TACGGGTGATTCGTTCAAGTAAAGATTCGGAAGGGCAGCCAGCAACAGGCCACCCTGCAATGGCATATTG 480 | CATGGTGTGCTCCTTATTTATACATAACGAAAAACGCCTCGAGTGAAGCGTTATTGGTATGCGGTAAAAC 481 | CGCACTCAGGCGGCCTTGATAGTCATATCATCTGAATCAAATATTCCTGATGTATCGATATCGGTAATTC 482 | TTATTCCTTCGCTACCATCCATTGGAGGCCATCCTTCCTGACCATTTCCATCATTCCAGTCGAACTCACA 483 | CACAACACCATATGCATTTAAGTCGCTTGAAATTGCTATAAGCAGAGCATGTTGCGCCAGCATGATTAAT 484 | ACAGCATTTAATACAGAGCCGTGTTTATTGAGTCGGTATTCAGAGTCTGACCAGAAATTATTAATCTGGT 485 | GAAGTTTTTCCTCTGTCATTACGTCATGGTCGATTTCAATTTCTATTGATGCTTTCCAGTCGTAATCAAT 486 | GATGTATTTTTTGATGTTTGACATCTGTTCATATCCTCACAGATAAAAAATCGCCCTCACACTGGAGGGC 487 | AAAGAAGATTTCCAATAATCAGAACAAGTCGGCTCCTGTTTAGTTACGAGCGACATTGCTCCGTGTATTC 488 | ACTCGTTGGAATGAATACACAGTGCAGTGTTTATTCTGTTATTTATGCCAAAAATAAAGGCCACTATCAG 489 | GCAGCTTTGTTGTTCTGTTTACCAAGTTCTCTGGCAATCATTGCCGTCGTTCGTATTGCCCATTTATCGA 490 | CATATTTCCCATCTTCCATTACAGGAAACATTTCTTCAGGCTTAACCATGCATTCCGATTGCAGCTTGCA 491 | TCCATTGCATCGCTTGAATTGTCCACACCATTGATTTTTATCAATAGTCGTAGTCATACGGATAGTCCTG 492 | GTATTGTTCCATCACATCCTGAGGATGCTCTTCGAACTCTTCAAATTCTTCTTCCATATATCACCTTAAA 493 | TAGTGGATTGCGGTAGTAAAGATTGTGCCTGTCTTTTAACCACATCAGGCTCGGTGGTTCTCGTGTACCC 494 | CTACAGCGAGAAATCGGATAAACTATTACAACCCCTACAGTTTGATGAGTATAGAAATGGATCCACTCGT 495 | TATTCTCGGACGAGTGTTCAGTAATGAACCTCTGGAGAGAACCATGTATATGATCGTTATCTGGGTTGGA 496 | CTTCTGCTTTTAAGCCCAGATAACTGGCCTGAATATGTTAATGAGAGAATCGGTATTCCTCATGTGTGGC 497 | ATGTTTTCGTCTTTGCTCTTGCATTTTCGCTAGCAATTAATGTGCATCGATTATCAGCTATTGCCAGCGC 498 | CAGATATAAGCGATTTAAGCTAAGAAAACGCATTAAGATGCAAAACGATAAAGTGCGATCAGTAATTCAA 499 | AACCTTACAGAAGAGCAATCTATGGTTTTGTGCGCAGCCCTTAATGAAGGCAGGAAGTATGTGGTTACAT 500 | CAAAACAATTCCCATACATTAGTGAGTTGATTGAGCTTGGTGTGTTGAACAAAACTTTTTCCCGATGGAA 501 | TGGAAAGCATATATTATTCCCTATTGAGGATATTTACTGGACTGAATTAGTTGCCAGCTATGATCCATAT 502 | AATATTGAGATAAAGCCAAGGCCAATATCTAAGTAACTAGATAAGAGGAATCGATTTTCCCTTAATTTTC 503 | TGGCGTCCACTGCATGTTATGCCGCGTTCGCCAGGCTTGCTGTACCATGTGCGCTGATTCTTGCGCTCAA 504 | TACGTTGCAGGTTGCTTTCAATCTGTTTGTGGTATTCAGCCAGCACTGTAAGGTCTATCGGATTTAGTGC 505 | GCTTTCTACTCGTGATTTCGGTTTGCGATTCAGCGAGAGAATAGGGCGGTTAACTGGTTTTGCGCTTACC 506 | CCAACCAACAGGGGATTTGCTGCTTTCCATTGAGCCTGTTTCTCTGCGCGACGTTCGCGGCGGCGTGTTT 507 | GTGCATCCATCTGGATTCTCCTGTCAGTTAGCTTTGGTGGTGTGTGGCAGTTGTAGTCCTGAACGAAAAC 508 | CCCCCGCGATTGGCACATTGGCAGCTAATCCGGAATCGCACTTACGGCCAATGCTTCGTTTCGTATCACA 509 | CACCCCAAAGCCTTCTGCTTTGAATGCTGCCCTTCTTCAGGGCTTAATTTTTAAGAGCGTCACCTTCATG 510 | GTGGTCAGTGCGTCCTGCTGATGTGCTCAGTATCACCGCCAGTGGTATTTATGTCAACACCGCCAGAGAT 511 | AATTTATCACCGCAGATGGTTATCTGTATGTTTTTTATATGAATTTATTTTTTGCAGGGGGGCATTGTTT 512 | GGTAGGTGAGAGATCTGAATTGCTATGTTTAGTGAGTTGTATCTATTTATTTTTCAATAAATACAATTGG 513 | TTATGTGTTTTGGGGGCGATCGTGAGGCAAAGAAAACCCGGCGCTGAGGCCGGGTTATTCTTGTTCTCTG 514 | GTCAAATTATATAGTTGGAAAACAAGGATGCATATATGAATGAACGATGCAGAGGCAATGCCGATGGCGA 515 | TAGTGGGTATCATGTAGCCGCTTATGCTGGAAAGAAGCAATAACCCGCAGAAAAACAAAGCTCCAAGCTC 516 | AACAAAACTAAGGGCATAGACAATAACTACCGATGTCATATACCCATACTCTCTAATCTTGGCCAGTCGG 517 | CGCGTTCTGCTTCCGATTAGAAACGTCAAGGCAGCAATCAGGATTGCAATCATGGTTCCTGCATATGATG 518 | ACAATGTCGCCCCAAGACCATCTCTATGAGCTGAAAAAGAAACACCAGGAATGTAGTGGCGGAAAAGGAG 519 | ATAGCAAATGCTTACGATAACGTAAGGAATTATTACTATGTAAACACCAGGCATGATTCTGTTCCGCATA 520 | ATTACTCCTGATAATTAATCCTTAACTTTGCCCACCTGCCTTTTAAAACATTCCAGTATATCACTTTTCA 521 | TTCTTGCGTAGCAATATGCCATCTCTTCAGCTATCTCAGCATTGGTGACCTTGTTCAGAGGCGCTGAGAG 522 | ATGGCCTTTTTCTGATAGATAATGTTCTGTTAAAATATCTCCGGCCTCATCTTTTGCCCGCAGGCTAATG 523 | TCTGAAAATTGAGGTGACGGGTTAAAAATAATATCCTTGGCAACCTTTTTTATATCCCTTTTAAATTTTG 524 | GCTTAATGACTATATCCAATGAGTCAAAAAGCTCCCCTTCAATATCTGTTGCCCCTAAGACCTTTAATAT 525 | ATCGCCAAATACAGGTAGCTTGGCTTCTACCTTCACCGTTGTTCGGCCGATGAAATGCATATGCATAACA 526 | TCGTCTTTGGTGGTTCCCCTCATCAGTGGCTCTATCTGAACGCGCTCTCCACTGCTTAATGACATTCCTT 527 | TCCCGATTAAAAAATCTGTCAGATCGGATGTGGTCGGCCCGAAAACAGTTCTGGCAAAACCAATGGTGTC 528 | GCCTTCAACAAACAAAAAAGATGGGAATCCCAATGATTCGTCATCTGCGAGGCTGTTCTTAATATCTTCA 529 | ACTGAAGCTTTAGAGCGATTTATCTTCTGAACCAGACTCTTGTCATTTGTTTTGGTAAAGAGAAAAGTTT 530 | TTCCATCGATTTTATGAATATACAAATAATTGGAGCCAACCTGCAGGTGATGATTATCAGCCAGCAGAGA 531 | ATTAAGGAAAACAGACAGGTTTATTGAGCGCTTATCTTTCCCTTTATTTTTGCTGCGGTAAGTCGCATAA 532 | AAACCATTCTTCATAATTCAATCCATTTACTATGTTATGTTCTGAGGGGAGTGAAAATTCCCCTAATTCG 533 | ATGAAGATTCTTGCTCAATTGTTATCAGCTATGCGCCGACCAGAACACCTTGCCGATCAGCCAAACGTCT 534 | CTTCAGGCCACTGACTAGCGATAACTTTCCCCACAACGGAACAACTCTCATTGCATGGGATCATTGGGTA 535 | CTGTGGGTTTAGTGGTTGTAAAAACACCTGACCGCTATCCCTGATCAGTTTCTTGAAGGTAAACTCATCA 536 | CCCCCAAGTCTGGCTATGCAGAAATCACCTGGCTCAACAGCCTGCTCAGGGTCAACGAGAATTAACATTC 537 | CGTCAGGAAAGCTTGGCTTGGAGCCTGTTGGTGCGGTCATGGAATTACCTTCAACCTCAAGCCAGAATGC 538 | AGAATCACTGGCTTTTTTGGTTGTGCTTACCCATCTCTCCGCATCACCTTTGGTAAAGGTTCTAAGCTCA 539 | GGTGAGAACATCCCTGCCTGAACATGAGAAAAAACAGGGTACTCATACTCACTTCTAAGTGACGGCTGCA 540 | TACTAACCGCTTCATACATCTCGTAGATTTCTCTGGCGATTGAAGGGCTAAATTCTTCAACGCTAACTTT 541 | GAGAATTTTTGCAAGCAATGCGGCGTTATAAGCATTTAATGCATTGATGCCATTAAATAAAGCACCAACG 542 | CCTGACTGCCCCATCCCCATCTTGTCTGCGACAGATTCCTGGGATAAGCCAAGTTCATTTTTCTTTTTTT 543 | CATAAATTGCTTTAAGGCGACGTGCGTCCTCAAGCTGCTCTTGTGTTAATGGTTTCTTTTTTGTGCTCAT 544 | ACGTTAAATCTATCACCGCAAGGGATAAATATCTAACACCGTGCGTGTTGACTATTTTACCTCTGGCGGT 545 | GATAATGGTTGCATGTACTAAGGAGGTTGTATGGAACAACGCATAACCCTGAAAGATTATGCAATGCGCT 546 | TTGGGCAAACCAAGACAGCTAAAGATCTCGGCGTATATCAAAGCGCGATCAACAAGGCCATTCATGCAGG 547 | CCGAAAGATTTTTTTAACTATAAACGCTGATGGAAGCGTTTATGCGGAAGAGGTAAAGCCCTTCCCGAGT 548 | AACAAAAAAACAACAGCATAAATAACCCCGCTCTTACACATTCCAGCCCTGAAAAAGGGCATCAAATTAA 549 | ACCACACCTATGGTGTATGCATTTATTTGCATACATTCAATCAATTGTTATCTAAGGAAATACTTACATA 550 | TGGTTCGTGCAAACAAACGCAACGAGGCTCTACGAATCGAGAGTGCGTTGCTTAACAAAATCGCAATGCT 551 | TGGAACTGAGAAGACAGCGGAAGCTGTGGGCGTTGATAAGTCGCAGATCAGCAGGTGGAAGAGGGACTGG 552 | ATTCCAAAGTTCTCAATGCTGCTTGCTGTTCTTGAATGGGGGGTCGTTGACGACGACATGGCTCGATTGG 553 | CGCGACAAGTTGCTGCGATTCTCACCAATAAAAAACGCCCGGCGGCAACCGAGCGTTCTGAACAAATCCA 554 | GATGGAGTTCTGAGGTCATTACTGGATCTATCAACAGGAGTCATTATGACAAATACAGCAAAAATACTCA 555 | ACTTCGGCAGAGGTAACTTTGCCGGACAGGAGCGTAATGTGGCAGATCTCGATGATGGTTACGCCAGACT 556 | ATCAAATATGCTGCTTGAGGCTTATTCGGGCGCAGATCTGACCAAGCGACAGTTTAAAGTGCTGCTTGCC 557 | ATTCTGCGTAAAACCTATGGGTGGAATAAACCAATGGACAGAATCACCGATTCTCAACTTAGCGAGATTA 558 | CAAAGTTACCTGTCAAACGGTGCAATGAAGCCAAGTTAGAACTCGTCAGAATGAATATTATCAAGCAGCA 559 | AGGCGGCATGTTTGGACCAAATAAAAACATCTCAGAATGGTGCATCCCTCAAAACGAGGGAAAATCCCCT 560 | AAAACGAGGGATAAAACATCCCTCAAATTGGGGGATTGCTATCCCTCAAAACAGGGGGACACAAAAGACA 561 | CTATTACAAAAGAAAAAAGAAAAGATTATTCGTCAGAGAATTCTGGCGAATCCTCTGACCAGCCAGAAAA 562 | CGACCTTTCTGTGGTGAAACCGGATGCTGCAATTCAGAGCGGCAGCAAGTGGGGGACAGCAGAAGACCTG 563 | ACCGCCGCAGAGTGGATGTTTGACATGGTGAAGACTATCGCACCATCAGCCAGAAAACCGAATTTTGCTG 564 | GGTGGGCTAACGATATCCGCCTGATGCGTGAACGTGACGGACGTAACCACCGCGACATGTGTGTGCTGTT 565 | CCGCTGGGCATGCCAGGACAACTTCTGGTCCGGTAACGTGCTGAGCCCGGCCAAACTCCGCGATAAGTGG 566 | ACCCAACTCGAAATCAACCGTAACAAGCAACAGGCAGGCGTGACAGCCAGCAAACCAAAACTCGACCTGA 567 | CAAACACAGACTGGATTTACGGGGTGGATCTATGAAAAACATCGCCGCACAGATGGTTAACTTTGACCGT 568 | GAGCAGATGCGTCGGATCGCCAACAACATGCCGGAACAGTACGACGAAAAGCCGCAGGTACAGCAGGTAG 569 | CGCAGATCATCAACGGTGTGTTCAGCCAGTTACTGGCAACTTTCCCGGCGAGCCTGGCTAACCGTGACCA 570 | GAACGAAGTGAACGAAATCCGTCGCCAGTGGGTTCTGGCTTTTCGGGAAAACGGGATCACCACGATGGAA 571 | CAGGTTAACGCAGGAATGCGCGTAGCCCGTCGGCAGAATCGACCATTTCTGCCATCACCCGGGCAGTTTG 572 | TTGCATGGTGCCGGGAAGAAGCATCCGTTACCGCCGGACTGCCAAACGTCAGCGAGCTGGTTGATATGGT 573 | TTACGAGTATTGCCGGAAGCGAGGCCTGTATCCGGATGCGGAGTCTTATCCGTGGAAATCAAACGCGCAC 574 | TACTGGCTGGTTACCAACCTGTATCAGAACATGCGGGCCAATGCGCTTACTGATGCGGAATTACGCCGTA 575 | AGGCCGCAGATGAGCTTGTCCATATGACTGCGAGAATTAACCGTGGTGAGGCGATCCCTGAACCAGTAAA 576 | ACAACTTCCTGTCATGGGCGGTAGACCTCTAAATCGTGCACAGGCTCTGGCGAAGATCGCAGAAATCAAA 577 | GCTAAGTTCGGACTGAAAGGAGCAAGTGTATGACGGGCAAAGAGGCAATTATTCATTACCTGGGGACGCA 578 | TAATAGCTTCTGTGCGCCGGACGTTGCCGCGCTAACAGGCGCAACAGTAACCAGCATAAATCAGGCCGCG 579 | GCTAAAATGGCACGGGCAGGTCTTCTGGTTATCGAAGGTAAGGTCTGGCGAACGGTGTATTACCGGTTTG 580 | CTACCAGGGAAGAACGGGAAGGAAAGATGAGCACGAACCTGGTTTTTAAGGAGTGTCGCCAGAGTGCCGC 581 | GATGAAACGGGTATTGGCGGTATATGGAGTTAAAAGATGACCATCTACATTACTGAGCTAATAACAGGCC 582 | TGCTGGTAATCGCAGGCCTTTTTATTTGGGGGAGAGGGAAGTCATGAAAAAACTAACCTTTGAAATTCGA 583 | TCTCCAGCACATCAGCAAAACGCTATTCACGCAGTACAGCAAATCCTTCCAGACCCAACCAAACCAATCG 584 | TAGTAACCATTCAGGAACGCAACCGCAGCTTAGACCAAAACAGGAAGCTATGGGCCTGCTTAGGTGACGT 585 | CTCTCGTCAGGTTGAATGGCATGGTCGCTGGCTGGATGCAGAAAGCTGGAAGTGTGTGTTTACCGCAGCA 586 | TTAAAGCAGCAGGATGTTGTTCCTAACCTTGCCGGGAATGGCTTTGTGGTAATAGGCCAGTCAACCAGCA 587 | GGATGCGTGTAGGCGAATTTGCGGAGCTATTAGAGCTTATACAGGCATTCGGTACAGAGCGTGGCGTTAA 588 | GTGGTCAGACGAAGCGAGACTGGCTCTGGAGTGGAAAGCGAGATGGGGAGACAGGGCTGCATGATAAATG 589 | TCGTTAGTTTCTCCGGTGGCAGGACGTCAGCATATTTGCTCTGGCTAATGGAGCAAAAGCGACGGGCAGG 590 | TAAAGACGTGCATTACGTTTTCATGGATACAGGTTGTGAACATCCAATGACATATCGGTTTGTCAGGGAA 591 | GTTGTGAAGTTCTGGGATATACCGCTCACCGTATTGCAGGTTGATATCAACCCGGAGCTTGGACAGCCAA 592 | ATGGTTATACGGTATGGGAACCAAAGGATATTCAGACGCGAATGCCTGTTCTGAAGCCATTTATCGATAT 593 | GGTAAAGAAATATGGCACTCCATACGTCGGCGGCGCGTTCTGCACTGACAGATTAAAACTCGTTCCCTTC 594 | ACCAAATACTGTGATGACCATTTCGGGCGAGGGAATTACACCACGTGGATTGGCATCAGAGCTGATGAAC 595 | CGAAGCGGCTAAAGCCAAAGCCTGGAATCAGATATCTTGCTGAACTGTCAGACTTTGAGAAGGAAGATAT 596 | CCTCGCATGGTGGAAGCAACAACCATTCGATTTGCAAATACCGGAACATCTCGGTAACTGCATATTCTGC 597 | ATTAAAAAATCAACGCAAAAAATCGGACTTGCCTGCAAAGATGAGGAGGGATTGCAGCGTGTTTTTAATG 598 | AGGTCATCACGGGATCCCATGTGCGTGACGGACATCGGGAAACGCCAAAGGAGATTATGTACCGAGGAAG 599 | AATGTCGCTGGACGGTATCGCGAAAATGTATTCAGAAAATGATTATCAAGCCCTGTATCAGGACATGGTA 600 | CGAGCTAAAAGATTCGATACCGGCTCTTGTTCTGAGTCATGCGAAATATTTGGAGGGCAGCTTGATTTCG 601 | ACTTCGGGAGGGAAGCTGCATGATGCGATGTTATCGGTGCGGTGAATGCAAAGAAGATAACCGCTTCCGA 602 | CCAAATCAACCTTACTGGAATCGATGGTGTCTCCGGTGTGAAAGAACACCAACAGGGGTGTTACCACTAC 603 | CGCAGGAAAAGGAGGACGTGTGGCGAGACAGCGACGAAGTATCACCGACATAATCTGCGAAAACTGCAAA 604 | TACCTTCCAACGAAACGCACCAGAAATAAACCCAAGCCAATCCCAAAAGAATCTGACGTAAAAACCTTCA 605 | ACTACACGGCTCACCTGTGGGATATCCGGTGGCTAAGACGTCGTGCGAGGAAAACAAGGTGATTGACCAA 606 | AATCGAAGTTACGAACAAGAAAGCGTCGAGCGAGCTTTAACGTGCGCTAACTGCGGTCAGAAGCTGCATG 607 | TGCTGGAAGTTCACGTGTGTGAGCACTGCTGCGCAGAACTGATGAGCGATCCGAATAGCTCGATGCACGA 608 | GGAAGAAGATGATGGCTAAACCAGCGCGAAGACGATGTAAAAACGATGAATGCCGGGAATGGTTTCACCC 609 | TGCATTCGCTAATCAGTGGTGGTGCTCTCCAGAGTGTGGAACCAAGATAGCACTCGAACGACGAAGTAAA 610 | GAACGCGAAAAAGCGGAAAAAGCAGCAGAGAAGAAACGACGACGAGAGGAGCAGAAACAGAAAGATAAAC 611 | TTAAGATTCGAAAACTCGCCTTAAAGCCCCGCAGTTACTGGATTAAACAAGCCCAACAAGCCGTAAACGC 612 | CTTCATCAGAGAAAGAGACCGCGACTTACCATGTATCTCGTGCGGAACGCTCACGTCTGCTCAGTGGGAT 613 | GCCGGACATTACCGGACAACTGCTGCGGCACCTCAACTCCGATTTAATGAACGCAATATTCACAAGCAAT 614 | GCGTGGTGTGCAACCAGCACAAAAGCGGAAATCTCGTTCCGTATCGCGTCGAACTGATTAGCCGCATCGG 615 | GCAGGAAGCAGTAGACGAAATCGAATCAAACCATAACCGCCATCGCTGGACTATCGAAGAGTGCAAGGCG 616 | ATCAAGGCAGAGTACCAACAGAAACTCAAAGACCTGCGAAATAGCAGAAGTGAGGCCGCATGACGTTCTC 617 | AGTAAAAACCATTCCAGACATGCTCGTTGAAGCATACGGAAATCAGACAGAAGTAGCACGCAGACTGAAA 618 | TGTAGTCGCGGTACGGTCAGAAAATACGTTGATGATAAAGACGGGAAAATGCACGCCATCGTCAACGACG 619 | TTCTCATGGTTCATCGCGGATGGAGTGAAAGAGATGCGCTATTACGAAAAAATTGATGGCAGCAAATACC 620 | GAAATATTTGGGTAGTTGGCGATCTGCACGGATGCTACACGAACCTGATGAACAAACTGGATACGATTGG 621 | ATTCGACAACAAAAAAGACCTGCTTATCTCGGTGGGCGATTTGGTTGATCGTGGTGCAGAGAACGTTGAA 622 | TGCCTGGAATTAATCACATTCCCCTGGTTCAGAGCTGTACGTGGAAACCATGAGCAAATGATGATTGATG 623 | GCTTATCAGAGCGTGGAAACGTTAATCACTGGCTGCTTAATGGCGGTGGCTGGTTCTTTAATCTCGATTA 624 | CGACAAAGAAATTCTGGCTAAAGCTCTTGCCCATAAAGCAGATGAACTTCCGTTAATCATCGAACTGGTG 625 | AGCAAAGATAAAAAATATGTTATCTGCCACGCCGATTATCCCTTTGACGAATACGAGTTTGGAAAGCCAG 626 | TTGATCATCAGCAGGTAATCTGGAACCGCGAACGAATCAGCAACTCACAAAACGGGATCGTGAAAGAAAT 627 | CAAAGGCGCGGACACGTTCATCTTTGGTCATACGCCAGCAGTGAAACCACTCAAGTTTGCCAACCAAATG 628 | TATATCGATACCGGCGCAGTGTTCTGCGGAAACCTAACATTGATTCAGGTACAGGGAGAAGGCGCATGAG 629 | ACTCGAAAGCGTAGCTAAATTTCATTCGCCAAAAAGCCCGATGATGAGCGACTCACCACGGGCCACGGCT 630 | TCTGACTCTCTTTCCGGTACTGATGTGATGGCTGCTATGGGGATGGCGCAATCACAAGCCGGATTCGGTA 631 | TGGCTGCATTCTGCGGTAAGCACGAACTCAGCCAGAACGACAAACAAAAGGCTATCAACTATCTGATGCA 632 | ATTTGCACACAAGGTATCGGGGAAATACCGTGGTGTGGCAAAGCTTGAAGGAAATACTAAGGCAAAGGTA 633 | CTGCAAGTGCTCGCAACATTCGCTTATGCGGATTATTGCCGTAGTGCCGCGACGCCGGGGGCAAGATGCA 634 | GAGATTGCCATGGTACAGGCCGTGCGGTTGATATTGCCAAAACAGAGCTGTGGGGGAGAGTTGTCGAGAA 635 | AGAGTGCGGAAGATGCAAAGGCGTCGGCTATTCAAGGATGCCAGCAAGCGCAGCATATCGCGCTGTGACG 636 | ATGCTAATCCCAAACCTTACCCAACCCACCTGGTCACGCACTGTTAAGCCGCTGTATGACGCTCTGGTGG 637 | TGCAATGCCACAAAGAAGAGTCAATCGCAGACAACATTTTGAATGCGGTCACACGTTAGCAGCATGATTG 638 | CCACGGATGGCAACATATTAACGGCATGATATTGACTTATTGAATAAAATTGGGTAAATTTGACTCAACG 639 | ATGGGTTAATTCGCTCGTTGTGGTAGTGAGATGAAAAGAGGCGGCGCTTACTACCGATTCCGCCTAGTTG 640 | GTCACTTCGACGTATCGTCTGGAACTCCAACCATCGCAGGCAGAGAGGTCTGCAAAATGCAATCCCGAAA 641 | CAGTTCGCAGGTAATAGTTAGAGCCTGCATAACGGTTTCGGGATTTTTTATATCTGCACAACAGGTAAGA 642 | GCATTGAGTCGATAATCGTGAAGAGTCGGCGAGCCTGGTTAGCCAGTGCTCTTTCCGTTGTGCTGAATTA 643 | AGCGAATACCGGAAGCAGAACCGGATCACCAAATGCGTACAGGCGTCATCGCCGCCCAGCAACAGCACAA 644 | CCCAAACTGAGCCGTAGCCACTGTCTGTCCTGAATTCATTAGTAATAGTTACGCTGCGGCCTTTTACACA 645 | TGACCTTCGTGAAAGCGGGTGGCAGGAGGTCGCGCTAACAACCTCCTGCCGTTTTGCCCGTGCATATCGG 646 | TCACGAACAAATCTGATTACTAAACACAGTAGCCTGGATTTGTTCTATCAGTAATCGACCTTATTCCTAA 647 | TTAAATAGAGCAAATCCCCTTATTGGGGGTAAGACATGAAGATGCCAGAAAAACATGACCTGTTGGCCGC 648 | CATTCTCGCGGCAAAGGAACAAGGCATCGGGGCAATCCTTGCGTTTGCAATGGCGTACCTTCGCGGCAGA 649 | TATAATGGCGGTGCGTTTACAAAAACAGTAATCGACGCAACGATGTGCGCCATTATCGCCTGGTTCATTC 650 | GTGACCTTCTCGACTTCGCCGGACTAAGTAGCAATCTCGCTTATATAACGAGCGTGTTTATCGGCTACAT 651 | CGGTACTGACTCGATTGGTTCGCTTATCAAACGCTTCGCTGCTAAAAAAGCCGGAGTAGAAGATGGTAGA 652 | AATCAATAATCAACGTAAGGCGTTCCTCGATATGCTGGCGTGGTCGGAGGGAACTGATAACGGACGTCAG 653 | AAAACCAGAAATCATGGTTATGACGTCATTGTAGGCGGAGAGCTATTTACTGATTACTCCGATCACCCTC 654 | GCAAACTTGTCACGCTAAACCCAAAACTCAAATCAACAGGCGCCGGACGCTACCAGCTTCTTTCCCGTTG 655 | GTGGGATGCCTACCGCAAGCAGCTTGGCCTGAAAGACTTCTCTCCGAAAAGTCAGGACGCTGTGGCATTG 656 | CAGCAGATTAAGGAGCGTGGCGCTTTACCTATGATTGATCGTGGTGATATCCGTCAGGCAATCGACCGTT 657 | GCAGCAATATCTGGGCTTCACTGCCGGGCGCTGGTTATGGTCAGTTCGAGCATAAGGCTGACAGCCTGAT 658 | TGCAAAATTCAAAGAAGCGGGCGGAACGGTCAGAGAGATTGATGTATGAGCAGAGTCACCGCGATTATCT 659 | CCGCTCTGGTTATCTGCATCATCGTCTGCCTGTCATGGGCTGTTAATCATTACCGTGATAACGCCATTAC 660 | CTACAAAGCCCAGCGCGACAAAAATGCCAGAGAACTGAAGCTGGCGAACGCGGCAATTACTGACATGCAG 661 | ATGCGTCAGCGTGATGTTGCTGCGCTCGATGCAAAATACACGAAGGAGTTAGCTGATGCTAAAGCTGAAA 662 | ATGATGCTCTGCGTGATGATGTTGCCGCTGGTCGTCGTCGGTTGCACATCAAAGCAGTCTGTCAGTCAGT 663 | GCGTGAAGCCACCACCGCCTCCGGCGTGGATAATGCAGCCTCCCCCCGACTGGCAGACACCGCTGAACGG 664 | GATTATTTCACCCTCAGAGAGAGGCTGATCACTATGCAAAAACAACTGGAAGGAACCCAGAAGTATATTA 665 | ATGAGCAGTGCAGATAGAGTTGCCCATATCGATGGGCAACTCATGCAATTATTGTGAGCAATACACACGC 666 | GCTTCCAGCGGAGTATAAATGCCTAAAGTAATAAAACCGAGCAATCCATTTACGAATGTTTGCTGGGTTT 667 | CTGTTTTAACAACATTTTCTGCGCCGCCACAAATTTTGGCTGCATCGACAGTTTTCTTCTGCCCAATTCC 668 | AGAAACGAAGAAATGATGGGTGATGGTTTCCTTTGGTGCTACTGCTGCCGGTTTGTTTTGAACAGTAAAC 669 | GTCTGTTGAGCACATCCTGTAATAAGCAGGGCCAGCGCAGTAGCGAGTAGCATTTTTTTCATGGTGTTAT 670 | TCCCGATGCTTTTTGAAGTTCGCAGAATCGTATGTGTAGAAAATTAAACAAACCCTAAACAATGAGTTGA 671 | AATTTCATATTGTTAATATTTATTAATGTATGTCAGGTGCGATGAATCGTCATTGTATTCCCGGATTAAC 672 | TATGTCCACAGCCCTGACGGGGAACTTCTCTGCGGGAGTGTCCGGGAATAATTAAAACGATGCACACAGG 673 | GTTTAGCGCGTACACGTATTGCATTATGCCAACGCCCCGGTGCTGACACGGAAGAAACCGGACGTTATGA 674 | TTTAGCGTGGAAAGATTTGTGTAGTGTTCTGAATGCTCTCAGTAAATAGTAATGAATTATCAAAGGTATA 675 | GTAATATCTTTTATGTTCATGGATATTTGTAACCCATCGGAAAACTCCTGCTTTAGCAAGATTTTCCCTG 676 | TATTGCTGAAATGTGATTTCTCTTGATTTCAACCTATCATAGGACGTTTCTATAAGATGCGTGTTTCTTG 677 | AGAATTTAACATTTACAACCTTTTTAAGTCCTTTTATTAACACGGTGTTATCGTTTTCTAACACGATGTG 678 | AATATTATCTGTGGCTAGATAGTAAATATAATGTGAGACGTTGTGACGTTTTAGTTCAGAATAAAACAAT 679 | TCACAGTCTAAATCTTTTCGCACTTGATCGAATATTTCTTTAAAAATGGCAACCTGAGCCATTGGTAAAA 680 | CCTTCCATGTGATACGAGGGCGCGTAGTTTGCATTATCGTTTTTATCGTTTCAATCTGGTCTGACCTCCT 681 | TGTGTTTTGTTGATGATTTATGTCAAATATTAGGAATGTTTTCACTTAATAGTATTGGTTGCGTAACAAA 682 | GTGCGGTCCTGCTGGCATTCTGGAGGGAAATACAACCGACAGATGTATGTAAGGCCAACGTGCTCAAATC 683 | TTCATACAGAAAGATTTGAAGTAATATTTTAACCGCTAGATGAAGAGCAAGCGCATGGAGCGACAAAATG 684 | AATAAAGAACAATCTGCTGATGATCCCTCCGTGGATCTGATTCGTGTAAAAAATATGCTTAATAGCACCA 685 | TTTCTATGAGTTACCCTGATGTTGTAATTGCATGTATAGAACATAAGGTGTCTCTGGAAGCATTCAGAGC 686 | AATTGAGGCAGCGTTGGTGAAGCACGATAATAATATGAAGGATTATTCCCTGGTGGTTGACTGATCACCA 687 | TAACTGCTAATCATTCAAACTATTTAGTCTGTGACAGAGCCAACACGCAGTCTGTCACTGTCAGGAAAGT 688 | GGTAAAACTGCAACTCAATTACTGCAATGCCCTCGTAATTAAGTGAATTTACAATATCGTCCTGTTCGGA 689 | GGGAAGAACGCGGGATGTTCATTCTTCATCACTTTTAATTGATGTATATGCTCTCTTTTCTGACGTTAGT 690 | CTCCGACGGCAGGCTTCAATGACCCAGGCTGAGAAATTCCCGGACCCTTTTTGCTCAAGAGCGATGTTAA 691 | TTTGTTCAATCATTTGGTTAGGAAAGCGGATGTTGCGGGTTGTTGTTCTGCGGGTTCTGTTCTTCGTTGA 692 | CATGAGGTTGCCCCGTATTCAGTGTCGCTGATTTGTATTGTCTGAAGTTGTTTTTACGTTAAGTTGATGC 693 | AGATCAATTAATACGATACCTGCGTCATAATTGATTATTTGACGTGGTTTGATGGCCTCCACGCACGTTG 694 | TGATATGTAGATGATAATCATTATCACTTTACGGGTCCTTTCCGGTGATCCGACAGGTTACG 695 | 696 | -------------------------------------------------------------------------------- /ProgrammingHomework1/phix.fa: -------------------------------------------------------------------------------- 1 | >gi|216019|gb|J02482.1|PX1CG Coliphage phi-X174, complete genome 2 | GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTT 3 | GATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAA 4 | ATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTG 5 | TCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTA 6 | GATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATC 7 | TGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTT 8 | TCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTT 9 | CGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCT 10 | TGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCG 11 | TCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTAC 12 | GGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTA 13 | CGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCGGAAGGAG 14 | TGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACT 15 | AAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGC 16 | CCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCA 17 | TCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC 18 | TCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTA 19 | CTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAA 20 | GGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTT 21 | GGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACA 22 | ACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGC 23 | TCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTT 24 | TCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGC 25 | ATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCATGATGTTATTTCTTCATTTGGAGGTAAAAC 26 | CTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTT 27 | GATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGC 28 | CGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGAC 29 | TAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTG 30 | TATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGT 31 | TTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA 32 | AGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGAT 33 | TATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTT 34 | ATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAAC 35 | GCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGC 36 | TTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGT 37 | TCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTA 38 | TATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTG 39 | TCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGC 40 | CTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTG 41 | AATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGC 42 | CGGGCAATAACGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGT 43 | TTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTG 44 | CTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAA 45 | AGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCT 46 | GGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTG 47 | GTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGA 48 | TAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTAT 49 | CTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGG 50 | TTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGA 51 | GATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGAC 52 | CAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTA 53 | TGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCA 54 | AACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGAC 55 | TTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTT 56 | CTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGA 57 | TACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCG 58 | TCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTT 59 | CTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTAT 60 | TGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGC 61 | ATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATG 62 | TTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGA 63 | ATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGG 64 | GACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCC 65 | CTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATT 66 | GCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAG 67 | GCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTT 68 | ATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCG 69 | CAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGC 70 | CGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTC 71 | GTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCAT 72 | CGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAG 73 | CCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATA 74 | TGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACT 75 | TCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTG 76 | TCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGC 77 | AGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACC 78 | TGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA 79 | -------------------------------------------------------------------------------- /ProgrammingHomework2/bm_preproc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """bm_preproc.py: Boyer-Moore preprocessing.""" 4 | 5 | __author__ = "Ben Langmead" 6 | 7 | import unittest 8 | 9 | 10 | def z_array(s): 11 | """ Use Z algorithm (Gusfield theorem 1.4.1) to preprocess s """ 12 | assert len(s) > 1 13 | z = [len(s)] + [0] * (len(s)-1) 14 | 15 | # Initial comparison of s[1:] with prefix 16 | for i in range(1, len(s)): 17 | if s[i] == s[i-1]: 18 | z[1] += 1 19 | else: 20 | break 21 | 22 | r, l = 0, 0 23 | if z[1] > 0: 24 | r, l = z[1], 1 25 | 26 | for k in range(2, len(s)): 27 | assert z[k] == 0 28 | if k > r: 29 | # Case 1 30 | for i in range(k, len(s)): 31 | if s[i] == s[i-k]: 32 | z[k] += 1 33 | else: 34 | break 35 | r, l = k + z[k] - 1, k 36 | else: 37 | # Case 2 38 | # Calculate length of beta 39 | nbeta = r - k + 1 40 | zkp = z[k - l] 41 | if nbeta > zkp: 42 | # Case 2a: zkp wins 43 | z[k] = zkp 44 | else: 45 | # Case 2b: Compare characters just past r 46 | nmatch = 0 47 | for i in range(r+1, len(s)): 48 | if s[i] == s[i - k]: 49 | nmatch += 1 50 | else: 51 | break 52 | l, r = k, r + nmatch 53 | z[k] = r - k + 1 54 | return z 55 | 56 | 57 | def n_array(s): 58 | """ Compile the N array (Gusfield theorem 2.2.2) from the Z array """ 59 | return z_array(s[::-1])[::-1] 60 | 61 | 62 | def big_l_prime_array(p, n): 63 | """ Compile L' array (Gusfield theorem 2.2.2) using p and N array. 64 | L'[i] = largest index j less than n such that N[j] = |P[i:]| """ 65 | lp = [0] * len(p) 66 | for j in range(len(p)-1): 67 | i = len(p) - n[j] 68 | if i < len(p): 69 | lp[i] = j + 1 70 | return lp 71 | 72 | 73 | def big_l_array(p, lp): 74 | """ Compile L array (Gusfield theorem 2.2.2) using p and L' array. 75 | L[i] = largest index j less than n such that N[j] >= |P[i:]| """ 76 | l = [0] * len(p) 77 | l[1] = lp[1] 78 | for i in range(2, len(p)): 79 | l[i] = max(l[i-1], lp[i]) 80 | return l 81 | 82 | 83 | def small_l_prime_array(n): 84 | """ Compile lp' array (Gusfield theorem 2.2.4) using N array. """ 85 | small_lp = [0] * len(n) 86 | for i in range(len(n)): 87 | if n[i] == i+1: # prefix matching a suffix 88 | small_lp[len(n)-i-1] = i+1 89 | for i in range(len(n)-2, -1, -1): # "smear" them out to the left 90 | if small_lp[i] == 0: 91 | small_lp[i] = small_lp[i+1] 92 | return small_lp 93 | 94 | 95 | def good_suffix_table(p): 96 | """ Return tables needed to apply good suffix rule. """ 97 | n = n_array(p) 98 | lp = big_l_prime_array(p, n) 99 | return lp, big_l_array(p, lp), small_l_prime_array(n) 100 | 101 | 102 | def good_suffix_mismatch(i, big_l_prime, small_l_prime): 103 | """ Given a mismatch at offset i, and given L/L' and l' arrays, 104 | return amount to shift as determined by good suffix rule. """ 105 | length = len(big_l_prime) 106 | assert i < length 107 | if i == length - 1: 108 | return 0 109 | i += 1 # i points to leftmost matching position of P 110 | if big_l_prime[i] > 0: 111 | return length - big_l_prime[i] 112 | return length - small_l_prime[i] 113 | 114 | 115 | def good_suffix_match(small_l_prime): 116 | """ Given a full match of P to T, return amount to shift as 117 | determined by good suffix rule. """ 118 | return len(small_l_prime) - small_l_prime[1] 119 | 120 | 121 | def dense_bad_char_tab(p, amap): 122 | """ Given pattern string and list with ordered alphabet characters, create 123 | and return a dense bad character table. Table is indexed by offset 124 | then by character. """ 125 | tab = [] 126 | nxt = [0] * len(amap) 127 | for i in range(0, len(p)): 128 | c = p[i] 129 | assert c in amap 130 | tab.append(nxt[:]) 131 | nxt[amap[c]] = i+1 132 | return tab 133 | 134 | 135 | class BoyerMoore(object): 136 | """ Encapsulates pattern and associated Boyer-Moore preprocessing. """ 137 | 138 | def __init__(self, p, alphabet='ACGT'): 139 | # Create map from alphabet characters to integers 140 | self.amap = {alphabet[i]: i for i in range(len(alphabet))} 141 | # Make bad character rule table 142 | self.bad_char = dense_bad_char_tab(p, self.amap) 143 | # Create good suffix rule table 144 | _, self.big_l, self.small_l_prime = good_suffix_table(p) 145 | 146 | def bad_character_rule(self, i, c): 147 | """ Return # skips given by bad character rule at offset i """ 148 | assert c in self.amap 149 | assert i < len(self.bad_char) 150 | ci = self.amap[c] 151 | return i - (self.bad_char[i][ci]-1) 152 | 153 | def good_suffix_rule(self, i): 154 | """ Given a mismatch at offset i, return amount to shift 155 | as determined by (weak) good suffix rule. """ 156 | length = len(self.big_l) 157 | assert i < length 158 | if i == length - 1: 159 | return 0 160 | i += 1 # i points to leftmost matching position of P 161 | if self.big_l[i] > 0: 162 | return length - self.big_l[i] 163 | return length - self.small_l_prime[i] 164 | 165 | def match_skip(self): 166 | """ Return amount to shift in case where P matches T """ 167 | return len(self.small_l_prime) - self.small_l_prime[1] 168 | 169 | 170 | class TestBoyerMoorePreproc(unittest.TestCase): 171 | 172 | def test_z_1(self): 173 | s = 'abb' 174 | # -00 175 | z = z_array(s) 176 | self.assertEqual([3, 0, 0], z) 177 | 178 | def test_z_2(self): 179 | s = 'abababab' 180 | # 00604020 181 | z = z_array(s) 182 | self.assertEqual([8, 0, 6, 0, 4, 0, 2, 0], z) 183 | 184 | def test_z_3(self): 185 | s = 'abababab' 186 | # 00604020 187 | z = z_array(s) 188 | self.assertEqual([8, 0, 6, 0, 4, 0, 2, 0], z) 189 | 190 | def test_n_1(self): 191 | s = 'abb' 192 | # 01- 193 | n = n_array(s) 194 | self.assertEqual([0, 1, 3], n) 195 | 196 | def test_n_2(self): 197 | s = 'abracadabra' 198 | # 1004010100- 199 | n = n_array(s) 200 | self.assertEqual([1, 0, 0, 4, 0, 1, 0, 1, 0, 0, 11], n) 201 | 202 | def test_n_3(self): 203 | s = 'abababab' 204 | # 0204060- 205 | n = n_array(s) 206 | self.assertEqual([0, 2, 0, 4, 0, 6, 0, 8], n) 207 | 208 | def test_big_l_prime_1(self): 209 | s = 'abb' 210 | # 001 211 | big_l_prime = big_l_prime_array(s, n_array(s)) 212 | self.assertEqual([0, 0, 2], big_l_prime) 213 | 214 | def test_big_l_prime_2(self): 215 | s = 'abracadabra' 216 | # 01234567890 217 | # L' 00000003007 218 | # L 00000003337 219 | big_l_prime = big_l_prime_array(s, n_array(s)) 220 | self.assertEqual([0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 8], big_l_prime) 221 | 222 | def test_small_l_prime_1(self): 223 | s = 'abracadabra' 224 | # N 1004010100- 225 | # l' 1 226 | # l' 4 227 | # l' 44444444111 228 | small_l_prime = small_l_prime_array(n_array(s)) 229 | self.assertEqual([11, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1], small_l_prime) 230 | 231 | def test_good_suffix_match_mismatch_1(self): 232 | p = 'GGTAGGT' 233 | big_l_prime, big_l, small_l_prime = good_suffix_table(p) 234 | self.assertEqual([0, 0, 0, 0, 3, 0, 0], big_l_prime) 235 | self.assertEqual([0, 0, 0, 0, 3, 3, 3], big_l) 236 | self.assertEqual([7, 3, 3, 3, 3, 0, 0], small_l_prime) 237 | self.assertEqual(0, good_suffix_mismatch(6, big_l_prime, small_l_prime)) 238 | self.assertEqual(0, good_suffix_mismatch(6, big_l, small_l_prime)) 239 | # t: xT 240 | # p: GGTAGGT 241 | # L': -000300 242 | # L: -000333 243 | self.assertEqual(7, good_suffix_mismatch(5, big_l_prime, small_l_prime)) 244 | self.assertEqual(4, good_suffix_mismatch(5, big_l, small_l_prime)) 245 | # t: xGT 246 | # p: GGTAGGT 247 | # L': -000300 248 | # L: -000333 249 | self.assertEqual(7, good_suffix_mismatch(4, big_l_prime, small_l_prime)) 250 | self.assertEqual(4, good_suffix_mismatch(4, big_l, small_l_prime)) 251 | # t: xGGT 252 | # p: GGTAGGT 253 | # L': -000300 254 | # L: -000333 255 | self.assertEqual(4, good_suffix_mismatch(3, big_l_prime, small_l_prime)) 256 | self.assertEqual(4, good_suffix_mismatch(3, big_l, small_l_prime)) 257 | # t: xAGGT 258 | # p: GGTAGGT 259 | # L': -000300 260 | # L: -000333 261 | self.assertEqual(4, good_suffix_mismatch(2, big_l_prime, small_l_prime)) 262 | self.assertEqual(4, good_suffix_mismatch(2, big_l, small_l_prime)) 263 | # t: xTAGGT 264 | # p: GGTAGGT 265 | # L': -000300 266 | # L: -000333 267 | self.assertEqual(4, good_suffix_mismatch(1, big_l_prime, small_l_prime)) 268 | self.assertEqual(4, good_suffix_mismatch(1, big_l, small_l_prime)) 269 | # t: xGTAGGT 270 | # p: GGTAGGT 271 | # L': -000300 272 | # L: -000333 273 | self.assertEqual(4, good_suffix_mismatch(0, big_l_prime, small_l_prime)) 274 | self.assertEqual(4, good_suffix_mismatch(0, big_l, small_l_prime)) 275 | 276 | def test_good_suffix_table_1(self): 277 | s = 'abb' 278 | # 001 279 | big_l_prime, big_l, small_l_prime = good_suffix_table(s) 280 | self.assertEqual([0, 0, 2], big_l_prime) 281 | self.assertEqual([0, 0, 2], big_l) 282 | self.assertEqual([3, 0, 0], small_l_prime) 283 | 284 | def test_good_suffix_table_2(self): 285 | s = 'abracadabra' 286 | # 01234567890 287 | # L' 00000003007 288 | # L 00000003337 289 | # l' -4444444111 290 | big_l_prime, big_l, small_l_prime = good_suffix_table(s) 291 | self.assertEqual([0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 8], big_l_prime) 292 | self.assertEqual([0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 8], big_l) 293 | self.assertEqual([11, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1], small_l_prime) 294 | 295 | if __name__ == '__main__': 296 | unittest.main() 297 | -------------------------------------------------------------------------------- /ProgrammingHomework2/homework2.py: -------------------------------------------------------------------------------- 1 | def naive_with_counts(p, t): 2 | occurrences = [] 3 | num_char_comp = 0 4 | num_aligments_tried = 0 5 | for i in range(len(t) - len(p) + 1): # loop over alignments 6 | match = True 7 | num_aligments_tried += 1 8 | for j in range(len(p)): # loop over characters 9 | num_char_comp += 1 10 | if t[i+j] != p[j]: # compare characters 11 | match = False 12 | break 13 | if match: 14 | occurrences.append(i) # all chars matched; record 15 | return occurrences, num_char_comp, num_aligments_tried 16 | 17 | def example_1_1(): 18 | p = 'word' 19 | t = 'there would have been a time for such a word' 20 | #print(naive_with_counts(p, t)) 21 | assert naive_with_counts(p, t) == ([40], 46, 41) 22 | 23 | def example_1_2(): 24 | p = 'needle' 25 | t = 'needle need noodle needle' 26 | #print(naive_with_counts(p, t)) 27 | assert naive_with_counts(p, t) == ([0, 19], 35, 20) 28 | 29 | def boyer_moore_with_counts(p, p_bm, t): 30 | """ Do Boyer-Moore matching. p=pattern, t=text, p_bm=BoyerMoore object for p """ 31 | i = 0 32 | occurrences = [] 33 | num_char_comp = 0 34 | num_aligments_tried = 0 35 | 36 | while i < len(t) - len(p) + 1: 37 | shift = 1 38 | mismatched = False 39 | num_aligments_tried += 1 40 | 41 | for j in range(len(p)-1, -1, -1): 42 | num_char_comp += 1 43 | if p[j] != t[i+j]: 44 | skip_bc = p_bm.bad_character_rule(j, t[i+j]) 45 | skip_gs = p_bm.good_suffix_rule(j) 46 | shift = max(shift, skip_bc, skip_gs) 47 | mismatched = True 48 | break 49 | 50 | if not mismatched: 51 | occurrences.append(i) 52 | skip_gs = p_bm.match_skip() 53 | shift = max(shift, skip_gs) 54 | i += shift 55 | 56 | return occurrences, num_char_comp, num_aligments_tried 57 | 58 | def readFastq(filename): 59 | sequences = [] 60 | qualities = [] 61 | with open(filename) as fh: 62 | while True: 63 | fh.readline() # skip name line 64 | seq = fh.readline().rstrip() # read base sequence 65 | fh.readline() # skip placeholder line 66 | qual = fh.readline().rstrip() # base quality line 67 | if len(seq) == 0: 68 | break 69 | sequences.append(seq) 70 | qualities.append(qual) 71 | return sequences, qualities 72 | 73 | def example_2_1(): 74 | from bm_preproc import BoyerMoore 75 | p = 'word' 76 | t = 'there would have been a time for such a word' 77 | lowercase_alphabet = 'abcdefghijklmnopqrstuvwxyz ' 78 | p_bm = BoyerMoore(p, lowercase_alphabet) 79 | #print(boyer_moore_with_counts(p, p_bm, t)) 80 | assert boyer_moore_with_counts(p, p_bm, t) == ([40], 15, 12) 81 | 82 | def example_2_2(): 83 | from bm_preproc import BoyerMoore 84 | p = 'needle' 85 | t = 'needle need noodle needle' 86 | lowercase_alphabet = 'abcdefghijklmnopqrstuvwxyz ' 87 | p_bm = BoyerMoore(p, lowercase_alphabet) 88 | #print(boyer_moore_with_counts(p, p_bm, t)) 89 | assert boyer_moore_with_counts(p, p_bm, t) == ([0, 19], 18, 5) 90 | 91 | def question1_and_2(): 92 | p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG' 93 | reads, qualities = readFastq('chr1.GRCh38.excerpt.fasta') 94 | assert len(reads) == len(qualities) 95 | 96 | total_char_comp = 0 97 | total_align_comp = 0 98 | 99 | for t in reads: 100 | occurrences, num_char_comp, num_aligments_tried = naive_with_counts(p, t) 101 | total_char_comp += num_char_comp 102 | total_align_comp += num_aligments_tried 103 | 104 | """How many alignments does the naive exact matching algorithm try when matching the string 105 | GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG (derived from human Alu sequences) to the 106 | excerpt of human chromosome 1? (Don't consider reverse complements.)""" 107 | print 'Question1: ', total_align_comp 108 | 109 | """How many character comparisons does the naive exact matching algorithm try when matching 110 | the string GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG (derived from human Alu sequences) 111 | to the excerpt of human chromosome 1? (Don't consider reverse complements.)""" 112 | print 'Question2: ', total_char_comp 113 | 114 | def question3(): 115 | p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG' 116 | reads, qualities = readFastq('chr1.GRCh38.excerpt.fasta') 117 | assert len(reads) == len(qualities) 118 | 119 | total_char_comp = 0 120 | total_align_comp = 0 121 | 122 | from bm_preproc import BoyerMoore 123 | p_bm = BoyerMoore(p) 124 | for t in reads: 125 | occurrences, num_char_comp, num_aligments_tried = boyer_moore_with_counts(p, p_bm, t) 126 | total_char_comp += num_char_comp 127 | total_align_comp += num_aligments_tried 128 | 129 | """How many alignments does Boyer-Moore try when matching the string 130 | GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG (derived from human Alu sequences) 131 | to the excerpt of human chromosome 1? (Don't consider reverse complements.)""" 132 | print 'Question3: ', total_align_comp 133 | #print 'Question3--: ', total_char_comp 134 | 135 | class Index(object): 136 | def __init__(self, t, k): 137 | ''' Create index from all substrings of size 'length' ''' 138 | self.k = k # k-mer length (k) 139 | self.index = [] 140 | for i in range(len(t) - k + 1): # for each k-mer 141 | self.index.append((t[i:i+k], i)) # add (k-mer, offset) pair 142 | self.index.sort() # alphabetize by k-mer 143 | 144 | def query(self, p): 145 | import bisect 146 | ''' Return index hits for first k-mer of P ''' 147 | kmer = p[:self.k] # query with first k-mer 148 | i = bisect.bisect_left(self.index, (kmer, -1)) # binary search 149 | hits = [] 150 | while i < len(self.index): # collect matching index entries 151 | if self.index[i][0] != kmer: 152 | break 153 | hits.append(self.index[i][1]) 154 | i += 1 155 | return hits 156 | 157 | def question4(): 158 | """Write a function that, given a length-24 pattern P and given an Index object built on 8-mers, finds all approximate occurrences of P within T with up to 2 mismatches. 159 | Insertions and deletions are not allowed. Don't consider any reverse complements. 160 | 161 | How many times does the string GGCGCGGTGGCTCACGCCTGTAAT, which is derived from a human Alu sequence, occur with up to 2 substitutions in the excerpt of human 162 | chromosome 1? (Don't consider reverse complements here.) 163 | 164 | - Hint 1: Multiple index hits might direct you to the same match multiple times, but be careful not to count a match more than once. 165 | - Hint 2: You can check your work by comparing the output of your new function to that of the naive_2mm function implemented in the previous module.""" 166 | 167 | p = 'GGCGCGGTGGCTCACGCCTGTAAT' 168 | 169 | #number of mistmatches = 2, so we need to split into 3 (2+1) 170 | mistmatches_allowed = 2 171 | num_segments_required = mistmatches_allowed + 1 172 | k_mer_size = 8 173 | pattern_size = 24 174 | 175 | reads, qualities = readFastq('chr1.GRCh38.excerpt.fasta') 176 | consolidated_read = ''.join([read for read in reads]) 177 | 178 | index = Index(consolidated_read, k_mer_size) 179 | p_segments = [ p[i*k_mer_size:i*k_mer_size+k_mer_size] for i in range(num_segments_required) ] # = ['GGCGCGGT', 'GGCTCACG', 'CCTGTAAT'] 180 | #print p_segments 181 | 182 | hits_per_segment = {} # not used 183 | hit_lists = [] 184 | i = 0 185 | index_hits = 0 186 | for segment in p_segments: 187 | hits = index.query(segment) 188 | index_hits += 1 189 | if len(hits) > 0: 190 | #print segment, hits 191 | hits_per_segment[segment] = hits 192 | hit_lists.append(set([hit - i*k_mer_size for hit in hits])) # we keep the starting point of the pattern in our results set for easy comparison (verification means equality) 193 | # and we store them in set for easy equality check by means of intersecting the sets. 194 | i += 1 195 | #print hits_per_segment 196 | #print hit_lists 197 | 198 | 199 | results = [] 200 | for i in range(len(hit_lists)): 201 | # WARN: this bit here only works for numbr of mismatches = 2 as we intersect two sets for any values of mistmatches_allowed, it would lead to invalid results otherwise 202 | intersect = hit_lists[i].intersection(hit_lists[(i+1)%len(hit_lists)]) 203 | if len(intersect) > 0 : 204 | for item in intersect: 205 | results.append(item) 206 | print len(results) 207 | return sorted(results), index_hits 208 | 209 | # Copy paste from homework1.py 210 | def naive_2mm(p, t): 211 | occurrences = [] 212 | for i in range(len(t) - len(p) + 1): # loop over alignments 213 | match = True 214 | mismatches = 0 215 | for j in range(len(p)): # loop over characters 216 | if t[i+j] != p[j]: # compare characters 217 | mismatches += 1 218 | if mismatches > 2: 219 | match = False 220 | break 221 | if match: 222 | occurrences.append(i) # all chars matched; record 223 | return occurrences 224 | 225 | def question4_check(): 226 | p = 'GGCGCGGTGGCTCACGCCTGTAAT' 227 | reads, qualities = readFastq('chr1.GRCh38.excerpt.fasta') 228 | t = ''.join([read for read in reads]) 229 | return naive_2mm(p, t) 230 | 231 | class SubseqIndex(object): 232 | """ Holds a subsequence index for a text T """ 233 | 234 | def __init__(self, t, k, ival): 235 | """ Create index from all subsequences consisting of k characters 236 | spaced ival positions apart. E.g., SubseqIndex("ATAT", 2, 2) 237 | extracts ("AA", 0) and ("TT", 1). """ 238 | self.k = k # num characters per subsequence extracted 239 | self.ival = ival # space between them; 1=adjacent, 2=every other, etc 240 | self.index = [] 241 | self.dictIndex = {} 242 | self.span = 1 + ival * (k - 1) 243 | for i in range(len(t) - self.span + 1): # for each subseq 244 | self.index.append((t[i:i+self.span:self.ival], i)) # add (subseq, offset) 245 | if self.dictIndex.has_key(t[i:i+self.span:self.ival]): 246 | self.dictIndex[t[i:i+self.span:self.ival]].append(i) 247 | else: 248 | self.dictIndex[t[i:i+self.span:self.ival]] = [i] 249 | self.index.sort() # alphabetize by subseq 250 | #print "built index", self.index 251 | #print "dict index", self.dictIndex 252 | 253 | def get_subseq(self, p): 254 | return [p[i:i+self.span:self.ival] for i in range(len(p) - self.span + 1)] # add (subseq, offset) 255 | 256 | def queryDictIndex(self, p): 257 | subseq = p[:self.span:self.ival] # query with first subseq 258 | if len(subseq) != self.k: 259 | return [] 260 | 261 | #print "input is: '{0}' (length={1}). Querying for: '{2}' (length= {3})".format(p, len(p), subseq, len(subseq)) 262 | 263 | #print "dict index", self.dictIndex 264 | if subseq in self.dictIndex.keys(): 265 | return self.dictIndex[subseq] 266 | else: 267 | return [] 268 | 269 | def query(self, p): 270 | """ Return index hits for first subseq of p """ 271 | import bisect 272 | subseq = p[:self.span:self.ival] # query with first subseq 273 | #print "input is: '{0}' (length={1}). Querying for: '{2}' (length= {3})".format(p, len(p), subseq, len(subseq)) 274 | i = bisect.bisect_left(self.index, (subseq, -1)) # binary search 275 | hits = [] 276 | while i < len(self.index): # collect matching index entries 277 | if self.index[i][0] != subseq: 278 | print "'", self.index[i][0], "' != '", subseq, "'" 279 | break 280 | hits.append(self.index[i][1]) 281 | i += 1 282 | return hits 283 | 284 | def query_subseq(p, t, subseq_ind): 285 | """Write a function that, given a length-24 pattern P and given a SubseqIndex object built with k = 8 and ival = 3, 286 | finds all approximate occurrences of P within T with up to 2 mismatches.""" 287 | #number of mistmatches = 2, so we need to split into 3 (2+1) 288 | mistmatches_allowed = 2 289 | num_segments_required = mistmatches_allowed + 1 290 | 291 | pattern_size = 24 292 | 293 | reads, qualities = readFastq('chr1.GRCh38.excerpt.fasta') 294 | consolidated_read = ''.join([read for read in reads]) 295 | 296 | #p_segments = [ p[i:] for i in range(0, pattern_size / subseq_ind.ival) ] 297 | p_segments = [] 298 | for i in range(0, pattern_size): 299 | if len(p[i::subseq_ind.ival]) == subseq_ind.k: 300 | p_segments.append(p[i:]) 301 | else: 302 | break 303 | #print p_segments 304 | 305 | #print [subseq_ind.get_subseq(segment) for segment in p_segments] 306 | checksize = reduce(lambda x,y : x and y, [map(lambda x: len(x)==8, subseq_ind.get_subseq(segment)) for segment in p_segments])[0] 307 | assert checksize 308 | 309 | hits_lists = [] 310 | hits_per_segment = {} 311 | i = 0 312 | index_hits = 0 313 | segment_number = 0 314 | for segment in p_segments: 315 | 316 | #hits = subseq_ind.query(segment) 317 | hits = subseq_ind.queryDictIndex(segment) 318 | i += 1 319 | #print "hits: ", hits 320 | index_hits += 1 321 | if len(hits) > 0: 322 | #print segment, hits 323 | hits_lists.append(hits) 324 | hits_per_segment[segment_number] = set([hit-segment_number for hit in hits]) 325 | segment_number += 1 326 | #print hits_per_segment 327 | 328 | #reduce: 329 | reduced_hits = set() 330 | for i in range(len(hits_per_segment)): 331 | intersect = hits_per_segment[i].intersection(hits_per_segment[(i+1)%len(hits_per_segment)]) 332 | if len(intersect) > 0 : 333 | for item in intersect: 334 | reduced_hits.add(item) 335 | 336 | return sorted(reduced_hits), index_hits 337 | 338 | 339 | def example_3_1(): 340 | t = 'to-morrow and to-morrow and to-morrow creeps in this petty pace' 341 | p = 'to-morrow and to-morrow ' 342 | subseq_ind = SubseqIndex(t, 8, 3) 343 | occurrences, num_index_hits = query_subseq(p, t, subseq_ind) 344 | print "occurrences, num_index_hits:", occurrences, num_index_hits 345 | #assert occurrences == [0, 14] 346 | #this seems invalid: 347 | #assert num_index_hits == 6 348 | 349 | def example_3_2(): 350 | t = open('1110.txt.utf-8').read() 351 | p = 'English measure backward' 352 | subseq_ind = SubseqIndex(t, 8, 3) 353 | occurrences, num_index_hits = query_subseq(p, t, subseq_ind) 354 | print "occurrences, num_index_hits:", occurrences, num_index_hits 355 | #assert occurrences == [135249] 356 | #assert num_index_hits == 3 357 | 358 | def question6(): 359 | p = 'GGCGCGGTGGCTCACGCCTGTAAT' 360 | 361 | #number of mistmatches = 2, so we need to split into 3 (2+1) 362 | mistmatches_allowed = 2 363 | num_segments_required = mistmatches_allowed + 1 364 | k_mer_size = 8 365 | pattern_size = 24 366 | ival = 3 367 | 368 | reads, qualities = readFastq('chr1.GRCh38.excerpt.fasta') 369 | t = ''.join([read for read in reads]) 370 | 371 | subseq_ind = SubseqIndex(t, k_mer_size, ival) 372 | return query_subseq(p, t, subseq_ind) 373 | 374 | def main(): 375 | example_1_1() 376 | example_1_2() 377 | print "All tests passed successfully for examples in set1" 378 | example_2_1() 379 | example_2_2() 380 | print "All tests passed successfully for examples in set2" 381 | question1_and_2() 382 | question3() 383 | print "Question4: How many times does the string GGCGCGGTGGCTCACGCCTGTAAT, which is derived from a human Alu sequence, occur with up to 2 substitutions in the excerpt of human chromosome 1?" 384 | res, num_index_hits = question4() 385 | check_res = question4_check() 386 | assert res == check_res 387 | print "check with naive_2mm validated" 388 | print "Question5: how many total index hits are there when searching for occurrences of GGCGCGGTGGCTCACGCCTGTAAT with up to 2 substitutions in the excerpt of human chromosome 1?" 389 | print num_index_hits 390 | example_3_1() 391 | example_3_2() 392 | print "All tests passed successfully for examples in set3" 393 | print "Question 6: how many total index hits are there when searching for GGCGCGGTGGCTCACGCCTGTAAT with up to 2 substitutions in the excerpt of human chromosome 1?" 394 | occurrences, num_index_hits = question6() 395 | print occurrences, num_index_hits 396 | 397 | if __name__ == "__main__": 398 | main() -------------------------------------------------------------------------------- /ProgrammingHomework2/kmer_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """kmer_index.py: A k-mer index for indexing a text.""" 4 | 5 | __author__ = "Ben Langmead" 6 | 7 | import bisect 8 | 9 | 10 | class Index(object): 11 | """ Holds a substring index for a text T """ 12 | 13 | def __init__(self, t, k): 14 | """ Create index from all substrings of t of length k """ 15 | self.k = k # k-mer length (k) 16 | self.index = [] 17 | for i in range(len(t) - k + 1): # for each k-mer 18 | self.index.append((t[i:i+k], i)) # add (k-mer, offset) pair 19 | self.index.sort() # alphabetize by k-mer 20 | 21 | def query(self, p): 22 | """ Return index hits for first k-mer of p """ 23 | kmer = p[:self.k] # query with first k-mer 24 | i = bisect.bisect_left(self.index, (kmer, -1)) # binary search 25 | hits = [] 26 | while i < len(self.index): # collect matching index entries 27 | if self.index[i][0] != kmer: 28 | break 29 | hits.append(self.index[i][1]) 30 | i += 1 31 | return hits 32 | -------------------------------------------------------------------------------- /ProgrammingHomework3/0440_approx__editdist3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pejvan/Algorithms-for-DNA-Sequencing/80e0e56cb9eaff0aff1827893926bd1acd79c2e3/ProgrammingHomework3/0440_approx__editdist3.pdf -------------------------------------------------------------------------------- /ProgrammingHomework3/homework3.py: -------------------------------------------------------------------------------- 1 | def readGenome(filename): 2 | genome = '' 3 | with open(filename, 'r') as f: 4 | for line in f: 5 | # ignore header line with genome information 6 | if not line[0] == '>': 7 | genome += line.rstrip() 8 | return genome 9 | 10 | def editDistance(x, y): 11 | """Returns the edit distance between two strings, x and y""" 12 | # Create distance matrix 13 | D = [] 14 | for i in range(len(x)+1): 15 | D.append([0]*(len(y)+1)) 16 | 17 | # Initialize first row and column of matrix 18 | for i in range(len(x)+1): 19 | D[i][0] = i 20 | for i in range(len(y)+1): 21 | D[0][i] = i 22 | 23 | # Fill in the rest of the matrix 24 | for i in range(1, len(x)+1): 25 | for j in range(1, len(y)+1): 26 | distHor = D[i][j-1] + 1 27 | distVer = D[i-1][j] + 1 28 | if x[i-1] == y[j-1]: 29 | distDiag = D[i-1][j-1] 30 | else: 31 | distDiag = D[i-1][j-1] + 1 32 | D[i][j] = min(distHor, distVer, distDiag) 33 | # Edit distance is the value in the bottom right corner of the matrix 34 | return D[-1][-1] 35 | 36 | def bestApproximateMatchEditDistance(p, t): 37 | """Returns the edit distance between two strings, p and t""" 38 | # Create distance matrix 39 | D = [] 40 | for i in range(len(p)+1): 41 | D.append([0]*(len(t)+1)) 42 | 43 | # Initialize first row and column of matrix 44 | for i in range(len(p)+1): 45 | D[i][0] = i 46 | # See slide 4 on 0440_approx__editdist3.pdf 47 | # First row is already initialised to zero so we simply just comment the following two lines. 48 | #for i in range(len(p)+1): 49 | # D[0][i] = i 50 | 51 | # Fill in the rest of the matrix 52 | for i in range(1, len(p)+1): 53 | for j in range(1, len(t)+1): 54 | distHor = D[i][j-1] + 1 55 | distVer = D[i-1][j] + 1 56 | if p[i-1] == t[j-1]: 57 | distDiag = D[i-1][j-1] 58 | else: 59 | distDiag = D[i-1][j-1] + 1 60 | D[i][j] = min(distHor, distVer, distDiag) 61 | 62 | # Best Approximate Match Distance is the smallest value of the last row 63 | return min(D[-1]) 64 | 65 | def test1(): 66 | """P = GCGTATGC within T = TATTGGCTATACGGTT had 2 edits.""" 67 | assert bestApproximateMatchEditDistance('GCGTATGC', 'TATTGGCTATACGGTT') == 2 68 | 69 | def question1(t): 70 | """What is the edit distance of the best match between pattern GCTGATCGATCGTACG and the excerpt of human chromosome 1? (Don't consider reverse complements.)""" 71 | print("Question1: " + str(bestApproximateMatchEditDistance('GCTGATCGATCGTACG', t))) 72 | 73 | def question2(t): 74 | """What is the edit distance of the best match between pattern GATTTACCAGATTGAG and the excerpt of human chromosome 1? (Don't consider reverse complements.)""" 75 | print("Question2: " + str(bestApproximateMatchEditDistance('GATTTACCAGATTGAG', t))) 76 | 77 | 78 | 79 | def overlap(a, b, min_length=3): 80 | """ Return length of longest suffix of 'a' matching a prefix of 'b' that is at least 'min_length' characters long. If no such overlap exists, return 0. """ 81 | start = 0 # start all the way at the left 82 | while True: 83 | start = a.find(b[:min_length], start) # look for b's prefix in a 84 | if start == -1: # no more occurrences to right 85 | return 0 86 | # found occurrence; check for full suffix/prefix match 87 | if b.startswith(a[start:]): 88 | return len(a)-start 89 | start += 1 # move just past previous match 90 | 91 | def readFastq(filename): 92 | sequences = [] 93 | qualities = [] 94 | with open(filename) as fh: 95 | while True: 96 | fh.readline() # skip name line 97 | seq = fh.readline().rstrip() # read base sequence 98 | fh.readline() # skip placeholder line 99 | qual = fh.readline().rstrip() # base quality line 100 | if len(seq) == 0: 101 | break 102 | sequences.append(seq) 103 | qualities.append(qual) 104 | return sequences, qualities 105 | 106 | def overlap_all_pairs(reads, min_length): 107 | from pprint import pprint 108 | 109 | overlap_map = {} 110 | overlap_graph = {} 111 | overlap_pairs = [] 112 | 113 | #We use a Python dictionary to associate each k-mer with its corresponding set. 114 | suffixDict = {} 115 | for read in reads: 116 | kmers = getkmers(read, min_length) 117 | #print(kmers) 118 | #(1) For every k-mer in a read, we add the read to the set object corresponding to that k-mer. 119 | for kmer in kmers: 120 | if not kmer in suffixDict.keys(): 121 | #Let every k-mer in the dataset have an associated Python set object, which starts out empty. 122 | suffixDict[kmer] = set() 123 | suffixDict[kmer].add(read) 124 | #pprint(suffixDict) 125 | 126 | #(2) Now, for each read a, we find all overlaps involving a suffix of a 127 | for read in reads: 128 | #we take a's length-k suffix, 129 | suffix = read[-min_length:] 130 | #if len(suffix) < min_length: 131 | # continue 132 | 133 | #find all reads containing that k-mer (obtained from the corresponding set) ... 134 | matching_reads = suffixDict[suffix] 135 | 136 | #...and call overlap(a, b, min_length=k) for each. 137 | for read2 in matching_reads: 138 | # The most important point is that we do not call overlap(a, b, min_length=k) if b does not contain the length-k suffix of a. 139 | #if read2.find(suffix) >= 0 and read2 != suffix : 140 | if read2 != read : 141 | val = overlap(read, read2, min_length) 142 | if val > 0: 143 | overlap_map[ (read, read2) ] = val 144 | overlap_graph[read] = read2 145 | overlap_pairs.append( (read,read2) ) 146 | 147 | #pprint(overlap_map) 148 | #pprint(overlap_pairs) 149 | #pprint(overlap_graph) 150 | return overlap_pairs, overlap_map, overlap_graph 151 | 152 | 153 | 154 | def getkmers(read, kmer_length): 155 | """our read is GATTA and k=3, we would add GATTA to the set objects for GAT, ATT and TTA.""" 156 | return [ read[i:i+kmer_length] for i in range(len(read)+1-kmer_length) ] 157 | 158 | def test2(): 159 | kmers = getkmers('GATTA', 3) 160 | assert kmers == ['GAT', 'ATT', 'TTA'] 161 | 162 | def example1(): 163 | reads = ['ABCDEFG', 'EFGHIJ', 'HIJABC'] 164 | assert overlap_all_pairs(reads, 3)[0] == [('ABCDEFG', 'EFGHIJ'), ('EFGHIJ', 'HIJABC'), ('HIJABC', 'ABCDEFG')] 165 | assert overlap_all_pairs(reads, 4)[0] == [] 166 | 167 | def example2(): 168 | from pprint import pprint 169 | 170 | reads = ['CGTACG', 'TACGTA', 'GTACGT', 'ACGTAC', 'GTACGA', 'TACGAT'] 171 | 172 | results4, overlap_map4, overlap_graph4 = overlap_all_pairs(reads, 4) 173 | expected4 = [('CGTACG', 'TACGTA'), 174 | ('CGTACG', 'GTACGT'), 175 | ('CGTACG', 'GTACGA'), 176 | ('CGTACG', 'TACGAT'), 177 | ('TACGTA', 'ACGTAC'), 178 | ('TACGTA', 'CGTACG'), 179 | ('GTACGT', 'TACGTA'), 180 | ('GTACGT', 'ACGTAC'), 181 | ('ACGTAC', 'GTACGA'), 182 | ('ACGTAC', 'GTACGT'), 183 | ('ACGTAC', 'CGTACG'), 184 | ('GTACGA', 'TACGAT')] 185 | 186 | assert sorted(results4) == sorted(expected4) , "example2, first assert failed" 187 | 188 | results5, overlap_map5, overlap_graph5 = overlap_all_pairs(reads, 5) 189 | expected5 = [('CGTACG', 'GTACGT'), 190 | ('CGTACG', 'GTACGA'), 191 | ('TACGTA', 'ACGTAC'), 192 | ('GTACGT', 'TACGTA'), 193 | ('ACGTAC', 'CGTACG'), 194 | ('GTACGA', 'TACGAT')] 195 | 196 | assert sorted(results5) == sorted(expected5), "example2, second assert failed" 197 | 198 | def question3and4(reads): 199 | from pprint import pprint 200 | 201 | overlap_pairs, overlap_map, overlap_graph = overlap_all_pairs(reads, 30) 202 | 203 | """Picture the overlap graph corresponding to the overlaps just calculated. How many edges are in the graph? 204 | In other words, how many distinct pairs of reads overlap?""" 205 | print('Question3: ') 206 | print(len(overlap_map)) 207 | 208 | """Picture the overlap graph corresponding to the overlaps computed for the previous question. 209 | How many nodes in this graph have at least one outgoing edge? (In other words, how many reads have a suffix involved in an overlap?)""" 210 | print('Question4: ') 211 | print(len(overlap_graph)) 212 | 213 | 214 | 215 | 216 | def main(): 217 | test1() 218 | print("All tests completed successfully") 219 | genome = readGenome('chr1.GRCh38.excerpt.fasta') 220 | question1(genome) 221 | question2(genome) 222 | 223 | test2() 224 | example1() 225 | example2() 226 | print("All example tests completed successfully") 227 | 228 | reads, qualities = readFastq('ERR266411_1.for_asm.fastq') 229 | question3and4(reads) 230 | print("All done") 231 | 232 | 233 | 234 | 235 | if __name__ == "__main__": 236 | main() -------------------------------------------------------------------------------- /ProgrammingHomework4/homework4.py: -------------------------------------------------------------------------------- 1 | def overlap(a, b, min_length=3): 2 | """ Return length of longest suffix of 'a' matching a prefix of 'b' that is at least 'min_length' 3 | characters long. If no such overlap exists, return 0. """ 4 | start = 0 # start all the way at the left 5 | while True: 6 | start = a.find(b[:min_length], start) # look for b's suffx in a 7 | if start == -1: # no more occurrences to right 8 | return 0 9 | # found occurrence; check for full suffix/prefix match 10 | if b.startswith(a[start:]): 11 | return len(a)-start 12 | start += 1 # move just past previous match 13 | 14 | def scs(ss): 15 | """ Returns shortest common superstring of given strings, which must be the same length """ 16 | import itertools 17 | shortest_sup = None 18 | for ssperm in itertools.permutations(ss): 19 | sup = ssperm[0] # superstring starts as first string 20 | for i in range(len(ss)-1): 21 | # overlap adjacent strings A and B in the permutation 22 | olen = overlap(ssperm[i], ssperm[i+1], min_length=1) 23 | # add non-overlapping portion of B to superstring 24 | sup += ssperm[i+1][olen:] 25 | if shortest_sup is None or len(sup) < len(shortest_sup): 26 | shortest_sup = sup # found shorter superstring 27 | return shortest_sup # return shortest 28 | 29 | def scs_list(ss): 30 | """ Returns the alphebeticaly sorted list of shortest common superstrings of given strings, which must be the same length """ 31 | import itertools 32 | shortest_sup = [] 33 | shortest_length = 0 34 | for ssperm in itertools.permutations(ss): 35 | sup = ssperm[0] # superstring starts as first string 36 | for i in range(len(ss)-1): 37 | # overlap adjacent strings A and B in the permutation 38 | olen = overlap(ssperm[i], ssperm[i+1], min_length=1) 39 | # add non-overlapping portion of B to superstring 40 | sup += ssperm[i+1][olen:] 41 | if shortest_length == 0: 42 | shortest_sup.append(sup) 43 | shortest_length = len(sup) 44 | elif len(sup) < shortest_length: 45 | shortest_length = len(sup) 46 | shortest_sup = [sup] 47 | elif len(sup) == shortest_length: 48 | shortest_sup.append(sup) 49 | else: 50 | #simply ignore and move on 51 | None 52 | return sorted(shortest_sup) 53 | 54 | 55 | def test01(): 56 | """Consider the input strings ABC, BCA, CAB. One shortest common superstring is ABCAB but another is BCABC and another is CABCA.""" 57 | inputStrings = ['ABC', 'BCA', 'CAB'] 58 | shortestCommonSuperstring = scs(inputStrings) 59 | assert shortestCommonSuperstring in ['ABCAB', 'BCABC', 'CABCA'] 60 | 61 | def question1(): 62 | """What is the length of the shortest common superstring of the following strings? CCT, CTT, TGC, TGG, GAT, ATT""" 63 | inputStrings = ['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'] 64 | shortestCommonSuperstring = scs(inputStrings) #== 'CCTTGGATTGC' 65 | 66 | #print("shortest common superstring", shortestCommonSuperstring) 67 | print("What is the length of the shortest common superstring of the following strings? CCT, CTT, TGC, TGG, GAT, ATT") 68 | print(len(shortestCommonSuperstring)) 69 | 70 | def example1(): 71 | strings = ['ABC', 'BCA', 'CAB'] 72 | # Returns just one shortest superstring 73 | assert scs(strings) == 'ABCAB' 74 | # Returns list of all superstrings that are tied for shorest 75 | shortestList = scs_list(strings) 76 | assert shortestList == ['ABCAB', 'BCABC', 'CABCA'], 'found ' + str(shortestList) 77 | 78 | def example2(): 79 | ##from pprint import pprint 80 | strings = ['GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA'] 81 | # Returns just one shortest superstring 82 | assert scs(strings) == 'TCGATGCAATAG' 83 | # Returns list of all superstrings that are tied for shorest 84 | shortestList = scs_list(strings) 85 | ##pprint(shortestList) 86 | 87 | assert shortestList == ['AATAGATCGTGC', 88 | 'AATAGATGCTCG', 89 | 'AATAGTCGATGC', 90 | 'AATCGATAGTGC', 91 | 'AATGCTCGATAG', 92 | 'TCGAATAGATGC', 93 | 'TCGATAGAATGC', 94 | 'TCGATGCAATAG', 95 | 'TGCAATAGATCG', 96 | 'TGCAATCGATAG'], 'found ' + str(shortestList) 97 | 98 | 99 | 100 | def question2(): 101 | """How many different shortest common superstrings are there for the input strings given in the previous question? 102 | Hint 1: You can modify the scs function to keep track of this.""" 103 | #from pprint import pprint 104 | shortestList = scs_list( ['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'] ) 105 | #pprint(shortestList) 106 | print("How many different shortest common superstrings are there for the input strings given in the previous question?") 107 | print(len(shortestList)) 108 | 109 | 110 | def readFastq(filename): 111 | sequences = [] 112 | qualities = [] 113 | with open(filename) as fh: 114 | while True: 115 | fh.readline() # skip name line 116 | seq = fh.readline().rstrip() # read base sequence 117 | fh.readline() # skip placeholder line 118 | qual = fh.readline().rstrip() # base quality line 119 | if len(seq) == 0: 120 | break 121 | #All the reads are the same length (100 bases) 122 | assert len(seq) == 100 123 | sequences.append(seq) 124 | qualities.append(qual) 125 | return sequences, qualities 126 | 127 | overlap_cache = {} # 128 | 129 | #copied from: http://nbviewer.ipython.org/github/Benlangmead/ads1-notebooks/blob/master/4.02_GreedySCS.ipynb 130 | def pick_maximal_overlap(reads, k): 131 | """ Return a pair of reads from the list with a 132 | maximal suffix/prefix overlap >= k. Returns 133 | overlap length 0 if there are no such overlaps.""" 134 | import itertools 135 | reada, readb = None, None 136 | best_olen = 0 137 | 138 | for a, b in itertools.permutations(reads, 2): 139 | 140 | if (a,b) in overlap_cache.keys() and overlap_cache[(a,b)] >= k: 141 | overlap_len = overlap_cache[(a,b)] 142 | #print("overlap_cache hit for (a,b):", (a,b), "min was", k, "result was:", overlap_len, "cache size:", len(overlap_cache)) 143 | return a, b, overlap_len 144 | else: 145 | olen = overlap(a, b, min_length=k) 146 | #olen = overlap(a, b) 147 | if olen > best_olen: 148 | reada, readb = a, b 149 | best_olen = olen 150 | overlap_cache[(a,b)]=best_olen 151 | 152 | return reada, readb, best_olen 153 | 154 | #copied from: http://nbviewer.ipython.org/github/Benlangmead/ads1-notebooks/blob/master/4.02_GreedySCS.ipynb 155 | def greedy_scs(reads, k): 156 | """ Greedy shortest-common-superstring merge. 157 | Repeat until no edges (overlaps of length >= k) 158 | remain. """ 159 | lenCacheBefore = len(overlap_cache) 160 | read_a, read_b, olen = pick_maximal_overlap(reads, k) 161 | 162 | while olen > 0: 163 | reads.remove(read_a) 164 | reads.remove(read_b) 165 | reads.append(read_a + read_b[olen:]) 166 | 167 | read_a, read_b, olen = pick_maximal_overlap(reads, k) 168 | 169 | lenCacheAfter = len(overlap_cache) 170 | #print("Cached {0} items during this pass".format(lenCacheAfter-lenCacheBefore)) 171 | 172 | return ''.join(reads) 173 | 174 | def validated_greedy_scs(): 175 | res1 = greedy_scs(['ABC', 'BCA', 'CAB'], 2) 176 | #print('res1: ', res1) 177 | assert res1 == 'CABCA' 178 | 179 | res2 = greedy_scs(['ABCD', 'CDBC', 'BCDA'], 1) 180 | #print('res2: ', res2) 181 | assert res2 == 'CDBCABCDA' 182 | 183 | def question3and4(): 184 | from datetime import datetime 185 | 186 | reads, qualities = readFastq('ads1_week4_reads.fq') 187 | #print(len(reads)) 188 | 189 | for i in range (30, 100): 190 | print("timestamp: ", datetime.now()) 191 | result = greedy_scs(list(reads), i) #we make a copy of the reads as the greedy_scs modifies the list 192 | print("Found result which is {0} bases long for k={1}".format( len(result), i ) ) 193 | 194 | # Hint: the virus genome you are assembling is exactly 15,894 bases long 195 | #assert len(result) == 15894 196 | if len(result) == 15894: 197 | print("Question3: ", result.count('A')) 198 | print("Question4: ", result.count('T')) 199 | return 200 | 201 | for i in range (30, 1, -1): 202 | print("timestamp: ", datetime.now()) 203 | result = greedy_scs(list(reads), i) #we make a copy of the reads as the greedy_scs modifies the list 204 | print("Found result which is {0} bases long for k={1}".format( len(result), i ) ) 205 | 206 | # Hint: the virus genome you are assembling is exactly 15,894 bases long 207 | #assert len(result) == 15894 208 | if len(result) == 15894: 209 | print("Question3: ", result.count('A')) 210 | print("Question4: ", result.count('T')) 211 | return 212 | 213 | def main(): 214 | test01() 215 | question1() 216 | example1() 217 | example2() 218 | question2() 219 | validated_greedy_scs() 220 | 221 | question3and4() 222 | 223 | 224 | if __name__ == '__main__': 225 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Algorithms-for-DNA-Sequencing 2 | Coursera MOOC Algorithms for DNA Sequencing by Ben Langmead, PhD, Jacob Pritt 3 | 4 | ##Homework1 5 | Score 6/7 6 | 7 | ##Homework2 8 | Score **0/6** with the committed implementation. 9 | 10 | There is a serious flaw in the way the exercices are built, as they require you to exactly copy paste 11 | from the video lectures to get the correct answers to the questions. Indeed, they are only interested in 12 | making sure your code does the exact same number of character comparisons or index hits as theirs, whilst 13 | the code they are showing is **extremely** inefficient for no valid reasons than sloppyness. 14 | 15 | It would be much more productive to focus on finding the actual pattern matches or results, than trying to match 16 | the number of character comparisons... which `homework2.py` actually does: it does find the same answers as far as 17 | the actual pattern matching is concerned :-) 18 | 19 | While I have not tried to optimise nor to do any kinds of clever tricks, and while `homework2.py` does fairly more 20 | than expected (many validations, tests, etc.) in an unoptimised way, it's still much faster than the expected code. 21 | 22 | Question | Answer expected from Quiz | Answer from ProgrammingHomework2 23 | -------- | --------------------------| --------------------------------- 24 | 1 | 799954 | 85000 25 | 2 | 984143 | 104266 26 | 3 | 127974 | 15291 27 | 4 | ? | 2 28 | 5 | 90 | 3 29 | 6 | 79 | 3 30 | RunTime | 6.847s | 1.917s and 1.159s w/o tests 31 | 32 | ##Homework3 33 | Score 4/4 34 | 35 | ##Homework4 36 | Score 4/4 37 | --------------------------------------------------------------------------------