├── .coveragerc ├── .gitignore ├── .travis.yml ├── HISTORY.rst ├── MANIFEST.in ├── README.rst ├── align ├── __init__.py ├── align.py ├── calign.pyx ├── data │ ├── BLOSUM62 │ └── DNA ├── matrix.py └── tests │ └── test_align.py ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg ├── setup.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = align 4 | 5 | [report] 6 | exclude_lines = 7 | if self.debug: 8 | pragma: no cover 9 | raise NotImplementedError 10 | if __name__ == .__main__.: 11 | omit = 12 | align/tests/* 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build/ 3 | dist/ 4 | align.egg-info/ 5 | .coverage 6 | *.c 7 | *.so 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Config file for automatic testing at travis-ci.org 2 | 3 | language: python 4 | 5 | python: 6 | - "3.5" 7 | - "2.7" 8 | 9 | before_install: 10 | - pip install setuptools==18.0 11 | - pip install -r requirements-dev.txt 12 | 13 | install: 14 | - python setup.py install 15 | - pip install codecov 16 | 17 | script: 18 | - nosetests -v --with-coverage align/tests 19 | 20 | after_success: 21 | - codecov 22 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | History 4 | ======= 5 | 6 | Version 0.0.3 7 | ------------- 8 | 9 | Backwards incompatible changes since version 0.0.2: 10 | 11 | * Alignment results are now stored in an object called ``AlignmentResult``. 12 | The pure Python implementation is based on the ``namedtuple`` class. 13 | 14 | * The ``AlignmentResult`` object, in addition to containing the aligned 15 | sequences, also contains the alignment start positions, end positions, 16 | number of gaps on each sequence, number of mismatches, and the alignment 17 | score. 18 | 19 | * A new argument, ``max_hits``, is added to the ``aligner``. This 20 | determines the maximum number of alignments to return in case there 21 | are multiple optimal alignments (i.e. alignment with the same maximum 22 | score). The default value is 1. When set to ``None``, all optimal 23 | alignments are returned, except for when the method is ``global``. 24 | In this case, only one optimal alignment is returned. This also changes 25 | the return type of ``aligner`` to be a list, regardless of how many 26 | alignments are returned. 27 | 28 | Additionally, several internal code refactor were done. 29 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include HISTORY.rst 3 | include LICENSE 4 | include README.rst 5 | include requirements.txt 6 | include requirements-dev.txt 7 | 8 | recursive-include * *.pyx 9 | recursive-include tests * 10 | recursive-exclude * __pycache__ 11 | recursive-exclude * *.py[co] 12 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ++++++++++++++++++++++++++++++++++++++++ 2 | Align: polite, proper sequence alignment 3 | ++++++++++++++++++++++++++++++++++++++++ 4 | 5 | :Authors: Marcin Cieślik, Brent Pedersen (brentp), Wibowo Arindrarto (bow) 6 | :Email: (marcin), bpederse@gmail.com, bow@bow.web.id 7 | :License: BSD 8 | 9 | .. contents :: 10 | 11 | 12 | About 13 | ===== 14 | Various packages implement sequence alignment algorithms with various levels of 15 | success. This package is an attempt to handle the sequence alignment properly, 16 | including edge-cases. 17 | 18 | 19 | Usage 20 | ===== 21 | 22 | usage will change. currently :: 23 | 24 | >>> from align.matrix import DNAFULL 25 | >>> from align import aligner 26 | 27 | >>> aligner('WW','WEW', method= 'global') 28 | ('W-W', 'WEW') 29 | 30 | >>> aligner('WW','WEWWEW', method= 'glocal') 31 | ('WW', 'WW') 32 | 33 | 34 | >>> aligner('TAATTC', 'TAAT', method='global', matrix=DNAFULL, gap_open=-10, gap_extend=-1) 35 | ('TAATTC', 'TAAT--') 36 | 37 | >>> aligner('PYNCHAN', 'YNCH', method='local') 38 | ('YNCH', 'YNCH') 39 | 40 | >>> a, b = aligner('AAAAAAAAAAAAACCTGCGCCCCAAAAAAAAAAAAAAAAAAAA', 'CCTGCGCACCCC', method='global_cfe') 41 | >>> print "%s\n%s" % (a, b) 42 | AAAAAAAAAAAAACCTGCGC-CCCAAAAAAAAAAAAAAAAAAAA 43 | -------------CCTGCGCACCCC------------------- 44 | -------------------------------------------------------------------------------- /align/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | try: 4 | from calign import aligner, score_alignment 5 | except ImportError: 6 | from . import matrix 7 | from .align import AlignmentResult, aligner 8 | -------------------------------------------------------------------------------- /align/align.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | from collections import namedtuple 5 | 6 | import numpy as np 7 | 8 | from .matrix import BLOSUM62 9 | 10 | 11 | __all__ = ["AlignmentResult", "aligner"] 12 | 13 | 14 | IS_PY2 = False 15 | if sys.version_info.major == 2: 16 | IS_PY2 = True 17 | 18 | # Container for alignment result 19 | AlignmentResult = namedtuple( 20 | 'AlignmentResult', 21 | ['seq1', 'seq2', 'start1', 'start2', 22 | 'end1', 'end2', 'n_gaps1', 'n_gaps2', 23 | 'n_mismatches', 'score']) 24 | 25 | 26 | def aligner(seqj, seqi, method='global', gap_open=-7, gap_extend=-7, 27 | gap_double=-7, matrix=BLOSUM62, max_hits=1): 28 | '''Calculates the alignment of two sequences. 29 | 30 | The supported 'methods' are: 31 | * 'global' for a global Needleman-Wunsh algorithm 32 | * 'local' for a local Smith-Waterman alignment 33 | * 'global_cfe' for a global alignment with cost-free ends 34 | * 'glocal' for an alignment which is 'global' only with respect to 35 | the shorter sequence (also known as a 'semi-global' alignment) 36 | 37 | Returns the aligned (sub)sequences as character arrays. 38 | 39 | Gotoh, O. (1982). J. Mol. Biol. 162, 705-708. 40 | Needleman, S. & Wunsch, C. (1970). J. Mol. Biol. 48(3), 443-53. 41 | Smith, T.F. & Waterman M.S. (1981). J. Mol. Biol. 147, 195-197. 42 | 43 | Arguments: 44 | 45 | - seqj (``sequence``) First aligned iterable object of symbols. 46 | - seqi (``sequence``) Second aligned iterable object of symbols. 47 | - method (``str``) Type of alignment: 'global', 'global_cfe', 'local', 48 | 'glocal'. 49 | - gap_open (``float``) The gap-opening cost. 50 | - gap_extend (``float``) The cost of extending an open gap. 51 | - gap_double (``float``) The gap-opening cost if a gap is already open 52 | in the other sequence. 53 | - matrix (``dict``) A score matrix dictionary. 54 | - max_hits (``int``) The maximum number of results to return in 55 | case multiple alignments with the same score are found. If set to 1, 56 | a single ``AlignmentResult`` object is returned. If set to values 57 | larger than 1, a list containing ``AlignmentResult`` objects are 58 | returned. If set to `None`, all alignments with the maximum score 59 | are returned. 60 | ''' 61 | assert max_hits is None or max_hits > 0 62 | NONE, LEFT, UP, DIAG = range(4) # NONE is 0 63 | GAP_CHAR = ord('-') if not IS_PY2 else '-' 64 | max_j = len(seqj) 65 | max_i = len(seqi) 66 | 67 | if max_j > max_i: 68 | flip = 1 69 | seqi, seqj = seqj, seqi 70 | max_i, max_j = max_j, max_i 71 | else: 72 | flip = 0 73 | 74 | seqi = seqi.encode() if not isinstance(seqi, bytes) else seqi 75 | seqj = seqj.encode() if not isinstance(seqj, bytes) else seqj 76 | 77 | F = np.zeros((max_i + 1, max_j + 1), dtype=np.float32) 78 | I = np.ndarray((max_i + 1, max_j + 1), dtype=np.float32) 79 | I.fill(-np.inf) 80 | J = np.ndarray((max_i + 1, max_j + 1), dtype=np.float32) 81 | J.fill(-np.inf) 82 | pointer = np.zeros((max_i + 1, max_j + 1), dtype=np.uint) # NONE 83 | 84 | if method == 'global': 85 | pointer[0, 1:] = LEFT 86 | pointer[1:, 0] = UP 87 | F[0, 1:] = gap_open + gap_extend * \ 88 | np.arange(0, max_j, dtype=np.float32) 89 | F[1:, 0] = gap_open + gap_extend * \ 90 | np.arange(0, max_i, dtype=np.float32) 91 | elif method == 'global_cfe': 92 | pointer[0, 1:] = LEFT 93 | pointer[1:, 0] = UP 94 | elif method == 'glocal': 95 | pointer[0, 1:] = LEFT 96 | F[0, 1:] = gap_open + gap_extend * \ 97 | np.arange(0, max_j, dtype=np.float32) 98 | 99 | for i in range(1, max_i + 1): 100 | ci = seqi[i - 1:i] 101 | for j in range(1, max_j + 1): 102 | cj = seqj[j - 1:j] 103 | # I 104 | I[i, j] = max( 105 | F[i, j - 1] + gap_open, 106 | I[i, j - 1] + gap_extend, 107 | J[i, j - 1] + gap_double) 108 | # J 109 | J[i, j] = max( 110 | F[i - 1, j] + gap_open, 111 | J[i - 1, j] + gap_extend, 112 | I[i - 1, j] + gap_double) 113 | # F 114 | diag_score = F[i - 1, j - 1] + matrix[cj][ci] 115 | left_score = I[i, j] 116 | up_score = J[i, j] 117 | max_score = max(diag_score, up_score, left_score) 118 | 119 | F[i, j] = max(0, max_score) if method == 'local' else max_score 120 | 121 | if method == 'local': 122 | if F[i, j] == 0: 123 | pass # point[i,j] = NONE 124 | elif max_score == diag_score: 125 | pointer[i, j] = DIAG 126 | elif max_score == up_score: 127 | pointer[i, j] = UP 128 | elif max_score == left_score: 129 | pointer[i, j] = LEFT 130 | elif method == 'glocal': 131 | # In a semi-global alignment we want to consume as much as 132 | # possible of the longer sequence. 133 | if max_score == up_score: 134 | pointer[i, j] = UP 135 | elif max_score == diag_score: 136 | pointer[i, j] = DIAG 137 | elif max_score == left_score: 138 | pointer[i, j] = LEFT 139 | else: 140 | # global 141 | if max_score == up_score: 142 | pointer[i, j] = UP 143 | elif max_score == left_score: 144 | pointer[i, j] = LEFT 145 | else: 146 | pointer[i, j] = DIAG 147 | 148 | # container for traceback coordinates 149 | ij_pairs = [] 150 | if method == 'local': 151 | # max anywhere 152 | maxv_indices = np.argwhere(F == F.max())[:max_hits] 153 | for index in maxv_indices: 154 | ij_pairs.append(index) 155 | elif method == 'glocal': 156 | # max in last col 157 | max_score = F[:, -1].max() 158 | maxi_indices = np.argwhere(F[:, -1] == F[:, -1].max())\ 159 | .flatten()[:max_hits] 160 | for i in maxi_indices: 161 | ij_pairs.append((i, max_j)) 162 | elif method == 'global_cfe': 163 | # from i,j to max(max(last row), max(last col)) for free 164 | row_max = F[-1].max() 165 | col_max = F[:, -1].max() 166 | # expecting max to exist on either last column or last row 167 | if row_max > col_max: 168 | col_idces = np.argwhere(F[-1] == row_max).flatten()[:max_hits] 169 | for cid in col_idces: 170 | ij_pairs.append((i, cid)) 171 | elif row_max < col_max: 172 | row_idces = np.argwhere(F[:, -1] == col_max).flatten()[:max_hits] 173 | for rid in row_idces: 174 | ij_pairs.append((rid, j)) 175 | # special case: max is on last row, last col 176 | elif row_max == col_max == F[i, j]: 177 | # check if max score also exist on other cells in last row 178 | # or last col. we expect only one of the case. 179 | col_idces = np.argwhere(F[-1] == row_max).flatten() 180 | row_idces = np.argwhere(F[:, -1] == col_max).flatten() 181 | ncol_idces = len(col_idces) 182 | nrow_idces = len(row_idces) 183 | 184 | # tiebreaker between row/col is whichever has more max scores 185 | if ncol_idces > nrow_idces: 186 | for cid in col_idces[:max_hits]: 187 | ij_pairs.append((i, cid)) 188 | elif ncol_idces < nrow_idces: 189 | for rid in row_idces[:max_hits]: 190 | ij_pairs.append((rid, j)) 191 | elif ncol_idces == nrow_idces == 1: 192 | ij_pairs.append((i, j)) 193 | else: 194 | raise RuntimeError('Unexpected multiple maximum global_cfe' 195 | ' scores.') 196 | else: 197 | raise RuntimeError('Unexpected global_cfe scenario.') 198 | else: 199 | # method must be global at this point 200 | ij_pairs.append((i, j)) 201 | 202 | results = [] 203 | for i, j in ij_pairs: 204 | align_j = [] 205 | align_i = [] 206 | score = F[i, j] 207 | p = pointer[i, j] 208 | # mimic Python's coord system 209 | if method.startswith("global"): 210 | end_i, end_j = max_i, max_j 211 | else: 212 | end_i, end_j = i, j 213 | n_gaps_i, n_gaps_j, n_mmatch = 0, 0, 0 214 | 215 | # special case for global_cfe ~ one cell may contain multiple pointer 216 | # directions 217 | if method == 'global_cfe': 218 | if i < max_i: 219 | align_i.extend([c for c in seqi[i:][::-1]]) 220 | align_j.extend([GAP_CHAR] * (max_i - i)) 221 | n_gaps_j += 1 222 | elif j < max_j: 223 | align_i.extend([GAP_CHAR] * (max_j - j)) 224 | align_j.extend([c for c in seqj[j:][::-1]]) 225 | n_gaps_i += 1 226 | 227 | while p != NONE: 228 | if p == DIAG: 229 | i -= 1 230 | j -= 1 231 | ichar = seqi[i] 232 | jchar = seqj[j] 233 | if ichar != jchar: 234 | n_mmatch += 1 235 | align_j.append(jchar) 236 | align_i.append(ichar) 237 | elif p == LEFT: 238 | j -= 1 239 | align_j.append(seqj[j]) 240 | if not align_i or align_i[-1] != GAP_CHAR: 241 | n_gaps_i += 1 242 | align_i.append(GAP_CHAR) 243 | elif p == UP: 244 | i -= 1 245 | align_i.append(seqi[i]) 246 | if not align_j or align_j[-1] != GAP_CHAR: 247 | n_gaps_j += 1 248 | align_j.append(GAP_CHAR) 249 | else: 250 | raise Exception('wtf!') 251 | p = pointer[i, j] 252 | 253 | align_i = bytes(align_i[::-1]) \ 254 | if not IS_PY2 else ''.join(align_i[::-1]) 255 | align_j = bytes(align_j[::-1]) \ 256 | if not IS_PY2 else ''.join(align_j[::-1]) 257 | 258 | aln = (AlignmentResult(align_i, align_j, i, j, end_i, end_j, 259 | n_gaps_i, n_gaps_j, n_mmatch, score) 260 | if flip else 261 | AlignmentResult(align_j, align_i, j, i, end_j, end_i, 262 | n_gaps_j, n_gaps_i, n_mmatch, score)) 263 | 264 | results.append(aln) 265 | 266 | return results 267 | -------------------------------------------------------------------------------- /align/calign.pyx: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from collections import namedtuple 4 | from itertools import islice 5 | 6 | import numpy as np 7 | import os.path as op 8 | import sys 9 | 10 | cimport numpy as np 11 | from libc.string cimport strlen 12 | from cpython.mem cimport PyMem_Malloc, PyMem_Free 13 | 14 | from .align import AlignmentResult 15 | from .matrix import BLOSUM62, DNAFULL 16 | 17 | 18 | METHODS = {"global": 0, "local": 1, "glocal": 2, "global_cfe": 3} 19 | 20 | 21 | ctypedef np.int_t DTYPE_INT 22 | ctypedef np.uint_t DTYPE_UINT 23 | ctypedef np.float32_t DTYPE_FLOAT 24 | 25 | 26 | cdef inline DTYPE_FLOAT max3(DTYPE_FLOAT a, DTYPE_FLOAT b, DTYPE_FLOAT c): 27 | """Returns a maximum of three numpy 32-bit floats.""" 28 | if c > b: 29 | return c if c > a else a 30 | return b if b > a else a 31 | 32 | 33 | cdef inline DTYPE_FLOAT max2(DTYPE_FLOAT a, DTYPE_FLOAT b): 34 | """Returns a maximum of two numpy 32-bit floats.""" 35 | return b if b > a else a 36 | 37 | 38 | cdef DTYPE_FLOAT[:, :] make_cmatrix(dict pymatrix): 39 | """Transforms the given dictionary scoring matrix into a numpy matrix. 40 | 41 | so here, we read a matrix in the NCBI format and put 42 | it into a numpy array. so the score for a 'C' changing 43 | to an 'A' is stored in the matrix as: 44 | mat[ord('C'), ord('A')] = score 45 | as such, it's a direct array lookup from each pair in the alignment 46 | to a score. this makes it very fast. the cost is in terms of space. 47 | though it's usually less than 100*100. 48 | 49 | """ 50 | cdef: 51 | DTYPE_INT size = sorted([ord(c) for c in pymatrix.keys()]).pop() + 1 52 | DTYPE_FLOAT score 53 | np.int8_t c1, c2 54 | np.ndarray[DTYPE_FLOAT, ndim=2] cmatrix = np.zeros((size, size), 55 | dtype=np.float32) 56 | 57 | for char1 in pymatrix.keys(): 58 | for char2, score in pymatrix[char1].items(): 59 | c1 = ord(char1) 60 | c2 = ord(char2) 61 | cmatrix[c1, c2] = score 62 | 63 | return cmatrix 64 | 65 | 66 | cdef: 67 | DTYPE_FLOAT[:, :] m_BLOSUM62 = make_cmatrix(BLOSUM62) 68 | DTYPE_FLOAT[:, :] m_DNAFULL = make_cmatrix(DNAFULL) 69 | unsigned char GAP_CHAR = '-' 70 | 71 | cdef enum: 72 | NONE = 0, LEFT = 1, UP = 2, DIAG = 3 73 | 74 | 75 | cdef list caligner( 76 | const unsigned char* seqj, const unsigned char* seqi, const int imethod, 77 | const DTYPE_FLOAT gap_open, const DTYPE_FLOAT gap_extend, const DTYPE_FLOAT gap_double, 78 | const DTYPE_FLOAT[:, :] amatrix, const bint flipped, max_hits): 79 | 80 | cdef: 81 | unsigned char* align_j 82 | unsigned char* align_i 83 | unsigned char ci, cj 84 | size_t max_j = strlen(seqj) 85 | size_t max_i = strlen(seqi) 86 | size_t align_counter = 0 87 | size_t i = 1, j = 1 88 | int ncol_idces, nrow_idces, idx 89 | int end_i, end_j, n_gaps_i, n_gaps_j, n_mmatch 90 | DTYPE_UINT p 91 | DTYPE_FLOAT diag_score, left_score, up_score, max_score, aln_score 92 | np.ndarray[DTYPE_FLOAT, ndim=2] agap_i = np.empty((max_i + 1, max_j + 1), dtype=np.float32) 93 | np.ndarray[DTYPE_FLOAT, ndim=2] agap_j = np.empty((max_i + 1, max_j + 1), dtype=np.float32) 94 | np.ndarray[DTYPE_FLOAT, ndim=2] score = np.zeros((max_i + 1, max_j + 1), dtype=np.float32) 95 | np.ndarray[DTYPE_UINT, ndim=2] pointer = np.zeros((max_i + 1, max_j + 1), dtype=np.uint) 96 | list indices = [], row_idces = [], col_idces = [] 97 | list results = [], tracestart_coords = [] 98 | 99 | agap_i.fill(-np.inf) 100 | agap_j.fill(-np.inf) 101 | 102 | # START HERE: 103 | if imethod == 0: 104 | pointer[0, 1:] = LEFT 105 | pointer[1:, 0] = UP 106 | score[0, 1:] = gap_open + gap_extend * np.arange(0, max_j, dtype=np.float32) 107 | score[1:, 0] = gap_open + gap_extend * np.arange(0, max_i, dtype=np.float32) 108 | elif imethod == 3: 109 | pointer[0, 1:] = LEFT 110 | pointer[1:, 0] = UP 111 | elif imethod == 2: 112 | pointer[0, 1:] = LEFT 113 | score[0, 1:] = gap_open + gap_extend * np.arange(0, max_j, dtype=np.float32) 114 | 115 | cdef DTYPE_FLOAT matrix_max = 0, row_max = score[-1, 0], col_max = score[0, -1] 116 | 117 | for i in range(1, max_i + 1): 118 | ci = seqi[i - 1] 119 | for j in range(1, max_j + 1): 120 | cj = seqj[j - 1] 121 | # agap_i 122 | agap_i[i,j] = max3( 123 | score[i, j - 1] + gap_open, 124 | agap_i[i, j - 1] + gap_extend, 125 | agap_j[i, j - 1] + gap_double) 126 | # agap_j 127 | agap_j[i,j] = max3( 128 | score[i - 1, j] + gap_open, 129 | agap_j[i - 1, j] + gap_extend, 130 | agap_i[i - 1, j] + gap_double) 131 | # score 132 | diag_score = score[i - 1, j - 1] + amatrix[ci, cj] 133 | left_score = agap_i[i, j] 134 | up_score = agap_j[i, j] 135 | max_score = max3(diag_score, up_score, left_score) 136 | if imethod == 1: 137 | max_score = max2(0, max_score) 138 | 139 | score[i, j] = max_score 140 | 141 | if imethod == 1: 142 | if score[i,j] == 0: 143 | pass # point[i,j] = NONE 144 | elif max_score == diag_score: 145 | pointer[i,j] = DIAG 146 | elif max_score == up_score: 147 | pointer[i,j] = UP 148 | elif max_score == left_score: 149 | pointer[i,j] = LEFT 150 | 151 | # Manual tracking of [i, j] coordinates where score is max 152 | if max_score > matrix_max: 153 | matrix_max = max_score 154 | indices = [(i, j)] 155 | elif max_score == matrix_max: 156 | indices.append((i, j)) 157 | 158 | elif imethod == 2: 159 | # In a semi-global alignment we want to consume as much as 160 | # possible of the longer sequence. 161 | if max_score == up_score: 162 | pointer[i,j] = UP 163 | elif max_score == diag_score: 164 | pointer[i,j] = DIAG 165 | elif max_score == left_score: 166 | pointer[i,j] = LEFT 167 | 168 | # Manual tracking of [i, j] coordinates where col score is max 169 | if j == max_j: 170 | if max_score > col_max: 171 | col_max = max_score 172 | col_idces = [(i, j)] 173 | elif max_score == col_max: 174 | col_idces.append((i, j)) 175 | 176 | else: 177 | # global 178 | if max_score == up_score: 179 | pointer[i,j] = UP 180 | elif max_score == left_score: 181 | pointer[i,j] = LEFT 182 | else: 183 | pointer[i,j] = DIAG 184 | 185 | if imethod == 3: 186 | # Manual tracking of [i, j] coordinates where col score is max 187 | if j == max_j: 188 | if max_score > col_max: 189 | col_max = max_score 190 | col_idces = [(i, j)] 191 | elif max_score == col_max: 192 | col_idces.append((i, j)) 193 | if i == max_i: 194 | if max_score > row_max: 195 | row_max = max_score 196 | row_idces = [(i, j)] 197 | elif max_score == row_max: 198 | row_idces.append((i, j)) 199 | 200 | if imethod == 1: 201 | for index in islice(indices, max_hits): 202 | tracestart_coords.append(index) 203 | 204 | elif imethod == 2: 205 | for index in islice(col_idces, max_hits): 206 | tracestart_coords.append(index) 207 | 208 | elif imethod == 3: 209 | # from i,j to max(max(last row), max(last col)) for free 210 | # expecting max to exist on either last column or last row 211 | if row_max > col_max: 212 | for index in islice(row_idces, max_hits): 213 | tracestart_coords.append(index) 214 | elif row_max < col_max: 215 | for index in islice(col_idces, max_hits): 216 | tracestart_coords.append(index) 217 | # special case: max is on last row, last col 218 | elif row_max == col_max == score[max_i, max_j]: 219 | ncol_idces = len(col_idces) 220 | nrow_idces = len(row_idces) 221 | # tiebreaker between row/col is whichever has more max scores 222 | if ncol_idces > nrow_idces: 223 | for index in islice(col_idces, max_hits): 224 | tracestart_coords.append(index) 225 | elif ncol_idces < nrow_idces: 226 | for index in islice(row_idces, max_hits): 227 | tracestart_coords.append(index) 228 | elif ncol_idces == nrow_idces == 1: 229 | tracestart_coords.append((max_i, max_j)) 230 | else: 231 | raise RuntimeError('Unexpected multiple maximum global_cfe' 232 | ' scores.') 233 | else: 234 | raise RuntimeError('Unexpected global_cfe scenario.') 235 | else: 236 | # method must be global at this point 237 | tracestart_coords.append((max_i, max_j)) 238 | 239 | seqlen = max_i + max_j 240 | for i, j in tracestart_coords: 241 | if imethod == 0 or imethod == 3: 242 | end_i, end_j = max_i, max_j 243 | else: 244 | end_i, end_j = i, j 245 | aln_score = score[i, j] 246 | p = pointer[i, j] 247 | n_gaps_i, n_gaps_j, n_mmatch = 0, 0, 0 248 | align_counter = 0 249 | align_i = PyMem_Malloc(seqlen * sizeof(unsigned char)) 250 | align_j = PyMem_Malloc(seqlen * sizeof(unsigned char)) 251 | 252 | # special case for global_cfe ~ one cell may contain multiple pointer 253 | # directions 254 | if imethod == 3: 255 | if i < max_i: 256 | n_gaps_j += 1 257 | align_counter = max_i - i 258 | for idx in range(align_counter): 259 | align_j[idx] = GAP_CHAR 260 | align_i[idx] = seqi[max_i - 1 * idx - 1] 261 | elif j < max_j: 262 | n_gaps_i += 1 263 | align_counter = max_j - j 264 | for idx in range(align_counter): 265 | align_i[idx] = GAP_CHAR 266 | align_j[idx] = seqj[max_j - 1 * idx - 1] 267 | 268 | while p != NONE: 269 | if p == DIAG: 270 | i -= 1 271 | j -= 1 272 | if seqi[i] != seqj[j]: 273 | n_mmatch += 1 274 | align_j[align_counter] = seqj[j] 275 | align_i[align_counter] = seqi[i] 276 | elif p == LEFT: 277 | j -= 1 278 | align_j[align_counter] = seqj[j] 279 | if align_i[align_counter - 1] != GAP_CHAR or align_counter == 0: 280 | n_gaps_i += 1 281 | align_i[align_counter] = GAP_CHAR 282 | elif p == UP: 283 | i -= 1 284 | align_i[align_counter] = seqi[i] 285 | if align_j[align_counter - 1] != GAP_CHAR or align_counter == 0: 286 | n_gaps_j += 1 287 | align_j[align_counter] = GAP_CHAR 288 | else: 289 | raise Exception('wtf!') 290 | p = pointer[i, j] 291 | align_counter += 1 292 | 293 | alns_i = bytes(align_i[:align_counter][::-1]) 294 | alns_j = bytes(align_j[:align_counter][::-1]) 295 | 296 | PyMem_Free(align_i) 297 | PyMem_Free(align_j) 298 | 299 | aln = (AlignmentResult(alns_i, alns_j, i, j, end_i, end_j, 300 | n_gaps_i, n_gaps_j, n_mmatch, aln_score) 301 | if flipped else 302 | AlignmentResult(alns_j, alns_i, j, i, end_j, end_i, 303 | n_gaps_j, n_gaps_i, n_mmatch, aln_score)) 304 | 305 | results.append(aln) 306 | 307 | return results 308 | 309 | 310 | def aligner(seqj, seqi, method='global', gap_open=-7, gap_extend=-7, 311 | gap_double=-7, matrix="BLOSUM62", max_hits=1): 312 | '''Calculates the alignment of two sequences. 313 | 314 | The supported 'methods' are: 315 | * 'global' for a global Needleman-Wunsh algorithm 316 | * 'local' for a local Smith-Waterman alignment 317 | * 'global_cfe' for a global alignment with cost-free ends 318 | * 'glocal' for an alignment which is 'global' only with respect to 319 | the shorter sequence (also known as a 'semi-global' alignment) 320 | 321 | Returns the aligned (sub)sequences as character arrays. 322 | 323 | Gotoh, O. (1982). J. Mol. Biol. 162, 705-708. 324 | Needleman, S. & Wunsch, C. (1970). J. Mol. Biol. 48(3), 443-53. 325 | Smith, T.F. & Waterman M.S. (1981). J. Mol. Biol. 147, 195-197. 326 | 327 | Arguments: 328 | 329 | - seqj (``sequence``) First aligned iterable object of symbols. 330 | - seqi (``sequence``) Second aligned iterable object of symbols. 331 | - method (``str``) Type of alignment: 'global', 'global_cfe', 'local', 332 | 'glocal'. 333 | - gap_open (``float``) The gap-opening cost. 334 | - gap_extend (``float``) The cost of extending an open gap. 335 | - gap_double (``float``) The gap-opening cost if a gap is already open 336 | in the other sequence. 337 | - matrix (``dict``) A score matrix dictionary. 338 | - max_hits (``int``) The maximum number of results to return in 339 | case multiple alignments with the same score are found. If set to 1, 340 | a single ``AlignmentResult`` object is returned. If set to values 341 | larger than 1, a list containing ``AlignmentResult`` objects are 342 | returned. If set to `None`, all alignments with the maximum score 343 | are returned. 344 | ''' 345 | assert gap_extend <= 0, "gap_extend penalty must be <= 0" 346 | assert gap_open <= 0, "gap_open must be <= 0" 347 | assert max_hits is None or max_hits > 0 348 | 349 | max_j = len(seqj) 350 | max_i = len(seqi) 351 | seq1 = seqj if isinstance(seqj, bytes) else bytes(seqj, 'ascii') 352 | seq2 = seqi if isinstance(seqi, bytes) else bytes(seqi, 'ascii') 353 | 354 | if max_j > max_i: 355 | flipped = 1 356 | seq1, seq2 = seq2, seq1 357 | max_i, max_j = max_j, max_i 358 | else: 359 | flipped = 0 360 | 361 | if isinstance(matrix, dict): 362 | score_matrix = make_cmatrix(matrix) 363 | elif matrix == "BLOSUM62": 364 | score_matrix = m_BLOSUM62 365 | elif matrix == "DNAFULL": 366 | score_matrix = m_DNAFULL 367 | else: 368 | raise ValueError("Invalid premade scoring matrix:" 369 | " {0}.".format(matrix)) 370 | 371 | return caligner(seq1, seq2, METHODS[method], 372 | gap_open, gap_extend, gap_double, 373 | score_matrix, flipped, max_hits) 374 | -------------------------------------------------------------------------------- /align/data/BLOSUM62: -------------------------------------------------------------------------------- 1 | # Entries for the BLOSUM62 matrix at a scale of ln(2)/2.0. 2 | A R N D C Q E G H I L K M F P S T W Y V B J Z X * 3 | A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 -1 -1 -4 4 | R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 -2 0 -1 -4 5 | N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 4 -3 0 -1 -4 6 | D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 -3 1 -1 -4 7 | C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -1 -3 -1 -4 8 | Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 -2 4 -1 -4 9 | E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 -3 4 -1 -4 10 | G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -4 -2 -1 -4 11 | H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 -3 0 -1 -4 12 | I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 3 -3 -1 -4 13 | L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 3 -3 -1 -4 14 | K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 -3 1 -1 -4 15 | M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 2 -1 -1 -4 16 | F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 0 -3 -1 -4 17 | P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -3 -1 -1 -4 18 | S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 -2 0 -1 -4 19 | T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 -1 -1 -4 20 | W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -2 -2 -1 -4 21 | Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -1 -2 -1 -4 22 | V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 2 -2 -1 -4 23 | B -2 -1 4 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 -3 0 -1 -4 24 | J -1 -2 -3 -3 -1 -2 -3 -4 -3 3 3 -3 2 0 -3 -2 -1 -2 -1 2 -3 3 -3 -1 -4 25 | Z -1 0 0 1 -3 4 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -2 -2 -2 0 -3 4 -1 -4 26 | X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -4 27 | * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 28 | -------------------------------------------------------------------------------- /align/data/DNA: -------------------------------------------------------------------------------- 1 | A T G C S W R Y K M B V H D N U 2 | A 5 -4 -4 -4 -4 1 1 -4 -4 1 -4 -1 -1 -1 -2 -4 3 | T -4 5 -4 -4 -4 1 -4 1 1 -4 -1 -4 -1 -1 -2 5 4 | G -4 -4 5 -4 1 -4 1 -4 1 -4 -1 -1 -4 -1 -2 -4 5 | C -4 -4 -4 5 1 -4 -4 1 -4 1 -1 -1 -1 -4 -2 -4 6 | S -4 -4 1 1 -1 -4 -2 -2 -2 -2 -1 -1 -3 -3 -1 -4 7 | W 1 1 -4 -4 -4 -1 -2 -2 -2 -2 -3 -3 -1 -1 -1 1 8 | R 1 -4 1 -4 -2 -2 -1 -4 -2 -2 -3 -1 -3 -1 -1 -4 9 | Y -4 1 -4 1 -2 -2 -4 -1 -2 -2 -1 -3 -1 -3 -1 1 10 | K -4 1 1 -4 -2 -2 -2 -2 -1 -4 -1 -3 -3 -1 -1 1 11 | M 1 -4 -4 1 -2 -2 -2 -2 -4 -1 -3 -1 -1 -3 -1 -4 12 | B -4 -1 -1 -1 -1 -3 -3 -1 -1 -3 -1 -2 -2 -2 -1 -1 13 | V -1 -4 -1 -1 -1 -3 -1 -3 -3 -1 -2 -1 -2 -2 -1 -4 14 | H -1 -1 -4 -1 -3 -1 -3 -1 -3 -1 -2 -2 -1 -2 -1 -1 15 | D -1 -1 -1 -4 -3 -1 -1 -3 -1 -3 -2 -2 -2 -1 -1 -1 16 | N -2 -2 -2 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 17 | U -4 5 -4 -4 -4 1 -4 1 1 -4 -1 -4 -1 -1 -2 5 18 | -------------------------------------------------------------------------------- /align/matrix.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __all__ = ['DNAFULL', 'BLOSUM62'] 4 | 5 | _NUCL_LETTERS = [ 6 | b'A', b'T', b'G', b'C', b'U', # non-ambiguous letters 7 | b'R', b'Y', b'S', b'W', b'K', b'M', # 2-base ambiguity 8 | b'B', b'D', b'H', b'V', # 3-base ambiguity 9 | b'N', # 4-base ambiguity 10 | ] 11 | 12 | _NUCL_SCORES = { 13 | # A T G C U R Y S W K M B D H V N 14 | b'A': [ 5, -4, -4, -4, -4, 1, -4, -4, 1, -4, 1, -4, -1, -1, -1, -2], 15 | b'T': [-4, 5, -4, -4, 5, -4, 1, -4, 1, 1, -4, -1, -1, -1, -4, -2], 16 | b'G': [-4, -4, 5, -4, -4, 1, -4, 1, -4, 1, -4, -1, -1, -4, -1, -2], 17 | b'C': [-4, -4, -4, 5, -4, -4, 1, 1, -4, -4, 1, -1, -4, -1, -1, -2], 18 | b'U': [-4, 5, -4, -4, 5, -4, 1, -4, 1, 1, -4, -1, -1, -1, -4, -2], 19 | 20 | b'R': [ 1, -4, 1, -4, -4, -1, -4, -2, -2, -2, -2, -3, -1, -3, -1, -1], 21 | b'Y': [-4, 1, -4, 1, 1, -4, -1, -2, -2, -2, -2, -1, -3, -1, -3, -1], 22 | b'S': [-4, -4, 1, 1, -4, -2, -2, -1, -4, -2, -2, -1, -3, -3, -1, -1], 23 | b'W': [ 1, 1, -4, -4, 1, -2, -2, -4, -1, -2, -2, -3, -1, -1, -3, -1], 24 | b'K': [-4, 1, 1, -4, 1, -2, -2, -2, -2, -1, -4, -1, -1, -3, -3, -1], 25 | b'M': [ 1, -4, -4, 1, -4, -2, -2, -2, -2, -4, -1, -3, -3, -1, -1, -1], 26 | 27 | b'B': [-4, -1, -1, -1, -1, -3, -1, -1, -3, -1, -3, -1, -2, -2, -2, -1], 28 | b'D': [-1, -1, -1, -4, -1, -1, -3, -3, -1, -1, -3, -2, -1, -2, -2, -1], 29 | b'H': [-1, -1, -4, -1, -1, -3, -1, -3, -1, -3, -1, -2, -2, -1, -2, -1], 30 | b'V': [-1, -4, -1, -1, -4, -1, -3, -1, -3, -3, -1, -2, -2, -2, -1, -1], 31 | 32 | b'N': [-2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], 33 | } 34 | 35 | # NOTE: This relies on the ordering of _NUCL_LETTERS and the lists inside 36 | # _NUCL_SCORES above. 37 | DNAFULL = {letter: dict(zip(_NUCL_LETTERS, scores)) 38 | for letter, scores in _NUCL_SCORES.items()} 39 | 40 | _AA_LETTERS = [ 41 | b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'K', b'L', b'M', 42 | b'N', b'P', b'Q', b'R', b'S', b'T', b'V', b'W', b'X', b'Y', b'Z', b'*', 43 | ] 44 | 45 | _BLOSUM62_SCORES = { 46 | # A B C D E F G H I K L M N P Q R S T V W X Y Z * 47 | b'A': [ 4, -2, 0, -2, -1, -2, 0, -2, -1, -1, -1, -1, -2, -1, -1, -1, 1, 0, 0, -3, 0, -2, -1, -4], 48 | b'B': [-2, 4, -3, 4, 1, -3, -1, 0, -3, 0, -4, -3, 3, -2, 0, -1, 0, -1, -3, -4, -1, -3, 1, -4], 49 | b'C': [ 0, -3, 9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2, -2, -3, -4], 50 | b'D': [-2, 4, -3, 6, 2, -3, -1, -1, -3, -1, -4, -3, 1, -1, 0, -2, 0, -1, -3, -4, -1, -3, 1, -4], 51 | b'E': [-1, 1, -4, 2, 5, -3, -2, 0, -3, 1, -3, -2, 0, -1, 2, 0, 0, -1, -2, -3, -1, -2, 4, -4], 52 | b'F': [-2, -3, -2, -3, -3, 6, -3, -1, 0, -3, 0, 0, -3, -4, -3, -3, -2, -2, -1, 1, -1, 3, -3, -4], 53 | b'G': [ 0, -1, -3, -1, -2, -3, 6, -2, -4, -2, -4, -3, 0, -2, -2, -2, 0, -2, -3, -2, -1, -3, -2, -4], 54 | b'H': [-2, 0, -3, -1, 0, -1, -2, 8, -3, -1, -3, -2, 1, -2, 0, 0, -1, -2, -3, -2, -1, 2, 0, -4], 55 | b'I': [-1, -3, -1, -3, -3, 0, -4, -3, 4, -3, 2, 1, -3, -3, -3, -3, -2, -1, 3, -3, -1, -1, -3, -4], 56 | b'K': [-1, 0, -3, -1, 1, -3, -2, -1, -3, 5, -2, -1, 0, -1, 1, 2, 0, -1, -2, -3, -1, -2, 1, -4], 57 | b'L': [-1, -4, -1, -4, -3, 0, -4, -3, 2, -2, 4, 2, -3, -3, -2, -2, -2, -1, 1, -2, -1, -1, -3, -4], 58 | b'M': [-1, -3, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5, -2, -2, 0, -1, -1, -1, 1, -1, -1, -1, -1, -4], 59 | b'N': [-2, 3, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6, -2, 0, 0, 1, 0, -3, -4, -1, -2, 0, -4], 60 | b'P': [-1, -2, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7, -1, -2, -1, -1, -2, -4, -2, -3, -1, -4], 61 | b'Q': [-1, 0, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5, 1, 0, -1, -2, -2, -1, -1, 3, -4], 62 | b'R': [-1, -1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5, -1, -1, -3, -3, -1, -2, 0, -4], 63 | b'S': [ 1, 0, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4, 1, -2, -3, 0, -2, 0, -4], 64 | b'T': [ 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5, 0, -2, 0, -2, -1, -4], 65 | b'V': [ 0, -3, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4, -3, -1, -1, -2, -4], 66 | b'W': [-3, -4, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11, -2, 2, -3, -4], 67 | b'X': [ 0, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, 0, 0, -1, -2, -1, -1, -1, -4], 68 | b'Y': [-2, -3, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, - 1, 7, -2, -4], 69 | b'Z': [-1, 1, -3, 1, 4, -3, -2, 0, -3, 1, -3, -1, 0, -1, 3, 0, 0, -1, -2, -3, -1, -2, 4, -4], 70 | b'*': [-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1], 71 | } 72 | 73 | # NOTE: This relies on the ordering of _AA_LETTERS and the lists inside 74 | # _BLOSUM62_SCORES above. 75 | BLOSUM62 = {letter: dict(zip(_AA_LETTERS, scores)) 76 | for letter, scores in _BLOSUM62_SCORES.items()} 77 | -------------------------------------------------------------------------------- /align/tests/test_align.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy 4 | import pyximport 5 | pyximport.install(setup_args={ 6 | "include_dirs": numpy.get_include(), 7 | }) # noqa 8 | 9 | import unittest 10 | 11 | from align import AlignmentResult 12 | from align.align import aligner as pyaligner 13 | from align.calign import aligner as caligner 14 | from align.matrix import DNAFULL, BLOSUM62 15 | 16 | 17 | class AlignmentTests(unittest.TestCase): 18 | 19 | @property 20 | def f(self): 21 | return pyaligner 22 | 23 | 24 | class TestGlobalPy(AlignmentTests): 25 | 26 | method = 'global' 27 | 28 | def test_global1(self): 29 | alns = self.f('WW', 'WEW', method=self.method, matrix=BLOSUM62, 30 | max_hits=None) 31 | assert len(alns) == 1, alns 32 | aln = alns.pop() 33 | assert aln.seq1 == b'W-W', aln 34 | assert aln.seq2 == b'WEW', aln 35 | assert aln.start1 == 0, aln 36 | assert aln.start2 == 0, aln 37 | assert aln.end1 == 2, aln 38 | assert aln.end2 == 3, aln 39 | assert aln.n_gaps1 == 1, aln 40 | assert aln.n_gaps2 == 0, aln 41 | assert aln.n_mismatches == 0, aln 42 | assert aln.score == 15.0, aln 43 | 44 | def test_global2(self): 45 | alns = self.f('WW', 'WEW', method=self.method, matrix=BLOSUM62, 46 | gap_open=-100, max_hits=None) 47 | assert len(alns) == 1, alns 48 | aln = alns.pop() 49 | assert aln.seq1 == b'W-W', aln 50 | assert aln.seq2 == b'WEW', aln 51 | assert aln.start1 == 0, aln 52 | assert aln.start2 == 0, aln 53 | assert aln.end1 == 2, aln 54 | assert aln.end2 == 3, aln 55 | assert aln.n_gaps1 == 1, aln 56 | assert aln.n_gaps2 == 0, aln 57 | assert aln.n_mismatches == 0, aln 58 | assert aln.score == -78.0, aln 59 | 60 | def test_global3(self): 61 | aln, = self.f('A', 'A', method=self.method, matrix=BLOSUM62, 62 | gap_open=-7, max_hits=None) 63 | assert aln.seq1 == b'A', aln 64 | assert aln.seq2 == b'A', aln 65 | assert aln.start1 == 0, aln 66 | assert aln.start2 == 0, aln 67 | assert aln.end1 == 1, aln 68 | assert aln.end2 == 1, aln 69 | assert aln.n_gaps1 == 0, aln 70 | assert aln.n_gaps2 == 0, aln 71 | assert aln.n_mismatches == 0, aln 72 | assert aln.score == 4.0, aln 73 | 74 | def test_global4(self): 75 | aln, = self.f('R', 'K', method=self.method, matrix=BLOSUM62, 76 | gap_open=-7, max_hits=None) 77 | assert aln.seq1 == b'R', aln 78 | assert aln.seq2 == b'K', aln 79 | assert aln.start1 == 0, aln 80 | assert aln.start2 == 0, aln 81 | assert aln.end1 == 1, aln 82 | assert aln.end2 == 1, aln 83 | assert aln.n_gaps1 == 0, aln 84 | assert aln.n_gaps2 == 0, aln 85 | assert aln.n_mismatches == 1, aln 86 | assert aln.score == 2.0, aln 87 | 88 | def test_global5(self): 89 | alns = self.f('R', 'AR', method=self.method, matrix=BLOSUM62, 90 | gap_open=-7, max_hits=None) 91 | assert len(alns) == 1, alns 92 | aln = alns.pop() 93 | assert aln.seq1 == b'-R', aln 94 | assert aln.seq2 == b'AR', aln 95 | assert aln.start1 == 0, aln 96 | assert aln.start2 == 0, aln 97 | assert aln.end1 == 1, aln 98 | assert aln.end2 == 2, aln 99 | assert aln.n_gaps1 == 1, aln 100 | assert aln.n_gaps2 == 0, aln 101 | assert aln.n_mismatches == 0, aln 102 | assert aln.score == -2.0, aln 103 | 104 | def test_global6(self): 105 | alns = self.f('AR', 'R', method=self.method, matrix=BLOSUM62, 106 | gap_open=-7, max_hits=None) 107 | assert len(alns) == 1, alns 108 | aln = alns.pop() 109 | assert aln.seq1 == b'AR', aln 110 | assert aln.seq2 == b'-R', aln 111 | assert aln.start1 == 0, aln 112 | assert aln.start2 == 0, aln 113 | assert aln.end1 == 2, aln 114 | assert aln.end2 == 1, aln 115 | assert aln.n_gaps1 == 0, aln 116 | assert aln.n_gaps2 == 1, aln 117 | assert aln.n_mismatches == 0, aln 118 | assert aln.score == -2.0, aln 119 | 120 | def test_global7(self): 121 | alns = self.f('AR', 'RA', method=self.method, matrix=BLOSUM62, 122 | gap_open=-7, max_hits=None) 123 | assert len(alns) == 1, alns 124 | aln = alns.pop() 125 | assert aln.seq1 == b'AR', aln 126 | assert aln.seq2 == b'RA', aln 127 | assert aln.start1 == 0, aln 128 | assert aln.start2 == 0, aln 129 | assert aln.end1 == 2, aln 130 | assert aln.end2 == 2, aln 131 | assert aln.n_gaps1 == 0, aln 132 | assert aln.n_gaps2 == 0, aln 133 | assert aln.n_mismatches == 2, aln 134 | assert aln.score == -2.0, aln 135 | 136 | def test_global8(self): 137 | alns = self.f('AR', 'RA', method=self.method, matrix=BLOSUM62, 138 | gap_open=-3, max_hits=None) 139 | assert len(alns) == 1, alns 140 | aln = alns.pop() 141 | assert aln.seq1 == b'AR-', aln 142 | assert aln.seq2 == b'-RA', aln 143 | assert aln.start1 == 0, aln 144 | assert aln.start2 == 0, aln 145 | assert aln.end1 == 2, aln 146 | assert aln.end2 == 2, aln 147 | assert aln.n_gaps1 == 1, aln 148 | assert aln.n_gaps2 == 1, aln 149 | assert aln.n_mismatches == 0, aln 150 | assert aln.score == -1.0, aln 151 | 152 | def test_global9(self): 153 | alns = self.f('RAR', 'RR', method=self.method, matrix=BLOSUM62, 154 | gap_open=-3, max_hits=None) 155 | assert len(alns) == 1, alns 156 | aln = alns.pop() 157 | assert aln.seq1 == b'RAR', aln 158 | assert aln.seq2 == b'R-R', aln 159 | assert aln.start1 == 0, aln 160 | assert aln.start2 == 0, aln 161 | assert aln.end1 == 3, aln 162 | assert aln.end2 == 2, aln 163 | assert aln.n_gaps1 == 0, aln 164 | assert aln.n_gaps2 == 1, aln 165 | assert aln.n_mismatches == 0, aln 166 | assert aln.score == 7.0, aln 167 | 168 | def test_global10(self): 169 | alns = self.f('RAR', 'RR', method=self.method, matrix=BLOSUM62, 170 | gap_open=-10, max_hits=None) 171 | assert len(alns) == 1, alns 172 | aln = alns.pop() 173 | assert aln.seq1 == b'RAR', aln 174 | assert aln.seq2 == b'R-R', aln 175 | assert aln.start1 == 0, aln 176 | assert aln.start2 == 0, aln 177 | assert aln.end1 == 3, aln 178 | assert aln.end2 == 2, aln 179 | assert aln.n_gaps1 == 0, aln 180 | assert aln.n_gaps2 == 1, aln 181 | assert aln.n_mismatches == 0, aln 182 | assert aln.score == 0.0, aln 183 | 184 | def test_global11(self): 185 | alns = self.f('RAAR', 'RR', method=self.method, matrix=BLOSUM62, 186 | gap_open=-5, max_hits=None) 187 | assert len(alns) == 1, alns 188 | aln = alns.pop() 189 | assert aln.seq1 == b'RAAR', aln 190 | assert aln.seq2 == b'R--R', aln 191 | assert aln.start1 == 0, aln 192 | assert aln.start2 == 0, aln 193 | assert aln.end1 == 4, aln 194 | assert aln.end2 == 2, aln 195 | assert aln.n_gaps1 == 0, aln 196 | assert aln.n_gaps2 == 1, aln 197 | assert aln.n_mismatches == 0, aln 198 | assert aln.score == 0.0, aln 199 | 200 | def test_global12(self): 201 | alns = self.f('RLR', 'RER', method=self.method, matrix=BLOSUM62, 202 | gap_open=-9, max_hits=None) 203 | assert len(alns) == 1, alns 204 | aln = alns.pop() 205 | assert aln.seq1 == b'RLR', aln 206 | assert aln.seq2 == b'RER', aln 207 | assert aln.start1 == 0, aln 208 | assert aln.start2 == 0, aln 209 | assert aln.end1 == 3, aln 210 | assert aln.end2 == 3, aln 211 | assert aln.n_gaps1 == 0, aln 212 | assert aln.n_gaps2 == 0, aln 213 | assert aln.n_mismatches == 1, aln 214 | assert aln.score == 7., aln 215 | 216 | def test_global13(self): 217 | alns = self.f('RLR', 'RER', method=self.method, matrix=BLOSUM62, 218 | gap_open=-1, max_hits=None) 219 | assert len(alns) == 1, alns 220 | aln = alns.pop() 221 | assert aln.seq1 == b'RL-R', aln 222 | assert aln.seq2 == b'R-ER', aln 223 | assert aln.start1 == 0, aln 224 | assert aln.start2 == 0, aln 225 | assert aln.end1 == 3, aln 226 | assert aln.end2 == 3, aln 227 | assert aln.n_gaps1 == 1, aln 228 | assert aln.n_gaps2 == 1, aln 229 | assert aln.n_mismatches == 0, aln 230 | assert aln.score == 8.0, aln 231 | 232 | def test_global14(self): 233 | alns = self.f('RLR', 'REER', method=self.method, matrix=BLOSUM62, 234 | gap_open=-1, max_hits=None) 235 | assert len(alns) == 1, alns 236 | aln = alns.pop() 237 | assert aln.seq1 == b'RL--R', aln 238 | assert aln.seq2 == b'R-EER', aln 239 | assert aln.start1 == 0, aln 240 | assert aln.start2 == 0, aln 241 | assert aln.end1 == 3, aln 242 | assert aln.end2 == 4, aln 243 | assert aln.n_gaps1 == 1, aln 244 | assert aln.n_gaps2 == 1, aln 245 | assert aln.n_mismatches == 0, aln 246 | assert aln.score == 7.0, aln 247 | 248 | def test_global15(self): 249 | alns = self.f('AGEBAM', 'AGEBAMAM', method=self.method, 250 | matrix=BLOSUM62, gap_open=-6, max_hits=None) 251 | assert len(alns) == 1, alns 252 | aln = alns.pop() 253 | assert aln.seq1 == b'AGEBAM--', aln 254 | assert aln.seq2 == b'AGEBAMAM', aln 255 | assert aln.start1 == 0, aln 256 | assert aln.start2 == 0, aln 257 | assert aln.end1 == 6, aln 258 | assert aln.end2 == 8, aln 259 | assert aln.n_gaps1 == 1, aln 260 | assert aln.n_gaps2 == 0, aln 261 | assert aln.n_mismatches == 0, aln 262 | assert aln.score == 16.0, aln 263 | 264 | def test_global16(self): 265 | alns = self.f('CPELIRKNCANTH', 'PREKRLICAN', method=self.method, 266 | matrix=BLOSUM62, gap_open=-0.5, max_hits=None) 267 | assert len(alns) == 1, alns 268 | aln = alns.pop() 269 | assert aln.seq1 == b'CP-E--LIRKNCANTH', aln 270 | assert aln.seq2 == b'-PREKRLI---CAN--', aln 271 | assert aln.start1 == 0, aln 272 | assert aln.start2 == 0, aln 273 | assert aln.end1 == 13, aln 274 | assert aln.end2 == 10, aln 275 | assert aln.n_gaps1 == 2, aln 276 | assert aln.n_gaps2 == 3, aln 277 | assert aln.n_mismatches == 0, aln 278 | assert aln.score == 34.5, aln 279 | 280 | def test_global17(self): 281 | alns = self.f('CPEL', 'PREK', method=self.method, matrix=BLOSUM62, 282 | gap_open=-5, max_hits=None) 283 | assert len(alns) == 1, alns 284 | aln = alns.pop() 285 | assert aln.seq1 == b'CP-EL', aln 286 | assert aln.seq2 == b'-PREK', aln 287 | assert aln.start1 == 0, aln 288 | assert aln.start2 == 0, aln 289 | assert aln.end1 == 4, aln 290 | assert aln.end2 == 4, aln 291 | assert aln.n_gaps1 == 1, aln 292 | assert aln.n_gaps2 == 1, aln 293 | assert aln.n_mismatches == 1, aln 294 | assert aln.score == 0.0, aln 295 | 296 | def test_global18(self): 297 | alns = self.f('RLRR', 'RRER', method=self.method, matrix=BLOSUM62, 298 | gap_open=-1, max_hits=None) 299 | assert len(alns) == 1, alns 300 | aln = alns.pop() 301 | assert aln.seq1 == b'RLR-R', aln 302 | assert aln.seq2 == b'R-RER', aln 303 | assert aln.start1 == 0, aln 304 | assert aln.start2 == 0, aln 305 | assert aln.end1 == 4, aln 306 | assert aln.end2 == 4, aln 307 | assert aln.n_gaps1 == 1, aln 308 | assert aln.n_gaps2 == 1, aln 309 | assert aln.n_mismatches == 0, aln 310 | assert aln.score == 13.0, aln 311 | 312 | def test_global19(self): 313 | alns = self.f('TAAT', 'TAATTC', method=self.method, matrix=DNAFULL, 314 | max_hits=None) 315 | assert len(alns) == 1, alns 316 | aln = alns.pop() 317 | assert aln.seq1 == b'TAAT--', aln 318 | assert aln.seq2 == b'TAATTC', aln 319 | assert aln.start1 == 0, aln 320 | assert aln.start2 == 0, aln 321 | assert aln.end1 == 4, aln 322 | assert aln.end2 == 6, aln 323 | assert aln.n_gaps1 == 1, aln 324 | assert aln.n_gaps2 == 0, aln 325 | assert aln.n_mismatches == 0, aln 326 | assert aln.score == 6.0, aln 327 | 328 | def test_global20(self): 329 | alns = self.f('WR', 'WRR', method=self.method, matrix=BLOSUM62, 330 | max_hits=None) 331 | assert len(alns) == 1, alns 332 | aln = alns.pop() 333 | assert aln.seq1 == b'WR-', aln 334 | assert aln.seq2 == b'WRR', aln 335 | assert aln.start1 == 0, aln 336 | assert aln.start2 == 0, aln 337 | assert aln.end1 == 2, aln 338 | assert aln.end2 == 3, aln 339 | assert aln.n_gaps1 == 1, aln 340 | assert aln.n_gaps2 == 0, aln 341 | assert aln.n_mismatches == 0, aln 342 | assert aln.score == 9.0, aln 343 | 344 | def test_global21(self): 345 | alns = self.f('AIP', 'AP', method=self.method, matrix=BLOSUM62, 346 | max_hits=None) 347 | assert len(alns) == 1, alns 348 | aln = alns.pop() 349 | assert aln.seq1 == b'AIP', aln 350 | assert aln.seq2 == b'A-P', aln 351 | assert aln.start1 == 0, aln 352 | assert aln.start2 == 0, aln 353 | assert aln.end1 == 3, aln 354 | assert aln.end2 == 2, aln 355 | assert aln.n_gaps1 == 0, aln 356 | assert aln.n_gaps2 == 1, aln 357 | assert aln.n_mismatches == 0, aln 358 | assert aln.score == 4.0, aln 359 | 360 | def test_global22(self): 361 | alns = self.f('PAA', 'PA', method=self.method, matrix=BLOSUM62, 362 | max_hits=None) 363 | assert len(alns) == 1, alns 364 | aln = alns.pop() 365 | assert aln.seq1 == b'PAA', aln 366 | assert aln.seq2 == b'PA-', aln 367 | assert aln.start1 == 0, aln 368 | assert aln.start2 == 0, aln 369 | assert aln.end1 == 3, aln 370 | assert aln.end2 == 2, aln 371 | assert aln.n_gaps1 == 0, aln 372 | assert aln.n_gaps2 == 1, aln 373 | assert aln.n_mismatches == 0, aln 374 | assert aln.score == 4.0, aln 375 | 376 | def test_global23(self): 377 | alns = self.f('TAATTC', 'TAAT', method=self.method, matrix=DNAFULL, 378 | gap_open=-10, gap_extend=-1, max_hits=None) 379 | assert len(alns) == 1, alns 380 | aln = alns.pop() 381 | assert aln.seq1 == b'TAATTC', aln 382 | assert aln.seq2 == b'TAAT--', aln 383 | assert aln.start1 == 0, aln 384 | assert aln.start2 == 0, aln 385 | assert aln.end1 == 6, aln 386 | assert aln.end2 == 4, aln 387 | assert aln.n_gaps1 == 0, aln 388 | assert aln.n_gaps2 == 1, aln 389 | assert aln.n_mismatches == 0, aln 390 | assert aln.score == 9.0, aln 391 | 392 | def test_global24(self): 393 | alns = self.f('CELECANTH', 'PELICAN', method=self.method, 394 | matrix=BLOSUM62, max_hits=None) 395 | assert len(alns) == 1, alns 396 | aln = alns.pop() 397 | assert aln.seq1 == b'CELECANTH', aln 398 | assert aln.seq2 == b'PELICAN--', aln 399 | assert aln.start1 == 0, aln 400 | assert aln.start2 == 0, aln 401 | assert aln.end1 == 9, aln 402 | assert aln.end2 == 7, aln 403 | assert aln.n_gaps1 == 0, aln 404 | assert aln.n_gaps2 == 1, aln 405 | assert aln.n_mismatches == 2, aln 406 | assert aln.score == 8.0, aln 407 | 408 | def test_global25(self): 409 | alns = self.f('PELICAN', 'CELECANTH', method=self.method, 410 | matrix=BLOSUM62, max_hits=None) 411 | assert len(alns) == 1, alns 412 | aln = alns.pop() 413 | assert aln.seq1 == b'PELICAN--', aln 414 | assert aln.seq2 == b'CELECANTH', aln 415 | assert aln.start1 == 0, aln 416 | assert aln.start2 == 0, aln 417 | assert aln.end1 == 7, aln 418 | assert aln.end2 == 9, aln 419 | assert aln.n_gaps1 == 1, aln 420 | assert aln.n_gaps2 == 0, aln 421 | assert aln.n_mismatches == 2, aln 422 | assert aln.score == 8.0, aln 423 | 424 | def test_global26(self): 425 | alns = self.f('AGEBANAN', 'ACEBAN', method=self.method, 426 | matrix=BLOSUM62, gap_open=-2, gap_extend=-1, 427 | max_hits=None) 428 | assert len(alns) == 1, alns 429 | aln = alns.pop() 430 | assert aln.seq1 == b'AGEBANAN', aln 431 | assert aln.seq2 == b'ACEBAN--', aln 432 | assert aln.start1 == 0, aln 433 | assert aln.start2 == 0, aln 434 | assert aln.end1 == 8, aln 435 | assert aln.end2 == 6, aln 436 | assert aln.n_gaps1 == 0, aln 437 | assert aln.n_gaps2 == 1, aln 438 | assert aln.n_mismatches == 1, aln 439 | assert aln.score == 17.0, aln 440 | 441 | def test_global27(self): 442 | alns = self.f('AATCAAG', 'AATGAATGAGTCAATG', method=self.method, 443 | matrix=DNAFULL, max_hits=None) 444 | assert len(alns) == 1, alns 445 | aln = alns.pop() 446 | assert aln.seq1 == b'AAT--------CAA-G', aln 447 | assert aln.seq2 == b'AATGAATGAGTCAATG', aln 448 | assert aln.start1 == 0, aln 449 | assert aln.start2 == 0, aln 450 | assert aln.end1 == 7, aln 451 | assert aln.end2 == 16, aln 452 | assert aln.n_gaps1 == 2, aln 453 | assert aln.n_gaps2 == 0, aln 454 | assert aln.n_mismatches == 0, aln 455 | assert aln.score == -28.0, aln 456 | 457 | 458 | class TestGlobalCFEPy(AlignmentTests): 459 | 460 | method = 'global_cfe' 461 | 462 | def test_global_cfe1(self): 463 | alns = self.f('TAAT', 'TAATTC', method=self.method, matrix=DNAFULL, 464 | max_hits=None) 465 | assert len(alns) == 1, alns 466 | aln = alns.pop() 467 | assert aln.seq1 == b'TAAT--', aln 468 | assert aln.seq2 == b'TAATTC', aln 469 | assert aln.start1 == 0, aln 470 | assert aln.start2 == 0, aln 471 | assert aln.end1 == 4, aln 472 | assert aln.end2 == 6, aln 473 | assert aln.n_gaps1 == 1, aln 474 | assert aln.n_gaps2 == 0, aln 475 | assert aln.n_mismatches == 0, aln 476 | assert aln.score == 20.0, aln 477 | 478 | def test_global_cfe2(self): 479 | alns = self.f('TCTAAT', 'TAAT', method=self.method, matrix=DNAFULL, 480 | max_hits=None) 481 | assert len(alns) == 1, alns 482 | aln = alns.pop() 483 | assert aln.seq1 == b'TCTAAT', aln 484 | assert aln.seq2 == b'--TAAT', aln 485 | assert aln.start1 == 0, aln 486 | assert aln.start2 == 0, aln 487 | assert aln.end1 == 6, aln 488 | assert aln.end2 == 4, aln 489 | assert aln.n_gaps1 == 0, aln 490 | assert aln.n_gaps2 == 1, aln 491 | assert aln.n_mismatches == 0, aln 492 | assert aln.score == 20.0, aln 493 | 494 | def test_global_cfe3(self): 495 | alns = self.f('PAA', 'PA', method=self.method, matrix=BLOSUM62, 496 | max_hits=None) 497 | assert len(alns) == 1, alns 498 | aln = alns.pop() 499 | assert aln.seq1 == b'PAA', aln 500 | assert aln.seq2 == b'PA-', aln 501 | assert aln.start1 == 0, aln 502 | assert aln.start2 == 0, aln 503 | assert aln.end1 == 3, aln 504 | assert aln.end2 == 2, aln 505 | assert aln.n_gaps1 == 0, aln 506 | assert aln.n_gaps2 == 1, aln 507 | assert aln.n_mismatches == 0, aln 508 | assert aln.score == 11.0, aln 509 | 510 | def test_global_cfe4(self): 511 | alns = self.f('AATGAA', 'AATGAATGAA', method=self.method, 512 | matrix=DNAFULL, max_hits=None) 513 | assert len(alns) == 2, alns 514 | aln1 = AlignmentResult( 515 | seq1=b'AATGAA----', seq2=b'AATGAATGAA', start1=0, start2=0, 516 | end1=6, end2=10, n_gaps1=1, n_gaps2=0, 517 | n_mismatches=0, score=30.0) 518 | aln2 = AlignmentResult( 519 | seq1=b'----AATGAA', seq2=b'AATGAATGAA', start1=0, start2=0, 520 | end1=6, end2=10, n_gaps1=1, n_gaps2=0, 521 | n_mismatches=0, score=30.0) 522 | assert aln1 in alns, alns 523 | assert aln2 in alns, alns 524 | 525 | def test_global_cfe5(self): 526 | alns = self.f('AAAAAAAAAAAAACCTGCGCCCCAAAAAAAAAAAAAAAAAAAA', 527 | 'CCTGCGCACCCC', method='global_cfe', matrix=DNAFULL, 528 | max_hits=None) 529 | assert len(alns) == 1, alns 530 | aln = alns.pop() 531 | assert aln.seq1 == b'AAAAAAAAAAAAACCTGCGC-CCCAAAAAAAAAAAAAAAAAAAA', aln 532 | assert aln.seq2 == b'-------------CCTGCGCACCCC-------------------', aln 533 | assert aln.start1 == 0, aln 534 | assert aln.start2 == 0, aln 535 | assert aln.end1 == 43, aln 536 | assert aln.end2 == 12, aln 537 | assert aln.n_gaps1 == 1, aln 538 | assert aln.n_gaps2 == 2, aln 539 | assert aln.n_mismatches == 1, aln 540 | assert aln.score == 39.0, aln 541 | 542 | def test_global_cfe6(self): 543 | alns = self.f('AATCAAG', 'AATGAATGAGTCAATG', method=self.method, 544 | matrix=DNAFULL, max_hits=None) 545 | assert len(alns) == 2, alns 546 | aln1 = AlignmentResult(seq1=b'AATCAA-G--------', 547 | seq2=b'AATGAATGAGTCAATG', 548 | start1=0, start2=0, end1=7, end2=16, 549 | n_gaps1=2, n_gaps2=0, n_mismatches=1, 550 | score=19.0) 551 | aln2 = AlignmentResult(seq1=b'--------AATCAA-G', 552 | seq2=b'AATGAATGAGTCAATG', 553 | start1=0, start2=0, end1=7, end2=16, 554 | n_gaps1=2, n_gaps2=0, n_mismatches=1, 555 | score=19.0) 556 | assert aln1 in alns, alns 557 | assert aln2 in alns, alns 558 | 559 | 560 | class TestLocalPy(AlignmentTests): 561 | 562 | method = 'local' 563 | 564 | def test_local1(self): 565 | alns = self.f('TCTAAT', 'TAAT', method=self.method, matrix=DNAFULL, 566 | max_hits=None) 567 | assert len(alns) == 1, alns 568 | aln = alns.pop() 569 | assert aln.seq1 == b'TAAT', aln 570 | assert aln.seq2 == b'TAAT', aln 571 | assert aln.start1 == 2, aln 572 | assert aln.start2 == 0, aln 573 | assert aln.end1 == 6, aln 574 | assert aln.end2 == 4, aln 575 | assert aln.n_gaps1 == 0, aln 576 | assert aln.n_gaps2 == 0, aln 577 | assert aln.n_mismatches == 0, aln 578 | assert aln.score == 20.0, aln 579 | 580 | def test_local2(self): 581 | alns = self.f('TCTAAT', 'TAATCT', method=self.method, matrix=DNAFULL, 582 | max_hits=None) 583 | assert len(alns) == 1, alns 584 | aln = alns.pop() 585 | assert aln.seq1 == b'TAAT', aln 586 | assert aln.seq2 == b'TAAT', aln 587 | assert aln.start1 == 2, aln 588 | assert aln.start2 == 0, aln 589 | assert aln.end1 == 6, aln 590 | assert aln.end2 == 4, aln 591 | assert aln.n_gaps1 == 0, aln 592 | assert aln.n_gaps2 == 0, aln 593 | assert aln.n_mismatches == 0, aln 594 | assert aln.score == 20.0, aln 595 | 596 | def test_local3(self): 597 | alns = self.f('A', 'A', method=self.method, matrix=BLOSUM62) 598 | assert len(alns) == 1, alns 599 | aln = alns.pop() 600 | assert aln.seq1 == b'A', aln 601 | assert aln.seq2 == b'A', aln 602 | assert aln.start1 == 0, aln 603 | assert aln.start2 == 0, aln 604 | assert aln.end1 == 1, aln 605 | assert aln.end2 == 1, aln 606 | assert aln.n_gaps1 == 0, aln 607 | assert aln.n_gaps2 == 0, aln 608 | assert aln.n_mismatches == 0, aln 609 | assert aln.score == 4.0, aln 610 | 611 | def test_local4(self): 612 | alns = self.f('RA', 'AR', method=self.method, matrix=BLOSUM62, 613 | max_hits=None) 614 | assert len(alns) == 1, alns 615 | aln = alns.pop() 616 | assert aln.seq1 == b'R', aln 617 | assert aln.seq2 == b'R', aln 618 | assert aln.start1 == 0, aln 619 | assert aln.start2 == 1, aln 620 | assert aln.end1 == 1, aln 621 | assert aln.end2 == 2, aln 622 | assert aln.n_gaps1 == 0, aln 623 | assert aln.n_gaps2 == 0, aln 624 | assert aln.n_mismatches == 0, aln 625 | assert aln.score == 5.0, aln 626 | 627 | def test_local5(self): 628 | alns = self.f('RRR', 'RR', method=self.method, matrix=BLOSUM62, 629 | max_hits=None) 630 | assert len(alns) == 2, alns 631 | aln1 = AlignmentResult(seq1=b'RR', seq2=b'RR', start1=0, start2=0, 632 | end1=2, end2=2, n_gaps1=0, n_gaps2=0, 633 | n_mismatches=0, score=10.0) 634 | aln2 = AlignmentResult(seq1=b'RR', seq2=b'RR', start1=1, start2=0, 635 | end1=3, end2=2, n_gaps1=0, n_gaps2=0, 636 | n_mismatches=0, score=10.0) 637 | assert aln1 in alns, alns 638 | assert aln2 in alns, alns 639 | 640 | def test_local6(self): 641 | alns = self.f('PYNCHAN', 'YNCH', method=self.method, matrix=BLOSUM62, 642 | max_hits=None) 643 | assert len(alns) == 1, alns 644 | aln = alns.pop() 645 | assert aln.seq1 == b'YNCH', aln 646 | assert aln.seq2 == b'YNCH', aln 647 | assert aln.start1 == 1, aln 648 | assert aln.start2 == 0, aln 649 | assert aln.end1 == 5, aln 650 | assert aln.end2 == 4, aln 651 | assert aln.n_gaps1 == 0, aln 652 | assert aln.n_gaps2 == 0, aln 653 | assert aln.n_mismatches == 0, aln 654 | assert aln.score == 30.0, aln 655 | 656 | def test_local7(self): 657 | alns = self.f('AIP', 'AP', method=self.method, matrix=BLOSUM62, 658 | max_hits=None) 659 | assert len(alns) == 1, alns 660 | aln = alns.pop() 661 | assert aln.seq1 == b'P', aln 662 | assert aln.seq2 == b'P', aln 663 | assert aln.start1 == 2, aln 664 | assert aln.start2 == 1, aln 665 | assert aln.end1 == 3, aln 666 | assert aln.end2 == 2, aln 667 | assert aln.n_gaps1 == 0, aln 668 | assert aln.n_gaps2 == 0, aln 669 | assert aln.n_mismatches == 0, aln 670 | assert aln.score == 7.0, aln 671 | 672 | def test_local8(self): 673 | alns = self.f('PAA', 'PA', method=self.method, matrix=BLOSUM62, 674 | max_hits=None) 675 | assert len(alns) == 1, alns 676 | aln = alns.pop() 677 | assert aln.seq1 == b'PA', aln 678 | assert aln.seq2 == b'PA', aln 679 | assert aln.start1 == 0, aln 680 | assert aln.start2 == 0, aln 681 | assert aln.end1 == 2, aln 682 | assert aln.end2 == 2, aln 683 | assert aln.n_gaps1 == 0, aln 684 | assert aln.n_gaps2 == 0, aln 685 | assert aln.n_mismatches == 0, aln 686 | assert aln.score == 11.0, aln 687 | 688 | def test_local9(self): 689 | alns = self.f('AATCAAG', 'AATGAATGAGTCAATG', method=self.method, 690 | matrix=DNAFULL, max_hits=None) 691 | assert len(alns) == 2, alns 692 | aln1 = AlignmentResult(seq1=b'AATCAA', seq2=b'AATGAA', start1=0, 693 | start2=0, end1=6, end2=6, 694 | n_gaps1=0, n_gaps2=0, n_mismatches=1, 695 | score=21.0) 696 | aln2 = AlignmentResult(seq1=b'AATCAA', seq2=b'AGTCAA', start1=0, 697 | start2=8, end1=6, end2=14, 698 | n_gaps1=0, n_gaps2=0, n_mismatches=1, 699 | score=21.0) 700 | assert aln1 in alns, alns 701 | assert aln2 in alns, alns 702 | 703 | def test_local10(self): 704 | alns = self.f('AT', 'ATCATCATC', method=self.method, 705 | matrix=DNAFULL, max_hits=None) 706 | assert len(alns) == 3, alns 707 | aln1 = AlignmentResult(seq1=b'AT', seq2=b'AT', start1=0, start2=0, 708 | end1=2, end2=2, n_gaps1=0, n_gaps2=0, 709 | n_mismatches=0, score=10.0) 710 | aln2 = AlignmentResult(seq1=b'AT', seq2=b'AT', start1=0, start2=3, 711 | end1=2, end2=5, n_gaps1=0, n_gaps2=0, 712 | n_mismatches=0, score=10.0) 713 | aln3 = AlignmentResult(seq1=b'AT', seq2=b'AT', start1=0, start2=6, 714 | end1=2, end2=8, n_gaps1=0, n_gaps2=0, 715 | n_mismatches=0, score=10.0) 716 | assert aln1 in alns, alns 717 | assert aln2 in alns, alns 718 | assert aln3 in alns, alns 719 | 720 | def test_local11(self): 721 | alns = self.f('AT', 'ATCATCATC', method=self.method, 722 | matrix=DNAFULL, max_hits=1) 723 | assert len(alns) == 1, alns 724 | aln = alns.pop() 725 | aln1 = AlignmentResult(seq1=b'AT', seq2=b'AT', start1=0, start2=0, 726 | end1=2, end2=2, n_gaps1=0, n_gaps2=0, 727 | n_mismatches=0, score=10.0) 728 | aln2 = AlignmentResult(seq1=b'AT', seq2=b'AT', start1=0, start2=3, 729 | end1=2, end2=5, n_gaps1=0, n_gaps2=0, 730 | n_mismatches=0, score=10.0) 731 | aln3 = AlignmentResult(seq1=b'AT', seq2=b'AT', start1=0, start2=6, 732 | end1=2, end2=8, n_gaps1=0, n_gaps2=0, 733 | n_mismatches=0, score=10.0) 734 | assert aln == aln1 or aln == aln2 or aln == aln3, alns 735 | 736 | def test_local12(self): 737 | alns = self.f('AT', 'ATCATCATC', method=self.method, 738 | matrix=DNAFULL, max_hits=2) 739 | assert len(alns) == 2, alns 740 | aln1 = AlignmentResult(seq1=b'AT', seq2=b'AT', start1=0, start2=0, 741 | end1=2, end2=2, n_gaps1=0, n_gaps2=0, 742 | n_mismatches=0, score=10.0) 743 | aln2 = AlignmentResult(seq1=b'AT', seq2=b'AT', start1=0, start2=3, 744 | end1=2, end2=5, n_gaps1=0, n_gaps2=0, 745 | n_mismatches=0, score=10.0) 746 | aln3 = AlignmentResult(seq1=b'AT', seq2=b'AT', start1=0, start2=6, 747 | end1=2, end2=8, n_gaps1=0, n_gaps2=0, 748 | n_mismatches=0, score=10.0) 749 | assert len(set([aln1, aln2, aln3]).intersection(alns)) == 2, alns 750 | 751 | 752 | class TestGlocalPy(AlignmentTests): 753 | 754 | method = 'glocal' 755 | 756 | def test_glocal1(self): 757 | alns = self.f('AAATAATAAA', 'TAAT', method=self.method, 758 | matrix=DNAFULL, max_hits=None) 759 | assert len(alns) == 1, alns 760 | aln = alns.pop() 761 | assert aln.seq1 == b'TAAT', aln 762 | assert aln.seq2 == b'TAAT', aln 763 | assert aln.start1 == 3, aln 764 | assert aln.start2 == 0, aln 765 | assert aln.end1 == 7, aln 766 | assert aln.end2 == 4, aln 767 | assert aln.n_gaps1 == 0, aln 768 | assert aln.n_gaps2 == 0, aln 769 | assert aln.n_mismatches == 0, aln 770 | assert aln.score == 20.0, aln 771 | 772 | def test_glocal2(self): 773 | alns = self.f('AAATAATAAA', 'TATAT', method=self.method, 774 | matrix=DNAFULL, gap_open=-1, max_hits=None) 775 | assert len(alns) == 1, alns 776 | aln = alns.pop() 777 | assert aln.seq1 == b'TA-AT', aln 778 | assert aln.seq2 == b'TATAT', aln 779 | assert aln.start1 == 3, aln 780 | assert aln.start2 == 0, aln 781 | assert aln.end1 == 7, aln 782 | assert aln.end2 == 5, aln 783 | assert aln.n_gaps1 == 1, aln 784 | assert aln.n_gaps2 == 0, aln 785 | assert aln.n_mismatches == 0, aln 786 | assert aln.score == 19.0, aln 787 | 788 | def test_glocal3(self): 789 | alns = self.f('TATATAAA', 'CCTATAT', method=self.method, 790 | matrix=DNAFULL, gap_open=-8, gap_double=-8, 791 | max_hits=None) 792 | assert len(alns) == 1, alns 793 | aln = alns.pop() 794 | assert aln.seq1 == b'--TATAT', aln 795 | assert aln.seq2 == b'CCTATAT', aln 796 | assert aln.start1 == 0, aln 797 | assert aln.start2 == 0, aln 798 | assert aln.end1 == 5, aln 799 | assert aln.end2 == 7, aln 800 | assert aln.n_gaps1 == 1, aln 801 | assert aln.n_gaps2 == 0, aln 802 | assert aln.n_mismatches == 0, aln 803 | assert aln.score == 10.0, aln 804 | 805 | def test_glocal4(self): 806 | alns = self.f('CCTATAT', 'TATATAAA', method=self.method, 807 | matrix=DNAFULL, gap_open=-8, gap_double=-8, 808 | max_hits=None) 809 | assert len(alns) == 1, alns 810 | aln = alns.pop() 811 | assert aln.seq1 == b'CCTATAT', aln 812 | assert aln.seq2 == b'--TATAT', aln 813 | assert aln.start1 == 0, aln 814 | assert aln.start2 == 0, aln 815 | assert aln.end1 == 7, aln 816 | assert aln.end2 == 5, aln 817 | assert aln.n_gaps1 == 0, aln 818 | assert aln.n_gaps2 == 1, aln 819 | assert aln.n_mismatches == 0, aln 820 | assert aln.score == 10.0, aln 821 | 822 | def test_glocal5(self): 823 | alns = self.f('AATCAAG', 'AATGAATGAGTCAATG', method=self.method, 824 | matrix=DNAFULL, max_hits=None) 825 | assert len(alns) == 2, alns 826 | aln1 = AlignmentResult(seq1=b'AATCAA-G', seq2=b'AATGAATG', start1=0, 827 | start2=0, end1=7, end2=8, 828 | n_gaps1=1, n_gaps2=0, n_mismatches=1, 829 | score=19.0) 830 | aln2 = AlignmentResult(seq1=b'AATCAA-G', seq2=b'AGTCAATG', start1=0, 831 | start2=8, end1=7, end2=16, 832 | n_gaps1=1, n_gaps2=0, n_mismatches=1, 833 | score=19.0) 834 | assert aln1 in alns, alns 835 | assert aln2 in alns, alns 836 | 837 | 838 | class TestGlobalC(TestGlobalPy): 839 | 840 | @property 841 | def f(self): 842 | return caligner 843 | 844 | 845 | class TestLocalC(TestLocalPy): 846 | 847 | @property 848 | def f(self): 849 | return caligner 850 | 851 | 852 | class TestGlocalC(TestGlocalPy): 853 | 854 | @property 855 | def f(self): 856 | return caligner 857 | 858 | 859 | class TestGlobalCFEC(TestGlobalCFEPy): 860 | 861 | @property 862 | def f(self): 863 | return caligner 864 | 865 | 866 | if __name__ == '__main__': 867 | unittest.main() 868 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | coverage 2 | nose 3 | tox 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | cython 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | with-doctest=1 3 | doctest-extension=.rst,pyx 4 | nocapture=1 5 | detailed-errors=1 6 | with-coverage=1 7 | cover-package=align 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from setuptools import setup 4 | from distutils.extension import Extension 5 | 6 | version = '0.0.2' 7 | 8 | with open('README.rst') as src: 9 | doc = src.read() 10 | 11 | with open('HISTORY.rst') as src: 12 | history = src.read().replace('.. :changelog:', '').strip() 13 | 14 | with open('requirements.txt') as src: 15 | requirements = [line.strip() for line in src] 16 | 17 | with open('requirements-dev.txt') as src: 18 | test_requirements = [line.strip() for line in src] 19 | 20 | # Get numpy dependency from requirements 21 | numpy_req, = [req for req in requirements if 22 | req.lower().startswith("numpy")] 23 | 24 | # Attempt to import ~ this should raise an error when doing setup_requires 25 | # (preinstall) but should be ok during actual install 26 | try: 27 | import numpy 28 | np_includes = [numpy.get_include()] 29 | except ImportError: 30 | np_includes = [] 31 | 32 | # Similar logic to numpy import for Cython 33 | 34 | cython_req, = [req for req in requirements 35 | if req.lower().startswith("cython")] 36 | 37 | 38 | setup(name='align', 39 | version=version, 40 | description='polite, proper sequence alignment', 41 | long_description=doc + '\n\n' + history, 42 | keywords='sequence bioinformatics alignment text', 43 | url='http://github.com/brentp/align/', 44 | author='brentp', 45 | author_email='bpederse@gmail.com', 46 | license='BSD', 47 | test_suite='nose.collector', 48 | include_package_data=True, 49 | zip_safe=False, 50 | packages=['align'], 51 | package_dir={'align': 'align'}, 52 | package_data={'align': ['data/*']}, 53 | setup_requires=["setuptools>=18.0", numpy_req, cython_req], 54 | install_requires=requirements, 55 | test_require=test_requirements, 56 | ext_modules=[ 57 | Extension('align.calign', 58 | ['align/calign.pyx'], 59 | include_dirs=np_includes)], 60 | classifiers=[ 61 | 'Development Status :: 3 - Alpha', 62 | 'Intended Audience :: Developers', 63 | 'Intended Audience :: Science/Research', 64 | 'License :: OSI Approved :: BSD License', 65 | 'Operating System :: OS Independent', 66 | 'Programming Language :: Python', 67 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 68 | 'Topic :: Scientific/Engineering', 69 | 'Topic :: Text Processing' 70 | ]) 71 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py35 3 | 4 | [testenv] 5 | deps = 6 | numpy 7 | cython 8 | -rrequirements.txt 9 | -rrequirements-dev.txt 10 | commands = 11 | nosetests -v --with-coverage align/tests 12 | --------------------------------------------------------------------------------