├── .gitignore ├── LICENSE.txt ├── README.rst ├── alfpy ├── __init__.py ├── bbc.py ├── fcgr.py ├── graphdna.py ├── lempelziv.py ├── ncd.py ├── utils │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── seqcontent.py │ │ └── subsmat.py │ ├── distance.py │ ├── distmatrix.py │ ├── fasta.py │ └── seqrecords.py ├── version.py ├── wmetric.py ├── word_bool_distance.py ├── word_d2.py ├── word_distance.py ├── word_pattern.py ├── word_rtd.py ├── word_sets_distance.py └── word_vector.py ├── bin ├── calc_bbc.py ├── calc_fcgr.py ├── calc_graphdna.py ├── calc_lempelziv.py ├── calc_ncd.py ├── calc_wmetric.py ├── calc_word.py ├── calc_word_bool.py ├── calc_word_cv.py ├── calc_word_d2.py ├── calc_word_ffp.py ├── calc_word_rtd.py ├── calc_word_sets.py └── create_wordpattern.py ├── example_data ├── input │ ├── aminoacid.freqs.swissprot.txt │ ├── aminoacid.weights.txt │ ├── bears.dna.fasta │ ├── gp120.pep.fasta │ ├── hiv.pep.fasta │ ├── sample.dna.fasta │ └── sample.pep.fasta └── output │ ├── bears.dna.fasta.1mer │ ├── bears.dna.fasta.2mer │ ├── bears.dna.fasta.3mer │ ├── bears.dna.fasta.pairwise │ ├── bears.dna.fasta.phylip │ ├── bears.dna.fasta.teiresias.2mer │ ├── bears.dna.fasta.teiresias.3mer │ ├── gp120.pep.fasta.1mer │ ├── gp120.pep.fasta.2mer │ ├── gp120.pep.fasta.3mer │ ├── gp120.pep.fasta.pairwise │ ├── gp120.pep.fasta.phylip │ ├── gp120.pep.fasta.teiresias.2mer │ ├── gp120.pep.fasta.teiresias.3mer │ ├── hiv.pep.fasta.1mer │ ├── hiv.pep.fasta.2mer │ ├── hiv.pep.fasta.3mer │ ├── hiv.pep.fasta.pairwise │ ├── hiv.pep.fasta.phylip │ ├── hiv.pep.fasta.teiresias.2mer │ └── hiv.pep.fasta.teiresias.3mer ├── setup.py └── tests ├── __init__.py ├── data ├── char_freqs.txt ├── char_weights.txt ├── dna.fa ├── dna.fa.1mer.txt ├── dna.fa.1mer.wordpos.txt ├── dna.fa.2mer.txt ├── dna.fa.2mer.wordpos.txt ├── pep.fa ├── pep.fa.1mer.txt ├── pep.fa.1mer.wordpos.txt ├── pep.fa.2mer.txt ├── pep.fa.2mer.wordpos.txt ├── pep.fa.3mer.txt └── pep.fa.3mer.wordpos.txt ├── test_bbc.py ├── test_calc_bbc.py ├── test_calc_fcgr.py ├── test_calc_graphdna.py ├── test_calc_lempelziv.py ├── test_calc_ncd.py ├── test_calc_wmetric.py ├── test_calc_word.py ├── test_calc_word_bool.py ├── test_calc_word_cv.py ├── test_calc_word_d2.py ├── test_calc_word_ffp.py ├── test_calc_word_rtd.py ├── test_calc_word_sets.py ├── test_create_wordpattern.py ├── test_distance.py ├── test_distmatrix.py ├── test_fasta.py ├── test_fcgr.py ├── test_graphdna.py ├── test_lempelziv.py ├── test_ncd.py ├── test_seqrecords.py ├── test_wmetric.py ├── test_word_bool_distance.py ├── test_word_d2.py ├── test_word_distance.py ├── test_word_pattern.py ├── test_word_rtd.py ├── test_word_sets_distance.py ├── test_word_vector.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # IPython Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | venv/ 86 | ENV/ 87 | 88 | # Spyder project settings 89 | .spyderproject 90 | 91 | # Rope project settings 92 | .ropeproject 93 | 94 | # My 95 | test.py 96 | 97 | 98 | # cache files for sublime text 99 | *.tmlanguage.cache 100 | *.tmPreferences.cache 101 | *.stTheme.cache 102 | 103 | # workspace files are user-specific 104 | *.sublime-workspace 105 | 106 | # project files should be checked into the repository, unless a significant 107 | # proportion of contributors will probably not be using SublimeText 108 | # *.sublime-project 109 | 110 | # sftp configuration file 111 | sftp-config.json 112 | 113 | # Package control specific files 114 | Package Control.last-run 115 | Package Control.ca-list 116 | Package Control.ca-bundle 117 | Package Control.system-ca-bundle 118 | Package Control.cache/ 119 | Package Control.ca-certs/ 120 | bh_unicode_properties.cache 121 | 122 | # Sublime-github package stores a github token in this file 123 | # https://packagecontrol.io/packages/sublime-github 124 | GitHub.sublime-settings 125 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2016 Andrzej Zielezinski, combio.pl, http://combio.pl/alfree 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | alfpy 2 | ===== 3 | 4 | alfpy is a bionformatics Python package that provides alignment-free framework 5 | to compare biological sequences (DNA/RNA/protein) and infers their 6 | phylogenetic relationships. 7 | 8 | alfpy also contains Python scripts with user-friendly command-line interfaces 9 | that let you compare unaligned FASTA sequences with more than 40 distance methods. 10 | 11 | 12 | Latest source code 13 | ------------------ 14 | The official source code repository is at: https://github.com/aziele/alfpy 15 | 16 | 17 | Web sites 18 | --------- 19 | alfpy is also available as a web app: http://www.combio.pl/alfree 20 | 21 | 22 | Requirements 23 | ============ 24 | 25 | 1. Python (https://www.python.org/) version 2.7 or >= 3.3 26 | 2. NumPy (http://www.numpy.org/). 27 | 28 | 29 | Installation 30 | ============ 31 | 32 | Option 1: Get the latest official version 33 | ----------------------------------------- 34 | 35 | Install the latest official version with `pip `_ 36 | :: 37 | 38 | sudo pip install alfpy 39 | 40 | If you are not allowed to use `sudo`, install alfpy as user:: 41 | 42 | sudo pip install --user alfpy 43 | 44 | 45 | 46 | Option 2: Get the latest development version 47 | -------------------------------------------- 48 | 49 | Get it using this shell command, which requires Git:: 50 | 51 | git clone https://github.com/aziele/alfpy.git 52 | 53 | If you don't feel like using git, just download the package manually as a `gzipped tarball `_. 54 | 55 | Unpack the zip package, go to the directory and run the installation:: 56 | 57 | cd alfpy 58 | python setup.py install 59 | 60 | or:: 61 | 62 | python setup.py install --user 63 | 64 | Alfpy usage 65 | =========== 66 | 67 | The examples of using Alfpy are available at: http://www.combio.pl/alfree/download/. 68 | 69 | 70 | Testing 71 | ======= 72 | 73 | To run tests, go to the alfpy source code directory and type:: 74 | 75 | python -m unittest discover 76 | 77 | 78 | If you want to test a specific file (e.g. ``test_word_distance.py``), type:: 79 | 80 | python -m unittest tests.test_word_distance 81 | 82 | 83 | Contact 84 | ======= 85 | 86 | Drop us any feedback at: bioinfo@amu.edu.pl or on twitter `@a_zielezinski `_. 87 | 88 | License 89 | ======= 90 | 91 | alfpy is under the MIT license; see ``LICENSE.txt``. Distribution, 92 | modification and redistribution, incorporation into other software, 93 | and pretty much everything else is allowed. 94 | 95 | 96 | .. |Travis| image:: https://travis-ci.org/aziele/alfpy.svg?branch=master 97 | :target: https://travis-ci.org/aziele/alfpy 98 | 99 | 100 | .. |PyPI| image:: https://img.shields.io/pypi/v/alfpy.svg?branch=master 101 | :target: https://pypi.python.org/pypi/alfpy 102 | 103 | .. |Landscape| image:: https://landscape.io/github/aziele/alfpy/master/landscape.svg?style=flat 104 | :target: https://landscape.io/github/aziele/alfpy/master 105 | :alt: Code Health 106 | 107 | .. |Codecov| image:: https://codecov.io/gh/aziele/alfpy/branch/master/graph/badge.svg 108 | :target: https://codecov.io/gh/aziele/alfpy 109 | -------------------------------------------------------------------------------- /alfpy/__init__.py: -------------------------------------------------------------------------------- 1 | from .version import __version__ 2 | 3 | version = __version__ -------------------------------------------------------------------------------- /alfpy/bbc.py: -------------------------------------------------------------------------------- 1 | """This module computes distances between DNA/protein sequences based on the 2 | sequence feature, named Base-Base Correlation (BBC). 3 | 4 | References: 5 | 1. Liu, Zhi-Hua, et al. (2007) Bioinformatics and Biomedical Engineering, 6 | ICBBE. The 1st International Conference on. IEEE, 2007. 7 | doi: 10.1109/ICBBE.2007.98 8 | 9 | 2. Liu Z, Meng J, Sun X. (2008) Biochem Biophys Res Commun. 368(2):223-30. 10 | doi: 10.1016/j.bbrc.2008.01.070. 11 | 12 | Todo: 13 | * handle sequence symbols not included in molecule's alphabet 14 | 15 | """ 16 | 17 | import numpy as np 18 | 19 | from .utils import distance 20 | 21 | 22 | def base_base_correlation(seq, k, alphabet=None): 23 | """Compute the base base correlation (BBC) vector for a sequence. 24 | 25 | Args: 26 | seq (str) : sequence 27 | k (int) : parameter of the BBC. Intuitively, it represents 28 | the maximum distance to observe correlation between bases. 29 | alphabet (str/list) : List of possible characters. This can be used to 30 | avoid autodetection of the alphabet in the case where 31 | sequences with missing letters are to be compared. 32 | 33 | Returns: 34 | numpy.ndarray: shape (1, 16) for DNA and (1, 400) for protein. 35 | 36 | Examples: 37 | >>> print(base_base_correlation('ATGCATGC', 1, 'ATGC')) 38 | [[ 39 | -0.12547302 -0.12547302 0.2281059 0.17169665 0.01815213 40 | -0.12547302 -0.12547302 0.04258163 0.04258163 0.17169665 41 | -0.12547302 -0.12547302 -0.12547302 0.2281059 0.17169665 42 | -0.12547302 43 | ]] 44 | 45 | Note: 46 | A description of the method can be found here: 47 | http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=4272582 48 | 49 | This implementation is generalized for any sequence type. 50 | """ 51 | 52 | s = seq 53 | 54 | if k > len(s) - 2: 55 | raise Exception("Sequence too short to compute BBC with " 56 | "k={}".format(k)) 57 | 58 | if alphabet is None: 59 | alphabet = set(s) 60 | else: 61 | s = "".join([c for c in s if c in alphabet]) 62 | 63 | alphabet = sorted(list(alphabet)) 64 | alphabet = dict(zip(alphabet, range(len(alphabet)))) 65 | L = len(alphabet) 66 | 67 | # Compute the base probabilities for every character. 68 | p = np.zeros(L) 69 | for c in s: 70 | p[alphabet[c]] += 1 71 | p /= np.sum(p) 72 | p.shape = (1, L) 73 | 74 | bbc = np.zeros((L, L)) 75 | for l in range(1, k + 2): 76 | # Compute $p_{ij}(l)$ representing the probability of 77 | # observing the bases i and j separated by l "gaps". 78 | # Compute it for all 16 combinations of alleles. 79 | l_dist_correlations = np.zeros((L, L)) 80 | for i in range(len(s) - l): 81 | nuc1 = alphabet[s[i]] 82 | nuc2 = alphabet[s[i + l]] 83 | l_dist_correlations[nuc1][nuc2] += 1 84 | l_dist_correlations /= np.sum(l_dist_correlations) 85 | 86 | # Compute the D_{ij}(l) which is the deviation from 87 | # statistical independance. 88 | # $D_{ij}(l) = p_{ij}(l) - p_i p_j$ 89 | D = l_dist_correlations - np.dot(p.T, p) 90 | 91 | bbc += D + (D ** 2 / 2 * np.dot(p.T ** 2, p ** 2)) + D ** 3 92 | 93 | # Flatten the bbc into a 16 feature vector. 94 | bbc.shape = (1, L * L) 95 | return bbc 96 | 97 | 98 | def create_vectors(seq_records, k=10, alphabet="ATGC"): 99 | """Create BBC's vectors for multiple sequence records. 100 | 101 | Args: 102 | seq_records (obj SeqRecords) 103 | """ 104 | data = np.zeros(shape=(seq_records.count, len(alphabet)**2)) 105 | for seqidx, seq in enumerate(seq_records.seq_list): 106 | vector = base_base_correlation(seq, k=k, alphabet=alphabet) 107 | data[seqidx] = vector 108 | return data 109 | 110 | 111 | class Distance(distance.Distance): 112 | 113 | def __init__(self, vector, disttype='euclid_norm'): 114 | super(Distance, self).__init__(vector, disttype) 115 | 116 | 117 | def main(): 118 | from .utils.seqrecords import main 119 | from .utils import distmatrix 120 | seq_records = main() 121 | vector = create_vectors(seq_records, 10, alphabet="ATGC") 122 | dist = Distance(vector) 123 | matrix = distmatrix.create(seq_records.id_list, dist) 124 | matrix.display() 125 | 126 | 127 | if __name__ == '__main__': 128 | main() 129 | -------------------------------------------------------------------------------- /alfpy/fcgr.py: -------------------------------------------------------------------------------- 1 | """This module computes distances between DNA sequences based on the Frequency 2 | Chaos Game Representation (FCGR) 3 | 4 | References: 5 | 1. Hatje K, Kollmar M (2012) Front Plant Sci 3: 192. 6 | doi: 10.3389/fpls.2012.00192 7 | 8 | 9 | Functions for creating DNA-representing vectors were built upon: 10 | Cheng J, Cao F, Liu Z. (2013) Mol Biol Evol. 2013 30(5):1032-7. 11 | doi: 10.1093/molbev/mst021. 12 | 13 | """ 14 | 15 | import numpy as np 16 | 17 | from .utils import distance 18 | 19 | 20 | def fcgr_vector(dnaseq, word_size): 21 | """Create a FCGR vector representing a DNA sequence. 22 | 23 | Args: 24 | dnaseq (str/list): dna sequence 25 | word_size (int): word size (>= 1) 26 | 27 | Returns: 28 | list (length equals 4^word_size) 29 | 30 | Examples: 31 | >>> s = 'ATGCTGATGGATG' 32 | >>> print(fcgr_vector(s, 1)) 33 | [5, 3, 5] 34 | 35 | >>> print(fcgr_vector(s, 2)) 36 | [1, 0, 1, 0, 0, 0, 4, 0, 2, 2, 0, 0, 1, 3, 0] 37 | 38 | """ 39 | ndata = pow(4, word_size) 40 | genlen = len(dnaseq) 41 | CGRs = np.zeros((genlen + 1, 2)) 42 | 43 | Apoint = np.array((0.0, 1.0)) 44 | Tpoint = np.array((1.0, 1.0)) 45 | Gpoint = np.array((1.0, 0.0)) 46 | Cpoint = np.array((0.0, 0.0)) 47 | CGRs[0, 0] = 0.5 48 | CGRs[0, 1] = 0.5 49 | for i in range(0, genlen): 50 | if dnaseq[i] == 'A': 51 | CGRs[i + 1] = 0.5 * (CGRs[i] + Apoint) 52 | if dnaseq[i] == 'T': 53 | CGRs[i + 1] = 0.5 * (CGRs[i] + Tpoint) 54 | if dnaseq[i] == 'G': 55 | CGRs[i + 1] = 0.5 * (CGRs[i] + Gpoint) 56 | if dnaseq[i] == 'C': 57 | CGRs[i + 1] = 0.5 * (CGRs[i] + Cpoint) 58 | temp = 1.0 / pow(2, word_size) 59 | 60 | vectors = np.zeros(shape=(1, ndata)) # numpy 61 | vectors = [0.0] * ndata # list 62 | 63 | for point in CGRs: 64 | xx = int(point[0] / temp) 65 | yy = int(point[1] / temp) 66 | if yy == pow(2, word_size): 67 | yy = pow(2, word_size) - 1 68 | vectors[yy * pow(2, word_size) + xx] += 1 69 | vectors.pop(0) 70 | return vectors 71 | 72 | 73 | def create_vectors(seq_records, word_size): 74 | """Create a matrix of FCGR vectors. 75 | 76 | Args: 77 | seq_records (obj: SeqRecords) 78 | word_size (int): word size (>= 1) 79 | 80 | Returns: 81 | numpy.ndarray 82 | 83 | """ 84 | data = np.zeros(shape=(seq_records.count, pow(4, word_size) - 1)) 85 | for seqidx, seq in enumerate(seq_records.seq_list): 86 | vector = fcgr_vector(seq, word_size) 87 | data[seqidx] = vector 88 | return data 89 | 90 | 91 | class Distance(distance.Distance): 92 | 93 | def __init__(self, vector, disttype='euclid_norm'): 94 | super(Distance, self).__init__(vector, disttype) 95 | 96 | 97 | def main(): 98 | from .utils.seqrecords import main 99 | from .utils import distmatrix 100 | seq_records = main() 101 | 102 | vector = create_vectors(seq_records, 1) 103 | dist = Distance(vector) 104 | matrix = distmatrix.create(seq_records.id_list, dist) 105 | matrix.display() 106 | 107 | 108 | if __name__ == '__main__': 109 | main() 110 | -------------------------------------------------------------------------------- /alfpy/ncd.py: -------------------------------------------------------------------------------- 1 | """Normalized compression distance (NCD) 2 | 3 | The NCD is a family of distances parametrized with the compressor Z. 4 | The better Z is, the closer the NCD approaches the NID, and the better 5 | the results are. 6 | 7 | As described in: 8 | 1. Bennett, Gacs, Ming, Vintanyi, Zurek 9 | IEEE Transactions on Information Theory 1998. 44(4):1407-1423 10 | doi: 10.1109/18.681318 11 | 12 | 2. Li, Chen, Li, Ma, Vitanyi 13 | IEEE Transactions on Information Theory 2004. 50(12):3250-3264 14 | doi: 10.1109/TIT.2004.838101 15 | 16 | 3. https://en.wikipedia.org/wiki/Normalized_compression_distance 17 | 18 | """ 19 | import itertools 20 | import zlib 21 | 22 | 23 | def complexity(s): 24 | """Compress string and return the size of the compression.""" 25 | s = s.encode("utf-8") # Python 3 fix. 26 | compr = zlib.compress(s) 27 | c = float(len(compr)) 28 | return c 29 | 30 | 31 | class Distance(): 32 | 33 | def __init__(self, seq_records): 34 | 35 | self.seq_records = seq_records 36 | self._complexity = {} 37 | self.numseqs = seq_records.count 38 | # Precomputed complexity for input sequences 39 | # as well as all pairwise concatenated sequences. 40 | self._complexity = self.__precompute_complexity() 41 | 42 | def __precompute_complexity(self): 43 | d = {} 44 | seqs = self.seq_records.seq_list 45 | # Complexity for single input sequences. 46 | for seqidx, seq in enumerate(seqs): 47 | d[(seqidx,)] = complexity(seq) 48 | # Complexity for pairwise concatenated sequences. 49 | for i, j in itertools.combinations(range(self.numseqs), 2): 50 | seq12 = seqs[i] + seqs[j] 51 | c12 = complexity(seq12) 52 | d[(i, j)] = c12 53 | return d 54 | 55 | def pairwise_distance(self, seq1idx, seq2idx): 56 | """Compute NCD between two sequences. 57 | 58 | Formula: 59 | NCD_Z(x,y) = \frac{Z(xy) - \min \{Z(x),Z(y)\}}{\max \{Z(x),Z(y)\}}. 60 | 61 | where: 62 | Z(x) is the binary length of the sequence `x` compressed 63 | with compressor Z 64 | """ 65 | zx = self._complexity[(seq1idx,)] 66 | zy = self._complexity[(seq2idx,)] 67 | zxy = self._complexity[(seq1idx, seq2idx)] 68 | return (zxy - min([zx, zy])) / max([zx, zy]) 69 | 70 | 71 | if __name__ == '__main__': 72 | from .utils import distmatrix 73 | from .utils.seqrecords import main 74 | seq_records = main() 75 | 76 | dist = Distance(seq_records) 77 | matrix = distmatrix.create(seq_records.id_list, dist) 78 | matrix.display('pairwise') 79 | -------------------------------------------------------------------------------- /alfpy/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aziele/alfpy/25545be14affa7d7e89e5b5ebcfe4f3e688108b7/alfpy/utils/__init__.py -------------------------------------------------------------------------------- /alfpy/utils/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aziele/alfpy/25545be14affa7d7e89e5b5ebcfe4f3e688108b7/alfpy/utils/data/__init__.py -------------------------------------------------------------------------------- /alfpy/utils/data/seqcontent.py: -------------------------------------------------------------------------------- 1 | """Collections of various bits of useful sequence data.""" 2 | 3 | FREQS = { 4 | 'protein': { 5 | 'A': 0.0826, 6 | 'Q': 0.0393, 7 | 'L': 0.0965, 8 | 'S': 0.0659, 9 | 'R': 0.0553, 10 | 'E': 0.0674, 11 | 'K': 0.0583, 12 | 'T': 0.0534, 13 | 'N': 0.0406, 14 | 'G': 0.0708, 15 | 'M': 0.0241, 16 | 'W': 0.0109, 17 | 'D': 0.0546, 18 | 'H': 0.0227, 19 | 'F': 0.0386, 20 | 'Y': 0.0292, 21 | 'C': 0.0137, 22 | 'I': 0.0594, 23 | 'P': 0.0471, 24 | 'V': 0.0687, 25 | 'X': 1, 26 | 'B': 0.0406 + 0.0546, 27 | 'Z': 0.0393 + 0.0674 28 | }, 29 | 'dna': { 30 | 'A': 0.25, 31 | 'C': 0.25, 32 | 'G': 0.25, 33 | 'T': 0.25 34 | }, 35 | 'rna': { 36 | 'A': 0.25, 37 | 'C': 0.25, 38 | 'G': 0.25, 39 | 'U': 0.25 40 | } 41 | } 42 | 43 | WEIGHTS = { 44 | 'protein': { 45 | 'A': 1.2106537530266344, 46 | 'C': 7.299270072992702, 47 | 'E': 1.4836795252225519, 48 | 'D': 1.8315018315018312, 49 | 'G': 1.4124293785310733, 50 | 'F': 2.590673575129534, 51 | 'I': 1.6835016835016834, 52 | 'H': 4.405286343612334, 53 | 'K': 1.7152658662092626, 54 | 'M': 4.149377593360996, 55 | 'L': 1.0362694300518134, 56 | 'N': 2.4630541871921183, 57 | 'Q': 2.5445292620865136, 58 | 'P': 2.123142250530785, 59 | 'S': 1.5174506828528072, 60 | 'R': 1.8083182640144662, 61 | 'T': 1.8726591760299625, 62 | 'W': 9.174311926605505, 63 | 'V': 1.4556040756914121, 64 | 'Y': 3.4246575342465753 65 | }, 66 | 'dna': { 67 | 'A': 1, 68 | 'C': 1, 69 | 'G': 1, 70 | 'T': 1 71 | }, 72 | 'rna': { 73 | 'A': 1, 74 | 'C': 1, 75 | 'G': 1, 76 | 'U': 1 77 | } 78 | 79 | } 80 | 81 | 82 | ALPHABET = { 83 | 'dna': 'ATGC', 84 | 'protein': 'ACDEFGHIKLMNPQRSTVWY' 85 | } 86 | 87 | REDUCED_ALPHABET = { 88 | 'dna': { 89 | 'A': 'R', 90 | 'G': 'R', 91 | 'T': 'Y', 92 | 'C': 'Y' 93 | }, 94 | 'protein': { 95 | 'T': 'S', 96 | 'E': 'D', 97 | 'Q': 'K', 98 | 'R': 'K', 99 | 'V': 'I', 100 | 'L': 'I', 101 | 'M': 'I', 102 | 'W': 'F', 103 | 'Y': 'F' 104 | } 105 | } 106 | 107 | 108 | def get_alphabet(mol): 109 | return ALPHABET[mol] 110 | 111 | 112 | def get_freqs(mol): 113 | return FREQS[mol] 114 | 115 | 116 | def get_weights(mol): 117 | return WEIGHTS[mol] 118 | 119 | 120 | def get_reduced_alphabet(mol): 121 | return REDUCED_ALPHABET[mol] 122 | -------------------------------------------------------------------------------- /alfpy/utils/distance.py: -------------------------------------------------------------------------------- 1 | """This module contains a `Distance` class that combines vector 2 | with distance function. 3 | 4 | """ 5 | 6 | import math 7 | import numpy as np 8 | 9 | 10 | class Distance(object): 11 | """Combine sequences-representing 2-D array of vectors 12 | with a distance function. 13 | 14 | Attributes: 15 | _vector (ndarray) 16 | _disttype (str): distance method name 17 | pairwise_distance (func): distance method 18 | 19 | """ 20 | 21 | def __getitem__(self, seqnum): 22 | return self._vector[seqnum] 23 | 24 | @classmethod 25 | def get_disttypes(cls): 26 | """Return a list of available distance function names. 27 | 28 | Returns: 29 | list of strings 30 | """ 31 | l = [x[7:] for x, y in cls.__dict__.items() if x.startswith('pwdist')] 32 | l.sort() 33 | return l 34 | 35 | def set_disttype(self, disttype): 36 | try: 37 | pwdist_func = getattr(self, 'pwdist_{}'.format(disttype)) 38 | self.pairwise_distance = pwdist_func 39 | # Method does not exist. 40 | except AttributeError: 41 | msg = 'unknown disttype "{}"'.format(disttype) 42 | raise ValueError(msg) 43 | 44 | def __init__(self, vector, disttype): 45 | """Create instance of Distance. 46 | 47 | Args: 48 | vector (ndarray) 49 | disttype (str) 50 | 51 | Examples: 52 | >>> vector 53 | [[ 3. 6. 4. 1. 3. 4. 3. 0. 1. 1. 6. 4. 5. 0. 3. 4.] 54 | [ 0. 3. 0. 3. 0. 0. 0. 2. 9. 0. 3. 3. 0. 6. 3. 6.] 55 | [ 9. 0. 0. 3. 0. 0. 0. 2. 6. 0. 3. 3. 0. 3. 3. 3.]] 56 | >>> disttype = 'minkowski' 57 | >>> dist = Distance(vector, disttype) 58 | 59 | """ 60 | self.set_disttype(disttype) 61 | self._vector = vector 62 | self._disttype = disttype 63 | 64 | def pwdist_euclid_squared(self, seq1idx, seq2idx): 65 | """Squared Euclidean distance 66 | 67 | References: 68 | 1. Blaisdell BE (1986) Proc Natl Acad Sci U S A 83: 5155-5159. 69 | doi: 10.1073/pnas.83.14.5155 70 | 71 | """ 72 | value = np.sum((self[seq1idx] - self[seq2idx])**2) 73 | return value 74 | 75 | def pwdist_euclid_norm(self, seq1idx, seq2idx): 76 | """Euclidean distance 77 | 78 | References: 79 | 1. Vinga & Almeida (2003) Bioinformatics 19(4): 513-523. 80 | doi: 10.1093/bioinformatics/btg005 81 | 2. http://web.ist.utl.pt/susanavinga/NASC/ 82 | 83 | """ 84 | value = math.sqrt(self.pwdist_euclid_squared(seq1idx, seq2idx)) 85 | return value 86 | 87 | def pwdist_google(self, seq1idx, seq2idx): 88 | """Normalized Google Distance (NGD). 89 | 90 | The maximum values for NGD is 1.0, which means two sequences are 91 | totally not similar to each other, and the minimum values for 92 | NGD is 0.0. Therefore, the similarity of the two sequences can be 93 | obtained by NGS = 1 - NGD. Two sequences are treated as two different 94 | web pages and the each word frequency represents terms found in each 95 | webpage. 96 | 97 | References: 98 | 1. Lee & Rashid (2008) Information Technology, ITSim 2008. 99 | doi:10.1109/ITSIM.2008.4631601 100 | 101 | """ 102 | v1 = self[seq1idx] 103 | v2 = self[seq2idx] 104 | 105 | sumwx = float(np.sum(v1)) 106 | sumwy = float(np.sum(v2)) 107 | 108 | summin = float(np.sum(np.minimum(v1, v2))) 109 | 110 | ngd = (max([sumwx, sumwy]) - summin) / \ 111 | ((sumwx + sumwy) - min([sumwx, sumwy])) 112 | return ngd 113 | -------------------------------------------------------------------------------- /alfpy/utils/distmatrix.py: -------------------------------------------------------------------------------- 1 | """This module creates and handles distance matrices""" 2 | 3 | import itertools 4 | import numpy as np 5 | import sys 6 | 7 | 8 | def create(id_list, distance): 9 | """Create a distance matrix (as Matrix object). 10 | 11 | Calculate distance measures between all pairs of sequences. 12 | 13 | Args: 14 | id_list (list): list of sequence identifiers 15 | distance (obj): instance of distance.Distance 16 | 17 | Returns: 18 | Matrix object 19 | 20 | Examples: 21 | >>> vector 22 | [[ 3. 6. 4. 1. 3. 4. 3. 0. 1. 1. 6. 4. 5. 0. 3. 4.] 23 | [ 0. 3. 0. 3. 0. 0. 0. 2. 9. 0. 3. 3. 0. 6. 3. 6.] 24 | [ 9. 0. 0. 3. 0. 0. 0. 2. 6. 0. 3. 3. 0. 3. 3. 3.]] 25 | >>> disttype = 'minkowski' 26 | >>> dist = Distance(vector, disttype) 27 | >>> id_list = ['seq1', 'seq2', 'seq3'] 28 | >>> matrix = create(id_list, dist) 29 | 30 | """ 31 | size = len(id_list) 32 | rows = np.zeros([size, size]) 33 | for i, j in itertools.combinations(range(size), 2): 34 | value = distance.pairwise_distance(i, j) 35 | rows[i][j] = value 36 | rows[j][i] = value 37 | # No need to calculate distances between the same sequences. 38 | # The distance should be zero. 39 | # for i in range(size): 40 | # value = distance.pairwise_distance(i, i) 41 | # rows[i][i] = value 42 | return Matrix(id_list, rows) 43 | 44 | 45 | def read_highcharts_matrix(id_list, data): 46 | """Create a distance matrix from a matrix in Highcharts format. 47 | 48 | Args: 49 | id_list (list): list of sequence identifiers 50 | data (list of 4-element tuples) 51 | e.g. [[0, 1, 0.35, 0.19], [0, 2, 1.0, 0.55], [1, 2, 0.88, 0.48]] 52 | 53 | Returns: 54 | Matrix object 55 | """ 56 | size = len(id_list) 57 | rows = np.zeros([size, size]) 58 | for i, j, _, value in data: 59 | rows[i][j] = value 60 | rows[j][i] = value 61 | return Matrix(id_list, rows) 62 | 63 | 64 | class Matrix(): 65 | """Distance matrix 66 | 67 | Attributes: 68 | id_list (list): list of sequence identifiers 69 | data (ndarray): 2-D array of distance values between pairs of seqs 70 | 71 | """ 72 | 73 | def __init__(self, id_list, data): 74 | """ 75 | Example: 76 | >>> id_list = ['seq1', 'seq2', 'seq3'] 77 | >>> data 78 | [[ 0. 0.3531587 0.35509333] 79 | [ 0.3531587 0. 0.295394 ] 80 | [ 0.35509333 0.295394 0. ]] 81 | >>> matrix = Matrix(id_list, data) 82 | 83 | """ 84 | self.id_list = id_list 85 | self.data = data 86 | 87 | def normalize(self): 88 | """Normalize distance values to 0-1 range.""" 89 | self.data /= self.max() 90 | 91 | def __iter__(self): 92 | """Iterate over a distance matrix.""" 93 | size = self.data.shape[0] 94 | for i, j in itertools.combinations(range(size), 2): 95 | yield i, j, self.id_list[i], self.id_list[j], self.data[i][j] 96 | 97 | def writer(self, handle, f, decimal_places): 98 | """Return a distance matrix as a string in `phylip` or `pairwise` 99 | formats. 100 | 101 | Args: 102 | handle : output file / sys.stdout 103 | f (str): phylip / pairwise 104 | decimal_places (int): round distance value to decimal places 105 | 106 | """ 107 | if f == 'phylip': 108 | handle.write(" {0}\n".format(len(self.id_list))) 109 | for i, line in enumerate(self.data): 110 | # PHYLIP requires that each sequence identifier 111 | # is maximum 10 characters long. 112 | seqid = self.id_list[i][:10] 113 | l = ['{0:.{1}f}'.format(line[i], decimal_places) 114 | for i in range(0, len(line))] 115 | l.insert(0, '{0: <10}'.format(seqid)) 116 | handle.write(" ".join(l) + "\n") 117 | elif f == 'pairwise': 118 | for _, _, seqid1, seqid2, distval in self: 119 | handle.write("{0}\t{1}\t{2:.{3}f}\n".format(seqid1, seqid2, 120 | distval, 121 | decimal_places)) 122 | 123 | def display(self, f="phylip", decimal_places=7): 124 | """Write a distance matrix to the screen.""" 125 | return self.writer(sys.stdout, f, decimal_places) 126 | 127 | def write_to_file(self, handle, f="phylip", decimal_places=7): 128 | """Write a distance matrix to a file.""" 129 | return self.writer(handle, f, decimal_places) 130 | 131 | def highcharts(self): 132 | """Return a distance matrix as a list in the Highcharts format.""" 133 | data = [] 134 | maxval = self.max() 135 | for i, j, _, _, distval in self: 136 | data.append([i, j, distval / maxval, distval]) 137 | return data 138 | 139 | def format(self, decimal_places=7): 140 | lines = [" {0}".format(len(self.id_list))] 141 | for i, line in enumerate(self.data): 142 | seqid = self.id_list[i][:10] 143 | l = ['{0:.{1}f}'.format(line[i], decimal_places) 144 | for i in range(0, len(line))] 145 | l.insert(0, '{0: <10}'.format(seqid)) 146 | lines.append("\n" + " ".join(l)) 147 | return "".join(lines) 148 | 149 | def min(self): 150 | """Return minimum distance value in matrix""" 151 | return np.amin(self.data) 152 | 153 | def max(self): 154 | """Return maximum distance value in matrix""" 155 | return np.amax(self.data) 156 | 157 | def is_zero(self): 158 | """Return True if matrix contains only zeros""" 159 | return not np.count_nonzero(self.data) 160 | 161 | def __repr__(self): 162 | return str(self.data) 163 | 164 | 165 | 166 | if __name__ == '__main__': 167 | id_list = ['seq1', 'seq2', 'seq3'] 168 | l = [[0, 0.3531587, 0.35509333], 169 | [0.3531587, 0, 0.295394], 170 | [0.35509333, 0.295394, 0.] 171 | ] 172 | data = np.array(l) 173 | matrix = Matrix(id_list, data) 174 | print(matrix.format()) 175 | print(matrix.highcharts()) 176 | -------------------------------------------------------------------------------- /alfpy/utils/fasta.py: -------------------------------------------------------------------------------- 1 | """Reading and writing FASTA format files""" 2 | 3 | from itertools import groupby 4 | 5 | 6 | class FastaRecord(): 7 | """Object representing a Fasta (aka Pearson) record. 8 | 9 | Attributes: 10 | seq (str) : Sequence 11 | id (str) : Sequence identifier 12 | description (str) : Sequence description 13 | """ 14 | 15 | def __init__(self, seq, seqid, description=False): 16 | """Create a FastaRecord. 17 | 18 | Example: 19 | >>> import Fasta 20 | >>> record = FastaRecord(seq='MRELEAKAT', 21 | ... seqid='NP_055309.2', 22 | ... description='TNRC6A') 23 | >>> print(record) 24 | >NP_055309.2 TNRC6A 25 | MRELEAKAT 26 | """ 27 | self.seq = seq 28 | self.id = seqid 29 | self.description = description 30 | 31 | def __iter__(self): 32 | """Iterate over the letters in the sequence. 33 | 34 | Example: 35 | >>> import Fasta 36 | >>> record = Fasta.read(open('sequence.fasta')) 37 | >>> for amino_acid in record: 38 | ... print(amino_acid) 39 | M 40 | R 41 | E 42 | L 43 | E 44 | 45 | This is equivalent to iterating over the sequence directly: 46 | >>> for amino_acid in record.seq: 47 | ... print(amino_acid) 48 | M 49 | R 50 | E 51 | L 52 | E 53 | """ 54 | return iter(self.seq) 55 | 56 | def __contains__(self, char): 57 | """Implements the 'in' keyword, searches the sequence. 58 | 59 | Example: 60 | >>> import Fasta 61 | >>> record = Fasta.read(open('sequence.fasta')) 62 | >>> print('M' in record) 63 | True 64 | """ 65 | return char in self.seq 66 | 67 | def __str__(self): 68 | """Return the record as a string in the fasta format. 69 | 70 | Example: 71 | >>> import Fasta 72 | >>> record = FastaRecord(seq='MRELEAKAT', 73 | ... id='NP_055309.2', 74 | ... description='TNRC6A') 75 | >>> print(record) 76 | >NP_055309.2 TNRC6A 77 | MRELEAKAT 78 | """ 79 | return self.format(wrap=70) 80 | 81 | def __len__(self): 82 | """Return the length of the sequence. 83 | 84 | Example: 85 | >>> import Fasta 86 | >>> record = Fasta.read(open('sequence.fasta')) 87 | >>> len(record) 88 | 1240 89 | """ 90 | return len(self.seq) 91 | 92 | def format(self, wrap=70): 93 | """Return a formatted Fasta record. 94 | 95 | Example: 96 | >>> import Fasta 97 | >>> record = SeqRecord(seq='MRELEAKAT', 98 | id='NP_055309.2', 99 | description='TNRC6A') 100 | >>> print(record.format()) 101 | >NP_055309.2 TNRC6A 102 | MRELEAKAT 103 | """ 104 | header = ">{0}".format(self.id) 105 | if self.description: 106 | header += " " + self.description 107 | header += "\n" 108 | wseq = [] 109 | for i in range(0, len(self.seq), wrap): 110 | wseq.append(self.seq[i:i + wrap]) 111 | return header + "\n".join(wseq) 112 | 113 | 114 | def parse(handle): 115 | """ 116 | Generator function to iterate over Fasta records (as FastaRecord objects). 117 | 118 | handle - input file containing fasta sequences. 119 | """ 120 | faiter = (x[1] for x in groupby(handle, lambda l: l[0] == ">")) 121 | for header in faiter: 122 | header = next(header)[1:].strip() 123 | seqid = header.split()[0] 124 | seq = "".join(s.strip() for s in next(faiter)) 125 | desc = header[len(seqid):].strip() 126 | yield FastaRecord(seq, seqid, description=desc) 127 | 128 | 129 | def read(handle): 130 | """Turns a sequence file into a single FastaRecord. 131 | 132 | EXAMPLE: 133 | >>> import Fasta 134 | >>> record = Fasta.read(open('sequence.fasta')) 135 | >>> print(record.id) 136 | NP_055309.2 137 | >>> print(record.seq) 138 | MRELEAKAT 139 | 140 | If the handle contains no records an exception is raised. 141 | If the handle contains more than one record, the very first one is read. 142 | 143 | Use the Fasta.parse(handle) function if you want 144 | to read multiple records from the handle. 145 | 146 | """ 147 | iterator = parse(handle) 148 | try: 149 | first = next(iterator) 150 | except StopIteration: 151 | first = None 152 | return first 153 | 154 | 155 | def to_dict(sequences): 156 | """Turns a Fasta sequence iterator or list into a dictionary. 157 | 158 | - sequences: an iterator that returns FastaRecord objects, 159 | or simply a list of SeqRecord objects. 160 | 161 | Uses record.id as key. 162 | 163 | If there are duplicate keys, an error is raised. 164 | 165 | EXAMPLE: 166 | >>> import Fasta 167 | >>> pdict = Fasta.to_dict(Fasta.parse(open('test.fa'))) 168 | >>> print(sorted(pdict.keys())) 169 | ['gi|195354411|', 'tr|Q8SY33|'] 170 | >>> print(pdict['tr|Q8SY33|'].description) 171 | Gawky, isoform A [Drosophila melanogaster] 172 | >>> len(pdict) 173 | 2 174 | 175 | NOTE: 176 | This approach is not suitable for very large sets of sequences, 177 | as all the SeqRecord objects are held in memory. 178 | 179 | """ 180 | d = dict() 181 | for record in sequences: 182 | key = record.id 183 | if key in d: 184 | raise ValueError("Duplicate key '{}'".format(key)) 185 | d[key] = record 186 | return d 187 | 188 | 189 | if __name__ == '__main__': 190 | seqs = ['>seq1 desc1', 'ATGCTGATGATAGATG', 'ATGTAGA', 191 | '>seq2 desc2', 'ATGCTGCT'] 192 | for seq_record in parse(seqs): 193 | print(seq_record) 194 | -------------------------------------------------------------------------------- /alfpy/utils/seqrecords.py: -------------------------------------------------------------------------------- 1 | from . import fasta 2 | 3 | 4 | class SeqRecords: 5 | """Object representing an ordered collection of sequence records. 6 | 7 | Attributes: 8 | id_list (list) : List of sequence record identifiers 9 | seq_list (list) : List of sequence strings 10 | count (int) : Number of sequence records 11 | 12 | """ 13 | 14 | def __init__(self, id_list=None, seq_list=None): 15 | """Create a collection (may be empty) of sequence records. 16 | 17 | Example: 18 | >>> ids = ['seq1', 'seq2'] 19 | >>> seqs = ['ATGCTG', 'TGCTGATAGTA'] 20 | >>> seq_records = SeqRecords(id_list=ids, seq_list=seqs) 21 | >>> print seq_records 22 | SeqRecords (noseqs: 2) 23 | 24 | """ 25 | self.count = 0 if not id_list else len(seq_list) 26 | self.id_list = id_list if id_list else [] 27 | # Make all sequences uppercased. 28 | self.seq_list = [s.upper() for s in seq_list] if seq_list else [] 29 | 30 | def add(self, seqid, seq): 31 | """Add a sequence record to the existing collection. 32 | 33 | Args: 34 | id (str) : sequence identifier 35 | seq (str) : sequence string 36 | 37 | Example: 38 | >>> seq_record.add("seq3", "TGCTGA") 39 | """ 40 | self.id_list.append(seqid) 41 | self.seq_list.append(seq.upper()) 42 | self.count += 1 43 | 44 | def fasta(self, wrap=70): 45 | """Return sequence records as a mutli-FASTA string. 46 | 47 | Example: 48 | >>> ids = ['seq1', 'seq2'] 49 | >>> seqs = ['ATGCTG', 'TGCTGATAGTA'] 50 | >>> seq_records = SeqRecords(id_list=ids, seq_list=seqs) 51 | >>> print seq_records.fasta() 52 | >seq1 53 | ATGCTG 54 | >seq2 55 | TGCTGATAGTA 56 | """ 57 | l = [] 58 | for seqid, seq in self: 59 | seq_record = fasta.FastaRecord(seq=seq, seqid=seqid) 60 | l.append(seq_record.format(wrap=wrap)) 61 | return "\n".join(l) 62 | 63 | @property 64 | def length_list(self): 65 | """Return a list of the sequences' length_list""" 66 | return [len(seq) for seq in self.seq_list] 67 | 68 | def __iter__(self): 69 | """ 70 | Iterate over sequence records in the collection. 71 | 72 | Example: 73 | >>> for amino_acid in record: 74 | ... print(amino_acid) 75 | seq1 76 | ATGCTG 77 | seq2 78 | TGCTGATAGTA 79 | """ 80 | for i in range(self.count): 81 | seqid = self.id_list[i] 82 | seq = self.seq_list[i] 83 | yield seqid, seq 84 | 85 | def __len__(self): 86 | """ 87 | Return the number of sequence records in the collection. 88 | 89 | Example: 90 | >>> len(seq_records) 91 | 3 92 | """ 93 | return len(self.seq_list) 94 | 95 | def __repr__(self): 96 | return "{0} (noseqs: {1})".format(self.__class__.__name__, 97 | self.count) 98 | 99 | 100 | def read_fasta(handle): 101 | """Create a SeqRecords object from Fasta file. 102 | 103 | Args: 104 | file handle : a file containing Fasta sequences. 105 | 106 | """ 107 | id_list = [] 108 | seq_list = [] 109 | for seq_record in fasta.parse(handle): 110 | id_list.append(seq_record.id) 111 | seq_list.append(seq_record.seq) 112 | return SeqRecords(id_list=id_list, seq_list=seq_list) 113 | 114 | 115 | def main(): 116 | seq_records = SeqRecords() 117 | seq_records.add( 118 | 'seq1', 'AACGTACCATTGAACGTACCATTGAACGTACCATTGATGCATGGTAGAT') 119 | seq_records.add('seq2', 'CTAGGGGACTTATCTAGGGGACTTATCTAGGGGACTTAT') 120 | seq_records.add('seq3', 'CTAGGGAAAATTCTAGGGAAAATTCTAGGGAAAATT') 121 | 122 | import uuid 123 | import os 124 | outfilename = uuid.uuid4().hex 125 | oh = open(outfilename, 'w') 126 | oh.write(seq_records.fasta()) 127 | oh.close() 128 | 129 | fh = open(outfilename) 130 | seq_records = read_fasta(fh) 131 | fh.close() 132 | os.remove(outfilename) 133 | 134 | return seq_records 135 | 136 | 137 | if __name__ == '__main__': 138 | seq_records = main() 139 | print(seq_records.fasta()) 140 | -------------------------------------------------------------------------------- /alfpy/version.py: -------------------------------------------------------------------------------- 1 | # I store the version here so: 2 | # 1) I don't load dependencies by storing it in __init__.py 3 | # 2) I can import it in setup.py for the same reason. 4 | # 3) I can import it into any module. 5 | __version__ = '1.0.6' -------------------------------------------------------------------------------- /alfpy/wmetric.py: -------------------------------------------------------------------------------- 1 | """Calculate distances between protein sequences based on the W-metric (Wm). 2 | 3 | Reference: 4 | 1. Vinga, Gouveia-Oliveira, Almeida. (2004) Bioinformatics. 20(2):206-215 5 | doi: 10.1093/bioinformatics/btg392 6 | 7 | W-metric includes one-tuple composition information (the difference 8 | in amino acid frequencies between two proteins) and weights from 9 | the scoring matrices used in alignment methods. 10 | 11 | """ 12 | import numpy as np 13 | 14 | 15 | def count_seq_chars(seq, alphabet): 16 | """Count characters from given alphabet that are present in sequence. 17 | 18 | Args: 19 | seq (str): sequence 20 | alphabet (str/list): list of allowed characters 21 | 22 | Returns: 23 | A list of characters' counting occurrences. 24 | 25 | Examples: 26 | >>> alphabet = 'ACDEFGHIKLMNPQRSTVWY' 27 | >>> seq = 'MKSTGWHFSG' 28 | >>> print(count_seq_chars(seq, alphabet)) 29 | [0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 0, 2, 1, 0, 1, 0] 30 | 31 | """ 32 | l = [0 for c in alphabet] 33 | for i, c in enumerate(alphabet): 34 | l[i] += seq.count(c) 35 | return l 36 | 37 | 38 | def freq_seq_chars(counts): 39 | """Calculate frequencies of characters (symbols) in a sequence based on 40 | characters' counts. 41 | 42 | Args: 43 | counts (list): result of the `count_seq_chars` function 44 | seqlen (int): length of a sequence 45 | 46 | Returns: 47 | A list of frequencies corresponding to alphabet 48 | 49 | Examples: 50 | >>> l = [0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 0, 2, 1, 0, 1, 0] 51 | >>> print(freq_seq_chars(l)) 52 | [0.0, 0.0, 0.0, 0.0, 0.1, 0.2, 0.1, 53 | 0.0, 0.1, 0.0, 0.1, 0.0, 0.0, 0.0, 54 | 0.0, 0.2, 0.1, 0.0, 0.1, 0.0] 55 | 56 | """ 57 | seqlen = float(sum(counts)) 58 | return [c / seqlen for c in counts] 59 | 60 | 61 | def freq_seqs_chars(seq_records, alphabet): 62 | """Calculate frequencies of characters from given alphabet 63 | for multiple sequences (stored as seq_records object). 64 | 65 | Args: 66 | seq_records (obj): instance of SeqRecords() 67 | alphabet (list): list of allowed characters 68 | 69 | Returns: 70 | numpy.ndarray 71 | """ 72 | l = [] 73 | for i in range(seq_records.count): 74 | seq = seq_records.seq_list[i] 75 | counts = count_seq_chars(seq, alphabet) 76 | freq = freq_seq_chars(counts) 77 | l.append(freq) 78 | return np.array(l) 79 | 80 | 81 | class Distance: 82 | """Combine vector with a distance function. 83 | 84 | Attributes: 85 | freqs (ndarray): matrix of sequence-representing vectors 86 | matrix (ndarray): substitution matrix for amino acid changes 87 | 88 | """ 89 | 90 | def __init__(self, seq_records, matrix): 91 | """Create a instance of Distance. 92 | 93 | Args: 94 | seq_records (obj: seqrecords.SeqRecords) 95 | matrix (obj: utils.data.subsmat.SubsMat) 96 | 97 | Examples: 98 | >>> from .utils.data import subsmat 99 | >>> from .utils.seqrecords import SeqRecords 100 | >>> matrix = subsmat.get('blosum62') 101 | >>> seq_records = SeqRecords() 102 | >>> seq_records.add('seq1', 'MKSTGWHF') 103 | >>> seq_records.add('seq2', 'MKSSSSTGWGWG') 104 | >>> seq_records.add('seq3', 'MKSTLKNGTEQ') 105 | 106 | >>> dist = Distance(seq_records, matrix) 107 | 108 | """ 109 | 110 | self.freqs = freq_seqs_chars(seq_records, matrix.alphabet_list) 111 | self.matrix = matrix 112 | 113 | def pairwise_distance(self, seqnum1, seqnum2): 114 | """Compute W-metric between two proteins. 115 | 116 | The distance is defined by one-tuple frequencies 117 | fx and fy of two proteins, weighted by matrix W. 118 | 119 | Formula: 120 | d^{w} = \sum_{i\in A}\sum_{j\in A}(f_{i}^{X}-f_{i}^{y}) 121 | \cdot (f_{j}^{X}-f_{j}^{y})\cdot w_{ij} 122 | 123 | """ 124 | freqs1 = self.freqs[seqnum1] 125 | freqs2 = self.freqs[seqnum2] 126 | f = freqs1 - freqs2 127 | m = np.outer(f, f) * self.matrix.data 128 | return np.sum(m) 129 | 130 | 131 | def main(): 132 | from .utils import distmatrix 133 | from .utils.data import subsmat 134 | from .utils.seqrecords import SeqRecords 135 | 136 | matrix = subsmat.get('blosum62') 137 | 138 | seq_records = SeqRecords() 139 | seq_records.add('seq1', 'MKSTGWHF') 140 | seq_records.add('seq2', 'MKSSSSTGWGWG') 141 | seq_records.add('seq3', 'MKSTLKNGTEQ') 142 | 143 | dist = Distance(seq_records, matrix) 144 | 145 | # print dist.pairwise_distance(0, 1) 146 | matrix = distmatrix.create(seq_records.id_list, dist) 147 | matrix.display() 148 | 149 | 150 | if __name__ == '__main__': 151 | main() 152 | -------------------------------------------------------------------------------- /alfpy/word_bool_distance.py: -------------------------------------------------------------------------------- 1 | """Distance methods between two boolean vectors (representing word 2 | occurrences). 3 | 4 | References: 5 | 1. SciPy, https://www.scipy.org 6 | 7 | """ 8 | 9 | import numpy as np 10 | 11 | from .utils import distance 12 | 13 | 14 | def _nbool_correspond_ft_tf(u, v): 15 | """Function used by some distance methods (in Distance class). 16 | Based on: https://github.com/scipy/scipy 17 | 18 | Args: 19 | u (numpy.ndarray) : boolean vector, shape: (N, 1) 20 | v (numpy.ndarray) : as above 21 | 22 | Returns: 23 | tuple of two numbers 24 | 25 | Examples: 26 | >>> u = np.array([True, False, True]) 27 | >>> v = np.array([True, True, False]) 28 | >>> print(_nbool_correspond_ft_tf(u, v)) 29 | (1, 1) 30 | 31 | """ 32 | not_u = ~u 33 | not_v = ~v 34 | nft = (not_u & v).sum() 35 | ntf = (u & not_v).sum() 36 | return (nft, ntf) 37 | 38 | 39 | def _nbool_correspond_all(u, v): 40 | """Function used by some distance methods (in Distance class). 41 | Based on: https://github.com/scipy/scipy 42 | 43 | Args: 44 | u (numpy.ndarray) : bool, shape: (N, ) 45 | v (numpy.ndarray) : as above 46 | 47 | Returns: 48 | tuple of four numbers 49 | 50 | Examples: 51 | >>> u = np.array([True, False, True]) 52 | >>> v = np.array([True, True, False]) 53 | >>> print(_nbool_correspond_all(u, v)) 54 | (0, 1, 1, 1) 55 | 56 | """ 57 | not_u = ~u 58 | not_v = ~v 59 | nff = (not_u & not_v).sum() 60 | nft = (not_u & v).sum() 61 | ntf = (u & not_v).sum() 62 | ntt = (u & v).sum() 63 | return (nff, nft, ntf, ntt) 64 | 65 | 66 | class Distance(distance.Distance): 67 | """Combine vector boolean data (numpy.ndarray) with distance method. 68 | 69 | """ 70 | 71 | def pwdist_dice(self, seq1idx, seq2idx): 72 | """Compute the Dice dissimilarity (Sorensen-Dice coefficient) 73 | between two boolean 1-D arrays. 74 | 75 | Returns: 76 | distance value (double) 77 | 78 | """ 79 | u = self[seq1idx] 80 | v = self[seq2idx] 81 | ntt = (u & v).sum() 82 | (nft, ntf) = _nbool_correspond_ft_tf(u, v) 83 | return float(ntf + nft) / float(2.0 * ntt + ntf + nft) 84 | 85 | def pwdist_yule(self, seq1idx, seq2idx): 86 | """Compute the Yule dissimilarity between two boolean 1-D arrays. 87 | 88 | Returns: 89 | distance value (double) 90 | 91 | """ 92 | u = self[seq1idx] 93 | v = self[seq2idx] 94 | (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v) 95 | return float(2.0 * ntf * nft) / float(ntt * nff + ntf * nft) 96 | 97 | def pwdist_rogerstanimoto(self, seq1idx, seq2idx): 98 | """Compute the Rogers-Tanimoto dissimilarity between two boolean 99 | 1-D arrays. 100 | 101 | Returns: 102 | distance value (double) 103 | 104 | """ 105 | u = self[seq1idx] 106 | v = self[seq2idx] 107 | (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v) 108 | r = float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft))) 109 | return r 110 | 111 | def pwdist_russellrao(self, seq1idx, seq2idx): 112 | """Compute the Russell-Rao dissimilarity between two boolean 1-D arrays. 113 | 114 | Returns: 115 | distance value (double) 116 | 117 | """ 118 | u = self[seq1idx] 119 | v = self[seq2idx] 120 | 121 | ntt = (u & v).sum() 122 | return float(len(u) - ntt) / float(len(u)) 123 | 124 | def pwdist_sokalmichener(self, seq1idx, seq2idx): 125 | """Compute the Sokal-Michener dissimilarity 126 | between two boolean 1-D arrays. 127 | 128 | Returns: 129 | distance value (double) 130 | 131 | """ 132 | u = self[seq1idx] 133 | v = self[seq2idx] 134 | ntt = (u & v).sum() 135 | nff = (~u & ~v).sum() 136 | (nft, ntf) = _nbool_correspond_ft_tf(u, v) 137 | return float(2.0 * (ntf + nft)) / float(ntt + nff + 2.0 * (ntf + nft)) 138 | 139 | def pwdist_sokalsneath(self, seq1idx, seq2idx): 140 | """Compute the Sokal-Sneath dissimilarity 141 | between two boolean 1-D arrays. 142 | 143 | Returns: 144 | distance value (double) 145 | 146 | """ 147 | u = self[seq1idx] 148 | v = self[seq2idx] 149 | ntt = (u & v).sum() 150 | 151 | (nft, ntf) = _nbool_correspond_ft_tf(u, v) 152 | denom = ntt + 2.0 * (ntf + nft) 153 | if denom == 0: 154 | raise ValueError('Sokal-Sneath dissimilarity is not defined for ' 155 | 'vectors that are entirely false.') 156 | return float(2.0 * (ntf + nft)) / denom 157 | 158 | def pwdist_jaccard(self, seq1idx, seq2idx): 159 | """Compute the Jaccard-Needham dissimilarity 160 | between two boolean 1-D arrays. 161 | 162 | Returns: 163 | distance value (double) 164 | 165 | """ 166 | u = self[seq1idx] 167 | v = self[seq2idx] 168 | dist = (np.double(np.bitwise_and((u != v), 169 | np.bitwise_or(u != 0, v != 0)).sum()) / 170 | np.double(np.bitwise_or(u != 0, v != 0).sum())) 171 | return dist 172 | 173 | def pwdist_hamming(self, seq1idx, seq2idx): 174 | """Compute the Hamming distance between two 1-D arrays. 175 | 176 | The Hamming distance between 1-D arrays `u` and `v`, is simply the 177 | proportion of disagreeing components in `u` and `v`. 178 | 179 | Returns: 180 | distance value (double) 181 | 182 | """ 183 | u = self[seq1idx] 184 | v = self[seq2idx] 185 | return (u != v).mean() 186 | 187 | def pwdist_kulsinski(self, seq1idx, seq2idx): 188 | """Compute the Kulsinski dissimilarity between two boolean 1-D arrays. 189 | 190 | Returns: 191 | distance value (double) 192 | 193 | """ 194 | u = self[seq1idx] 195 | v = self[seq2idx] 196 | n = float(len(u)) 197 | (_nff, nft, ntf, ntt) = _nbool_correspond_all(u, v) 198 | return (ntf + nft - ntt + n) / (ntf + nft + n) 199 | 200 | 201 | def main(): 202 | from .utils.seqrecords import SeqRecords 203 | from . import word_vector 204 | from . import word_pattern 205 | from .utils import distmatrix 206 | 207 | seq_records = SeqRecords() 208 | seq_records.add('seq1', 'MKSTGWHF') 209 | seq_records.add('seq2', 'MKSSSSTGWGWG') 210 | seq_records.add('seq3', 'MKSTLKNGTEQ') 211 | 212 | p = word_pattern.create(seq_records.seq_list, 2) 213 | bools = word_vector.Bools(seq_records.length_list, p) 214 | dist = Distance(bools, 'jaccard') 215 | matrix = distmatrix.create(seq_records.id_list, dist) 216 | matrix.display() 217 | 218 | 219 | if __name__ == '__main__': 220 | main() 221 | -------------------------------------------------------------------------------- /alfpy/word_d2.py: -------------------------------------------------------------------------------- 1 | """This module computes distance between DNA/protein sequences based on 2 | the d2 metric. 3 | 4 | References: 5 | 1. Hide, Burke, Davison (1994) J Comput Biol 1:199-215. 6 | doi: 10.1089/cmb.1994.1.199 7 | 2. Vinga S, Almeida J (2003) Bioinformatics 19:513-523. 8 | doi: 10.1093/bioinformatics/btg005 9 | 10 | """ 11 | 12 | import math 13 | import numpy as np 14 | 15 | 16 | class Distance: 17 | 18 | """Combine a list of vectors with distance function.""" 19 | 20 | def __init__(self, vector_list): 21 | self.vector_list = vector_list 22 | self.pairwise_distance = self.pwdist_d2 23 | 24 | def pwdist_d2(self, seqidx1, seqidx2): 25 | d2 = 0 26 | for vector in self.vector_list: 27 | d_res = np.sum((vector[seqidx1]-vector[seqidx2])**2) 28 | d2 += d_res 29 | return d2 30 | 31 | def pwdist_d2_squareroot(self, seqidx1, seqidx2): 32 | return math.sqrt(self.pwdist_d2(seqidx1, seqidx2)) 33 | 34 | def set_disttype(self, disttype): 35 | try: 36 | pwdist_func = getattr(self, 'pwdist_{}'.format(disttype)) 37 | self.pairwise_distance = pwdist_func 38 | # Method does not exist. 39 | except AttributeError: 40 | msg = 'unknown disttype "{}"'.format(disttype) 41 | raise ValueError(msg) 42 | 43 | 44 | def main(): 45 | from .utils.seqrecords import main 46 | from .utils.data import seqcontent 47 | from .utils import distmatrix 48 | from . import word_pattern 49 | from . import word_vector 50 | 51 | seq_records = main() 52 | 53 | patterns = [] 54 | for i in range(1, 5+1): 55 | p = word_pattern.create(seq_records.seq_list, i) 56 | patterns.append(p) 57 | 58 | counts = [] 59 | for p in patterns: 60 | c = word_vector.Counts(seq_records.length_list, p) 61 | counts.append(c) 62 | 63 | countsweight = [] 64 | weights = seqcontent.get_weights('dna') 65 | weightmodel = word_vector.WeightModel(weights) 66 | for p in patterns: 67 | c = word_vector.CountsWeight(seq_records, p, weightmodel) 68 | countsweight.append(c) 69 | dist = Distance(countsweight) 70 | matrix = distmatrix.create(seq_records.id_list, dist) 71 | matrix.display() 72 | 73 | 74 | if __name__ == '__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /alfpy/word_rtd.py: -------------------------------------------------------------------------------- 1 | """Return Time Distribution distance (RTD) 2 | 3 | In contrast to other word-based measures, RTD accounts for the words' 4 | relative orders. Although, originally presented for DNA sequences, the 5 | implemention handles proteins as well. 6 | 7 | Return time can be defined as the time required for the reappearance of a 8 | particular state without its appearance within the epoch. The `return time` 9 | in the context of nucleotide sequence can be defined as the number of 10 | nucleotides between the successive appearances of a particular nucleotide(s) 11 | or k-mer. The frequency distribution of those RTs for a particular k-mer is 12 | referred as a return time distribution (RTD) of that k-mer. 13 | 14 | References: 15 | 1. Kolekar, Kale, Kulkarni-Kale (2012) Mol Phylogenet Evol 65 510-522 16 | doi: http://dx.doi.org/10.1016/j.ympev.2012.07.003. 17 | 18 | """ 19 | 20 | import numpy as np 21 | from .utils import distance 22 | 23 | 24 | def calc_rtd(word_positions): 25 | """Compute return time distribution (RTD) of a given word. 26 | 27 | Args: 28 | word_positions (list) : list of sequence positions of a given word 29 | 30 | Returns: 31 | mean, stdev (tuple) 32 | 33 | Examples: 34 | >>> seq = 'CTACACAACTTTGCGGGTAGCCGGAAACATTGTGAATGCGGTGAACA' 35 | >>> apos = [i for i, nt in enumerate(seq) if nt == 'A'] 36 | >>> print(apos) 37 | [2, 4, 6, 7, 18, 24, 25, 26, 28, 34, 35, 43, 44, 46] 38 | >>> print(calc_rtd(apos, 1)) 39 | (3.3846153846153846, 3.1510306381944679) 40 | 41 | """ 42 | l = [] 43 | positions_count = len(word_positions) 44 | if positions_count < 2: 45 | return 0.0, 0.0 46 | for i in range(1, positions_count): 47 | pos1 = word_positions[i - 1] 48 | pos2 = word_positions[i] 49 | pos = pos2 - pos1 50 | l.append(pos) 51 | return np.mean(l), np.std(l) 52 | 53 | 54 | def create_vector(seqcount, pattern): 55 | """Compute a matrix of sequence-representing RTD vectors 56 | 57 | Args: 58 | seqcount (int): number of sequences 59 | pattern (obj: word_pattern.Pattern) 60 | 61 | Returns: 62 | ndarray: matrix of RTD vectors 63 | (shape: number of seqs, doubled number of words) 64 | 65 | """ 66 | words = pattern.pat_list 67 | data = np.zeros(shape=(seqcount, len(words) * 2)) 68 | for wordidx in range(len(words)): 69 | for seqidx in pattern.pos_list[wordidx]: 70 | word_positions = pattern.pos_list[wordidx][seqidx] 71 | mean, std = calc_rtd(word_positions) 72 | data[seqidx, wordidx * 2] = mean 73 | data[seqidx, wordidx * 2 + 1] = std 74 | return data 75 | 76 | 77 | class Distance(distance.Distance): 78 | pass 79 | 80 | 81 | def main(): 82 | from .utils.seqrecords import main 83 | from . import word_pattern 84 | from .utils import distmatrix 85 | 86 | seq_records = main() 87 | p = word_pattern.create(seq_records.seq_list, 2, True) 88 | vector = create_vector(seq_records.count, p) 89 | dist = Distance(vector, 'google') 90 | matrix = distmatrix.create(seq_records.id_list, dist) 91 | matrix.display() 92 | 93 | 94 | if __name__ == '__main__': 95 | main() 96 | -------------------------------------------------------------------------------- /alfpy/word_sets_distance.py: -------------------------------------------------------------------------------- 1 | """Distance methods measuring dissimilarity between sets of words. 2 | 3 | These methods are also implemented in numpy and provided in the 4 | `word_bool_distance` module. However, here are their faster 5 | implemetations based on python sets. 6 | """ 7 | 8 | from .utils import distance 9 | 10 | 11 | def _getwords(seq, word_size): 12 | """Return a set of words (of a given size) that are present 13 | in a given sequence. 14 | 15 | Args: 16 | seq (str) 17 | word_size (int): >= 1 18 | 19 | Example: 20 | >>> seq = 'ATGCGTA' 21 | >>> print(_getwords(seq, 2)) 22 | set(['GT', 'CG', 'GC', 'AT', 'TG', 'TA']) 23 | 24 | """ 25 | s = set([]) 26 | for i in range(0, len(seq) - word_size + 1): 27 | word = seq[i:i + word_size] 28 | s.add(word) 29 | return s 30 | 31 | 32 | class Distance(distance.Distance): 33 | """Combine vector data with pairwise distance methods that measures 34 | dissimilarity between sets.""" 35 | 36 | def __init__(self, seq_records, word_size, disttype='jaccard'): 37 | """Create an instance of Distance 38 | 39 | Args: 40 | seq_records (SeqRecords obj) 41 | word_size (int): >= 1 42 | 43 | """ 44 | self._vector = [_getwords(s, word_size) for s in seq_records.seq_list] 45 | self.set_disttype(disttype) 46 | 47 | def pwdist_jaccard(self, seq1idx, seq2idx): 48 | """Jaccard distance is complementary to the Jaccard coefficient 49 | and is obtained by subtracting the Jaccard coefficient from 1.""" 50 | s1 = self[seq1idx] 51 | s2 = self[seq2idx] 52 | return 1 - len(s1 & s2) / float(len(s1 | s2)) 53 | 54 | def pwdist_dice(self, seq1idx, seq2idx): 55 | """Sorensen-Dice coefficient (Czekanowski's binary index)""" 56 | s1 = self[seq1idx] 57 | s2 = self[seq2idx] 58 | return 1 - (2 * len(s1 & s2) / float(len(s1) + len(s2))) 59 | 60 | def pwdist_hamming(self, seq1idx, seq2idx): 61 | """Hamming distance measures the number of words which are in either 62 | of the sets and not in their intersection. 63 | 64 | """ 65 | s1 = self[seq1idx] 66 | s2 = self[seq2idx] 67 | return len(s1.symmetric_difference(s2)) 68 | 69 | 70 | def main(): 71 | from .utils.seqrecords import SeqRecords 72 | from .utils import distmatrix 73 | 74 | seq_records = SeqRecords() 75 | seq_records.add('seq1', 'MKSTGWHF') 76 | seq_records.add('seq2', 'MKSSSSTGWGWG') 77 | seq_records.add('seq3', 'MKSTLKNGTEQ') 78 | dist = Distance(seq_records, 2, 'jaccard') 79 | matrix = distmatrix.create(seq_records.id_list, dist) 80 | matrix.display() 81 | 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /bin/calc_bbc.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | 8 | from alfpy import bbc 9 | from alfpy.utils import distmatrix 10 | from alfpy.utils import seqrecords 11 | from alfpy.utils.data.seqcontent import get_alphabet 12 | from alfpy.version import __version__ 13 | 14 | 15 | def get_parser(): 16 | parser = argparse.ArgumentParser( 17 | description='''Calculatee distance between DNA/protein sequences 18 | based on Base-Base Correlation (BBC).''', 19 | add_help=False, prog='calc_bbc.py' 20 | ) 21 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 22 | group.add_argument('--fasta', '-f', 23 | help='input FASTA sequence filename', required=True, 24 | type=argparse.FileType('r'), metavar="FILE") 25 | group.add_argument('--molecule', '-m', choices=['dna', 'rna', 'protein'], 26 | help='choose sequence alphabet', required=True) 27 | 28 | group = parser.add_argument_group('OPTIONAL ARGUMENTS') 29 | group.add_argument('--k', '-k', help='''maximum distance to observe 30 | correlation between bases [default: %(default)s]''', 31 | type=int, default=10, metavar="INT") 32 | group.add_argument('--out', '-o', help="output filename", 33 | metavar="FILE") 34 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'], 35 | default='phylip', 36 | help='distances output format [default: %(default)s]') 37 | 38 | group = parser.add_argument_group("OTHER OPTIONS") 39 | group.add_argument("-h", "--help", action="help", 40 | help="show this help message and exit") 41 | group.add_argument('--version', action='version', 42 | version='%(prog)s {}'.format(__version__)) 43 | 44 | if len(sys.argv[1:]) == 0: 45 | # parser.print_help() 46 | parser.print_usage() 47 | parser.exit() 48 | return parser 49 | 50 | 51 | def validate_args(parser): 52 | args = parser.parse_args() 53 | try: 54 | args.alphabet = get_alphabet(args.molecule) 55 | except KeyError: 56 | parser.error("Unknown alphabet {}".format(args.molecule)) 57 | return args 58 | 59 | 60 | def main(): 61 | parser = get_parser() 62 | args = validate_args(parser) 63 | 64 | seq_records = seqrecords.read_fasta(args.fasta) 65 | vector = bbc.create_vectors(seq_records, args.k, alphabet=args.alphabet) 66 | dist = bbc.Distance(vector) 67 | matrix = distmatrix.create(seq_records.id_list, dist) 68 | 69 | if args.out: 70 | oh = open(args.out, 'w') 71 | matrix.write_to_file(oh, args.outfmt) 72 | oh.close() 73 | else: 74 | matrix.display(args.outfmt) 75 | 76 | 77 | if __name__ == '__main__': 78 | main() 79 | -------------------------------------------------------------------------------- /bin/calc_fcgr.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | 8 | from alfpy import fcgr 9 | from alfpy.utils import distmatrix 10 | from alfpy.utils import seqrecords 11 | from alfpy.version import __version__ 12 | 13 | 14 | def get_parser(): 15 | parser = argparse.ArgumentParser( 16 | description='''Calculate distances between DNA sequences based on 17 | Frequency Chaos Game Representation (FCGR) patterns of 18 | word occurrences.''', 19 | add_help=False, prog='calc_fcgr.py' 20 | ) 21 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 22 | group.add_argument('--fasta', '-f', 23 | help='input FASTA sequence filename', required=True, 24 | type=argparse.FileType('r'), metavar="FILE") 25 | group.add_argument('--word_size', '-w', required=True, 26 | help='word size', type=int) 27 | 28 | group = parser.add_argument_group('OUTPUT ARGUMENTS') 29 | group.add_argument('--out', '-o', help="output filename", 30 | metavar="FILE") 31 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'], 32 | default='phylip', 33 | help='distances output format [DEFAULT: %(default)s]') 34 | 35 | group = parser.add_argument_group("OTHER OPTIONS") 36 | group.add_argument("-h", "--help", action="help", 37 | help="show this help message and exit") 38 | group.add_argument('--version', action='version', 39 | version='%(prog)s {}'.format(__version__)) 40 | 41 | if len(sys.argv[1:]) == 0: 42 | # parser.print_help() 43 | parser.print_usage() 44 | parser.exit() 45 | return parser 46 | 47 | 48 | def validate_args(parser): 49 | args = parser.parse_args() 50 | if args.word_size < 1: 51 | parser.error('--word_size must be >= 1') 52 | return args 53 | 54 | 55 | def main(): 56 | parser = get_parser() 57 | args = validate_args(parser) 58 | 59 | seq_records = seqrecords.read_fasta(args.fasta) 60 | 61 | vector = fcgr.create_vectors(seq_records, args.word_size) 62 | dist = fcgr.Distance(vector) 63 | matrix = distmatrix.create(seq_records.id_list, dist) 64 | 65 | if args.out: 66 | oh = open(args.out, 'w') 67 | matrix.write_to_file(oh, args.outfmt) 68 | oh.close() 69 | else: 70 | matrix.display(args.outfmt) 71 | 72 | 73 | if __name__ == '__main__': 74 | main() 75 | -------------------------------------------------------------------------------- /bin/calc_graphdna.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | 8 | from alfpy import graphdna 9 | from alfpy.utils import distmatrix 10 | from alfpy.utils import seqrecords 11 | from alfpy.version import __version__ 12 | 13 | 14 | def get_parser(): 15 | parser = argparse.ArgumentParser( 16 | description='''Calculate distance between DNA sequences based on 17 | the two-dimensional (2D) graphical DNA curve''', 18 | add_help=False, prog='calc_graphdna.py' 19 | ) 20 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 21 | group.add_argument('--fasta', '-f', 22 | help='input FASTA sequence filename', required=True, 23 | type=argparse.FileType('r'), metavar="FILE") 24 | 25 | group = parser.add_argument_group('OPTIONAL ARGUMENTS') 26 | group.add_argument('--vector', '-v', choices=['2DSV', '2DNV', '2DMV'], 27 | help='vector type [default: %(default)s]', 28 | default='2DNV') 29 | group.add_argument('--ndim', '-n', type=int, metavar='N', 30 | help='''number of dimensions representing a sequence. 31 | (required if --vector 2DMV) [default: %(default)s]''', 32 | default=10) 33 | 34 | group = parser.add_argument_group('OUTPUT ARGUMENTS') 35 | group.add_argument('--out', '-o', help="output filename", metavar="FILE") 36 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'], 37 | default='phylip', 38 | help='distances output format [default: %(default)s]') 39 | 40 | group = parser.add_argument_group("OTHER OPTIONS") 41 | group.add_argument("-h", "--help", action="help", 42 | help="show this help message and exit") 43 | group.add_argument('--version', action='version', 44 | version='%(prog)s {}'.format(__version__)) 45 | 46 | if len(sys.argv[1:]) == 0: 47 | # parser.print_help() 48 | parser.print_usage() 49 | parser.exit() 50 | return parser 51 | 52 | 53 | def validate_args(parser): 54 | args = parser.parse_args() 55 | if args.vector == '2DMV' and args.ndim is None: 56 | parser.error("--vector 2DMV requires the --ndim") 57 | # TODO: mk as a range 58 | # stackoverflow.com/questions/18700634/python-argparse-integer-condition-12 59 | return args 60 | 61 | 62 | def main(): 63 | parser = get_parser() 64 | args = validate_args(parser) 65 | 66 | seq_records = seqrecords.read_fasta(args.fasta) 67 | if args.vector == '2DSV': 68 | vector = graphdna.create_2DSGraphVectors(seq_records) 69 | elif args.vector == '2DNV': 70 | vector = graphdna.create_2DNGraphVectors(seq_records) 71 | else: 72 | vector = graphdna.create_2DMGraphVectors(seq_records, args.ndim) 73 | dist = graphdna.Distance(vector) 74 | matrix = distmatrix.create(seq_records.id_list, dist) 75 | 76 | if args.out: 77 | oh = open(args.out, 'w') 78 | matrix.write_to_file(oh, args.outfmt) 79 | oh.close() 80 | else: 81 | matrix.display(args.outfmt) 82 | 83 | 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /bin/calc_lempelziv.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | 8 | from alfpy import lempelziv 9 | from alfpy.utils import distmatrix 10 | from alfpy.utils import seqrecords 11 | from alfpy.version import __version__ 12 | 13 | 14 | def get_parser(): 15 | parser = argparse.ArgumentParser( 16 | description='''Calculate distance between DNA/protein sequences based 17 | on Lempel-Ziv complexity.''', 18 | add_help=False, prog='calc_lempelziv.py' 19 | ) 20 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 21 | group.add_argument('--fasta', '-f', 22 | help='input FASTA sequence filename', required=True, 23 | type=argparse.FileType('r'), metavar="FILE") 24 | 25 | group = parser.add_argument_group('OPTIONAL ARGUMENTS') 26 | distlist = ['d', 'd_star', 'd1', 'd1_star', 'd1_star2'] 27 | group.add_argument('--distance', '-d', choices=distlist, 28 | help='choose from: {} [DEFAULT: %(default)s]'.format( 29 | ", ".join(distlist)), 30 | metavar='', default="d1_star2") 31 | 32 | group = parser.add_argument_group('OUTPUT ARGUMENTS') 33 | group.add_argument('--out', '-o', help="output filename", 34 | metavar="FILE") 35 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'], 36 | default='phylip', 37 | help='distances output format [DEFAULT: %(default)s]') 38 | 39 | group = parser.add_argument_group("OTHER OPTIONS") 40 | group.add_argument("-h", "--help", action="help", 41 | help="show this help message and exit") 42 | group.add_argument('--version', action='version', 43 | version='%(prog)s {}'.format(__version__)) 44 | 45 | if len(sys.argv[1:]) == 0: 46 | # parser.print_help() 47 | parser.print_usage() 48 | parser.exit() 49 | return parser 50 | 51 | 52 | def validate_args(parser): 53 | args = parser.parse_args() 54 | return args 55 | 56 | 57 | def main(): 58 | parser = get_parser() 59 | args = validate_args(parser) 60 | 61 | seq_records = seqrecords.read_fasta(args.fasta) 62 | dist = lempelziv.Distance(seq_records, args.distance) 63 | matrix = distmatrix.create(seq_records.id_list, dist) 64 | 65 | if args.out: 66 | oh = open(args.out, 'w') 67 | matrix.write_to_file(oh, args.outfmt) 68 | oh.close() 69 | else: 70 | matrix.display(args.outfmt) 71 | 72 | 73 | if __name__ == '__main__': 74 | main() 75 | -------------------------------------------------------------------------------- /bin/calc_ncd.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | 8 | from alfpy import ncd 9 | from alfpy.utils import distmatrix 10 | from alfpy.utils import seqrecords 11 | from alfpy.version import __version__ 12 | 13 | 14 | def get_parser(): 15 | parser = argparse.ArgumentParser( 16 | description='''Calculate distances between DNA/protein sequences based 17 | on Normalized Compression Distance (NCD).''', 18 | add_help=False, prog='calc_ncd.py' 19 | ) 20 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 21 | group.add_argument('--fasta', '-f', 22 | help='input FASTA sequence filename', required=True, 23 | type=argparse.FileType('r'), metavar="FILE") 24 | 25 | group = parser.add_argument_group('OUTPUT ARGUMENTS') 26 | group.add_argument('--out', '-o', help="output filename", 27 | metavar="FILE") 28 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'], 29 | default='phylip', 30 | help='distances output format [DEFAULT: %(default)s]') 31 | 32 | group = parser.add_argument_group("OTHER OPTIONS") 33 | group.add_argument("-h", "--help", action="help", 34 | help="show this help message and exit") 35 | group.add_argument('--version', action='version', 36 | version='%(prog)s {}'.format(__version__)) 37 | 38 | if len(sys.argv[1:]) == 0: 39 | # parser.print_help() 40 | parser.print_usage() 41 | parser.exit() 42 | 43 | return parser 44 | 45 | 46 | def validate_args(parser): 47 | args = parser.parse_args() 48 | return args 49 | 50 | 51 | def main(): 52 | parser = get_parser() 53 | args = validate_args(parser) 54 | 55 | seq_records = seqrecords.read_fasta(args.fasta) 56 | dist = ncd.Distance(seq_records) 57 | matrix = distmatrix.create(seq_records.id_list, dist) 58 | 59 | if args.out: 60 | oh = open(args.out, 'w') 61 | matrix.write_to_file(oh, args.outfmt) 62 | oh.close() 63 | else: 64 | matrix.display(args.outfmt) 65 | 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /bin/calc_wmetric.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | 8 | from alfpy import wmetric 9 | from alfpy.utils import distmatrix 10 | from alfpy.utils import seqrecords 11 | from alfpy.utils.data import subsmat 12 | from alfpy.version import __version__ 13 | 14 | 15 | def get_parser(): 16 | parser = argparse.ArgumentParser( 17 | description='''Calculate distances between protein sequences based 18 | on W-metric (Wm).''', add_help=False, prog='calc_wmetric.py' 19 | ) 20 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 21 | group.add_argument('--fasta', '-f', 22 | help='input FASTA sequence filename', required=True, 23 | type=argparse.FileType('r'), metavar="FILE") 24 | 25 | l = subsmat.list_subsmats() 26 | group = parser.add_argument_group('OPTIONAL ARGUMENTS') 27 | group.add_argument('--matrix', '-m', choices=l, 28 | help='choose from: {} [DEFAULT: %(default)s]'.format( 29 | ", ".join(l)), metavar='', 30 | default="blosum62") 31 | 32 | group = parser.add_argument_group('OUTPUT ARGUMENTS') 33 | group.add_argument('--out', '-o', help="output filename", 34 | metavar="FILE") 35 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'], 36 | default='phylip', 37 | help='distances output format [DEFAULT: %(default)s]') 38 | 39 | group = parser.add_argument_group("OTHER OPTIONS") 40 | group.add_argument("-h", "--help", action="help", 41 | help="show this help message and exit") 42 | group.add_argument('--version', action='version', 43 | version='%(prog)s {}'.format(__version__)) 44 | 45 | if len(sys.argv[1:]) == 0: 46 | # parser.print_help() 47 | parser.print_usage() 48 | parser.exit() 49 | 50 | return parser 51 | 52 | 53 | def validate_args(parser): 54 | args = parser.parse_args() 55 | try: 56 | args.matrix = subsmat.get(args.matrix) 57 | except KeyError: 58 | parser.error("Unknown matrix {}".format(args.matrix)) 59 | return args 60 | 61 | 62 | def main(): 63 | parser = get_parser() 64 | args = validate_args(parser) 65 | 66 | seq_records = seqrecords.read_fasta(args.fasta) 67 | dist = wmetric.Distance(seq_records, args.matrix) 68 | matrix = distmatrix.create(seq_records.id_list, dist) 69 | 70 | if args.out: 71 | oh = open(args.out, 'w') 72 | matrix.write_to_file(oh, args.outfmt) 73 | oh.close() 74 | else: 75 | matrix.display(args.outfmt) 76 | 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /bin/calc_word_bool.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | 8 | from alfpy import word_bool_distance 9 | from alfpy import word_pattern 10 | from alfpy import word_vector 11 | from alfpy.utils import distmatrix 12 | from alfpy.utils import seqrecords 13 | from alfpy.version import __version__ 14 | 15 | 16 | def get_parser(): 17 | parser = argparse.ArgumentParser( 18 | description='''Calculate distances between DNA/protein sequences based 19 | on boolean 1-D vectors of word counting occurrences.''', 20 | add_help=False, prog='calc_word_bool.py' 21 | ) 22 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 23 | group.add_argument('--fasta', '-f', 24 | help='input FASTA sequence filename', required=True, 25 | type=argparse.FileType('r'), metavar="FILE") 26 | 27 | group = parser.add_argument_group(' Choose between the two options') 28 | g1 = group.add_mutually_exclusive_group() 29 | g1.add_argument('--word_size', '-s', metavar="N", 30 | help='word size for creating word patterns', 31 | type=int) 32 | g1.add_argument('--word_pattern', '-w', 33 | help='input filename w/ pre-computed word patterns', 34 | type=argparse.FileType('r'), metavar="FILE") 35 | 36 | group = parser.add_argument_group('OPTIONAL ARGUMENTS') 37 | distlist = word_bool_distance.Distance.get_disttypes() 38 | group.add_argument('--distance', '-d', choices=distlist, 39 | help='choose from: {} [DEFAULT: %(default)s]'.format( 40 | ", ".join(distlist)), 41 | metavar='', default="jaccard") 42 | 43 | group = parser.add_argument_group('OUTPUT ARGUMENTS') 44 | group.add_argument('--out', '-o', help="output filename", 45 | metavar="FILE") 46 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'], 47 | default='phylip', 48 | help='distances output format [DEFAULT: %(default)s]') 49 | 50 | group = parser.add_argument_group("OTHER OPTIONS") 51 | group.add_argument("-h", "--help", action="help", 52 | help="show this help message and exit") 53 | group.add_argument('--version', action='version', 54 | version='%(prog)s {}'.format(__version__)) 55 | 56 | if len(sys.argv[1:]) == 0: 57 | # parser.print_help() 58 | parser.print_usage() # for just the usage line 59 | parser.exit() 60 | 61 | return parser 62 | 63 | 64 | def validate_args(parser): 65 | args = parser.parse_args() 66 | if args.word_size: 67 | if args.word_size < 1: 68 | parser.error('Word size must be >= 1.') 69 | elif args.word_pattern: 70 | pass 71 | else: 72 | parser.error("Specify either: --word_size or --word_pattern.") 73 | return args 74 | 75 | 76 | def main(): 77 | parser = get_parser() 78 | args = validate_args(parser) 79 | 80 | seq_records = seqrecords.read_fasta(args.fasta) 81 | if args.word_size: 82 | p = word_pattern.create(seq_records.seq_list, args.word_size) 83 | else: 84 | p = word_pattern.read(args.word_pattern) 85 | 86 | bools = word_vector.Bools(seq_records.length_list, p) 87 | dist = word_bool_distance.Distance(bools, args.distance) 88 | matrix = distmatrix.create(seq_records.id_list, dist) 89 | 90 | if args.out: 91 | oh = open(args.out, 'w') 92 | matrix.write_to_file(oh, args.outfmt) 93 | oh.close() 94 | else: 95 | matrix.display(args.outfmt) 96 | 97 | 98 | if __name__ == '__main__': 99 | main() 100 | -------------------------------------------------------------------------------- /bin/calc_word_cv.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | 8 | from alfpy import word_vector 9 | from alfpy import word_distance 10 | from alfpy.utils import distmatrix 11 | from alfpy.utils import seqrecords 12 | from alfpy import word_pattern 13 | from alfpy.version import __version__ 14 | 15 | 16 | def get_parser(): 17 | parser = argparse.ArgumentParser( 18 | description='''Calculate compositional distances between DNA/protein 19 | sequences based on word (of length k) occurrences using a Markov model 20 | of k-2.''', 21 | add_help=False, prog='calc_word_cv.py' 22 | ) 23 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 24 | group.add_argument('--fasta', '-f', 25 | help='input FASTA sequence filename', required=True, 26 | type=argparse.FileType('r'), metavar="FILE") 27 | 28 | group = parser.add_argument_group(' Choose between the two options') 29 | g1 = group.add_mutually_exclusive_group() 30 | g1.add_argument('--word_size', '-s', metavar="k", type=int, 31 | help='''word size (k-mer) for creating word patterns 32 | (must be >= 3)''' 33 | ) 34 | g1.add_argument('--word_patterns', '-w', nargs=3, 35 | help='''3 input word pattern files (k-, [k-1]-, 36 | [k-2]-mers)''', 37 | type=argparse.FileType('r'), metavar="FILE") 38 | 39 | group = parser.add_argument_group('OUTPUT ARGUMENTS') 40 | group.add_argument('--out', '-o', help="output filename", 41 | metavar="FILE") 42 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'], 43 | default='phylip', 44 | help='distances output format [DEFAULT: %(default)s]') 45 | 46 | group = parser.add_argument_group("OTHER OPTIONS") 47 | group.add_argument("-h", "--help", action="help", 48 | help="show this help message and exit") 49 | group.add_argument('--version', action='version', 50 | version='%(prog)s {}'.format(__version__)) 51 | 52 | if len(sys.argv[1:]) == 0: 53 | # parser.print_help() 54 | parser.print_usage() 55 | parser.exit() 56 | 57 | return parser 58 | 59 | 60 | def validate_args(parser): 61 | args = parser.parse_args() 62 | if args.word_size: 63 | if args.word_size < 3: 64 | parser.error('Word size must be >= 3') 65 | 66 | elif args.word_patterns: 67 | l = [] 68 | for i in range(0, 3): 69 | try: 70 | p = word_pattern.read(args.word_patterns[i]) 71 | l.append(p) 72 | except Exception: 73 | parser.error('Invalid format for word pattern: {0}'.format( 74 | args.word_patterns[i].name)) 75 | 76 | if len(l) == 3: 77 | # check if follow rule 78 | k, k1, k2 = [len(p.pat_list[0]) for p in l] 79 | if not (k == k1 + 1 == k2 + 2): 80 | parser.error( 81 | '''Word pattern lengths do not follow k, k-1, k-2''') 82 | 83 | args.word_patterns = l 84 | else: 85 | parser.error("Specify either: --word_size or --word_pattern.") 86 | return args 87 | 88 | 89 | def main(): 90 | parser = get_parser() 91 | args = validate_args(parser) 92 | 93 | seq_records = seqrecords.read_fasta(args.fasta) 94 | 95 | if args.word_patterns: 96 | l = args.word_patterns 97 | else: 98 | l = [] 99 | for i in range(args.word_size, args.word_size - 3, -1): 100 | p = word_pattern.create(seq_records.seq_list, i) 101 | l.append(p) 102 | 103 | compos = word_vector.Composition(seq_records.length_list, *l) 104 | dist = word_distance.Distance(compos, 'angle_cos_diss') 105 | matrix = distmatrix.create(seq_records.id_list, dist) 106 | 107 | if args.out: 108 | oh = open(args.out, 'w') 109 | matrix.write_to_file(oh, args.outfmt) 110 | oh.close() 111 | else: 112 | matrix.display(args.outfmt) 113 | 114 | 115 | if __name__ == '__main__': 116 | main() 117 | -------------------------------------------------------------------------------- /bin/calc_word_d2.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | 8 | from alfpy import word_d2 9 | from alfpy import word_pattern 10 | from alfpy import word_vector 11 | from alfpy.utils import distmatrix 12 | from alfpy.utils import seqrecords 13 | from alfpy.version import __version__ 14 | 15 | 16 | def get_parser(): 17 | parser = argparse.ArgumentParser( 18 | description='''Calculate d2 distance between DNA/protein sequences based 19 | on subsequence (words) occurrences.''', 20 | add_help=False, prog='calc_word_d2.py' 21 | ) 22 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 23 | group.add_argument('--fasta', '-f', 24 | help='input FASTA sequence filename', required=True, 25 | type=argparse.FileType('r'), metavar="FILE") 26 | 27 | group = parser.add_argument_group('OPTIONAL ARGUMENTS') 28 | group.add_argument('--min_word_size', '-l', 29 | help='minimum word size [default: %(default)s]', 30 | type=int, metavar="WORD_SIZE", default=1, 31 | ) 32 | group.add_argument('--max_word_size', '-u', 33 | help='maximum word size [default: %(default)s]', 34 | type=int, metavar="WORD_SIZE", default=3, 35 | ) 36 | veclist = ['counts', 'freqs'] 37 | group.add_argument('--vector', '-v', choices=veclist, 38 | help='choose from: {} [DEFAULT: %(default)s]'.format( 39 | ", ".join(veclist)), 40 | metavar='', default="counts") 41 | group.add_argument('--char_weights', '-W', metavar="FILE", 42 | help='''file w/ weights of background sequence characters 43 | (nt/aa)''', 44 | type=argparse.FileType('r')) 45 | 46 | group = parser.add_argument_group('OUTPUT ARGUMENTS') 47 | group.add_argument('--out', '-o', help="output filename", 48 | metavar="FILE") 49 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'], 50 | default='phylip', 51 | help='distances output format [DEFAULT: %(default)s]') 52 | 53 | group = parser.add_argument_group("OTHER OPTIONS") 54 | group.add_argument("-h", "--help", action="help", 55 | help="show this help message and exit") 56 | group.add_argument('--version', action='version', 57 | version='%(prog)s {}'.format(__version__)) 58 | 59 | if len(sys.argv[1:]) == 0: 60 | # parser.print_help() 61 | parser.print_usage() 62 | parser.exit() 63 | 64 | return parser 65 | 66 | 67 | def validate_args(parser): 68 | args = parser.parse_args() 69 | if not args.min_word_size: 70 | parser.error("min_word_size must be greater than 0") 71 | elif args.min_word_size >= args.max_word_size: 72 | parser.error("max_word_size must be greater than min_word_size") 73 | if args.char_weights: 74 | try: 75 | weights = word_vector.read_weightfile(args.char_weights) 76 | args.char_weights = weights 77 | except Exception: 78 | e = 'Invalid format for --char_weights {0}'.format( 79 | args.char_weights.name) 80 | parser.error(e) 81 | return args 82 | 83 | 84 | def main(): 85 | parser = get_parser() 86 | args = validate_args(parser) 87 | 88 | seq_records = seqrecords.read_fasta(args.fasta) 89 | 90 | patterns = [] 91 | for i in range(args.min_word_size, args.max_word_size + 1): 92 | p = word_pattern.create(seq_records.seq_list, i) 93 | patterns.append(p) 94 | 95 | vecs = [] 96 | if args.char_weights is not None: 97 | weightmodel = word_vector.WeightModel(char_weights=args.char_weights) 98 | vecklas = {'counts': word_vector.CountsWeight, 99 | 'freqs': word_vector.FreqsWeight}[args.vector] 100 | kwargs = {'seq_lengths': seq_records.length_list, 101 | 'weightmodel': weightmodel} 102 | else: 103 | vecklas = {'counts': word_vector.Counts, 104 | 'freqs': word_vector.Freqs}[args.vector] 105 | kwargs = {'seq_lengths': seq_records.length_list} 106 | for p in patterns: 107 | v = vecklas(patterns=p, **kwargs) 108 | vecs.append(v) 109 | 110 | dist = word_d2.Distance(vecs) 111 | matrix = distmatrix.create(seq_records.id_list, dist) 112 | 113 | if args.out: 114 | oh = open(args.out, 'w') 115 | matrix.write_to_file(oh, args.outfmt) 116 | oh.close() 117 | else: 118 | matrix.display(args.outfmt) 119 | 120 | 121 | if __name__ == '__main__': 122 | main() 123 | -------------------------------------------------------------------------------- /bin/calc_word_ffp.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | 8 | from alfpy import word_vector 9 | from alfpy import word_distance 10 | from alfpy.utils import distmatrix 11 | from alfpy.utils import seqrecords 12 | from alfpy import word_pattern 13 | from alfpy.utils.data import seqcontent 14 | from alfpy.version import __version__ 15 | 16 | 17 | def get_parser(): 18 | parser = argparse.ArgumentParser( 19 | description='''Calculate distance between DNA/protein sequences based 20 | on feature frequency profiles (FFPs) of words.''', 21 | add_help=False, prog='calc_word_ffp.py' 22 | ) 23 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 24 | group.add_argument('--fasta', '-f', 25 | help='input FASTA sequence filename', required=True, 26 | type=argparse.FileType('r'), metavar="FILE") 27 | group.add_argument('--molecule', '-m', choices=['dna', 'rna', 'protein'], 28 | help='choose sequence alphabet', required=True) 29 | 30 | group = parser.add_argument_group(' Choose between the two options') 31 | g1 = group.add_mutually_exclusive_group() 32 | g1.add_argument('--word_size', '-s', metavar="N", 33 | help='word size for creating word patterns', 34 | type=int) 35 | g1.add_argument('--word_pattern', '-w', 36 | help='input filename w/ pre-computed word patterns', 37 | type=argparse.FileType('r'), metavar="FILE") 38 | 39 | group = parser.add_argument_group('OPTIONAL ARGUMENTS') 40 | distlist = word_distance.Distance.get_disttypes() 41 | group.add_argument('--distance', '-d', choices=distlist, 42 | help='choose from: {} [DEFAULT: %(default)s]'.format( 43 | ", ".join(distlist)), 44 | metavar='', default="jsd") 45 | group.add_argument('--reduce_alphabet', '-r', action="store_true", 46 | help='''reduce the words' nt/aa alphabet to smaller 47 | number of symbols''') 48 | group.add_argument('--merge_revcomp', '-M', action="store_true", 49 | help='''merge together DNA words with their reverse 50 | complement words''') 51 | 52 | group = parser.add_argument_group('OUTPUT ARGUMENTS') 53 | group.add_argument('--out', '-o', help="output filename", 54 | metavar="FILE") 55 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'], 56 | default='phylip', 57 | help='distances output format [DEFAULT: %(default)s]') 58 | 59 | group = parser.add_argument_group("OTHER OPTIONS") 60 | group.add_argument("-h", "--help", action="help", 61 | help="show this help message and exit") 62 | group.add_argument('--version', action='version', 63 | version='%(prog)s {}'.format(__version__)) 64 | 65 | if len(sys.argv[1:]) == 0: 66 | # parser.print_help() 67 | parser.print_usage() 68 | parser.exit() 69 | 70 | return parser 71 | 72 | 73 | def validate_args(parser): 74 | args = parser.parse_args() 75 | if args.word_size: 76 | if args.word_size < 1: 77 | parser.error('word size must be >= 1') 78 | elif args.word_pattern: 79 | pass 80 | else: 81 | parser.error("Specify either: --word_size or --word_pattern.") 82 | 83 | if args.molecule == 'protein' and args.merge_revcomp: 84 | parser.error("Incompatible arguments: -m protein --merge_revcomp") 85 | 86 | return args 87 | 88 | 89 | def main(): 90 | parser = get_parser() 91 | args = validate_args(parser) 92 | 93 | seq_records = seqrecords.read_fasta(args.fasta) 94 | if args.word_size: 95 | p = word_pattern.create(seq_records.seq_list, args.word_size) 96 | else: 97 | p = word_pattern.read(args.word_pattern) 98 | 99 | if args.reduce_alphabet: 100 | p = p.reduce_alphabet(seqcontent.get_reduced_alphabet(args.molecule)) 101 | if args.merge_revcomp: 102 | p = p.merge_revcomp() 103 | 104 | freqs = word_vector.Freqs(seq_records.length_list, p) 105 | 106 | dist = word_distance.Distance(freqs, args.distance) 107 | matrix = distmatrix.create(seq_records.id_list, dist) 108 | 109 | if args.out: 110 | oh = open(args.out, 'w') 111 | matrix.write_to_file(oh, args.outfmt) 112 | oh.close() 113 | else: 114 | matrix.display(args.outfmt) 115 | 116 | 117 | if __name__ == '__main__': 118 | main() 119 | -------------------------------------------------------------------------------- /bin/calc_word_rtd.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | 8 | from alfpy import word_distance 9 | from alfpy import word_pattern 10 | from alfpy import word_rtd 11 | from alfpy.utils import distmatrix 12 | from alfpy.utils import seqrecords 13 | from alfpy.version import __version__ 14 | 15 | 16 | def get_parser(): 17 | parser = argparse.ArgumentParser( 18 | description='''Calculate distances between protein/DNA sequences based 19 | on Return Time Distribution (RTD) of words\' occurrences and their 20 | relative orders''', 21 | add_help=False, prog='calc_word_rtd.py' 22 | ) 23 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 24 | group.add_argument('--fasta', '-f', 25 | help='input FASTA sequence filename', required=True, 26 | type=argparse.FileType('r'), metavar="FILE") 27 | 28 | group = parser.add_argument_group(' Choose between the two options') 29 | g1 = group.add_mutually_exclusive_group() 30 | g1.add_argument('--word_size', '-s', metavar="N", 31 | help='word size for creating word patterns', 32 | type=int) 33 | g1.add_argument('--word_pattern', '-w', 34 | help='input filename w/ pre-computed word patterns', 35 | type=argparse.FileType('r'), metavar="FILE") 36 | 37 | group = parser.add_argument_group('OPTIONAL ARGUMENTS') 38 | distlist = word_distance.Distance.get_disttypes() 39 | group.add_argument('--distance', '-d', choices=distlist, 40 | help='choose from: {} [DEFAULT: %(default)s]'.format( 41 | ", ".join(distlist)), 42 | metavar='', default="google") 43 | 44 | group = parser.add_argument_group('OUTPUT ARGUMENTS') 45 | group.add_argument('--out', '-o', help="output filename", 46 | metavar="FILE") 47 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'], 48 | default='phylip', 49 | help='distances output format [DEFAULT: %(default)s]') 50 | 51 | group = parser.add_argument_group("OTHER OPTIONS") 52 | group.add_argument("-h", "--help", action="help", 53 | help="show this help message and exit") 54 | group.add_argument('--version', action='version', 55 | version='%(prog)s {}'.format(__version__)) 56 | 57 | if len(sys.argv[1:]) == 0: 58 | # parser.print_help() 59 | parser.print_usage() 60 | parser.exit() 61 | 62 | return parser 63 | 64 | 65 | def validate_args(parser): 66 | args = parser.parse_args() 67 | if args.word_size: 68 | if args.word_size < 1: 69 | parser.error('word size must be >= 1') 70 | elif args.word_pattern: 71 | p = word_pattern.read(args.word_pattern) 72 | if not p.pos_list: 73 | e = "{0} does not contain info on word positions.\n" 74 | e += "Please use: create_wordpattern.py with" 75 | e += " --word_position option." 76 | parser.error(e.format(args.word_pattern.name)) 77 | else: 78 | args.word_pattern = p 79 | else: 80 | parser.error("Specify either: --word_size or --word_pattern.") 81 | return args 82 | 83 | 84 | def main(): 85 | parser = get_parser() 86 | args = validate_args(parser) 87 | 88 | seq_records = seqrecords.read_fasta(args.fasta) 89 | if args.word_size: 90 | p = word_pattern.create(seq_records.seq_list, args.word_size, True) 91 | else: 92 | p = args.word_pattern 93 | 94 | vector = word_rtd.create_vector(seq_records.count, p) 95 | dist = word_rtd.Distance(vector, args.distance) 96 | 97 | matrix = distmatrix.create(seq_records.id_list, dist) 98 | 99 | if args.out: 100 | oh = open(args.out, 'w') 101 | matrix.write_to_file(oh, args.outfmt) 102 | oh.close() 103 | else: 104 | matrix.display(args.outfmt) 105 | 106 | 107 | if __name__ == '__main__': 108 | main() 109 | -------------------------------------------------------------------------------- /bin/calc_word_sets.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | from alfpy import word_sets_distance 8 | from alfpy.utils import distmatrix 9 | from alfpy.utils import seqrecords 10 | from alfpy.version import __version__ 11 | 12 | 13 | def get_parser(): 14 | parser = argparse.ArgumentParser( 15 | description='''Calculate distances between DNA/protein sequences based 16 | on boolean 1-D vectors of word counting occurrences.''', 17 | add_help=False, prog='calc_word_sets.py' 18 | 19 | ) 20 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 21 | group.add_argument('--fasta', '-f', 22 | help='input FASTA sequence filename', required=True, 23 | type=argparse.FileType('r'), metavar="FILE") 24 | group.add_argument('--word_size', '-s', metavar="N", required=True, 25 | help='word size for creating word patterns', 26 | type=int) 27 | 28 | group = parser.add_argument_group('OPTIONAL ARGUMENTS') 29 | distlist = ['dice', 'hamming', 'jaccard'] 30 | group.add_argument('--distance', '-d', choices=distlist, 31 | help='choose from: {} [DEFAULT: %(default)s]'.format( 32 | ", ".join(distlist)), 33 | metavar='', default="dice") 34 | 35 | group = parser.add_argument_group('OUTPUT ARGUMENTS') 36 | group.add_argument('--out', '-o', help="output filename", 37 | metavar="FILE") 38 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'], 39 | default='phylip', 40 | help='distances output format [DEFAULT: %(default)s]') 41 | 42 | group = parser.add_argument_group("OTHER OPTIONS") 43 | group.add_argument("-h", "--help", action="help", 44 | help="show this help message and exit") 45 | group.add_argument('--version', action='version', 46 | version='%(prog)s {}'.format(__version__)) 47 | 48 | if len(sys.argv[1:]) == 0: 49 | # parser.print_help() 50 | parser.print_usage() 51 | parser.exit() 52 | 53 | return parser 54 | 55 | 56 | def validate_args(parser): 57 | args = parser.parse_args() 58 | if args.word_size < 1: 59 | parser.error('Word size must be >= 1.') 60 | return args 61 | 62 | 63 | def main(): 64 | parser = get_parser() 65 | args = validate_args(parser) 66 | 67 | seq_records = seqrecords.read_fasta(args.fasta) 68 | dist = word_sets_distance.Distance(seq_records, args.word_size, 69 | args.distance) 70 | matrix = distmatrix.create(seq_records.id_list, dist) 71 | 72 | if args.out: 73 | oh = open(args.out, 'w') 74 | matrix.write_to_file(oh, args.outfmt) 75 | oh.close() 76 | else: 77 | matrix.display(args.outfmt) 78 | 79 | 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /bin/create_wordpattern.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Copyright (c) 2016 Zielezinski A, combio.pl 4 | 5 | import argparse 6 | import sys 7 | 8 | from alfpy import word_pattern 9 | from alfpy.utils import seqrecords 10 | from alfpy.version import __version__ 11 | 12 | 13 | def get_parser(): 14 | parser = argparse.ArgumentParser( 15 | description='''Count subsequences (words) of a given length (size) 16 | for each sequence in input FASTA-formatted file.''', 17 | add_help=False, prog='create_wordpattern.py' 18 | ) 19 | group = parser.add_argument_group('REQUIRED ARGUMENTS') 20 | group.add_argument('--fasta', '-f', 21 | help='input FASTA sequence filename', required=True, 22 | type=argparse.FileType('r'), metavar="FILE") 23 | group.add_argument('--word_size', '-w', required=True, type=int, 24 | metavar="k", help='word size (>=1)') 25 | 26 | group = parser.add_argument_group('OPTIONAL ARGUMENTS') 27 | group.add_argument('--word_position', '-p', action="store_true", 28 | help='''report word positions in output''') 29 | group.add_argument('--out', '-o', help="output pattern filename", 30 | metavar="FILE") 31 | 32 | t = ' Teiresias options' 33 | d = ' more info @ https://cm.jefferson.edu/data-tools-downloads/' 34 | d += 'teiresias-code/\n' 35 | group = parser.add_argument_group(t, d) 36 | group.add_argument('--teiresias', '-t', action="store_true", 37 | help='''Teiresias program creates word patterns. 38 | [by default: disabled]''', 39 | ) 40 | group.add_argument('--l', '-l', type=int, 41 | help='minimum number of literals and/or brackets') 42 | group.add_argument('--k', '-k', type=int, 43 | help='minimum support that any word can have') 44 | 45 | group = parser.add_argument_group("OTHER OPTIONS") 46 | group.add_argument("-h", "--help", action="help", 47 | help="show this help message and exit") 48 | group.add_argument('--version', action='version', 49 | version='%(prog)s {}'.format(__version__)) 50 | 51 | if len(sys.argv[1:]) == 0: 52 | # parser.print_help() 53 | parser.print_usage() # for just the usage line 54 | parser.exit() 55 | 56 | return parser 57 | 58 | 59 | def validate_args(parser): 60 | args = parser.parse_args() 61 | if args.teiresias: 62 | if args.l is None: 63 | parser.error("Teiresias requires --l") 64 | if args.k is None: 65 | parser.error("Teiresias requires --k") 66 | if args.word_size < 2: 67 | parser.error("Teiresias requires --word_size to be >= 2") 68 | if args.l < 2: 69 | parser.error("--l must be at least 2") 70 | if args.l > args.word_size: 71 | parser.error("--word_size must be >= than --l") 72 | elif args.word_size < 1: 73 | parser.error("--word_size must be >= 1") 74 | return args 75 | 76 | 77 | def main(): 78 | parser = get_parser() 79 | args = validate_args(parser) 80 | 81 | if args.teiresias: 82 | args.fasta.close() 83 | p = word_pattern.run_teiresias(args.fasta.name, 84 | w=args.word_size, 85 | l=args.l, 86 | k=args.k, 87 | output_filename=args.out) 88 | else: 89 | seq_records = seqrecords.read_fasta(args.fasta) 90 | args.fasta.close() 91 | p = word_pattern.create(seq_records.seq_list, 92 | args.word_size, 93 | args.word_position) 94 | 95 | if args.out: 96 | oh = open(args.out, 'w') 97 | oh.write(p.format()) 98 | oh.close() 99 | else: 100 | print(p.format()) 101 | # or sys.stdout(p.format()+'\n') 102 | 103 | 104 | if __name__ == '__main__': 105 | main() 106 | -------------------------------------------------------------------------------- /example_data/input/aminoacid.freqs.swissprot.txt: -------------------------------------------------------------------------------- 1 | # UniProtKB/Swiss-Prot protein knowledgebase release 2016_09 statistics 2 | # Release 2016_09 of 05-Oct-16 of UniProtKB/Swiss-Prot contains 552259 sequence entries, 3 | # comprising 197423140 amino acids abstracted from 247204 references. 4 | # http://web.expasy.org/docs/relnotes/relstat.html 5 | A 0.0826 6 | Q 0.0393 7 | L 0.0965 8 | S 0.0659 9 | R 0.0553 10 | E 0.0674 11 | K 0.0583 12 | T 0.0534 13 | N 0.0406 14 | G 0.0708 15 | M 0.0241 16 | W 0.0109 17 | D 0.0546 18 | H 0.0227 19 | F 0.0386 20 | Y 0.0292 21 | C 0.0137 22 | I 0.0594 23 | P 0.0471 24 | V 0.0687 -------------------------------------------------------------------------------- /example_data/input/aminoacid.weights.txt: -------------------------------------------------------------------------------- 1 | # Based on amino acid frequencies 2 | # Weight = 1 / amino acid freq / 10 3 | # should be greater than 1. 4 | A 1.21065375303 5 | C 7.29927007299 6 | E 1.48367952522 7 | D 1.8315018315 8 | G 1.41242937853 9 | F 2.59067357513 10 | I 1.6835016835 11 | H 4.40528634361 12 | K 1.71526586621 13 | M 4.14937759336 14 | L 1.03626943005 15 | N 2.46305418719 16 | Q 2.54452926209 17 | P 2.12314225053 18 | S 1.51745068285 19 | R 1.80831826401 20 | T 1.87265917603 21 | W 9.17431192661 22 | V 1.45560407569 23 | Y 3.42465753425 -------------------------------------------------------------------------------- /example_data/input/hiv.pep.fasta: -------------------------------------------------------------------------------- 1 | >DENTIST 2 | EVVIRSANFTDNAKIIIVQLNASVEINCTRPNNYTRKGIRIGPGRAVYAAEEIIGDIRRAHCNISREKWN 3 | NTLKQVVTKLREQFVNKTIIFTHPSGGDPEIVMHSVNCGGEFFY 4 | >PATIENT_A 5 | VIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIRIGPGRAVYAAEEIIGDIRRAHCNISREKWNNT 6 | LKQVVTKLREQFVNKTIIFNHSSGGDPEIVMHSFNCGGEFFY 7 | >PATIENT_B 8 | FTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGAKWNNTLEQVKT 9 | KLREQFGNTTIFFNHSSG 10 | >PATIENT_C 11 | EVVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAVYATDRIIGDIRQAHCNISREKWN 12 | NTLKQVVTKLREQFVNKTIIFTHPSGGDPEIVMHSVNCGGEFFY 13 | >PATIENT_D 14 | EVVIRSANFSDNAKTIIVQLNKSVKITCIRPSNNTRQSIPIGPGKAVYATGQIIGDIRQAHCNLSEAKWN 15 | NTLAQIVKKLKEQFRNRTIVFNQSSGGDPEIVMHSFNCGGEFFYC 16 | >PATIENT_E 17 | ASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGEKWNNTLKQVVTKLREQFGDKTIIF 18 | NHSSGGDPEIVM 19 | >PATIENT_F 20 | EVVIRSENFTDNVKTIIVQLNESVQINCTRPNNNTRKSIHIAPGRAFYATGEIIGDIRQAHCNLSSTKWN 21 | NTLRQIAKKLKEQFGNKTIVFNQSSGGDPEIVMHSFNCGGEFFYC 22 | >PATIENT_G 23 | EVVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRRGIHIGPGRAFYATDRIVGDIRQAYCNISREKWN 24 | NTLKQVVAKLREQFVNKTIIFNHSSGGDPEIVMHSVNCGGEFFYCNT 25 | >PATIENT_H 26 | LAEGEVIIRSENFTDNAKTIIVQLNATINITCERPHNNTRKSIHIGPGRAFFATGDITGDIRQAHCNLSK 27 | GDWDNALKQIVTKLGEQFGRNKTIVFKQSSGGDPEIIMHSFNCAGEFSYCN 28 | >DENTIST_WIFE 29 | NFTNNAKTIIVQLNTSVEINCTRPSNNTSKGIHIGPGRAFHATDRITGDIRQAHCNISKAKWNDTLQQVV 30 | KKLREQFGGNKTIVFNQSSGGDPEIVLHSFNCGGEFFYCNTT 31 | >Local_Control_1 32 | FTDNAKTIIVQLKNSVVINCTRPNNNTRRSVHIGPGSSLYTTDIIGDIRQAHCNLSRANWNKTLEQIVTK 33 | LGEQFGNNTTIVFNSSSGG 34 | >Local_Control_2 35 | SENFTDNTKTIIVQLNTSVTINCTRPGNNTRKSITMGPGKVFYAGEIIGDIRQAHCNLSRAAWNDTLKQI 36 | VGKLQEQFGNKTIVFNHSSGGDPEIVMHSF 37 | >Local_Control_3 38 | RSENFTNNAKIIIVHLNKTVNITCTRPNNNTRRSIPIGPGKAFYTTDIIGNIRQAHCNLSRAEWNNTLKQ 39 | IVKKLREQFKNKTIVFNHSSGGDPEIVMHSF 40 | >Local_Control_4 41 | LAEEEVVIRSENFTNNAKIIIVHLNKTVNITCTRPNNNTRRSIPMGPGKAFYTTEIIGNIRQAHCNLSKA 42 | EWNNTLRQIVKKLRDNLRIKQ 43 | >Local_Control_5 44 | LAEKEVVIRSENFTDNTKTIIIQLNTSVTINCTRPGNNTRKSITMGPGKVFYAGEIIGDIRQAHCNLSRT 45 | AWNDTLKQIVGKLQEQFGNKTIVFNHSSGGDPEIVMHSF 46 | -------------------------------------------------------------------------------- /example_data/input/sample.dna.fasta: -------------------------------------------------------------------------------- 1 | >seq1 2 | AACGTACCATTGAACGTACCATTGAACGTACCATTG 3 | >seq2 4 | CTAGGGGACTTATCTAGGGGACTTATCTAGGGGACTTAT 5 | >seq3 6 | CTAGGGAAAATTCTAGGGAAAATTCTAGGGAAAATT 7 | -------------------------------------------------------------------------------- /example_data/input/sample.pep.fasta: -------------------------------------------------------------------------------- 1 | >seq1 2 | MKSTGWHF 3 | >seq2 4 | MKSSSSTGWGWG 5 | >seq3 6 | MKSTLKNGTEQ -------------------------------------------------------------------------------- /example_data/output/bears.dna.fasta.1mer: -------------------------------------------------------------------------------- 1 | 2693 11 A 0:333 1:132 2:133 3:130 4:132 5:342 6:131 7:346 8:352 9:351 10:311 2 | 1717 11 T 0:226 1:86 2:83 3:81 4:83 5:236 6:87 7:232 8:225 9:216 10:162 3 | 1650 11 C 0:219 1:69 2:71 3:73 4:72 5:210 6:70 7:210 8:213 9:217 10:226 4 | 1337 11 G 0:188 1:60 2:59 3:63 4:59 5:178 6:61 7:166 8:172 9:172 10:159 -------------------------------------------------------------------------------- /example_data/output/bears.dna.fasta.2mer: -------------------------------------------------------------------------------- 1 | 1096 11 AA 0:139 1:57 2:60 3:58 4:59 5:138 6:57 7:129 8:140 9:136 10:123 2 | 697 11 TA 0:83 1:31 2:36 3:30 4:35 5:102 6:36 7:99 8:94 9:95 10:56 3 | 559 11 AG 0:70 1:28 2:26 3:27 4:26 5:74 6:29 7:75 8:68 9:72 10:64 4 | 550 11 AC 0:66 1:23 2:22 3:22 4:24 5:67 6:21 7:74 8:69 9:75 10:87 5 | 516 11 CA 0:64 1:24 2:16 3:20 4:17 5:53 6:18 7:71 8:74 9:72 10:87 6 | 487 11 AT 0:58 1:24 2:25 3:23 4:23 5:63 6:24 7:68 8:75 9:68 10:36 7 | 470 11 CC 0:67 1:17 2:22 3:24 4:23 5:57 6:20 7:60 8:62 9:61 10:57 8 | 464 11 TT 0:65 1:26 2:23 3:24 4:24 5:61 6:24 7:68 8:54 9:50 10:45 9 | 456 11 CT 0:57 1:23 2:24 3:21 4:24 5:70 6:26 7:54 8:51 9:57 10:49 10 | 379 11 GA 0:47 1:19 2:20 3:21 4:20 5:49 6:19 7:47 8:44 9:48 10:45 11 | 342 11 GC 0:49 1:11 2:12 3:12 4:11 5:49 6:15 7:42 8:43 9:48 10:50 12 | 307 11 GT 0:46 1:13 2:11 3:13 4:12 5:41 6:13 7:41 8:44 9:41 10:32 13 | 305 11 GG 0:46 1:16 2:15 3:16 4:15 5:39 6:14 7:36 8:41 9:35 10:32 14 | 285 11 TC 0:36 1:18 2:15 3:15 4:14 5:37 6:14 7:34 8:39 9:32 10:31 15 | 267 11 TG 0:42 1:11 2:9 3:12 4:10 5:36 6:12 7:30 8:37 9:38 10:30 16 | 206 11 CG 0:30 1:5 2:9 3:8 4:8 5:29 6:6 7:25 8:26 9:27 10:33 -------------------------------------------------------------------------------- /example_data/output/bears.dna.fasta.3mer: -------------------------------------------------------------------------------- 1 | 462 11 AAA 0:56 1:24 2:29 3:26 4:29 5:60 6:25 7:47 8:56 9:57 10:53 2 | 322 11 TAA 0:43 1:14 2:16 3:13 4:15 5:46 6:14 7:49 8:48 9:44 10:20 3 | 243 11 AAG 0:38 1:11 2:10 3:11 4:9 5:35 6:12 7:33 8:28 9:28 10:28 4 | 215 11 AAC 0:26 1:13 2:12 3:13 4:14 5:25 6:10 7:24 8:22 9:26 10:30 5 | 210 11 CTA 0:24 1:10 2:13 3:10 4:13 5:33 6:13 7:24 8:23 9:27 10:20 6 | 196 11 TTA 0:25 1:9 2:9 3:8 4:10 5:30 6:8 7:28 8:24 9:28 10:17 7 | 195 11 ATA 0:22 1:8 2:10 3:7 4:9 5:23 6:11 7:32 8:31 9:28 10:14 8 | 176 11 AAT 0:19 1:9 2:9 3:8 4:7 5:18 6:10 7:25 8:34 9:25 10:12 9 | 171 11 ACA 0:21 1:6 2:3 3:4 4:4 5:19 6:4 7:25 8:23 9:24 10:38 10 | 169 11 CAA 0:23 1:10 2:6 3:9 4:6 5:15 6:7 7:19 8:22 9:20 10:32 11 | 168 11 AGC 0:24 1:5 2:4 3:4 4:4 5:27 6:8 7:23 8:22 9:25 10:22 12 | 165 11 ACC 0:20 1:9 2:10 3:10 4:11 5:16 6:8 7:20 8:18 9:23 10:20 13 | 165 11 CAC 0:22 1:4 2:3 3:3 4:3 5:17 6:3 7:25 8:27 9:23 10:35 14 | 148 11 CCC 0:25 1:5 2:8 3:10 4:8 5:18 6:7 7:18 8:17 9:20 10:12 15 | 143 11 ACT 0:15 1:7 2:8 3:6 4:8 5:21 6:8 7:20 8:16 9:15 10:19 16 | 143 11 AGA 0:14 1:10 2:9 3:10 4:9 5:16 6:8 7:17 8:13 9:15 10:22 17 | 140 11 ATT 0:16 1:8 2:7 3:6 4:7 5:21 6:7 7:18 8:22 9:19 10:9 18 | 140 11 GAA 0:17 1:8 2:9 3:9 4:9 5:17 6:10 7:14 8:14 9:15 10:18 19 | 137 11 CCA 0:18 1:4 2:5 3:6 4:5 5:13 6:4 7:20 8:22 9:20 10:20 20 | 134 11 TAG 0:13 1:7 2:7 3:7 4:7 5:19 6:9 7:20 8:16 9:18 10:11 21 | 133 11 CCT 0:16 1:6 2:6 3:6 4:6 5:21 6:7 7:15 8:17 9:15 10:18 22 | 129 11 AGT 0:19 1:6 2:6 3:6 4:6 5:17 6:7 7:17 8:18 9:16 10:11 23 | 126 11 TTT 0:18 1:7 2:8 3:8 4:8 5:14 6:7 7:23 8:12 9:5 10:16 24 | 122 11 CTT 0:17 1:8 2:6 3:7 4:6 5:17 6:7 7:17 8:11 9:14 10:12 25 | 122 11 TAT 0:16 1:6 2:7 3:6 4:7 5:19 6:6 7:14 8:16 9:16 10:9 26 | 119 11 TAC 0:11 1:4 2:6 3:4 4:6 5:18 6:7 7:16 8:14 9:17 10:16 27 | 115 11 AGG 0:13 1:6 2:6 3:6 4:6 5:14 6:6 7:18 8:15 9:16 10:9 28 | 115 11 TCA 0:12 1:10 2:6 3:7 4:6 5:10 6:5 7:16 8:16 9:11 10:16 29 | 108 11 GCC 0:13 1:2 2:3 3:3 4:3 5:14 6:4 7:17 8:17 9:14 10:18 30 | 99 11 GGA 0:14 1:5 2:6 3:6 4:6 5:11 6:5 7:12 8:13 9:13 10:8 31 | 98 11 GCT 0:17 1:4 2:4 3:4 4:4 5:17 6:5 7:11 8:9 9:14 10:9 32 | 97 11 CAT 0:10 1:6 2:4 3:5 4:4 5:12 6:5 7:16 8:12 9:16 10:7 33 | 96 11 GAG 0:10 1:6 2:5 3:6 4:5 5:11 6:5 7:11 8:11 9:13 10:13 34 | 93 11 GTA 0:12 1:4 2:4 3:5 4:3 5:15 6:4 7:14 8:15 9:12 10:5 35 | 92 11 GAT 0:13 1:3 2:5 3:4 4:5 5:14 6:3 7:13 8:13 9:11 10:8 36 | 91 11 GCA 0:12 1:4 2:2 3:3 4:2 5:11 6:5 7:10 8:13 9:16 10:13 37 | 90 11 GGT 0:15 1:5 2:3 3:4 4:4 5:11 6:4 7:13 8:13 9:11 10:7 38 | 84 11 CAG 0:9 1:4 2:3 3:3 4:4 5:9 6:3 7:11 8:13 9:13 10:12 39 | 82 11 TCT 0:9 1:6 2:6 3:5 4:6 5:11 6:6 7:8 8:9 9:13 10:3 40 | 82 11 TGG 0:15 1:5 2:4 3:5 4:5 5:10 6:5 7:7 8:12 9:9 10:5 41 | 80 11 CTC 0:10 1:4 2:4 3:2 4:4 5:13 6:3 7:9 8:12 9:9 10:10 42 | 80 11 GTG 0:13 1:4 2:2 3:3 4:3 5:10 6:4 7:11 8:11 9:11 10:8 43 | 78 11 ATG 0:10 1:3 2:4 3:4 4:4 5:12 6:3 7:8 8:11 9:12 10:7 44 | 76 11 GTT 0:14 1:3 2:2 3:3 4:3 5:9 6:3 7:10 8:9 9:12 10:8 45 | 76 11 TTC 0:9 1:7 2:4 3:5 4:4 5:10 6:6 7:10 8:8 9:9 10:4 46 | 71 11 ATC 0:10 1:5 2:4 3:6 4:3 5:7 6:3 7:9 8:10 9:8 10:6 47 | 71 11 TGA 0:11 1:1 2:1 3:1 4:1 5:11 6:3 7:11 8:10 9:12 10:9 48 | 70 11 ACG 0:9 1:1 2:1 3:2 4:1 5:11 6:1 7:9 8:12 9:13 10:10 49 | 66 11 CGA 0:8 1:3 2:4 3:4 4:4 5:11 6:3 7:7 8:8 9:8 10:6 50 | 66 11 TTG 0:13 1:3 2:2 3:3 4:2 5:7 6:3 7:7 8:10 9:8 10:8 51 | 65 11 TGC 0:9 1:3 2:2 3:3 4:2 5:7 6:2 7:7 8:9 9:10 10:11 52 | 60 11 GGG 0:9 1:4 2:3 3:3 4:3 5:9 6:2 7:6 8:7 9:6 10:8 53 | 58 11 GTC 0:7 1:2 2:3 3:2 4:3 5:7 6:2 7:6 8:9 9:6 10:11 54 | 56 11 GGC 0:8 1:2 2:3 3:3 4:2 5:8 6:3 7:5 8:8 9:5 10:9 55 | 53 11 CGC 0:8 1:1 2:3 3:2 4:3 5:7 6:2 7:7 8:4 9:8 10:8 56 | 52 11 CCG 0:8 1:2 2:3 3:2 4:4 5:5 6:2 7:7 8:6 9:6 10:7 57 | 51 11 GAC 0:7 1:2 2:1 3:2 4:1 5:7 6:1 7:9 8:6 9:9 10:6 58 | 49 11 TGT 0:7 1:2 2:2 3:3 4:2 5:8 6:2 7:5 8:6 9:7 10:5 59 | 48 11 CGG 0:9 1:1 2:2 3:2 4:1 5:6 6:1 7:5 8:7 9:4 10:10 60 | 48 11 TCC 0:9 1:1 2:1 3:1 4:1 5:9 6:1 7:5 8:10 9:4 10:6 61 | 44 11 GCG 0:7 1:1 2:3 3:2 4:2 5:6 6:1 7:4 8:4 9:4 10:10 62 | 43 11 CTG 0:6 1:1 2:1 3:2 4:1 5:7 6:2 7:4 8:5 9:7 10:7 63 | 40 11 TCG 0:6 1:1 2:2 3:2 4:1 5:7 6:2 7:5 8:4 9:4 10:6 64 | 39 6 CGT 0:5 5:5 7:6 8:7 9:7 10:9 -------------------------------------------------------------------------------- /example_data/output/bears.dna.fasta.pairwise: -------------------------------------------------------------------------------- 1 | American_Black_Bear American_Brown_Bear 0.7106017 2 | American_Black_Bear Spectacled_Bear 0.7765043 3 | American_Black_Bear Asiatic_Black_Bear 0.7020057 4 | American_Black_Bear Polar_Bear 0.7736390 5 | American_Black_Bear Giant_Panda 0.5702006 6 | American_Black_Bear Red_Panda 0.8080229 7 | American_Black_Bear Dog 0.6131805 8 | American_Black_Bear Raccoon 0.5873926 9 | American_Black_Bear Cow 0.6704871 10 | American_Black_Bear Crocodilian_skink 0.7822350 11 | American_Brown_Bear Spectacled_Bear 0.4545455 12 | American_Brown_Bear Asiatic_Black_Bear 0.3034483 13 | American_Brown_Bear Polar_Bear 0.4405594 14 | American_Brown_Bear Giant_Panda 0.7761628 15 | American_Brown_Bear Red_Panda 0.5174825 16 | American_Brown_Bear Dog 0.7953216 17 | American_Brown_Bear Raccoon 0.7777778 18 | American_Brown_Bear Cow 0.8171091 19 | American_Brown_Bear Crocodilian_skink 0.8705502 20 | Spectacled_Bear Asiatic_Black_Bear 0.3655172 21 | Spectacled_Bear Polar_Bear 0.1478873 22 | Spectacled_Bear Giant_Panda 0.7877907 23 | Spectacled_Bear Red_Panda 0.5352113 24 | Spectacled_Bear Dog 0.7982456 25 | Spectacled_Bear Raccoon 0.7836257 26 | Spectacled_Bear Cow 0.8289086 27 | Spectacled_Bear Crocodilian_skink 0.8705502 28 | Asiatic_Black_Bear Polar_Bear 0.3655172 29 | Asiatic_Black_Bear Giant_Panda 0.7906977 30 | Asiatic_Black_Bear Red_Panda 0.5448276 31 | Asiatic_Black_Bear Dog 0.8157895 32 | Asiatic_Black_Bear Raccoon 0.7923977 33 | Asiatic_Black_Bear Cow 0.8436578 34 | Asiatic_Black_Bear Crocodilian_skink 0.8673139 35 | Polar_Bear Giant_Panda 0.8052326 36 | Polar_Bear Red_Panda 0.5177305 37 | Polar_Bear Dog 0.8070175 38 | Polar_Bear Raccoon 0.7894737 39 | Polar_Bear Cow 0.8289086 40 | Polar_Bear Crocodilian_skink 0.8770227 41 | Giant_Panda Red_Panda 0.7994186 42 | Giant_Panda Dog 0.5930233 43 | Giant_Panda Raccoon 0.5755814 44 | Giant_Panda Cow 0.6424419 45 | Giant_Panda Crocodilian_skink 0.8081395 46 | Red_Panda Dog 0.7807018 47 | Red_Panda Raccoon 0.7690058 48 | Red_Panda Cow 0.8318584 49 | Red_Panda Crocodilian_skink 0.8705502 50 | Dog Raccoon 0.5497076 51 | Dog Cow 0.6228070 52 | Dog Crocodilian_skink 0.7982456 53 | Raccoon Cow 0.6608187 54 | Raccoon Crocodilian_skink 0.8070175 55 | Cow Crocodilian_skink 0.7994100 56 | -------------------------------------------------------------------------------- /example_data/output/bears.dna.fasta.phylip: -------------------------------------------------------------------------------- 1 | 11 2 | American_B 0.0000000 0.6865672 0.7423168 0.6650000 0.7290168 0.6136784 0.7734554 0.6832740 0.6509946 0.7013889 0.8239203 3 | American_B 0.6865672 0.0000000 0.5071770 0.3027027 0.4878049 0.7410926 0.5636364 0.7677725 0.7476190 0.7806005 0.8517647 4 | Spectacled 0.7423168 0.5071770 0.0000000 0.3939394 0.1197605 0.7345972 0.5777778 0.7813953 0.7558685 0.7656613 0.8504673 5 | Asiatic_Bl 0.6650000 0.3027027 0.3939394 0.0000000 0.3877551 0.7328605 0.6000000 0.7908046 0.7570093 0.7752294 0.8403756 6 | Polar_Bear 0.7290168 0.4878049 0.1197605 0.3877551 0.0000000 0.7393365 0.5739910 0.7832168 0.7488152 0.7731481 0.8524590 7 | Giant_Pand 0.6136784 0.7410926 0.7345972 0.7328605 0.7393365 0.0000000 0.7660550 0.6410256 0.6275229 0.6772487 0.8283828 8 | Red_Panda 0.7734554 0.5636364 0.5777778 0.6000000 0.5739910 0.7660550 0.0000000 0.7458432 0.7405660 0.7790433 0.8465116 9 | Dog 0.6832740 0.7677725 0.7813953 0.7908046 0.7832168 0.6410256 0.7458432 0.0000000 0.6022727 0.6642599 0.8195616 10 | Raccoon 0.6509946 0.7476190 0.7558685 0.7570093 0.7488152 0.6275229 0.7405660 0.6022727 0.0000000 0.6725979 0.8252912 11 | Cow 0.7013889 0.7806005 0.7656613 0.7752294 0.7731481 0.6772487 0.7790433 0.6642599 0.6725979 0.0000000 0.8219634 12 | Crocodilia 0.8239203 0.8517647 0.8504673 0.8403756 0.8524590 0.8283828 0.8465116 0.8195616 0.8252912 0.8219634 0.0000000 13 | -------------------------------------------------------------------------------- /example_data/output/gp120.pep.fasta.1mer: -------------------------------------------------------------------------------- 1 | 1331 27 T 0:46 1:44 2:49 3:52 4:40 5:47 6:39 7:45 8:45 9:44 10:49 11:42 12:46 13:43 14:44 15:50 16:51 17:44 18:57 19:57 20:53 21:56 22:50 23:55 24:62 25:60 26:61 2 | 1152 27 N 0:47 1:46 2:49 3:48 4:44 5:49 6:41 7:45 8:43 9:43 10:47 11:43 12:49 13:49 14:43 15:37 16:40 17:38 18:38 19:39 20:42 21:42 22:39 23:39 24:35 25:37 26:40 3 | 841 27 V 0:28 1:30 2:28 3:39 4:37 5:33 6:37 7:33 8:34 9:33 10:39 11:35 12:32 13:39 14:39 15:23 16:26 17:32 18:29 19:31 20:28 21:22 22:22 23:23 24:29 25:29 26:31 4 | 835 27 I 0:36 1:40 2:36 3:32 4:38 5:39 6:35 7:34 8:35 9:36 10:31 11:36 12:36 13:32 14:31 15:29 16:28 17:24 18:25 19:23 20:25 21:25 22:25 23:30 24:25 25:25 26:24 5 | 820 27 K 0:28 1:28 2:34 3:28 4:31 5:30 6:36 7:31 8:33 9:33 10:33 11:36 12:35 13:30 14:27 15:25 16:30 17:28 18:29 19:28 20:29 21:28 22:27 23:29 24:30 25:32 26:32 6 | 758 27 S 0:33 1:31 2:23 3:24 4:30 5:24 6:29 7:38 8:35 9:37 10:31 11:27 12:32 13:26 14:31 15:23 16:22 17:25 18:22 19:21 20:27 21:28 22:23 23:27 24:30 25:30 26:29 7 | 751 27 E 0:26 1:29 2:30 3:27 4:31 5:26 6:29 7:26 8:25 9:25 10:27 11:26 12:26 13:28 14:21 15:28 16:27 17:33 18:31 19:31 20:27 21:30 22:27 23:28 24:28 25:29 26:30 8 | 731 27 L 0:34 1:31 2:32 3:33 4:26 5:28 6:26 7:26 8:26 9:26 10:31 11:27 12:28 13:29 14:28 15:25 16:22 17:26 18:26 19:26 20:22 21:23 22:23 23:20 24:29 25:29 26:29 9 | 715 27 G 0:26 1:25 2:29 3:29 4:31 5:29 6:27 7:31 8:30 9:29 10:32 11:29 12:27 13:28 14:26 15:25 16:29 17:20 18:23 19:23 20:24 21:22 22:23 23:19 24:28 25:26 26:25 10 | 664 27 R 0:25 1:24 2:20 3:26 4:25 5:25 6:21 7:21 8:22 9:22 10:21 11:20 12:21 13:24 14:22 15:31 16:25 17:25 18:25 19:25 20:24 21:24 22:29 23:27 24:30 25:29 26:31 11 | 613 27 A 0:27 1:23 2:27 3:22 4:21 5:22 6:20 7:24 8:22 9:21 10:21 11:22 12:21 13:20 14:26 15:26 16:24 17:21 18:23 19:23 20:19 21:25 22:28 23:24 24:20 25:20 26:21 12 | 604 27 P 0:22 1:22 2:25 3:21 4:21 5:22 6:21 7:22 8:22 9:22 10:22 11:23 12:20 13:23 14:25 15:23 16:25 17:24 18:21 19:21 20:22 21:21 22:20 23:25 24:23 25:23 26:23 13 | 544 27 C 0:18 1:18 2:18 3:18 4:18 5:18 6:18 7:18 8:18 9:18 10:18 11:18 12:18 13:19 14:18 15:23 16:23 17:23 18:23 19:23 20:22 21:23 22:22 23:23 24:22 25:22 26:24 14 | 520 27 D 0:15 1:18 2:19 3:16 4:15 5:17 6:19 7:16 8:17 9:18 10:16 11:17 12:20 13:17 14:18 15:21 16:22 17:26 18:19 19:19 20:19 21:20 22:21 23:23 24:25 25:25 26:22 15 | 450 27 Q 0:18 1:17 2:16 3:21 4:15 5:18 6:19 7:19 8:18 9:19 10:19 11:18 12:18 13:19 14:16 15:16 16:12 17:14 18:13 19:13 20:16 21:18 22:17 23:15 24:16 25:16 26:14 16 | 436 27 Y 0:13 1:13 2:13 3:12 4:13 5:11 6:12 7:11 8:11 9:11 10:13 11:12 12:13 13:13 14:12 15:21 16:23 17:21 18:20 19:20 20:24 21:23 22:18 23:21 24:21 25:21 26:20 17 | 432 27 F 0:13 1:13 2:13 3:15 4:16 5:16 6:16 7:19 8:19 9:18 10:15 11:16 12:15 13:15 14:15 15:18 16:17 17:16 18:18 19:18 20:16 21:16 22:20 23:18 24:14 25:14 26:13 18 | 333 27 W 0:9 1:10 2:10 3:12 4:10 5:10 6:9 7:10 8:10 9:10 10:9 11:10 12:10 13:10 14:9 15:14 16:15 17:14 18:15 19:15 20:15 21:16 22:16 23:15 24:17 25:17 26:16 19 | 266 27 M 0:7 1:9 2:10 3:7 4:6 5:7 6:9 7:10 8:9 9:9 10:8 11:8 12:9 13:9 14:8 15:13 16:11 17:12 18:11 19:11 20:10 21:10 22:13 23:12 24:13 25:13 26:12 20 | 233 27 H 0:8 1:7 2:8 3:8 4:13 5:9 6:8 7:7 8:7 9:7 10:8 11:8 12:8 13:9 14:11 15:9 16:9 17:8 18:8 19:9 20:9 21:12 22:9 23:9 24:8 25:8 26:9 -------------------------------------------------------------------------------- /example_data/output/hiv.pep.fasta.1mer: -------------------------------------------------------------------------------- 1 | 179 15 I 0:14 1:14 2:11 3:14 4:13 5:10 6:12 7:13 8:14 9:10 10:9 11:10 12:12 13:11 14:12 2 | 176 15 N 0:11 1:13 2:11 3:12 4:11 5:8 6:13 7:14 8:11 9:13 10:12 11:10 12:14 13:13 14:10 3 | 124 15 T 0:7 1:6 2:8 3:8 4:6 5:6 6:8 7:7 8:9 9:11 10:10 11:10 12:9 13:8 14:11 4 | 119 15 G 0:8 1:8 2:8 3:8 4:8 5:9 6:8 7:8 8:11 9:10 10:7 11:9 12:5 13:3 14:9 5 | 99 15 V 0:10 1:8 2:3 3:10 4:8 5:4 6:7 7:10 8:4 9:6 10:6 11:6 12:5 13:5 14:7 6 | 97 15 R 0:9 1:9 2:5 3:8 4:6 5:5 6:6 7:9 8:6 9:5 10:5 11:4 12:7 13:8 14:5 7 | 94 15 K 0:6 1:6 2:5 3:6 4:8 5:5 6:7 7:5 8:7 9:7 10:4 11:6 12:8 13:7 14:7 8 | 92 15 S 0:5 1:6 2:4 3:5 4:9 5:4 6:8 7:6 8:7 9:7 10:8 11:7 12:6 13:3 14:7 9 | 82 15 A 0:7 1:7 2:6 3:6 4:7 5:4 6:5 7:7 8:8 9:5 10:3 11:4 12:4 13:5 14:4 10 | 81 15 E 0:8 1:7 2:4 3:6 4:5 5:5 6:7 7:6 8:7 9:4 10:2 11:4 12:4 13:6 14:6 11 | 77 15 F 0:5 1:6 2:5 3:5 4:6 5:3 6:7 7:6 8:7 9:7 10:3 11:5 12:5 13:2 14:5 12 | 65 15 Q 0:3 1:3 2:4 3:4 4:7 5:3 6:6 7:4 8:5 9:6 10:4 11:5 12:3 13:3 14:5 13 | 58 15 L 0:3 1:3 2:3 3:3 4:4 5:2 6:4 7:3 8:5 9:4 10:5 11:4 12:4 13:6 14:5 14 | 49 15 D 0:3 1:3 2:2 3:4 4:3 5:3 6:3 7:4 8:6 9:4 10:3 11:4 12:2 13:1 14:4 15 | 47 15 P 0:4 1:3 2:2 3:4 4:4 5:3 6:3 7:3 8:3 9:3 10:2 11:3 12:4 13:3 14:3 16 | 46 15 H 0:3 1:3 2:3 3:4 4:2 5:3 6:3 7:3 8:4 9:4 10:2 11:3 12:4 13:2 14:3 17 | 43 15 C 0:3 1:3 2:2 3:3 4:4 5:2 6:4 7:4 8:4 9:4 10:2 11:2 12:2 13:2 14:2 18 | 23 15 Y 0:3 1:2 2:1 3:2 4:2 5:1 6:2 7:3 8:1 9:1 10:1 11:1 12:1 13:1 14:1 19 | 15 15 W 0:1 1:1 2:1 3:1 4:1 5:1 6:1 7:1 8:1 9:1 10:1 11:1 12:1 13:1 14:1 20 | 14 12 M 0:1 1:1 3:1 4:1 5:1 6:1 7:1 8:1 11:2 12:1 13:1 14:2 -------------------------------------------------------------------------------- /example_data/output/hiv.pep.fasta.pairwise: -------------------------------------------------------------------------------- 1 | DENTIST PATIENT_A 0.1910112 2 | DENTIST PATIENT_B 0.4886364 3 | DENTIST PATIENT_C 0.2111111 4 | DENTIST PATIENT_D 0.5384615 5 | DENTIST PATIENT_E 0.4886364 6 | DENTIST PATIENT_F 0.5326087 7 | DENTIST PATIENT_G 0.3516484 8 | DENTIST PATIENT_H 0.6210526 9 | DENTIST DENTIST_WIFE 0.5280899 10 | DENTIST Local_Control_1 0.6363636 11 | DENTIST Local_Control_2 0.6022727 12 | DENTIST Local_Control_3 0.5909091 13 | DENTIST Local_Control_4 0.6477273 14 | DENTIST Local_Control_5 0.6022727 15 | PATIENT_A PATIENT_B 0.4382022 16 | PATIENT_A PATIENT_C 0.2444444 17 | PATIENT_A PATIENT_D 0.4945055 18 | PATIENT_A PATIENT_E 0.3932584 19 | PATIENT_A PATIENT_F 0.4782609 20 | PATIENT_A PATIENT_G 0.3076923 21 | PATIENT_A PATIENT_H 0.5789474 22 | PATIENT_A DENTIST_WIFE 0.4943820 23 | PATIENT_A Local_Control_1 0.6067416 24 | PATIENT_A Local_Control_2 0.5280899 25 | PATIENT_A Local_Control_3 0.5280899 26 | PATIENT_A Local_Control_4 0.6292135 27 | PATIENT_A Local_Control_5 0.5280899 28 | PATIENT_B PATIENT_C 0.4444444 29 | PATIENT_B PATIENT_D 0.6043956 30 | PATIENT_B PATIENT_E 0.3200000 31 | PATIENT_B PATIENT_F 0.5108696 32 | PATIENT_B PATIENT_G 0.4725275 33 | PATIENT_B PATIENT_H 0.6210526 34 | PATIENT_B DENTIST_WIFE 0.5505618 35 | PATIENT_B Local_Control_1 0.4605263 36 | PATIENT_B Local_Control_2 0.5421687 37 | PATIENT_B Local_Control_3 0.5487805 38 | PATIENT_B Local_Control_4 0.5584416 39 | PATIENT_B Local_Control_5 0.5909091 40 | PATIENT_C PATIENT_D 0.5054945 41 | PATIENT_C PATIENT_E 0.4222222 42 | PATIENT_C PATIENT_F 0.4673913 43 | PATIENT_C PATIENT_G 0.2747253 44 | PATIENT_C PATIENT_H 0.5578947 45 | PATIENT_C DENTIST_WIFE 0.5000000 46 | PATIENT_C Local_Control_1 0.6000000 47 | PATIENT_C Local_Control_2 0.5444444 48 | PATIENT_C Local_Control_3 0.5444444 49 | PATIENT_C Local_Control_4 0.6111111 50 | PATIENT_C Local_Control_5 0.5444444 51 | PATIENT_D PATIENT_E 0.6043956 52 | PATIENT_D PATIENT_F 0.4347826 53 | PATIENT_D PATIENT_G 0.5384615 54 | PATIENT_D PATIENT_H 0.5578947 55 | PATIENT_D DENTIST_WIFE 0.4835165 56 | PATIENT_D Local_Control_1 0.6153846 57 | PATIENT_D Local_Control_2 0.5494505 58 | PATIENT_D Local_Control_3 0.5494505 59 | PATIENT_D Local_Control_4 0.6153846 60 | PATIENT_D Local_Control_5 0.5384615 61 | PATIENT_E PATIENT_F 0.5217391 62 | PATIENT_E PATIENT_G 0.4615385 63 | PATIENT_E PATIENT_H 0.6000000 64 | PATIENT_E DENTIST_WIFE 0.5393258 65 | PATIENT_E Local_Control_1 0.5263158 66 | PATIENT_E Local_Control_2 0.5301205 67 | PATIENT_E Local_Control_3 0.5121951 68 | PATIENT_E Local_Control_4 0.5584416 69 | PATIENT_E Local_Control_5 0.5340909 70 | PATIENT_F PATIENT_G 0.4782609 71 | PATIENT_F PATIENT_H 0.5052632 72 | PATIENT_F DENTIST_WIFE 0.4782609 73 | PATIENT_F Local_Control_1 0.5869565 74 | PATIENT_F Local_Control_2 0.4673913 75 | PATIENT_F Local_Control_3 0.5434783 76 | PATIENT_F Local_Control_4 0.5760870 77 | PATIENT_F Local_Control_5 0.4673913 78 | PATIENT_G PATIENT_H 0.5684211 79 | PATIENT_G DENTIST_WIFE 0.5054945 80 | PATIENT_G Local_Control_1 0.6373626 81 | PATIENT_G Local_Control_2 0.5824176 82 | PATIENT_G Local_Control_3 0.5494505 83 | PATIENT_G Local_Control_4 0.6373626 84 | PATIENT_G Local_Control_5 0.5714286 85 | PATIENT_H DENTIST_WIFE 0.5473684 86 | PATIENT_H Local_Control_1 0.5684211 87 | PATIENT_H Local_Control_2 0.5578947 88 | PATIENT_H Local_Control_3 0.6105263 89 | PATIENT_H Local_Control_4 0.6526316 90 | PATIENT_H Local_Control_5 0.5473684 91 | DENTIST_WIFE Local_Control_1 0.6067416 92 | DENTIST_WIFE Local_Control_2 0.5505618 93 | DENTIST_WIFE Local_Control_3 0.5955056 94 | DENTIST_WIFE Local_Control_4 0.6629213 95 | DENTIST_WIFE Local_Control_5 0.5505618 96 | Local_Control_1 Local_Control_2 0.5421687 97 | Local_Control_1 Local_Control_3 0.5487805 98 | Local_Control_1 Local_Control_4 0.5584416 99 | Local_Control_1 Local_Control_5 0.5909091 100 | Local_Control_2 Local_Control_3 0.5060241 101 | Local_Control_2 Local_Control_4 0.6144578 102 | Local_Control_2 Local_Control_5 0.1818182 103 | Local_Control_3 Local_Control_4 0.3902439 104 | Local_Control_3 Local_Control_5 0.5340909 105 | Local_Control_4 Local_Control_5 0.6136364 106 | -------------------------------------------------------------------------------- /example_data/output/hiv.pep.fasta.phylip: -------------------------------------------------------------------------------- 1 | 15 2 | DENTIST 0.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 3 | PATIENT_A 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000 4 | PATIENT_B 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000 5 | PATIENT_C 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000 6 | PATIENT_D 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000 7 | PATIENT_E 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000 8 | PATIENT_F 1.0000000 0.6636364 0.6636364 0.6636364 0.6636364 0.6636364 0.0000000 0.6636364 0.4822695 0.3363636 0.6636364 0.3454545 0.6727273 0.6727273 0.3454545 9 | PATIENT_G 1.0000000 0.5888889 0.5888889 0.5888889 0.5888889 0.5888889 0.6636364 0.0000000 1.0000000 1.0000000 0.5888889 1.0000000 0.6400000 0.6000000 1.0000000 10 | PATIENT_H 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.4822695 1.0000000 0.0000000 0.4751773 1.0000000 0.4893617 1.0000000 1.0000000 0.4893617 11 | DENTIST_WI 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.3363636 1.0000000 0.4751773 0.0000000 1.0000000 0.1724138 1.0000000 1.0000000 0.1724138 12 | Local_Cont 1.0000000 0.2884615 0.2884615 0.2884615 0.2884615 0.2884615 0.6636364 0.5888889 1.0000000 1.0000000 0.0000000 1.0000000 0.6400000 0.3076923 1.0000000 13 | Local_Cont 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.3454545 1.0000000 0.4893617 0.1724138 1.0000000 0.0000000 1.0000000 1.0000000 0.0000000 14 | Local_Cont 1.0000000 0.6400000 0.6400000 0.6400000 0.6400000 0.6400000 0.6727273 0.6400000 1.0000000 1.0000000 0.6400000 1.0000000 0.0000000 0.6400000 1.0000000 15 | Local_Cont 1.0000000 0.0270270 0.0270270 0.0270270 0.0270270 0.0270270 0.6727273 0.6000000 1.0000000 1.0000000 0.3076923 1.0000000 0.6400000 0.0000000 1.0000000 16 | Local_Cont 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.3454545 1.0000000 0.4893617 0.1724138 1.0000000 0.0000000 1.0000000 1.0000000 0.0000000 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | # Read a __version__ 4 | exec(open('alfpy/version.py').read()) 5 | 6 | # Long description 7 | fh = open('README.rst') 8 | long_description = fh.read() 9 | fh.close() 10 | 11 | setup( 12 | name='alfpy', 13 | version=__version__, 14 | description="Alignment-free package to compare DNA/RNA/protein sequences (bioinformatics).", 15 | long_description=long_description, 16 | author='Andrzej Zielezinski', 17 | keywords='alignment-free bioinformatics sequence DNA protein homology phylogeny', 18 | license="MIT", 19 | author_email='andrzejz@amu.edu.pl', 20 | url="http://www.combio.pl/alfree", 21 | packages=['alfpy', 'alfpy.utils', 'alfpy.utils.data'], 22 | #setup_requires=["numpy"], 23 | install_requires=["numpy"], 24 | scripts=[ 25 | 'bin/calc_bbc.py', 26 | 'bin/calc_graphdna.py', 27 | 'bin/calc_fcgr.py', 28 | 'bin/calc_lempelziv.py', 29 | 'bin/calc_ncd.py', 30 | 'bin/calc_wmetric.py', 31 | 'bin/calc_word.py', 32 | 'bin/calc_word_bool.py', 33 | 'bin/calc_word_sets.py', 34 | 'bin/calc_word_cv.py', 35 | 'bin/calc_word_d2.py', 36 | 'bin/calc_word_ffp.py', 37 | 'bin/calc_word_rtd.py', 38 | 'bin/create_wordpattern.py' 39 | ], 40 | classifiers=[ 41 | 'License :: OSI Approved :: MIT License', 42 | 'Environment :: Console', 43 | 'Operating System :: MacOS', 44 | 'Operating System :: POSIX :: Linux', 45 | 'Programming Language :: Python :: 2', 46 | 'Programming Language :: Python :: 2.7', 47 | 'Programming Language :: Python :: 3', 48 | 'Programming Language :: Python :: 3.3', 49 | 'Programming Language :: Python :: 3.4', 50 | 'Programming Language :: Python :: 3.5', 51 | 'Topic :: Scientific/Engineering', 52 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 53 | ], 54 | 55 | ) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aziele/alfpy/25545be14affa7d7e89e5b5ebcfe4f3e688108b7/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/char_freqs.txt: -------------------------------------------------------------------------------- 1 | # UniProtKB/Swiss-Prot protein knowledgebase release 2016_09 statistics 2 | # Release 2016_09 of 05-Oct-16 of UniProtKB/Swiss-Prot contains 552259 sequence entries, 3 | # comprising 197423140 amino acids abstracted from 247204 references. 4 | # http://web.expasy.org/docs/relnotes/relstat.html 5 | A 0.0826 6 | Q 0.0393 7 | L 0.0965 8 | S 0.0659 9 | R 0.0553 10 | E 0.0674 11 | K 0.0583 12 | T 0.0534 13 | N 0.0406 14 | G 0.0708 15 | M 0.0241 16 | W 0.0109 17 | D 0.0546 18 | H 0.0227 19 | F 0.0386 20 | Y 0.0292 21 | C 0.0137 22 | I 0.0594 23 | P 0.0471 24 | V 0.0687 -------------------------------------------------------------------------------- /tests/data/char_weights.txt: -------------------------------------------------------------------------------- 1 | # Based on amino acid frequencies 2 | # Weight = 1 / amino acid freq / 10 3 | # should be greater than 1. 4 | A 1.21065375303 5 | C 7.29927007299 6 | E 1.48367952522 7 | D 1.8315018315 8 | G 1.41242937853 9 | F 2.59067357513 10 | I 1.6835016835 11 | H 4.40528634361 12 | K 1.71526586621 13 | M 4.14937759336 14 | L 1.03626943005 15 | N 2.46305418719 16 | Q 2.54452926209 17 | P 2.12314225053 18 | S 1.51745068285 19 | R 1.80831826401 20 | T 1.87265917603 21 | W 9.17431192661 22 | V 1.45560407569 23 | Y 3.42465753425 -------------------------------------------------------------------------------- /tests/data/dna.fa: -------------------------------------------------------------------------------- 1 | >seq1 2 | AACGTACCATTGAACGTACCGTAGG 3 | >seq2 4 | CTAGGGGACTTATCTAGG 5 | >seq3 6 | CTAGGGAACATACCA -------------------------------------------------------------------------------- /tests/data/dna.fa.1mer.txt: -------------------------------------------------------------------------------- 1 | 18 3 A 0:8 1:4 2:6 2 | 15 3 G 0:6 1:6 2:3 3 | 13 3 C 0:6 1:3 2:4 4 | 12 3 T 0:5 1:5 2:2 -------------------------------------------------------------------------------- /tests/data/dna.fa.1mer.wordpos.txt: -------------------------------------------------------------------------------- 1 | 18 3 A 0 0 0 1 0 5 0 8 0 12 0 13 0 17 0 22 1 2 1 7 1 11 1 15 2 2 2 6 2 7 2 9 2 11 2 14 2 | 15 3 G 0 3 0 11 0 15 0 20 0 23 0 24 1 3 1 4 1 5 1 6 1 16 1 17 2 3 2 4 2 5 3 | 13 3 C 0 2 0 6 0 7 0 14 0 18 0 19 1 0 1 8 1 13 2 0 2 8 2 12 2 13 4 | 12 3 T 0 4 0 9 0 10 0 16 0 21 1 1 1 9 1 10 1 12 1 14 2 1 2 10 -------------------------------------------------------------------------------- /tests/data/dna.fa.2mer.txt: -------------------------------------------------------------------------------- 1 | 8 3 TA 0:3 1:3 2:2 2 | 7 3 AC 0:4 1:1 2:2 3 | 7 3 GG 0:1 1:4 2:2 4 | 4 3 AG 0:1 1:2 2:1 5 | 4 2 CT 1:3 2:1 6 | 3 3 AT 0:1 1:1 2:1 7 | 3 3 GA 0:1 1:1 2:1 8 | 3 2 AA 0:2 2:1 9 | 3 2 CA 0:1 2:2 10 | 3 2 CC 0:2 2:1 11 | 3 1 CG 0:3 12 | 3 1 GT 0:3 13 | 2 2 TT 0:1 1:1 14 | 1 1 TC 1:1 15 | 1 1 TG 0:1 -------------------------------------------------------------------------------- /tests/data/dna.fa.2mer.wordpos.txt: -------------------------------------------------------------------------------- 1 | 8 3 TA 0 4 0 16 0 21 1 1 1 10 1 14 2 1 2 10 2 | 7 3 AC 0 1 0 5 0 13 0 17 1 7 2 7 2 11 3 | 7 3 GG 0 23 1 3 1 4 1 5 1 16 2 3 2 4 4 | 4 3 AG 0 22 1 2 1 15 2 2 5 | 4 2 CT 1 0 1 8 1 13 2 0 6 | 3 3 AT 0 8 1 11 2 9 7 | 3 3 GA 0 11 1 6 2 5 8 | 3 2 AA 0 0 0 12 2 6 9 | 3 2 CA 0 7 2 8 2 13 10 | 3 2 CC 0 6 0 18 2 12 11 | 3 1 CG 0 2 0 14 0 19 12 | 3 1 GT 0 3 0 15 0 20 13 | 2 2 TT 0 9 1 9 14 | 1 1 TC 1 12 15 | 1 1 TG 0 10 -------------------------------------------------------------------------------- /tests/data/pep.fa: -------------------------------------------------------------------------------- 1 | >seq1 seq1 desc 2 | MEVVIRSANFTDNAKIIIVQLNASVEINC 3 | TRPNNYTRKGIRIGPGRAVYAAEEIIGDN 4 | TLKQVVTKLRE 5 | >seq2 seq2 desc 6 | MVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIR 7 | IGPGRAVYAAEEIIGDIRRAHCNIS 8 | >seq3 seq3 desc 9 | MFTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGAKW 10 | >seq4 11 | MFTDNAKIIIVQLNASVEINCTRPNNNTR 12 | -------------------------------------------------------------------------------- /tests/data/pep.fa.1mer.txt: -------------------------------------------------------------------------------- 1 | 34 4 I 0:9 1:11 2:10 3:4 2 | 28 4 N 0:7 1:8 2:7 3:6 3 | 21 4 A 0:6 1:7 2:6 3:2 4 | 19 4 R 0:6 1:7 2:4 3:2 5 | 15 4 T 0:5 1:3 2:4 3:3 6 | 15 4 V 0:7 1:4 2:2 3:2 7 | 14 3 G 0:4 1:4 2:6 8 | 11 4 E 0:5 1:3 2:2 3:1 9 | 10 4 K 0:4 1:2 2:3 3:1 10 | 8 4 S 0:2 1:3 2:2 3:1 11 | 7 4 D 0:2 1:2 2:2 3:1 12 | 7 4 P 0:2 1:2 2:2 3:1 13 | 6 4 C 0:1 1:2 2:2 3:1 14 | 6 4 L 0:3 1:1 2:1 3:1 15 | 6 4 Q 0:2 1:1 2:2 3:1 16 | 5 4 F 0:1 1:1 2:2 3:1 17 | 4 4 M 0:1 1:1 2:1 3:1 18 | 4 3 Y 0:2 1:1 2:1 19 | 3 2 H 1:1 2:2 20 | 1 1 W 2:1 -------------------------------------------------------------------------------- /tests/data/pep.fa.1mer.wordpos.txt: -------------------------------------------------------------------------------- 1 | 34 4 I 0 4 0 15 0 16 0 17 0 26 0 39 0 41 0 53 0 54 1 2 1 13 1 14 1 15 1 24 1 37 1 39 1 51 1 52 1 55 1 62 2 7 2 8 2 9 2 18 2 31 2 33 2 45 2 46 2 49 2 56 3 7 3 8 3 9 3 18 2 | 28 4 N 0 8 0 12 0 21 0 27 0 32 0 33 0 57 1 6 1 10 1 19 1 25 1 30 1 31 1 32 1 61 2 4 2 13 2 19 2 24 2 25 2 26 2 55 3 4 3 13 3 19 3 24 3 25 3 26 3 | 21 4 A 0 7 0 13 0 22 0 46 0 49 0 50 1 5 1 11 1 20 1 44 1 47 1 48 1 58 2 5 2 14 2 38 2 41 2 52 2 59 3 5 3 14 4 | 19 4 R 0 5 0 30 0 36 0 40 0 45 0 67 1 3 1 28 1 34 1 38 1 43 1 56 1 57 2 22 2 28 2 37 2 50 3 22 3 28 5 | 15 4 T 0 10 0 29 0 35 0 58 0 64 1 8 1 27 1 33 2 2 2 21 2 27 2 42 3 2 3 21 3 27 6 | 15 4 V 0 2 0 3 0 18 0 24 0 47 0 62 0 63 1 1 1 16 1 22 1 45 2 10 2 16 3 10 3 16 7 | 14 3 G 0 38 0 42 0 44 0 55 1 36 1 40 1 42 1 53 2 30 2 34 2 36 2 43 2 47 2 58 8 | 11 4 E 0 1 0 25 0 51 0 52 0 68 1 23 1 49 1 50 2 17 2 44 3 17 9 | 10 4 K 0 14 0 37 0 60 0 65 1 12 1 35 2 6 2 29 2 60 3 6 10 | 8 4 S 0 6 0 23 1 4 1 21 1 63 2 15 2 57 3 15 11 | 7 4 D 0 11 0 56 1 9 1 54 2 3 2 48 3 3 12 | 7 4 P 0 31 0 43 1 29 1 41 2 23 2 35 3 23 13 | 6 4 C 0 28 1 26 1 60 2 20 2 54 3 20 14 | 6 4 L 0 20 0 59 0 66 1 18 2 12 3 12 15 | 6 4 Q 0 19 0 61 1 17 2 11 2 51 3 11 16 | 5 4 F 0 9 1 7 2 1 2 39 3 1 17 | 4 4 M 0 0 1 0 2 0 3 0 18 | 4 3 Y 0 34 0 48 1 46 2 40 19 | 3 2 H 1 59 2 32 2 53 20 | 1 1 W 2 61 -------------------------------------------------------------------------------- /tests/data/pep.fa.2mer.txt: -------------------------------------------------------------------------------- 1 | 11 4 II 0:3 1:3 2:3 3:2 2 | 8 4 NA 0:2 1:2 2:2 3:2 3 | 8 4 TR 0:2 1:2 2:2 3:2 4 | 7 4 EI 0:2 1:2 2:2 3:1 5 | 7 4 NN 0:1 1:2 2:2 3:2 6 | 6 3 IG 0:2 1:2 2:2 7 | 6 3 IR 0:2 1:3 2:1 8 | 5 4 AK 0:1 1:1 2:2 3:1 9 | 5 4 DN 0:2 1:1 2:1 3:1 10 | 4 4 AS 0:1 1:1 2:1 3:1 11 | 4 4 CT 0:1 1:1 2:1 3:1 12 | 4 4 FT 0:1 1:1 2:1 3:1 13 | 4 4 IN 0:1 1:1 2:1 3:1 14 | 4 4 IV 0:1 1:1 2:1 3:1 15 | 4 4 KI 0:1 1:1 2:1 3:1 16 | 4 4 LN 0:1 1:1 2:1 3:1 17 | 4 4 NC 0:1 1:1 2:1 3:1 18 | 4 4 NT 0:1 1:1 2:1 3:1 19 | 4 4 PN 0:1 1:1 2:1 3:1 20 | 4 4 QL 0:1 1:1 2:1 3:1 21 | 4 4 RP 0:1 1:1 2:1 3:1 22 | 4 4 SV 0:1 1:1 2:1 3:1 23 | 4 4 TD 0:1 1:1 2:1 3:1 24 | 4 4 VE 0:1 1:1 2:1 3:1 25 | 4 4 VQ 0:1 1:1 2:1 3:1 26 | 4 3 RA 0:1 1:2 2:1 27 | 3 3 GD 0:1 1:1 2:1 28 | 3 3 GI 0:1 1:1 2:1 29 | 3 3 GP 0:1 1:1 2:1 30 | 3 3 GR 0:1 1:1 2:1 31 | 3 3 KG 0:1 1:1 2:1 32 | 3 3 PG 0:1 1:1 2:1 33 | 3 3 RK 0:1 1:1 2:1 34 | 3 3 YA 0:1 1:1 2:1 35 | 2 2 AA 0:1 1:1 36 | 2 2 AE 0:1 1:1 37 | 2 2 AH 1:1 2:1 38 | 2 2 AN 0:1 1:1 39 | 2 2 AV 0:1 1:1 40 | 2 2 CN 1:1 2:1 41 | 2 2 DI 1:1 2:1 42 | 2 2 EE 0:1 1:1 43 | 2 2 HC 1:1 2:1 44 | 2 2 IS 1:1 2:1 45 | 2 2 MF 2:1 3:1 46 | 2 2 NF 0:1 1:1 47 | 2 2 NI 1:1 2:1 48 | 2 2 RI 0:1 1:1 49 | 2 2 RS 0:1 1:1 50 | 2 2 SA 0:1 1:1 51 | 2 2 VI 0:1 1:1 52 | 2 2 VY 0:1 1:1 53 | 2 1 VV 0:2 54 | 1 1 AF 2:1 55 | 1 1 AT 2:1 56 | 1 1 EV 0:1 57 | 1 1 FY 2:1 58 | 1 1 GA 2:1 59 | 1 1 GE 2:1 60 | 1 1 HI 2:1 61 | 1 1 IH 2:1 62 | 1 1 KL 0:1 63 | 1 1 KQ 0:1 64 | 1 1 KW 2:1 65 | 1 1 LK 0:1 66 | 1 1 LR 0:1 67 | 1 1 ME 0:1 68 | 1 1 MV 1:1 69 | 1 1 NY 0:1 70 | 1 1 QA 2:1 71 | 1 1 QV 0:1 72 | 1 1 RE 0:1 73 | 1 1 RQ 2:1 74 | 1 1 RR 1:1 75 | 1 1 SG 2:1 76 | 1 1 TG 2:1 77 | 1 1 TK 0:1 78 | 1 1 TL 0:1 79 | 1 1 VT 0:1 80 | 1 1 YT 0:1 -------------------------------------------------------------------------------- /tests/data/pep.fa.2mer.wordpos.txt: -------------------------------------------------------------------------------- 1 | 11 4 II 0 15 0 16 0 53 1 13 1 14 1 51 2 7 2 8 2 45 3 7 3 8 2 | 8 4 NA 0 12 0 21 1 10 1 19 2 4 2 13 3 4 3 13 3 | 8 4 TR 0 29 0 35 1 27 1 33 2 21 2 27 3 21 3 27 4 | 7 4 EI 0 25 0 52 1 23 1 50 2 17 2 44 3 17 5 | 7 4 NN 0 32 1 30 1 31 2 24 2 25 3 24 3 25 6 | 6 3 IG 0 41 0 54 1 39 1 52 2 33 2 46 7 | 6 3 IR 0 4 0 39 1 2 1 37 1 55 2 49 8 | 5 4 AK 0 13 1 11 2 5 2 59 3 5 9 | 5 4 DN 0 11 0 56 1 9 2 3 3 3 10 | 4 4 AS 0 22 1 20 2 14 3 14 11 | 4 4 CT 0 28 1 26 2 20 3 20 12 | 4 4 FT 0 9 1 7 2 1 3 1 13 | 4 4 IN 0 26 1 24 2 18 3 18 14 | 4 4 IV 0 17 1 15 2 9 3 9 15 | 4 4 KI 0 14 1 12 2 6 3 6 16 | 4 4 LN 0 20 1 18 2 12 3 12 17 | 4 4 NC 0 27 1 25 2 19 3 19 18 | 4 4 NT 0 57 1 32 2 26 3 26 19 | 4 4 PN 0 31 1 29 2 23 3 23 20 | 4 4 QL 0 19 1 17 2 11 3 11 21 | 4 4 RP 0 30 1 28 2 22 3 22 22 | 4 4 SV 0 23 1 21 2 15 3 15 23 | 4 4 TD 0 10 1 8 2 2 3 2 24 | 4 4 VE 0 24 1 22 2 16 3 16 25 | 4 4 VQ 0 18 1 16 2 10 3 10 26 | 4 3 RA 0 45 1 43 1 57 2 37 27 | 3 3 GD 0 55 1 53 2 47 28 | 3 3 GI 0 38 1 36 2 30 29 | 3 3 GP 0 42 1 40 2 34 30 | 3 3 GR 0 44 1 42 2 36 31 | 3 3 KG 0 37 1 35 2 29 32 | 3 3 PG 0 43 1 41 2 35 33 | 3 3 RK 0 36 1 34 2 28 34 | 3 3 YA 0 48 1 46 2 40 35 | 2 2 AA 0 49 1 47 36 | 2 2 AE 0 50 1 48 37 | 2 2 AH 1 58 2 52 38 | 2 2 AN 0 7 1 5 39 | 2 2 AV 0 46 1 44 40 | 2 2 CN 1 60 2 54 41 | 2 2 DI 1 54 2 48 42 | 2 2 EE 0 51 1 49 43 | 2 2 HC 1 59 2 53 44 | 2 2 IS 1 62 2 56 45 | 2 2 MF 2 0 3 0 46 | 2 2 NF 0 8 1 6 47 | 2 2 NI 1 61 2 55 48 | 2 2 RI 0 40 1 38 49 | 2 2 RS 0 5 1 3 50 | 2 2 SA 0 6 1 4 51 | 2 2 VI 0 3 1 1 52 | 2 2 VY 0 47 1 45 53 | 2 1 VV 0 2 0 62 54 | 1 1 AF 2 38 55 | 1 1 AT 2 41 56 | 1 1 EV 0 1 57 | 1 1 FY 2 39 58 | 1 1 GA 2 58 59 | 1 1 GE 2 43 60 | 1 1 HI 2 32 61 | 1 1 IH 2 31 62 | 1 1 KL 0 65 63 | 1 1 KQ 0 60 64 | 1 1 KW 2 60 65 | 1 1 LK 0 59 66 | 1 1 LR 0 66 67 | 1 1 ME 0 0 68 | 1 1 MV 1 0 69 | 1 1 NY 0 33 70 | 1 1 QA 2 51 71 | 1 1 QV 0 61 72 | 1 1 RE 0 67 73 | 1 1 RQ 2 50 74 | 1 1 RR 1 56 75 | 1 1 SG 2 57 76 | 1 1 TG 2 42 77 | 1 1 TK 0 64 78 | 1 1 TL 0 58 79 | 1 1 VT 0 63 80 | 1 1 YT 0 34 -------------------------------------------------------------------------------- /tests/data/pep.fa.3mer.txt: -------------------------------------------------------------------------------- 1 | 4 4 AKI 0:1 1:1 2:1 3:1 2 | 4 4 ASV 0:1 1:1 2:1 3:1 3 | 4 4 CTR 0:1 1:1 2:1 3:1 4 | 4 4 DNA 0:1 1:1 2:1 3:1 5 | 4 4 EIN 0:1 1:1 2:1 3:1 6 | 4 4 FTD 0:1 1:1 2:1 3:1 7 | 4 4 III 0:1 1:1 2:1 3:1 8 | 4 4 IIV 0:1 1:1 2:1 3:1 9 | 4 4 INC 0:1 1:1 2:1 3:1 10 | 4 4 IVQ 0:1 1:1 2:1 3:1 11 | 4 4 KII 0:1 1:1 2:1 3:1 12 | 4 4 LNA 0:1 1:1 2:1 3:1 13 | 4 4 NAK 0:1 1:1 2:1 3:1 14 | 4 4 NAS 0:1 1:1 2:1 3:1 15 | 4 4 NCT 0:1 1:1 2:1 3:1 16 | 4 4 PNN 0:1 1:1 2:1 3:1 17 | 4 4 QLN 0:1 1:1 2:1 3:1 18 | 4 4 RPN 0:1 1:1 2:1 3:1 19 | 4 4 SVE 0:1 1:1 2:1 3:1 20 | 4 4 TDN 0:1 1:1 2:1 3:1 21 | 4 4 TRP 0:1 1:1 2:1 3:1 22 | 4 4 VEI 0:1 1:1 2:1 3:1 23 | 4 4 VQL 0:1 1:1 2:1 3:1 24 | 3 3 EII 0:1 1:1 2:1 25 | 3 3 GPG 0:1 1:1 2:1 26 | 3 3 GRA 0:1 1:1 2:1 27 | 3 3 IGD 0:1 1:1 2:1 28 | 3 3 IGP 0:1 1:1 2:1 29 | 3 3 IIG 0:1 1:1 2:1 30 | 3 3 KGI 0:1 1:1 2:1 31 | 3 3 NNN 1:1 2:1 3:1 32 | 3 3 NNT 1:1 2:1 3:1 33 | 3 3 NTR 1:1 2:1 3:1 34 | 3 3 PGR 0:1 1:1 2:1 35 | 3 3 RKG 0:1 1:1 2:1 36 | 3 3 TRK 0:1 1:1 2:1 37 | 2 2 AAE 0:1 1:1 38 | 2 2 AEE 0:1 1:1 39 | 2 2 AHC 1:1 2:1 40 | 2 2 ANF 0:1 1:1 41 | 2 2 AVY 0:1 1:1 42 | 2 2 CNI 1:1 2:1 43 | 2 2 DIR 1:1 2:1 44 | 2 2 EEI 0:1 1:1 45 | 2 2 GDI 1:1 2:1 46 | 2 2 GIR 0:1 1:1 47 | 2 2 HCN 1:1 2:1 48 | 2 2 IRI 0:1 1:1 49 | 2 2 IRS 0:1 1:1 50 | 2 2 MFT 2:1 3:1 51 | 2 2 NFT 0:1 1:1 52 | 2 2 NIS 1:1 2:1 53 | 2 2 RAV 0:1 1:1 54 | 2 2 RIG 0:1 1:1 55 | 2 2 RSA 0:1 1:1 56 | 2 2 SAN 0:1 1:1 57 | 2 2 VIR 0:1 1:1 58 | 2 2 VYA 0:1 1:1 59 | 2 2 YAA 0:1 1:1 60 | 1 1 AFY 2:1 61 | 1 1 AKW 2:1 62 | 1 1 ATG 2:1 63 | 1 1 DNT 0:1 64 | 1 1 EVV 0:1 65 | 1 1 FYA 2:1 66 | 1 1 GAK 2:1 67 | 1 1 GDN 0:1 68 | 1 1 GEI 2:1 69 | 1 1 GIH 2:1 70 | 1 1 HIG 2:1 71 | 1 1 IHI 2:1 72 | 1 1 IRQ 2:1 73 | 1 1 IRR 1:1 74 | 1 1 ISG 2:1 75 | 1 1 KLR 0:1 76 | 1 1 KQV 0:1 77 | 1 1 LKQ 0:1 78 | 1 1 LRE 0:1 79 | 1 1 MEV 0:1 80 | 1 1 MVI 1:1 81 | 1 1 NNY 0:1 82 | 1 1 NTL 0:1 83 | 1 1 NYT 0:1 84 | 1 1 QAH 2:1 85 | 1 1 QVV 0:1 86 | 1 1 RAF 2:1 87 | 1 1 RAH 1:1 88 | 1 1 RQA 2:1 89 | 1 1 RRA 1:1 90 | 1 1 SGA 2:1 91 | 1 1 TGE 2:1 92 | 1 1 TKL 0:1 93 | 1 1 TLK 0:1 94 | 1 1 VTK 0:1 95 | 1 1 VVI 0:1 96 | 1 1 VVT 0:1 97 | 1 1 YAT 2:1 98 | 1 1 YTR 0:1 -------------------------------------------------------------------------------- /tests/data/pep.fa.3mer.wordpos.txt: -------------------------------------------------------------------------------- 1 | 4 4 AKI 0 13 1 11 2 5 3 5 2 | 4 4 ASV 0 22 1 20 2 14 3 14 3 | 4 4 CTR 0 28 1 26 2 20 3 20 4 | 4 4 DNA 0 11 1 9 2 3 3 3 5 | 4 4 EIN 0 25 1 23 2 17 3 17 6 | 4 4 FTD 0 9 1 7 2 1 3 1 7 | 4 4 III 0 15 1 13 2 7 3 7 8 | 4 4 IIV 0 16 1 14 2 8 3 8 9 | 4 4 INC 0 26 1 24 2 18 3 18 10 | 4 4 IVQ 0 17 1 15 2 9 3 9 11 | 4 4 KII 0 14 1 12 2 6 3 6 12 | 4 4 LNA 0 20 1 18 2 12 3 12 13 | 4 4 NAK 0 12 1 10 2 4 3 4 14 | 4 4 NAS 0 21 1 19 2 13 3 13 15 | 4 4 NCT 0 27 1 25 2 19 3 19 16 | 4 4 PNN 0 31 1 29 2 23 3 23 17 | 4 4 QLN 0 19 1 17 2 11 3 11 18 | 4 4 RPN 0 30 1 28 2 22 3 22 19 | 4 4 SVE 0 23 1 21 2 15 3 15 20 | 4 4 TDN 0 10 1 8 2 2 3 2 21 | 4 4 TRP 0 29 1 27 2 21 3 21 22 | 4 4 VEI 0 24 1 22 2 16 3 16 23 | 4 4 VQL 0 18 1 16 2 10 3 10 24 | 3 3 EII 0 52 1 50 2 44 25 | 3 3 GPG 0 42 1 40 2 34 26 | 3 3 GRA 0 44 1 42 2 36 27 | 3 3 IGD 0 54 1 52 2 46 28 | 3 3 IGP 0 41 1 39 2 33 29 | 3 3 IIG 0 53 1 51 2 45 30 | 3 3 KGI 0 37 1 35 2 29 31 | 3 3 NNN 1 30 2 24 3 24 32 | 3 3 NNT 1 31 2 25 3 25 33 | 3 3 NTR 1 32 2 26 3 26 34 | 3 3 PGR 0 43 1 41 2 35 35 | 3 3 RKG 0 36 1 34 2 28 36 | 3 3 TRK 0 35 1 33 2 27 37 | 2 2 AAE 0 49 1 47 38 | 2 2 AEE 0 50 1 48 39 | 2 2 AHC 1 58 2 52 40 | 2 2 ANF 0 7 1 5 41 | 2 2 AVY 0 46 1 44 42 | 2 2 CNI 1 60 2 54 43 | 2 2 DIR 1 54 2 48 44 | 2 2 EEI 0 51 1 49 45 | 2 2 GDI 1 53 2 47 46 | 2 2 GIR 0 38 1 36 47 | 2 2 HCN 1 59 2 53 48 | 2 2 IRI 0 39 1 37 49 | 2 2 IRS 0 4 1 2 50 | 2 2 MFT 2 0 3 0 51 | 2 2 NFT 0 8 1 6 52 | 2 2 NIS 1 61 2 55 53 | 2 2 RAV 0 45 1 43 54 | 2 2 RIG 0 40 1 38 55 | 2 2 RSA 0 5 1 3 56 | 2 2 SAN 0 6 1 4 57 | 2 2 VIR 0 3 1 1 58 | 2 2 VYA 0 47 1 45 59 | 2 2 YAA 0 48 1 46 60 | 1 1 AFY 2 38 61 | 1 1 AKW 2 59 62 | 1 1 ATG 2 41 63 | 1 1 DNT 0 56 64 | 1 1 EVV 0 1 65 | 1 1 FYA 2 39 66 | 1 1 GAK 2 58 67 | 1 1 GDN 0 55 68 | 1 1 GEI 2 43 69 | 1 1 GIH 2 30 70 | 1 1 HIG 2 32 71 | 1 1 IHI 2 31 72 | 1 1 IRQ 2 49 73 | 1 1 IRR 1 55 74 | 1 1 ISG 2 56 75 | 1 1 KLR 0 65 76 | 1 1 KQV 0 60 77 | 1 1 LKQ 0 59 78 | 1 1 LRE 0 66 79 | 1 1 MEV 0 0 80 | 1 1 MVI 1 0 81 | 1 1 NNY 0 32 82 | 1 1 NTL 0 57 83 | 1 1 NYT 0 33 84 | 1 1 QAH 2 51 85 | 1 1 QVV 0 61 86 | 1 1 RAF 2 37 87 | 1 1 RAH 1 57 88 | 1 1 RQA 2 50 89 | 1 1 RRA 1 56 90 | 1 1 SGA 2 57 91 | 1 1 TGE 2 42 92 | 1 1 TKL 0 64 93 | 1 1 TLK 0 58 94 | 1 1 VTK 0 63 95 | 1 1 VVI 0 2 96 | 1 1 VVT 0 62 97 | 1 1 YAT 2 40 98 | 1 1 YTR 0 34 -------------------------------------------------------------------------------- /tests/test_calc_bbc.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsCommonTest.set_test_data() 11 | self.script_name = 'calc_bbc.py' 12 | 13 | def test_arg_molecule_when_no_fasta(self): 14 | args = ['--molecule', 'dna'] 15 | returncode, out = utils.runscript(self.script_name, args) 16 | self.assertEqual(returncode, 2) 17 | self.assertIn('--fasta/-f', out) 18 | 19 | def test_arg_molecule_invalid_choice(self): 20 | args = ['--fasta', self.filename_dna, 21 | '--molecule', 'nonexistent_mol'] 22 | returncode, out = utils.runscript(self.script_name, args) 23 | self.assertEqual(returncode, 2) 24 | self.assertIn('--molecule/-m', out) 25 | 26 | def test_output_on_dna1(self): 27 | args = ['--fasta', self.filename_dna, '--m', 'dna'] 28 | returncode, out, md5 = self._test_output(self.script_name, args) 29 | self.assertEqual(returncode, 0) 30 | self.assertEqual(md5, '6cfc27479ca5fb3d5d2d468544005d8b') 31 | 32 | def test_output_on_dna_k2(self): 33 | args = ['--fasta', self.filename_dna, '--m', 'dna', '--k', '2'] 34 | returncode, out, md5 = self._test_output(self.script_name, args) 35 | self.assertEqual(returncode, 0) 36 | self.assertEqual(md5, '1ea7e82d6bb7b8648e0dcca9e089361c') 37 | 38 | def test_output_on_dna_k2_pairwise(self): 39 | args = ['--fasta', self.filename_dna, '--m', 'dna', 40 | '--k', '2', '--outfmt', 'pairwise'] 41 | returncode, out, md5 = self._test_output(self.script_name, args) 42 | self.assertEqual(returncode, 0) 43 | self.assertEqual(md5, '74de6627e68cfb609701c13637ba4090') 44 | 45 | def test_output_on_protein(self): 46 | args = ['--fasta', self.filename_pep, '--m', 'protein'] 47 | returncode, out, md5 = self._test_output(self.script_name, args) 48 | self.assertEqual(returncode, 0) 49 | self.assertEqual(md5, '154f2788be2ec349092f22ce359acf80') 50 | 51 | def test_output_on_protein_no_outfile(self): 52 | args = ['--fasta', self.filename_pep, '--m', 'protein'] 53 | returncode, out, md5 = self._test_output(self.script_name, args, False) 54 | self.assertEqual(returncode, 0) 55 | self.assertEqual(md5, '154f2788be2ec349092f22ce359acf80') 56 | 57 | 58 | if __name__ == '__main__': 59 | unittest.main() 60 | -------------------------------------------------------------------------------- /tests/test_calc_fcgr.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsCommonTest.set_test_data() 11 | self.script_name = 'calc_fcgr.py' 12 | 13 | def test_arg_word_size_2_when_no_fasta(self): 14 | args = ['--word_size', '2'] 15 | returncode, out = utils.runscript(self.script_name, args) 16 | self.assertEqual(returncode, 2) 17 | self.assertIn('--fasta/-f', out) 18 | 19 | def test_arg_fasta_when_no_word_size(self): 20 | args = ['--fasta', self.filename_dna] 21 | returncode, out = utils.runscript(self.script_name, args) 22 | self.assertEqual(returncode, 2) 23 | self.assertIn('--word_size/-w', out) 24 | 25 | def test_arg_word_size_too_small(self): 26 | args = ['--fasta', self.filename_dna, '--word_size', '0'] 27 | returncode, out = utils.runscript(self.script_name, args) 28 | self.assertEqual(returncode, 2) 29 | self.assertIn('--word_size must be >= 1', out) 30 | 31 | def test_output_word_size_1(self): 32 | args = ['--fasta', self.filename_dna, '--word_size', '1'] 33 | returncode, out, md5 = self._test_output(self.script_name, args) 34 | self.assertEqual(returncode, 0) 35 | self.assertEqual(md5, 'bee51f3214f06f4e4265aa05bf9d6a7e') 36 | 37 | def test_output_word_size_2(self): 38 | args = ['--fasta', self.filename_dna, '--word_size', '2'] 39 | returncode, out, md5 = self._test_output(self.script_name, args) 40 | self.assertEqual(returncode, 0) 41 | self.assertEqual(md5, '7175a91fb9fc31661ce07aea28743605') 42 | 43 | 44 | if __name__ == '__main__': 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /tests/test_calc_graphdna.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsCommonTest.set_test_data() 11 | self.script_name = 'calc_graphdna.py' 12 | 13 | def test_arg_vector_when_no_fasta(self): 14 | args = ['--vector', '2DSV'] 15 | returncode, out = utils.runscript(self.script_name, args) 16 | self.assertEqual(returncode, 2) 17 | self.assertIn('--fasta/-f', out) 18 | 19 | def test_arg_vector_invalid_choice(self): 20 | args = ['--fasta', self.filename_dna, '--vector', 'nonexistent'] 21 | returncode, out = utils.runscript(self.script_name, args) 22 | self.assertEqual(returncode, 2) 23 | self.assertIn('invalid choice', out) 24 | 25 | def test_output_default(self): 26 | args = ['--fasta', self.filename_dna] 27 | returncode, out, md5 = self._test_output(self.script_name, args) 28 | self.assertEqual(returncode, 0) 29 | self.assertEqual(md5, '496832ba4841a988a46c81770ee54668') 30 | 31 | def test_output_vector_2DSV(self): 32 | args = ['--fasta', self.filename_dna, '--vector', '2DSV'] 33 | returncode, out, md5 = self._test_output(self.script_name, args) 34 | self.assertEqual(returncode, 0) 35 | self.assertEqual(md5, 'e35a44622d4f0411b26e12e8eedcdb64') 36 | 37 | def test_output_vector_2DMV(self): 38 | args = ['--fasta', self.filename_dna, '--vector', '2DMV'] 39 | returncode, out, md5 = self._test_output(self.script_name, args) 40 | self.assertEqual(returncode, 0) 41 | self.assertEqual(md5, '7638015e1c25657cd572071f3b9ae7c4') 42 | 43 | def test_script_output_vector_2DNV_pairwise(self): 44 | args = ['--fasta', self.filename_dna, '--vector', '2DNV', 45 | '--outfmt', 'pairwise'] 46 | returncode, out, md5 = self._test_output(self.script_name, args) 47 | self.assertEqual(returncode, 0) 48 | self.assertEqual(md5, '2921e374b468b6de81a1c9140681a3b4') 49 | 50 | 51 | if __name__ == '__main__': 52 | unittest.main() 53 | -------------------------------------------------------------------------------- /tests/test_calc_lempelziv.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsCommonTest.set_test_data() 11 | self.script_name = 'calc_lempelziv.py' 12 | 13 | def test_agr_fasta_when_invalid_distance(self): 14 | args = ['--fasta', self.filename_dna, 15 | '--distance', 'nonexistent'] 16 | returncode, out = utils.runscript(self.script_name, args) 17 | self.assertEqual(returncode, 2) 18 | self.assertIn('invalid choice', out) 19 | 20 | def test_agr_distance_when_no_fasta(self): 21 | args = ['--distance', 'd1'] 22 | returncode, out = utils.runscript(self.script_name, args) 23 | self.assertEqual(returncode, 2) 24 | self.assertIn('--fasta/-f', out) 25 | 26 | def test_output_default(self): 27 | args = ['--fasta', self.filename_pep] 28 | returncode, out, md5 = self._test_output(self.script_name, args) 29 | self.assertEqual(returncode, 0) 30 | self.assertEqual(md5, '89d18a9ac1e573743fa0214c48dde40c') 31 | 32 | def test_output_distance_d(self): 33 | args = ['--fasta', self.filename_pep, '--distance', 'd'] 34 | returncode, out, md5 = self._test_output(self.script_name, args) 35 | self.assertEqual(returncode, 0) 36 | self.assertEqual(md5, 'c71cb1521d0fc9084eee21c8599785ef') 37 | 38 | def test_output_distance_d_star_pairwise(self): 39 | args = ['--fasta', self.filename_pep, '--distance', 'd_star', 40 | '--outfmt', 'pairwise'] 41 | returncode, out, md5 = self._test_output(self.script_name, args) 42 | self.assertEqual(returncode, 0) 43 | self.assertEqual(md5, '3ed3ca10d198fe4f44ea85134dbcb481') 44 | 45 | 46 | if __name__ == '__main__': 47 | unittest.main() 48 | -------------------------------------------------------------------------------- /tests/test_calc_ncd.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsCommonTest.set_test_data() 11 | self.script_name = 'calc_ncd.py' 12 | 13 | def test_output_default(self): 14 | args = ['--fasta', self.filename_pep] 15 | returncode, out, md5 = self._test_output(self.script_name, args) 16 | self.assertEqual(returncode, 0) 17 | self.assertEqual(md5, 'e5491c3e4197bf1abb92e7f76bdefeaf') 18 | 19 | def test_output_pairwise(self): 20 | args = ['--fasta', self.filename_pep, '--outfmt', 'pairwise'] 21 | returncode, out, md5 = self._test_output(self.script_name, args) 22 | self.assertEqual(returncode, 0) 23 | self.assertEqual(md5, 'cb69bbabd9a4286a9596f8af3b2b82d5') 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /tests/test_calc_wmetric.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsCommonTest.set_test_data() 11 | self.script_name = 'calc_wmetric.py' 12 | 13 | def test_arg_matrix_when_no_fasta(self): 14 | args = ['--matrix', 'blosum62'] 15 | returncode, out = utils.runscript(self.script_name, args) 16 | self.assertEqual(returncode, 2) 17 | self.assertIn('--fasta/-f', out) 18 | 19 | def test_arg_matrix_invalid_choice(self): 20 | args = ['--matrix', 'nonexistent'] 21 | returncode, out = utils.runscript(self.script_name, args) 22 | self.assertEqual(returncode, 2) 23 | self.assertIn('--matrix/-m', out) 24 | 25 | def test_output_default(self): 26 | args = ['--fasta', self.filename_pep] 27 | returncode, out, md5 = self._test_output(self.script_name, args) 28 | self.assertEqual(returncode, 0) 29 | self.assertEqual(md5, '27ad675a7a2e5c2872a8ab495f2d4494') 30 | 31 | def test_output_phylip(self): 32 | args = ['--fasta', self.filename_pep, '--outfmt', 'phylip'] 33 | returncode, out, md5 = self._test_output(self.script_name, args) 34 | self.assertEqual(returncode, 0) 35 | self.assertEqual(md5, '27ad675a7a2e5c2872a8ab495f2d4494') 36 | 37 | def test_output_pairwise(self): 38 | args = ['--fasta', self.filename_pep, '--outfmt', 'pairwise'] 39 | returncode, out, md5 = self._test_output(self.script_name, args) 40 | self.assertEqual(returncode, 0) 41 | self.assertEqual(md5, '195fb45ed46a80473e1d004b9ce40e94') 42 | 43 | def test_output_pam250(self): 44 | args = ['--fasta', self.filename_pep, '--outfmt', 'phylip', 45 | '--matrix', 'pam250'] 46 | returncode, out, md5 = self._test_output(self.script_name, args) 47 | self.assertEqual(returncode, 0) 48 | self.assertEqual(md5, '217ed91de43b091205add32a673cf8fe') 49 | 50 | 51 | if __name__ == '__main__': 52 | unittest.main() 53 | -------------------------------------------------------------------------------- /tests/test_calc_word_bool.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsWordCommonTest.set_test_data() 11 | self.script_name = 'calc_word_bool.py' 12 | 13 | def test_arg_word_size_when_no_fasta(self): 14 | args = ['--word_size', '2'] 15 | returncode, out = utils.runscript(self.script_name, args) 16 | self.assertEqual(returncode, 2) 17 | self.assertIn('--fasta/-f', out) 18 | 19 | def test_arg_word_pattern_when_no_fasta(self): 20 | args = ['--word_pattern', self.filename_pep_2mer] 21 | returncode, out = utils.runscript(self.script_name, args) 22 | self.assertEqual(returncode, 2) 23 | self.assertIn('--fasta/-f', out) 24 | 25 | def test_arg_fasta_when_no_wordsize_or_wordpattern(self): 26 | args = ['--fasta', self.filename_pep] 27 | returncode, out = utils.runscript(self.script_name, args) 28 | self.assertEqual(returncode, 2) 29 | self.assertIn('Specify either: --word_size or --word', out) 30 | 31 | def test_arg_fasta_when_no_wordsize_or_wordpattern(self): 32 | args = ['--fasta', self.filename_pep] 33 | returncode, out = utils.runscript(self.script_name, args) 34 | self.assertEqual(returncode, 2) 35 | self.assertIn('Specify either: --word_size or --word', out) 36 | 37 | def test_arg_word_size_too_small(self): 38 | args = ['--fasta', self.filename_pep, '--word_size', '-1'] 39 | returncode, out = utils.runscript(self.script_name, args) 40 | self.assertEqual(returncode, 2) 41 | self.assertIn('Word size must be >= 1.', out) 42 | 43 | def test_output_word_size1(self): 44 | args = ['--fasta', self.filename_pep, '--word_size', '1'] 45 | returncode, out, md5 = self._test_output(self.script_name, args) 46 | self.assertEqual(returncode, 0) 47 | self.assertEqual(md5, '4caed60c7590f45e9a6de19482839e9c') 48 | 49 | 50 | if __name__ == '__main__': 51 | unittest.main() 52 | -------------------------------------------------------------------------------- /tests/test_calc_word_cv.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsWordCommonTest.set_test_data() 11 | self.script_name = 'calc_word_cv.py' 12 | 13 | def test_word_size_smaller_than_3(self): 14 | args = ['--fasta', self.filename_pep, '--word_size', '2'] 15 | returncode, out = utils.runscript(self.script_name, args) 16 | self.assertEqual(returncode, 2) 17 | self.assertIn('error: Word size must be >= 3', out) 18 | 19 | def test_word_pattern_only_one_file(self): 20 | args = ['--fasta', self.filename_pep, '--word_pattern', 21 | self.filename_pep_2mer] 22 | returncode, out = utils.runscript(self.script_name, args) 23 | self.assertEqual(returncode, 2) 24 | self.assertIn('expected 3 argument', out) 25 | 26 | def test_word_pattern_not_follow_rule(self): 27 | args = ['--fasta', self.filename_pep, '--word_pattern', 28 | self.filename_pep_2mer, self.filename_pep_2mer, 29 | self.filename_pep_2mer] 30 | returncode, out = utils.runscript(self.script_name, args) 31 | self.assertEqual(returncode, 2) 32 | self.assertIn(' do not follow k, k-1, k-2', out) 33 | 34 | def test_fasta_when_no_word_size_or_pattern(self): 35 | args = ['--fasta', self.filename_pep] 36 | returncode, out = utils.runscript(self.script_name, args) 37 | self.assertEqual(returncode, 2) 38 | self.assertIn('Specify either: --word_size or --word_pattern', out) 39 | 40 | def test_output_word_size(self): 41 | args = ['--fasta', self.filename_pep, '--word_size', '3'] 42 | returncode, out, md5 = self._test_output(self.script_name, args) 43 | self.assertEqual(returncode, 0) 44 | self.assertEqual(md5, '4fbba77e4f7a64601e7d0cb3b0b6878d') 45 | 46 | def test_output_word_pattern(self): 47 | args = ['--fasta', self.filename_pep, '--word_patterns', 48 | self.filename_pep_3mer, self.filename_pep_2mer, 49 | self.filename_pep_1mer 50 | ] 51 | returncode, out, md5 = self._test_output(self.script_name, args) 52 | self.assertEqual(returncode, 0) 53 | self.assertEqual(md5, '4fbba77e4f7a64601e7d0cb3b0b6878d') 54 | 55 | 56 | if __name__ == '__main__': 57 | unittest.main() 58 | -------------------------------------------------------------------------------- /tests/test_calc_word_d2.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsWordCommonTest.set_test_data() 11 | self.script_name = 'calc_word_d2.py' 12 | 13 | def test_arg_when_u_smaller_than_l(self): 14 | args = ['--fasta', self.filename_pep, '-l', '3', '-u', '2'] 15 | returncode, out = utils.runscript(self.script_name, args) 16 | self.assertEqual(returncode, 2) 17 | self.assertIn('error: max_word_size must be greater than ', out) 18 | 19 | def test_arg_char_weights_invalid_format(self): 20 | args = ['--fasta', self.filename_pep, 21 | '-l', '1', '-u', '4', 22 | '--char_weights', self.filename_pep, 23 | '--vector', 'freqs'] 24 | returncode, out = utils.runscript(self.script_name, args) 25 | self.assertEqual(returncode, 2) 26 | self.assertIn('Invalid format for --char_weights', out) 27 | 28 | def test_arg_word_size_0(self): 29 | args = ['--fasta', self.filename_pep, '-l', '0'] 30 | returncode, out = utils.runscript(self.script_name, args) 31 | self.assertEqual(returncode, 2) 32 | self.assertIn('min_word_size must be greater than 0', out) 33 | 34 | def test_output_default(self): 35 | args = ['--fasta', self.filename_pep] 36 | returncode, out, md5 = self._test_output(self.script_name, args) 37 | self.assertEqual(returncode, 0) 38 | self.assertEqual(md5, 'f651314b77dcd4fe9b3143de28000ca8') 39 | 40 | def test_output_l1_u4(self): 41 | args = ['--fasta', self.filename_pep, '-l', '1', '-u', '4'] 42 | returncode, out, md5 = self._test_output(self.script_name, args) 43 | self.assertEqual(returncode, 0) 44 | self.assertEqual(md5, '164ef1a902f74517e6b7cff7798c595f') 45 | 46 | def test_output_l1_u4_freqs(self): 47 | args = ['--fasta', self.filename_pep, '-l', '1', '-u', '4', 48 | '--vector', 'freqs'] 49 | returncode, out, md5 = self._test_output(self.script_name, args) 50 | self.assertEqual(returncode, 0) 51 | self.assertEqual(md5, '8340c1687a0e6ae50c5f6bcc24196247') 52 | 53 | def test_output_l1_u4_char_weights(self): 54 | args = ['--fasta', self.filename_pep, '-l', '1', '-u', '4', 55 | '--char_weights', self.filename_char_weights] 56 | returncode, out, md5 = self._test_output(self.script_name, args) 57 | self.assertEqual(returncode, 0) 58 | self.assertEqual(md5, '81873a0cb36f7e05698fa664311f38ee') 59 | 60 | def test_script_l1_u4_char_weights_freqs(self): 61 | args = ['--fasta', self.filename_pep, '-l', '1', '-u', '4', 62 | '--vector', 'freqs', 63 | '--char_weights', self.filename_char_weights] 64 | returncode, out, md5 = self._test_output(self.script_name, args) 65 | self.assertEqual(returncode, 0) 66 | self.assertEqual(md5, '96c944f9e8e4d2b8ca67bc2620f47d3a') 67 | 68 | 69 | if __name__ == '__main__': 70 | unittest.main() 71 | -------------------------------------------------------------------------------- /tests/test_calc_word_ffp.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsWordCommonTest.set_test_data() 11 | self.script_name = 'calc_word_ffp.py' 12 | 13 | def test_arg_word_size_when_no_fasta(self): 14 | args = ['--word_size', '2'] 15 | returncode, out = utils.runscript(self.script_name, args) 16 | self.assertEqual(returncode, 2) 17 | self.assertIn('--fasta/-f', out) 18 | 19 | def test_arg_no_molecule(self): 20 | args = ['--fasta', self.filename_pep] 21 | returncode, out = utils.runscript(self.script_name, args) 22 | self.assertEqual(returncode, 2) 23 | self.assertIn('--molecule/-m', out) 24 | 25 | def test_arg_no_word_size(self): 26 | args = ['--fasta', self.filename_pep, '--molecule', 'protein'] 27 | returncode, out = utils.runscript(self.script_name, args) 28 | self.assertEqual(returncode, 2) 29 | self.assertIn('--word_size', out) 30 | 31 | def test_arg_incompatible_args_protein_merge_revcomp(self): 32 | args = ['--fasta', self.filename_pep, '--word_size', '2', 33 | '--molecule', 'protein', '--merge_revcomp'] 34 | returncode, out = utils.runscript(self.script_name, args) 35 | self.assertEqual(returncode, 2) 36 | self.assertIn('Incompatible arguments', out) 37 | 38 | def test_arg_distance_invalid_choice(self): 39 | args = ['--fasta', self.filename_pep, '--word_size', '2', 40 | '--molecule', 'protein', '--distance', 'nonexistent'] 41 | returncode, out = utils.runscript(self.script_name, args) 42 | self.assertEqual(returncode, 2) 43 | self.assertIn('invalid choice', out) 44 | 45 | def test_output_pep_word_size2(self): 46 | args = ['--fasta', self.filename_pep, '--word_size', '2', 47 | '--molecule', 'protein'] 48 | returncode, out, md5 = self._test_output(self.script_name, args) 49 | self.assertEqual(returncode, 0) 50 | self.assertEqual(md5, '79caa37b67848c52b41a8cb074d810e1') 51 | 52 | def test_output_pep_word_size2_reduce_alphabet(self): 53 | args = ['--fasta', self.filename_pep, '--word_size', '2', 54 | '--molecule', 'protein', '--reduce_alphabet'] 55 | returncode, out, md5 = self._test_output(self.script_name, args) 56 | self.assertEqual(returncode, 0) 57 | self.assertEqual(md5, '2e03fddfa6a10d810c3481fd53ada4a3') 58 | 59 | def test_output_pep_word_pattern2_reduce_alphabet(self): 60 | args = ['--fasta', self.filename_pep, '--molecule', 'protein', 61 | '--word_pattern', self.filename_pep_2mer, '--reduce_alphabet'] 62 | returncode, out, md5 = self._test_output(self.script_name, args) 63 | self.assertEqual(returncode, 0) 64 | self.assertEqual(md5, '2e03fddfa6a10d810c3481fd53ada4a3') 65 | 66 | def test_output_dna_word_size2(self): 67 | args = ['--fasta', self.filename_dna, '--molecule', 'dna', 68 | '--word_size', '2'] 69 | returncode, out, md5 = self._test_output(self.script_name, args) 70 | self.assertEqual(returncode, 0) 71 | self.assertEqual(md5, '69d68abfe5cb8e855f77f9f8fff20178') 72 | 73 | def test_output_dna_word_size2_mergerevcomp(self): 74 | args = ['--fasta', self.filename_dna, '--molecule', 'dna', 75 | '--word_size', '2', '--merge_revcomp'] 76 | returncode, out, md5 = self._test_output(self.script_name, args) 77 | self.assertEqual(returncode, 0) 78 | self.assertEqual(md5, 'd3fd336b21aac9922ed7831b8d9f5f83') 79 | 80 | def test_output_dna_word_size2_mergerevcomp_reduce(self): 81 | args = ['--fasta', self.filename_dna, '--molecule', 'dna', 82 | '--word_size', '2', '--merge_revcomp', '--reduce_alphabet'] 83 | returncode, out, md5 = self._test_output(self.script_name, args) 84 | self.assertEqual(returncode, 0) 85 | self.assertEqual(md5, '83fd63884c64c88ee3ff6e4eb2183e8b') 86 | 87 | if __name__ == '__main__': 88 | unittest.main() 89 | -------------------------------------------------------------------------------- /tests/test_calc_word_rtd.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsWordCommonTest.set_test_data() 11 | self.script_name = 'calc_word_rtd.py' 12 | 13 | def test_arg_word_size_when_no_fasta(self): 14 | args = ['--word_size', '2'] 15 | returncode, out = utils.runscript(self.script_name, args) 16 | self.assertEqual(returncode, 2) 17 | self.assertIn('--fasta/-f', out) 18 | 19 | def test_arg_fasta_when_no_word_size(self): 20 | args = ['--fasta', self.filename_pep] 21 | returncode, out = utils.runscript(self.script_name, args) 22 | self.assertEqual(returncode, 2) 23 | self.assertIn('Specify either: --word_size or --word_pattern.', out) 24 | 25 | def test_arg_word_pattern_invalid_format(self): 26 | args = ['--fasta', self.filename_pep, 27 | '--word_pattern', self.filename_pep_2mer] 28 | returncode, out = utils.runscript(self.script_name, args) 29 | self.assertEqual(returncode, 2) 30 | self.assertIn('does not contain info on word positions', out) 31 | 32 | def test_arg_distance_invalid_choice(self): 33 | args = ['--fasta', self.filename_pep, '--word_size', '2', 34 | '--distance', 'nonexistent'] 35 | returncode, out = utils.runscript(self.script_name, args) 36 | self.assertEqual(returncode, 2) 37 | self.assertIn('invalid choice', out) 38 | 39 | def test_output_word_size_2(self): 40 | args = ['--fasta', self.filename_pep, '--word_size', '2'] 41 | returncode, out, md5 = self._test_output(self.script_name, args) 42 | self.assertEqual(returncode, 0) 43 | self.assertEqual(md5, '1e1a089908495d60275c039272e8e45f') 44 | 45 | def test_output_wordpattern(self): 46 | args = ['--fasta', self.filename_pep, 47 | '--word_pattern', self.filename_pep_2mer_wordpos] 48 | returncode, out, md5 = self._test_output(self.script_name, args) 49 | self.assertEqual(returncode, 0) 50 | self.assertEqual(md5, '1e1a089908495d60275c039272e8e45f') 51 | 52 | def test_output_word_size_1(self): 53 | args = ['--fasta', self.filename_pep, '--outfmt', 'pairwise', 54 | '--word_size', '1'] 55 | returncode, out, md5 = self._test_output(self.script_name, args) 56 | self.assertEqual(returncode, 0) 57 | self.assertEqual(md5, 'b4f581dabfa83b2f1ff4f5d367865711') 58 | 59 | 60 | if __name__ == '__main__': 61 | unittest.main() 62 | -------------------------------------------------------------------------------- /tests/test_calc_word_sets.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsWordCommonTest.set_test_data() 11 | self.script_name = 'calc_word_sets.py' 12 | 13 | def test_arg_word_size_when_no_fasta(self): 14 | args = ['--word_size', '2'] 15 | returncode, out = utils.runscript(self.script_name, args) 16 | self.assertEqual(returncode, 2) 17 | self.assertIn('--fasta/-f', out) 18 | 19 | def test_arg_fasta_when_no_wordsize(self): 20 | args = ['--fasta', self.filename_pep] 21 | returncode, out = utils.runscript(self.script_name, args) 22 | self.assertEqual(returncode, 2) 23 | self.assertIn('--word_size', out) 24 | 25 | def test_arg_word_size_too_small(self): 26 | args = ['--fasta', self.filename_pep, '--word_size', '-1'] 27 | returncode, out = utils.runscript(self.script_name, args) 28 | self.assertEqual(returncode, 2) 29 | self.assertIn('Word size must be >= 1.', out) 30 | 31 | def test_arg_distance_invalid_choice(self): 32 | args = ['--fasta', self.filename_pep, '--word_size', '-1', 33 | '--distance', 'nonexistent'] 34 | returncode, out = utils.runscript(self.script_name, args) 35 | self.assertEqual(returncode, 2) 36 | self.assertIn('invalid choice', out) 37 | 38 | def test_output_word_size2(self): 39 | args = ['--fasta', self.filename_pep, '--word_size', '2'] 40 | returncode, out, md5 = self._test_output(self.script_name, args) 41 | print(out) 42 | self.assertEqual(returncode, 0) 43 | self.assertEqual(md5, 'f1b4cf9538d2d2a2a4f1e81ac1b1251d') 44 | 45 | def test_output_word_size2(self): 46 | args = ['--fasta', self.filename_pep, '--word_size', '2', 47 | '--distance', 'jaccard'] 48 | returncode, out, md5 = self._test_output(self.script_name, args) 49 | self.assertEqual(returncode, 0) 50 | self.assertEqual(md5, '7a744c4665ac06483c5eb36ee03d4fa8') 51 | 52 | 53 | if __name__ == '__main__': 54 | unittest.main() 55 | -------------------------------------------------------------------------------- /tests/test_create_wordpattern.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import utils 4 | 5 | 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super(ScriptTest, self).__init__(*args, **kwargs) 10 | utils.ScriptsCommonTest.set_test_data() 11 | self.script_name = 'create_wordpattern.py' 12 | 13 | def test_arg_word_size_when_no_fasta(self): 14 | args = ['--word_size', '2'] 15 | returncode, out = utils.runscript(self.script_name, args) 16 | self.assertEqual(returncode, 2) 17 | self.assertIn('--fasta/-f', out) 18 | 19 | def test_arg_word_size_0(self): 20 | args = ['--fasta', self.filename_pep, '--word_size', '0'] 21 | returncode, out = utils.runscript(self.script_name, args) 22 | self.assertEqual(returncode, 2) 23 | self.assertIn('--word_size must be >= 1', out) 24 | 25 | def test_arg_teiresias_when_no_l(self): 26 | args = ['--fasta', self.filename_pep, '--word_size', '2', 27 | '--teiresias'] 28 | returncode, out = utils.runscript(self.script_name, args) 29 | self.assertEqual(returncode, 2) 30 | self.assertIn('Teiresias requires --l', out) 31 | 32 | def test_arg_teiresias_when_no_k(self): 33 | args = ['--fasta', self.filename_pep, '--word_size', '2', 34 | '--teiresias', '--l', '2'] 35 | returncode, out = utils.runscript(self.script_name, args) 36 | self.assertEqual(returncode, 2) 37 | self.assertIn('Teiresias requires --k', out) 38 | 39 | def test_arg_teiresias_when_k_and_not_l(self): 40 | args = ['--fasta', self.filename_pep, '--word_size', '2', 41 | '--teiresias', '--k', '2'] 42 | returncode, out = utils.runscript(self.script_name, args) 43 | self.assertEqual(returncode, 2) 44 | self.assertIn('Teiresias requires --l', out) 45 | 46 | def test_teiresias_when_l_too_small(self): 47 | args = ['--fasta', self.filename_pep, '--word_size', '2', 48 | '--teiresias', '--k', '2', '--l', '1'] 49 | returncode, out = utils.runscript(self.script_name, args) 50 | self.assertEqual(returncode, 2) 51 | self.assertIn('--l must be at least 2', out) 52 | 53 | def test_output_word_size_2(self): 54 | args = ['--fasta', self.filename_pep, '--word_size', '2'] 55 | returncode, out, md5 = self._test_output(self.script_name, args) 56 | self.assertEqual(returncode, 0) 57 | self.assertEqual(md5, '2aea23ad3e883708dc2f95111f7f04ec') 58 | 59 | def test_output_word_size_2_wordpos(self): 60 | args = ['--fasta', self.filename_pep, '--word_size', '2', 61 | '--word_position'] 62 | returncode, out, md5 = self._test_output(self.script_name, args) 63 | self.assertEqual(returncode, 0) 64 | self.assertEqual(md5, '040e121be77617191c7d7c847edafc8e') 65 | 66 | def test_output_word_size_1(self): 67 | args = ['--fasta', self.filename_pep, '--word_size', '1'] 68 | returncode, out, md5 = self._test_output(self.script_name, args) 69 | self.assertEqual(returncode, 0) 70 | self.assertEqual(md5, '2d4dd98798cb6320975f6919fe43b777') 71 | 72 | 73 | if __name__ == '__main__': 74 | unittest.main() 75 | -------------------------------------------------------------------------------- /tests/test_distance.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from alfpy import word_pattern 4 | from alfpy import word_vector 5 | from alfpy.utils import distance 6 | from alfpy.utils import distmatrix 7 | 8 | from . import utils 9 | 10 | 11 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest): 12 | 13 | def __init__(self, *args, **kwargs): 14 | super(DistanceTest, self).__init__(*args, **kwargs) 15 | utils.ModulesCommonTest.set_test_data() 16 | self.pattern = word_pattern.create(self.dna_records.seq_list, 2) 17 | self.counts = word_vector.Counts(self.dna_records.length_list, 18 | self.pattern) 19 | self.freqs = word_vector.Freqs(self.dna_records.length_list, 20 | self.pattern) 21 | 22 | def test_euclid_squared_counts(self): 23 | # The result of this method is identical to that from decaf+py. 24 | dist = distance.Distance(self.counts, 'euclid_squared') 25 | matrix = distmatrix.create(self.dna_records.id_list, dist) 26 | data = [' 3', 27 | 'seq1 0.0000000 57.0000000 30.0000000', 28 | 'seq2 57.0000000 0.0000000 19.0000000', 29 | 'seq3 30.0000000 19.0000000 0.0000000'] 30 | self.assertEqual(matrix.format(), "\n".join(data)) 31 | 32 | def test_euclid_squared_freqs(self): 33 | # The result of this method is identical to that from decaf+py. 34 | dist = distance.Distance(self.freqs, 'euclid_squared') 35 | matrix = distmatrix.create(self.dna_records.id_list, dist) 36 | data = [' 3', 37 | 'seq1 0.0000000 0.1416402 0.0641298', 38 | 'seq2 0.1416402 0.0000000 0.0677565', 39 | 'seq3 0.0641298 0.0677565 0.0000000'] 40 | self.assertEqual(matrix.format(), "\n".join(data)) 41 | 42 | def test_euclid_norm_counts(self): 43 | # The result of this method is identical to that from decaf+py. 44 | dist = distance.Distance(self.counts, 'euclid_norm') 45 | matrix = distmatrix.create(self.dna_records.id_list, dist) 46 | data = [' 3', 47 | 'seq1 0.0000000 7.5498344 5.4772256', 48 | 'seq2 7.5498344 0.0000000 4.3588989', 49 | 'seq3 5.4772256 4.3588989 0.0000000'] 50 | self.assertEqual(matrix.format(), "\n".join(data)) 51 | 52 | def test_euclid_norm_freqs(self): 53 | # The result of this method is identical to that from decaf+py. 54 | dist = distance.Distance(self.freqs, 'euclid_norm') 55 | matrix = distmatrix.create(self.dna_records.id_list, dist) 56 | data = [' 3', 57 | 'seq1 0.0000000 0.3763512 0.2532387', 58 | 'seq2 0.3763512 0.0000000 0.2603008', 59 | 'seq3 0.2532387 0.2603008 0.0000000'] 60 | self.assertEqual(matrix.format(), "\n".join(data)) 61 | 62 | def test_google_freqs(self): 63 | dist = distance.Distance(self.freqs, 'google') 64 | matrix = distmatrix.create(self.dna_records.id_list, dist) 65 | data = [' 3', 66 | 'seq1 0.0000000 0.6078431 0.3809524', 67 | 'seq2 0.6078431 0.0000000 0.3949580', 68 | 'seq3 0.3809524 0.3949580 0.0000000'] 69 | self.assertEqual(matrix.format(), "\n".join(data)) 70 | 71 | def test_get_disttypes(self): 72 | distlist = distance.Distance.get_disttypes() 73 | exp = ['euclid_norm', 'euclid_squared', 'google'] 74 | self.assertListEqual(distlist, exp) 75 | 76 | def test_set_disttypes_throws_exception(self): 77 | dist = distance.Distance(self.freqs, 'google') 78 | with self.assertRaises(Exception) as context: 79 | dist.set_disttype('nonexistent') 80 | self.assertIn('unknown disttype', str(context.exception)) 81 | 82 | if __name__ == '__main__': 83 | unittest.main() 84 | -------------------------------------------------------------------------------- /tests/test_distmatrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import unittest 4 | 5 | from alfpy import word_distance 6 | from alfpy.utils import distmatrix 7 | 8 | from . import utils 9 | 10 | 11 | class TestDistMatrix(unittest.TestCase): 12 | 13 | def setUp(self): 14 | id_list = ['seq1', 'seq2', 'seq3'] 15 | data = np.array([[0, 0.3531587, 0.35509333], 16 | [0.3531587, 0, 0.295394], 17 | [0.35509333, 0.295394, 0.] 18 | ]) 19 | self.matrix = distmatrix.Matrix(id_list, data) 20 | self.output_filename = utils.get_test_data('distmatrix.txt') 21 | 22 | def test_format(self): 23 | exp = [ 24 | ' 3', 25 | 'seq1 0.0000000 0.3531587 0.3550933', 26 | 'seq2 0.3531587 0.0000000 0.2953940', 27 | 'seq3 0.3550933 0.2953940 0.0000000' 28 | ] 29 | self.assertEqual(self.matrix.format(), "\n".join(exp)) 30 | 31 | def test_format_decimal3(self): 32 | exp = [ 33 | ' 3', 34 | 'seq1 0.000 0.353 0.355', 35 | 'seq2 0.353 0.000 0.295', 36 | 'seq3 0.355 0.295 0.000' 37 | ] 38 | self.assertEqual(self.matrix.format(3), "\n".join(exp)) 39 | 40 | def test_min(self): 41 | self.assertEqual(self.matrix.min(), 0) 42 | 43 | def test_max(self): 44 | self.assertEqual(self.matrix.max(), 0.35509332999999998) 45 | 46 | def test_is_zero(self): 47 | self.assertFalse(self.matrix.is_zero()) 48 | 49 | def test_normalize(self): 50 | self.matrix.normalize() 51 | exp = [ 52 | " 3", 53 | "seq1 0.0000000 0.9945518 1.0000000", 54 | "seq2 0.9945518 0.0000000 0.8318771", 55 | "seq3 1.0000000 0.8318771 0.0000000", 56 | ] 57 | self.assertEqual(self.matrix.format(), "\n".join(exp)) 58 | 59 | def test_write_to_file_phylip(self): 60 | oh = open(self.output_filename, 'w') 61 | self.matrix.write_to_file(oh) 62 | oh.close() 63 | fh = open(self.output_filename) 64 | result = fh.read() 65 | fh.close() 66 | os.remove(self.output_filename) 67 | exp = [ 68 | ' 3', 69 | 'seq1 0.0000000 0.3531587 0.3550933', 70 | 'seq2 0.3531587 0.0000000 0.2953940', 71 | 'seq3 0.3550933 0.2953940 0.0000000\n' 72 | ] 73 | self.assertEqual(result, "\n".join(exp)) 74 | 75 | def test_write_to_file_pairwise(self): 76 | oh = open(self.output_filename, 'w') 77 | self.matrix.write_to_file(oh, 'pairwise') 78 | oh.close() 79 | fh = open(self.output_filename) 80 | result = fh.read() 81 | fh.close() 82 | os.remove(self.output_filename) 83 | exp = [ 84 | "seq1\tseq2\t0.3531587", 85 | "seq1\tseq3\t0.3550933", 86 | "seq2\tseq3\t0.2953940\n" 87 | ] 88 | self.assertEqual(result, "\n".join(exp)) 89 | 90 | def test_write_to_file_pairwise_decimal3(self): 91 | oh = open(self.output_filename, 'w') 92 | self.matrix.write_to_file(oh, 'pairwise', 3) 93 | oh.close() 94 | fh = open(self.output_filename) 95 | result = fh.read() 96 | fh.close() 97 | os.remove(self.output_filename) 98 | exp = [ 99 | "seq1\tseq2\t0.353", 100 | "seq1\tseq3\t0.355", 101 | "seq2\tseq3\t0.295\n" 102 | ] 103 | self.assertEqual(result, "\n".join(exp)) 104 | 105 | def test_iter(self): 106 | exp = [(0, 1, 'seq1', 'seq2', 0.35315869999999999), 107 | (0, 2, 'seq1', 'seq3', 0.35509332999999998), 108 | (1, 2, 'seq2', 'seq3', 0.29539399999999999)] 109 | self.assertEqual(list(self.matrix), exp) 110 | 111 | def test_create_matrix(self): 112 | l = [[3, 6, 4, 1, 3, 4, 3, 0, 1, 1, 6, 4, 5, 0, 3, 4], 113 | [0, 3, 0, 3, 0, 0, 0, 2, 9, 0, 3, 3, 0, 6, 3, 6], 114 | [9, 0, 0, 3, 0, 0, 0, 2, 6, 0, 3, 3, 0, 3, 3, 3]] 115 | vector = np.array(l) 116 | dist = word_distance.Distance(vector, 'minkowski') 117 | id_list = ['seq1', 'seq2', 'seq3'] 118 | matrix = distmatrix.create(id_list, dist) 119 | exp = [ 120 | ' 3', 121 | 'seq1 0.0000000 14.6969385 14.1774469', 122 | 'seq2 14.6969385 0.0000000 10.8166538', 123 | 'seq3 14.1774469 10.8166538 0.0000000' 124 | ] 125 | self.assertEqual(matrix.format(), "\n".join(exp)) 126 | 127 | def test_highcharts(self): 128 | self.assertEqual(len(self.matrix.highcharts()), 3) 129 | 130 | def test_read_highcharts_matrix(self): 131 | id_list = ['seq1', 'seq2', 'seq3'] 132 | data = [[0, 1, 0.35, 0.19], [0, 2, 1.0, 0.55], [1, 2, 0.88, 0.48]] 133 | matrix = distmatrix.read_highcharts_matrix(id_list, data) 134 | md5 = utils.calc_md5(matrix.format()) 135 | self.assertEqual(md5, "476c8f5d284a84ee3c7c419bde2d7658") 136 | 137 | 138 | if __name__ == '__main__': 139 | unittest.main() 140 | -------------------------------------------------------------------------------- /tests/test_fasta.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from alfpy.utils import fasta 5 | 6 | from . import utils 7 | 8 | 9 | class FastaTest(unittest.TestCase): 10 | 11 | def __init__(self, *args, **kwargs): 12 | super(FastaTest, self).__init__(*args, **kwargs) 13 | self.ID_LIST = ['seq1', 'seq2', 'seq3', 'seq4'] 14 | self.DESC_LIST = ['seq1 desc', 'seq2 desc', 'seq3 desc', ''] 15 | self.SEQ_LIST = [ 16 | 'MEVVIRSANFTDNAKIIIVQLNASVEINCTRPNNYTRKGIRIGPGRAVYAAEEIIGDNTLKQVVTKLRE', 17 | 'MVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIRIGPGRAVYAAEEIIGDIRRAHCNIS', 18 | 'MFTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGAKW', 19 | 'MFTDNAKIIIVQLNASVEINCTRPNNNTR' 20 | ] 21 | 22 | def _validate_FastaRecord_init(self, fasta_record, seqidx): 23 | self.assertIsInstance(fasta_record, fasta.FastaRecord) 24 | self.assertEqual(fasta_record.seq, self.SEQ_LIST[seqidx]) 25 | self.assertEqual(fasta_record.id, self.ID_LIST[seqidx]) 26 | self.assertEqual(fasta_record.description, self.DESC_LIST[seqidx]) 27 | self.assertEqual(len(fasta_record), len(self.SEQ_LIST[seqidx])) 28 | 29 | def test_single_FastaRecord_init(self): 30 | r = fasta.FastaRecord(self.SEQ_LIST[0], 31 | self.ID_LIST[0], 32 | self.DESC_LIST[0]) 33 | self._validate_FastaRecord_init(r, seqidx=0) 34 | 35 | def test_single_FastaRecord_iter(self): 36 | r = fasta.FastaRecord(self.SEQ_LIST[3], 37 | self.ID_LIST[3], 38 | self.DESC_LIST[3]) 39 | i = iter(r) 40 | self.assertEqual(next(i), 'M') 41 | self.assertEqual(next(i), 'F') 42 | 43 | def test_single_FastaRecord_contains(self): 44 | r = fasta.FastaRecord(self.SEQ_LIST[3], 45 | self.ID_LIST[3], 46 | self.DESC_LIST[3]) 47 | self.assertTrue('MFT' in r) 48 | 49 | def test_multiple_FastaRecord_init(self): 50 | for i in range(len(self.ID_LIST)): 51 | r = fasta.FastaRecord(self.SEQ_LIST[i], 52 | self.ID_LIST[i], 53 | self.DESC_LIST[i]) 54 | self._validate_FastaRecord_init(r, seqidx=i) 55 | 56 | def test_read_fasta(self): 57 | fh = open(utils.get_test_data('pep.fa')) 58 | r = fasta.read(fh) 59 | fh.close() 60 | self._validate_FastaRecord_init(r, seqidx=0) 61 | 62 | def test_parse_fasta(self): 63 | fh = open(utils.get_test_data('pep.fa')) 64 | for i, rec in enumerate(fasta.parse(fh)): 65 | self._validate_FastaRecord_init(rec, seqidx=i) 66 | fh.close() 67 | 68 | def test_to_dict(self): 69 | fh = open(utils.get_test_data('pep.fa')) 70 | d = fasta.to_dict(fasta.parse(fh)) 71 | fh.close() 72 | self.assertEqual(len(d), 4) 73 | 74 | def test_to_dict_value_error(self): 75 | h = ['>seq1\n', 'ATG\n', '>seq1\n', 'ATGC'] 76 | with self.assertRaises(ValueError) as context: 77 | d = fasta.to_dict(fasta.parse(h)) 78 | self.assertIn('Duplicate key', str(context.exception)) 79 | 80 | 81 | def test_parse_fasta_missing_sequences(self): 82 | ids = ['seq1', 'seq2'] 83 | seqs = ['ATGC', ''] 84 | l = ['>{}\n'.format(ids[0]), 85 | '{}\n\n\n'.format(seqs[0]), 86 | '>{}\n'.format(ids[1]), 87 | '{}\n'.format(seqs[1]) 88 | ] 89 | for i, fasta_record in enumerate(fasta.parse(l)): 90 | self.assertIsInstance(fasta_record, fasta.FastaRecord) 91 | self.assertEqual(fasta_record.seq, seqs[i]) 92 | 93 | def test_fasta_format(self, wrap=70): 94 | l = ['>seq1 seq1 desc\n', 95 | 'A' * wrap + '\n', 96 | 'B' * wrap] 97 | r = fasta.read(l) 98 | self.assertEqual(''.join(l), r.format(wrap=wrap)) 99 | 100 | def test_input_output_file_fasta(self): 101 | filename = 'temp.fa' 102 | oh = open(utils.get_test_data(filename), 'w') 103 | l1 = [] 104 | fh = open(utils.get_test_data('pep.fa')) 105 | for seq_record in fasta.parse(fh): 106 | l1.append(seq_record.format()) 107 | oh.write(seq_record.format()) 108 | oh.write('\n') 109 | fh.close() 110 | oh.close() 111 | fh = open(utils.get_test_data(filename)) 112 | l2 = [seq_record.format() for seq_record in fasta.parse(fh)] 113 | fh.close() 114 | os.remove(utils.get_test_data(filename)) 115 | self.assertEqual(l1, l2) 116 | 117 | 118 | if __name__ == '__main__': 119 | unittest.main() 120 | -------------------------------------------------------------------------------- /tests/test_fcgr.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from alfpy import fcgr 5 | from alfpy.utils import distmatrix 6 | 7 | from . import utils 8 | 9 | 10 | class VectorTest(unittest.TestCase, utils.ModulesCommonTest): 11 | 12 | def __init__(self, *args, **kwargs): 13 | super(VectorTest, self).__init__(*args, **kwargs) 14 | utils.ModulesCommonTest.set_test_data() 15 | 16 | def test_fcgr_vector1(self): 17 | vec = fcgr.fcgr_vector('CTAGGGAACATACCA', 1) 18 | self.assertEqual(vec, [3.0, 6.0, 3.0]) 19 | 20 | def test_fcgr_vector2(self): 21 | vec = fcgr.fcgr_vector('CTAGGGAACATACCA', 2) 22 | exp = [0.0, 0.0, 2.0, 2.0, 1.0, 1.0, 0.0, 2.0, 23 | 1.0, 2.0, 0.0, 1.0, 2.0, 1.0, 0.0] 24 | self.assertEqual(vec, exp) 25 | 26 | def test_fcgr_vector3(self): 27 | vec = fcgr.fcgr_vector('CTAGGGAACATACCXXA', 1) 28 | self.assertEqual(vec, [3.0, 6.0, 3.0]) 29 | 30 | def test_create_vectors(self): 31 | vecs = fcgr.create_vectors(self.dna_records, 2) 32 | exp = [[0, 3, 1, 4, 0, 1, 1, 1, 1, 1, 3, 2, 4, 1, 1], 33 | [0, 0, 4, 1, 2, 2, 0, 0, 1, 4, 0, 0, 3, 1, 1], 34 | [0, 0, 2, 2, 1, 1, 0, 2, 1, 2, 0, 1, 2, 1, 0]] 35 | self.assertEqual(vecs.tolist(), exp) 36 | 37 | 38 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest): 39 | 40 | def __init__(self, *args, **kwargs): 41 | super(DistanceTest, self).__init__(*args, **kwargs) 42 | utils.ModulesCommonTest.set_test_data() 43 | 44 | def test_distance1(self): 45 | vecs = fcgr.create_vectors(self.dna_records, 2) 46 | dist = fcgr.Distance(vecs) 47 | matrix = distmatrix.create(self.dna_records.id_list, dist) 48 | exp = [ 49 | " 3", 50 | "seq1 0.0000000 7.5498344 5.7445626", 51 | "seq2 7.5498344 0.0000000 4.2426407", 52 | "seq3 5.7445626 4.2426407 0.0000000" 53 | ] 54 | self.assertEqual(matrix.format(), "\n".join(exp)) 55 | 56 | def test_distance2(self): 57 | vecs = fcgr.create_vectors(self.dna_records, 2) 58 | dist = fcgr.Distance(vecs, 'google') 59 | matrix = distmatrix.create(self.dna_records.id_list, dist) 60 | exp = [ 61 | " 3", 62 | "seq1 0.0000000 0.5833333 0.5416667", 63 | "seq2 0.5833333 0.0000000 0.4210526", 64 | "seq3 0.5416667 0.4210526 0.0000000" 65 | ] 66 | self.assertEqual(matrix.format(), "\n".join(exp)) 67 | 68 | 69 | if __name__ == '__main__': 70 | unittest.main() 71 | -------------------------------------------------------------------------------- /tests/test_graphdna.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import unittest 3 | 4 | from alfpy import graphdna 5 | from alfpy.utils import distmatrix 6 | 7 | from . import utils 8 | 9 | 10 | class VectorTest(unittest.TestCase, utils.ModulesCommonTest): 11 | 12 | def __init__(self, *args, **kwargs): 13 | super(VectorTest, self).__init__(*args, **kwargs) 14 | utils.ModulesCommonTest.set_test_data() 15 | 16 | def test_2DSGraphVector(self): 17 | seq = 'CTAGGGAACATACCA' 18 | vec = graphdna._2DSGraphVector(seq) 19 | 20 | exp = [2.99197183, -8.04298066, 9.16666667, -5.78272208, 21 | 6.5, -1.75064326, 5, -2.92241364, 9.25, -3.81343559] 22 | self.assertTrue(np.allclose(vec, np.array(exp))) 23 | 24 | def test_2DSGraphVector_ambiguousDNA(self): 25 | seq = 'CTAGGGAANNNXXXCATACCA' 26 | vec = graphdna._2DSGraphVector(seq) 27 | 28 | exp = [2.99197183, -8.04298066, 9.16666667, -5.78272208, 29 | 6.5, -1.75064326, 5, -2.92241364, 9.25, -3.81343559] 30 | self.assertTrue(np.allclose(vec, np.array(exp))) 31 | 32 | def test_2DMGraphVector_ndim10(self): 33 | seq = 'CTAGGGAACATACCA' 34 | vec = graphdna._2DMGraphVector(seq, 10) 35 | exp = [15, 12.14790682, 13.5804606, 15.88980624, 19.16010756, 36 | 23.55763468, 29.38627489, 37.08035601, 47.23633868, 37 | 60.66394053] 38 | self.assertEqual(vec.shape, (10,)) 39 | self.assertTrue(np.allclose(vec, np.array(exp))) 40 | 41 | def test_2DMGraphVector_ndim10_ambiguousDNA(self): 42 | seq = 'CTAGGGAACATACCA' 43 | vec = graphdna._2DMGraphVector(seq, 10) 44 | exp = [15, 12.14790682, 13.5804606, 15.88980624, 19.16010756, 45 | 23.55763468, 29.38627489, 37.08035601, 47.23633868, 46 | 60.66394053] 47 | self.assertEqual(vec.shape, (10,)) 48 | self.assertTrue(np.allclose(vec, np.array(exp))) 49 | 50 | def test_2DMGraphVector_ndim5(self): 51 | seq = 'CTAGGGAACATACCA' 52 | vec = graphdna._2DMGraphVector(seq, 5) 53 | exp = [15, 12.14790682, 13.5804606, 15.88980624, 19.16010756] 54 | self.assertEqual(vec.shape, (5,)) 55 | self.assertTrue(np.allclose(vec, np.array(exp))) 56 | 57 | def test_2DNGraphVector(self): 58 | seq = 'CTAGGGAACATACCA' 59 | vec = graphdna._2DNGraphVector(seq) 60 | md5 = utils.calc_md5(vec) 61 | self.assertEqual(len(vec), 48) 62 | self.assertEqual(md5, '44829cc0277531646d656cdaacd3ae94') 63 | 64 | def test_2DNGraphVector_ambiguousDNA(self): 65 | seq = 'CTAGGGAACATACCA' 66 | vec = graphdna._2DNGraphVector(seq) 67 | md5 = utils.calc_md5(vec) 68 | self.assertEqual(len(vec), 48) 69 | self.assertEqual(md5, '44829cc0277531646d656cdaacd3ae94') 70 | 71 | def test_create_2DSGraphVectors(self): 72 | data = graphdna.create_2DSGraphVectors(self.dna_records) 73 | md5 = utils.calc_md5(data) 74 | self.assertEqual(md5, 'e2399897bb7eaa5ca3a81c84e2eeac84') 75 | 76 | def test_create_2DMGraphVectors(self): 77 | data = graphdna.create_2DMGraphVectors(self.dna_records, 10) 78 | md5 = utils.calc_md5(data) 79 | self.assertEqual(md5, '8c7d4dca912aeaf7c88d325799dadf00') 80 | 81 | def test_create_2DNGraphVectors(self): 82 | data = graphdna.create_2DNGraphVectors(self.dna_records) 83 | md5 = utils.calc_md5(data) 84 | self.assertEqual(md5, '3211fc3837b876521a6ab8b6a22b411c') 85 | 86 | 87 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest): 88 | 89 | def __init__(self, *args, **kwargs): 90 | super(DistanceTest, self).__init__(*args, **kwargs) 91 | utils.ModulesCommonTest.set_test_data() 92 | 93 | def test_distance_2DSG(self): 94 | data = graphdna.create_2DSGraphVectors(self.dna_records) 95 | dist = graphdna.Distance(data) 96 | matrix = distmatrix.create(self.dna_records.id_list, dist) 97 | exp = [ 98 | ' 3', 99 | 'seq1 0.0000000 9.4762599 14.6585286', 100 | 'seq2 9.4762599 0.0000000 6.7199568', 101 | 'seq3 14.6585286 6.7199568 0.0000000', 102 | ] 103 | self.assertEqual(matrix.format(), "\n".join(exp)) 104 | 105 | def test_distance_2DMG(self): 106 | data = graphdna.create_2DMGraphVectors(self.dna_records, 10) 107 | dist = graphdna.Distance(data) 108 | matrix = distmatrix.create(self.dna_records.id_list, dist) 109 | exp = [ 110 | ' 3', 111 | 'seq1 0.0000000 22.2449494 55.9753388', 112 | 'seq2 22.2449494 0.0000000 34.2064423', 113 | 'seq3 55.9753388 34.2064423 0.0000000' 114 | ] 115 | self.assertEqual(matrix.format(), "\n".join(exp)) 116 | 117 | def test_distance_2DNG(self): 118 | data = graphdna.create_2DNGraphVectors(self.dna_records) 119 | dist = graphdna.Distance(data) 120 | matrix = distmatrix.create(self.dna_records.id_list, dist) 121 | exp = [ 122 | ' 3', 123 | 'seq1 0.0000000 10.3711467 15.1355787', 124 | 'seq2 10.3711467 0.0000000 7.8973545', 125 | 'seq3 15.1355787 7.8973545 0.0000000' 126 | ] 127 | self.assertEqual(matrix.format(), "\n".join(exp)) 128 | 129 | 130 | if __name__ == '__main__': 131 | unittest.main() 132 | -------------------------------------------------------------------------------- /tests/test_lempelziv.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from alfpy import lempelziv 4 | from alfpy.utils import distmatrix 5 | 6 | from . import utils 7 | 8 | 9 | class VectorTest(unittest.TestCase, utils.ModulesCommonTest): 10 | 11 | def __init__(self, *args, **kwargs): 12 | super(VectorTest, self).__init__(*args, **kwargs) 13 | utils.ModulesCommonTest.set_test_data() 14 | 15 | def test_complexity(self): 16 | seq = 'MFTDNAKIIIVQLNASVEINCTRPNNNTR' 17 | c = lempelziv.complexity(seq) 18 | self.assertEqual(c, 19) 19 | 20 | def test_complexity1(self): 21 | seq = 'MFTDNAKIIIVQLNASVEINCTRPNNNTR' 22 | c = lempelziv.complexity1(seq) 23 | self.assertEqual(c, 20) 24 | 25 | def test_complexities(self): 26 | dist = lempelziv.Distance(self.pep_records) 27 | exp = [((0,), 40), ((0, 1), 47), ((0, 2), 53), 28 | ((0, 3), 43), ((1,), 38), ((1, 0), 47), 29 | ((1, 2), 47), ((1, 3), 41), ((2,), 35), 30 | ((2, 0), 50), ((2, 1), 45), ((2, 3), 37), 31 | ((3,), 19), ((3, 0), 39), ((3, 1), 37), 32 | ((3, 2), 36)] 33 | self.assertEqual(sorted(dist._complexity.items()), exp) 34 | 35 | 36 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest): 37 | 38 | def __init__(self, *args, **kwargs): 39 | super(DistanceTest, self).__init__(*args, **kwargs) 40 | utils.ModulesCommonTest.set_test_data() 41 | self.dist = lempelziv.Distance(self.pep_records, 'd') 42 | 43 | def test_distance_d(self): 44 | matrix = distmatrix.create(self.pep_records.id_list, self.dist) 45 | exp = [ 46 | " 4", 47 | "seq1 0 9 15 20", 48 | "seq2 9 0 10 18", 49 | "seq3 15 10 0 17", 50 | "seq4 20 18 17 0" 51 | ] 52 | self.assertEqual(matrix.format(decimal_places=0), "\n".join(exp)) 53 | 54 | def test_distance_d_star(self): 55 | self.dist.set_disttype('d_star') 56 | matrix = distmatrix.create(self.pep_records.id_list, self.dist) 57 | exp = [ 58 | " 4", 59 | "seq1 0.0000000 0.2250000 0.3750000 0.5000000", 60 | "seq2 0.2250000 0.0000000 0.2631579 0.4736842", 61 | "seq3 0.3750000 0.2631579 0.0000000 0.4857143", 62 | "seq4 0.5000000 0.4736842 0.4857143 0.0000000" 63 | ] 64 | self.assertEqual(matrix.format(), "\n".join(exp)) 65 | 66 | def test_distance_d1(self): 67 | self.dist.set_disttype('d1') 68 | matrix = distmatrix.create(self.pep_records.id_list, self.dist) 69 | exp = [ 70 | " 4", 71 | "seq1 0 16 28 23", 72 | "seq2 16 0 19 21", 73 | "seq3 28 19 0 19", 74 | "seq4 23 21 19 0" 75 | ] 76 | self.assertEqual(matrix.format(0), "\n".join(exp)) 77 | 78 | def test_distance_d1_star(self): 79 | self.dist.set_disttype('d1_star') 80 | matrix = distmatrix.create(self.pep_records.id_list, self.dist) 81 | exp = [ 82 | " 4", 83 | "seq1 0.0000000 0.3404255 0.5283019 0.5348837", 84 | "seq2 0.3404255 0.0000000 0.4042553 0.5121951", 85 | "seq3 0.5283019 0.4042553 0.0000000 0.5135135", 86 | "seq4 0.5348837 0.5121951 0.5135135 0.0000000" 87 | ] 88 | self.assertEqual(matrix.format(), "\n".join(exp)) 89 | 90 | def test_distance_d1_star2(self): 91 | self.dist.set_disttype('d1_star2') 92 | matrix = distmatrix.create(self.pep_records.id_list, self.dist) 93 | exp = [ 94 | " 4", 95 | "seq1 0.0000000 0.3404255 0.5436893 0.5609756", 96 | "seq2 0.3404255 0.0000000 0.4130435 0.5384615", 97 | "seq3 0.5436893 0.4130435 0.0000000 0.5205479", 98 | "seq4 0.5609756 0.5384615 0.5205479 0.0000000" 99 | ] 100 | self.assertEqual(matrix.format(), "\n".join(exp)) 101 | 102 | def test_set_disttype_throws_exception(self): 103 | with self.assertRaises(Exception) as context: 104 | self.dist.set_disttype('nonexitent') 105 | self.assertIn('unknown disttype', str(context.exception)) 106 | 107 | 108 | if __name__ == '__main__': 109 | unittest.main() 110 | -------------------------------------------------------------------------------- /tests/test_ncd.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from alfpy import ncd 4 | from alfpy.utils import distmatrix 5 | 6 | from . import utils 7 | 8 | 9 | class Test(unittest.TestCase, utils.ModulesCommonTest): 10 | 11 | def __init__(self, *args, **kwargs): 12 | super(Test, self).__init__(*args, **kwargs) 13 | utils.ModulesCommonTest.set_test_data() 14 | 15 | def test_complexity1(self): 16 | seq = 'AACGTACCATTGAACGTACCGTAGG' 17 | c = ncd.complexity(seq) 18 | self.assertEqual(c, 26) 19 | 20 | def test_complexity2(self): 21 | seq = 'MFTDNAKIIIVQLNASVEINCTRPNNNTR' 22 | c = ncd.complexity(seq) 23 | self.assertEqual(c, 37) 24 | 25 | def test_complexities(self): 26 | dist = ncd.Distance(self.pep_records) 27 | exp = [ 28 | ((0,), 63.0), ((0, 1), 77.0), ((0, 2), 85.0), 29 | ((0, 3), 70.0), ((1,), 60.0), ((1, 2), 78.0), 30 | ((1, 3), 65.0), ((2,), 61.0), ((2, 3), 66.0), 31 | ((3,), 37.0) 32 | ] 33 | self.assertEqual(exp, sorted(dist._complexity.items())) 34 | 35 | def test_distance(self): 36 | dist = ncd.Distance(self.pep_records) 37 | matrix = distmatrix.create(self.pep_records.id_list, dist) 38 | exp = [ 39 | " 4", 40 | "seq1 0.0000000 0.2698413 0.3809524 0.5238095", 41 | "seq2 0.2698413 0.0000000 0.2950820 0.4666667", 42 | "seq3 0.3809524 0.2950820 0.0000000 0.4754098", 43 | "seq4 0.5238095 0.4666667 0.4754098 0.0000000" 44 | ] 45 | self.assertEqual(matrix.format(), "\n".join(exp)) 46 | 47 | 48 | if __name__ == '__main__': 49 | unittest.main() 50 | -------------------------------------------------------------------------------- /tests/test_seqrecords.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from alfpy.utils import seqrecords 4 | 5 | from . import utils 6 | 7 | 8 | class SeqRecordsTest(unittest.TestCase): 9 | 10 | def __init__(self, *args, **kwargs): 11 | super(SeqRecordsTest, self).__init__(*args, **kwargs) 12 | self.ID_LIST = ['seq1', 'seq2', 'seq3', 'seq4'] 13 | self.DESC_LIST = ['seq1 desc', 'seq2 desc', 'seq3 desc', ''] 14 | self.SEQ_LIST = [ 15 | 'MEVVIRSANFTDNAKIIIVQLNASVEINCTRPNNYTRKGIRIGPGRAVYAAEEIIGDNTLKQVVTKLRE', 16 | 'MVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIRIGPGRAVYAAEEIIGDIRRAHCNIS', 17 | 'MFTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGAKW', 18 | 'MFTDNAKIIIVQLNASVEINCTRPNNNTR' 19 | ] 20 | 21 | def _validate_seqrecords(self, rec): 22 | self.assertEqual(rec.id_list, self.ID_LIST) 23 | self.assertEqual(rec.seq_list, self.SEQ_LIST) 24 | self.assertEqual(rec.length_list, [len(s) for s in self.SEQ_LIST]) 25 | self.assertEqual(rec.count, len(self.SEQ_LIST)) 26 | 27 | def test_SeqRecords_init(self): 28 | rec = seqrecords.SeqRecords( 29 | id_list=self.ID_LIST, seq_list=self.SEQ_LIST) 30 | self._validate_seqrecords(rec) 31 | 32 | def test_SeqRecords_add(self): 33 | rec = seqrecords.SeqRecords() 34 | for i in range(len(self.ID_LIST)): 35 | rec.add(self.ID_LIST[i], self.SEQ_LIST[i]) 36 | self._validate_seqrecords(rec) 37 | 38 | def test_SeqRecords_len(self): 39 | rec = seqrecords.SeqRecords( 40 | id_list=self.ID_LIST, seq_list=self.SEQ_LIST) 41 | self.assertEqual(len(rec), 4) 42 | 43 | def test_read_fasta(self): 44 | fh = open(utils.get_test_data('pep.fa')) 45 | rec = seqrecords.read_fasta(fh) 46 | fh.close() 47 | self._validate_seqrecords(rec) 48 | 49 | def test_fasta(self): 50 | rec = seqrecords.SeqRecords( 51 | id_list=self.ID_LIST, seq_list=self.SEQ_LIST) 52 | md5 = utils.calc_md5(rec.fasta(wrap=30)) 53 | exp = [ 54 | ">seq1", 55 | "MEVVIRSANFTDNAKIIIVQLNASVEINCT", 56 | "RPNNYTRKGIRIGPGRAVYAAEEIIGDNTL", 57 | "KQVVTKLRE", 58 | ">seq2", 59 | "MVIRSANFTDNAKIIIVQLNASVEINCTRP", 60 | "NNNTRKGIRIGPGRAVYAAEEIIGDIRRAH", 61 | "CNIS", 62 | ">seq3", 63 | "MFTDNAKIIIVQLNASVEINCTRPNNNTRK", 64 | "GIHIGPGRAFYATGEIIGDIRQAHCNISGA", 65 | "KW", 66 | ">seq4", 67 | "MFTDNAKIIIVQLNASVEINCTRPNNNTR" 68 | ] 69 | self.assertEqual(rec.fasta(wrap=30), "\n".join(exp)) 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /tests/test_wmetric.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from alfpy import wmetric 4 | from alfpy.utils import distmatrix 5 | from alfpy.utils.data import subsmat 6 | 7 | from . import utils 8 | 9 | 10 | class VectorTest(unittest.TestCase): 11 | 12 | def test_count_seq_chars(self): 13 | seq = 'MKSTGWHFSG' 14 | l = wmetric.count_seq_chars(seq, utils.ALPHABET_PEP) 15 | expl = [0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 1, 0, 0] 16 | self.assertEqual(l, expl) 17 | 18 | def test_count_seq_chars_pep_ambiguous(self): 19 | seq = 'MKSTGWXXXXXXXOOOOOOOHFSG' 20 | l = wmetric.count_seq_chars(seq, utils.ALPHABET_PEP) 21 | expl = [0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 1, 0, 0] 22 | self.assertEqual(l, expl) 23 | 24 | def test_freq_seq_chars(self): 25 | seq = 'MKSTGWXXXXXXXOOOOOOOHFSG' 26 | l = wmetric.count_seq_chars(seq, utils.ALPHABET_PEP) 27 | freq = wmetric.freq_seq_chars(l) 28 | expfreq = [0.0, 0.0, 0.0, 0.0, 0.1, 0.2, 0.1, 0.0, 0.1, 0.0, 29 | 0.1, 0.0, 0.0, 0.0, 0.2, 0.1, 0.0, 0.1, 0.0, 0.0] 30 | self.assertEqual(freq, expfreq) 31 | 32 | 33 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest): 34 | 35 | def __init__(self, *args, **kwargs): 36 | super(DistanceTest, self).__init__(*args, **kwargs) 37 | utils.ModulesCommonTest.set_test_data() 38 | 39 | def test_wmetric_blosum62(self): 40 | # The result of this method is identical to that from decaf+py. 41 | matrix = subsmat.get('blosum62') 42 | dist = wmetric.Distance(self.pep_records, matrix) 43 | matrix = distmatrix.create(self.pep_records.id_list, dist) 44 | data = [' 4', 45 | 'seq1 0.0000000 0.0392559 0.0783026 0.1261381', 46 | 'seq2 0.0392559 0.0000000 0.0377364 0.1166475', 47 | 'seq3 0.0783026 0.0377364 0.0000000 0.1677386', 48 | 'seq4 0.1261381 0.1166475 0.1677386 0.0000000'] 49 | self.assertEqual(matrix.format(), "\n".join(data)) 50 | 51 | def test_wmetric_pam250(self): 52 | matrix = subsmat.get('pam250') 53 | dist = wmetric.Distance(self.pep_records, matrix) 54 | matrix = distmatrix.create(self.pep_records.id_list, dist) 55 | data = [' 4', 56 | 'seq1 0.0000000 0.0289700 0.0467580 0.0353781', 57 | 'seq2 0.0289700 0.0000000 0.0227122 0.0372699', 58 | 'seq3 0.0467580 0.0227122 0.0000000 0.0578383', 59 | 'seq4 0.0353781 0.0372699 0.0578383 0.0000000'] 60 | self.assertEqual(matrix.format(), "\n".join(data)) 61 | 62 | 63 | if __name__ == '__main__': 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /tests/test_word_d2.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from alfpy import word_d2 4 | from alfpy import word_pattern 5 | from alfpy import word_vector 6 | from alfpy.utils import distmatrix 7 | 8 | from . import utils 9 | 10 | 11 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest): 12 | 13 | def __init__(self, *args, **kwargs): 14 | super(DistanceTest, self).__init__(*args, **kwargs) 15 | utils.ModulesCommonTest.set_test_data() 16 | self.patterns = [] 17 | self.counts = [] 18 | self.freqs = [] 19 | for i in range(1, 5): 20 | p = word_pattern.create(self.pep_records.seq_list, i) 21 | self.patterns.append(p) 22 | c = word_vector.Counts(self.pep_records.length_list, p) 23 | self.counts.append(c) 24 | f = word_vector.Freqs(self.pep_records.length_list, p) 25 | self.freqs.append(f) 26 | 27 | def test_counts_from1_to4(self): 28 | dist = word_d2.Distance(self.counts) 29 | matrix = distmatrix.create(self.pep_records.id_list, dist) 30 | exp = [ 31 | ' 4', 32 | 'seq1 0 130 236 286', 33 | 'seq2 130 0 142 258', 34 | 'seq3 236 142 0 212', 35 | 'seq4 286 258 212 0' 36 | ] 37 | self.assertEqual(matrix.format(decimal_places=0), "\n".join(exp)) 38 | 39 | def test_freqs_from1_to4(self): 40 | dist = word_d2.Distance(self.freqs) 41 | matrix = distmatrix.create(self.pep_records.id_list, dist) 42 | exp = [ 43 | ' 4', 44 | 'seq1 0.0000000 0.0313590 0.0573154 0.1020235', 45 | 'seq2 0.0313590 0.0000000 0.0373677 0.0907196', 46 | 'seq3 0.0573154 0.0373677 0.0000000 0.0870581', 47 | 'seq4 0.1020235 0.0907196 0.0870581 0.0000000' 48 | 49 | ] 50 | self.assertEqual(matrix.format(), "\n".join(exp)) 51 | 52 | def test_counts_from1_to1(self): 53 | dist = word_d2.Distance([self.counts[0]]) 54 | matrix = distmatrix.create(self.pep_records.id_list, dist) 55 | exp = [ 56 | ' 4', 57 | 'seq1 0 37 57 140', 58 | 'seq2 37 0 28 137', 59 | 'seq3 57 28 0 111', 60 | 'seq4 140 137 111 0' 61 | ] 62 | self.assertEqual(matrix.format(decimal_places=0), "\n".join(exp)) 63 | 64 | def test_freqs_from1_to4_d2_squareroot(self): 65 | dist = word_d2.Distance(self.freqs) 66 | dist.set_disttype('d2_squareroot') 67 | matrix = distmatrix.create(self.pep_records.id_list, dist) 68 | exp = [ 69 | " 4", 70 | "seq1 0.0000000 0.1770847 0.2394063 0.3194113", 71 | "seq2 0.1770847 0.0000000 0.1933073 0.3011969", 72 | "seq3 0.2394063 0.1933073 0.0000000 0.2950560", 73 | "seq4 0.3194113 0.3011969 0.2950560 0.0000000" 74 | 75 | ] 76 | self.assertEqual(matrix.format(), "\n".join(exp)) 77 | 78 | def test_set_disttype_throws_exception(self): 79 | dist = word_d2.Distance(self.freqs) 80 | with self.assertRaises(Exception) as context: 81 | dist.set_disttype('nonexistent') 82 | self.assertIn('unknown disttype', str(context.exception)) 83 | 84 | 85 | if __name__ == '__main__': 86 | unittest.main() 87 | -------------------------------------------------------------------------------- /tests/test_word_rtd.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from alfpy import word_pattern 4 | from alfpy import word_rtd 5 | from alfpy.utils import distmatrix 6 | 7 | from . import utils 8 | 9 | 10 | class Test(unittest.TestCase, utils.ModulesCommonTest): 11 | 12 | def __init__(self, *args, **kwargs): 13 | super(Test, self).__init__(*args, **kwargs) 14 | utils.ModulesCommonTest.set_test_data() 15 | self.pep_2mer_pos = word_pattern.create( 16 | self.pep_records.seq_list, 2, True) 17 | 18 | def test_calc_rtd(self): 19 | seq = 'CTACACAACTTTGCGGGTAGCCGGAAACATTGTGAATGCGGTGAACA' 20 | apos = [i for i, nt in enumerate(seq) if nt == 'A'] 21 | val = word_rtd.calc_rtd(apos) 22 | exp = (3.3846153846153846, 3.1510306381944679) 23 | self.assertEqual(val, exp) 24 | 25 | def test_create_vector(self): 26 | vec = word_rtd.create_vector(self.pep_records.count, self.pep_2mer_pos) 27 | exp = (self.pep_records.count, len(self.pep_2mer_pos.pat_list)*2) 28 | self.assertEqual(vec.shape, exp) 29 | 30 | def test_distance(self): 31 | vec = word_rtd.create_vector(self.pep_records.count, self.pep_2mer_pos) 32 | dist = word_rtd.Distance(vec, 'google') 33 | matrix = distmatrix.create(self.pep_records.id_list, dist) 34 | exp = [ 35 | " 4", 36 | "seq1 0.0000000 0.4892241 0.6034483 0.9310345", 37 | "seq2 0.4892241 0.0000000 0.3673469 0.8802817", 38 | "seq3 0.6034483 0.3673469 0.0000000 0.8843537", 39 | "seq4 0.9310345 0.8802817 0.8843537 0.0000000" 40 | ] 41 | self.assertEqual(matrix.format(), "\n".join(exp)) 42 | 43 | 44 | if __name__ == '__main__': 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /tests/test_word_sets_distance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import unittest 3 | 4 | from alfpy import word_pattern 5 | from alfpy import word_sets_distance 6 | from alfpy.utils import distmatrix 7 | 8 | from . import utils 9 | 10 | 11 | class Test(unittest.TestCase, utils.ModulesCommonTest): 12 | 13 | def __init__(self, *args, **kwargs): 14 | super(Test, self).__init__(*args, **kwargs) 15 | utils.ModulesCommonTest.set_test_data() 16 | self.p = word_pattern.create(self.pep_records.seq_list, 2) 17 | 18 | def test_getwords(self): 19 | words = word_sets_distance._getwords('ATGCGTA', 2) 20 | self.assertSetEqual(words, set(['GT', 'CG', 'GC', 'AT', 'TG', 'TA'])) 21 | 22 | def test_distance_dice(self): 23 | # The result of this function is identical 24 | # to the Dice distance implemented in word_bool_distance. 25 | dist = word_sets_distance.Distance(self.pep_records, 2, 'dice') 26 | matrix = distmatrix.create(self.pep_records.id_list, dist) 27 | exp = [ 28 | " 4", 29 | "seq1 0.0000000 0.1964286 0.3928571 0.4457831", 30 | "seq2 0.1964286 0.0000000 0.2452830 0.4025974", 31 | "seq3 0.3928571 0.2452830 0.0000000 0.3766234", 32 | "seq4 0.4457831 0.4025974 0.3766234 0.0000000" 33 | ] 34 | self.assertEqual(matrix.format(), "\n".join(exp)) 35 | 36 | def test_distance_hamming(self): 37 | dist = word_sets_distance.Distance(self.pep_records, 2, 'hamming') 38 | matrix = distmatrix.create(self.pep_records.id_list, dist) 39 | exp = [ 40 | " 4", 41 | "seq1 0 22 44 37", 42 | "seq2 22 0 26 31", 43 | "seq3 44 26 0 29", 44 | "seq4 37 31 29 0" 45 | ] 46 | self.assertEqual(matrix.format(0), "\n".join(exp)) 47 | 48 | def test_distance_jaccard(self): 49 | # The result of this function is identical 50 | # to the Jaccard distance implemented in word_bool_distance. 51 | dist = word_sets_distance.Distance(self.pep_records, 2, 'jaccard') 52 | matrix = distmatrix.create(self.pep_records.id_list, dist) 53 | exp = [ 54 | " 4", 55 | "seq1 0.0000000 0.3283582 0.5641026 0.6166667", 56 | "seq2 0.3283582 0.0000000 0.3939394 0.5740741", 57 | "seq3 0.5641026 0.3939394 0.0000000 0.5471698", 58 | "seq4 0.6166667 0.5740741 0.5471698 0.0000000" 59 | ] 60 | self.assertEqual(matrix.format(), "\n".join(exp)) 61 | 62 | 63 | if __name__ == '__main__': 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import subprocess 4 | 5 | from alfpy.utils import seqrecords 6 | from alfpy import __version__ 7 | 8 | 9 | ALPHABET_DNA = 'ATGC' 10 | ALPHABET_PEP = 'ACDEFGHIKLMNPRSTQWVY' 11 | 12 | 13 | def get_test_data(filename): 14 | filepath = os.path.join(os.path.dirname(__file__), 'data', filename) 15 | return filepath 16 | 17 | 18 | def calc_md5(obj): 19 | return hashlib.md5(str(obj).encode("utf-8")).hexdigest() 20 | 21 | 22 | def runscript(scriptname, args): 23 | cmd = [scriptname] 24 | for arg in args: 25 | cmd.append(arg) 26 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, 27 | stderr=subprocess.PIPE, 28 | universal_newlines=True) 29 | out = "".join(p.communicate()) 30 | return p.returncode, out 31 | 32 | 33 | class ScriptsCommonTest: 34 | """Methods testing arguments that are common to all scripts.""" 35 | 36 | # the name of the file to read from 37 | 38 | @classmethod 39 | def set_test_data(cls): 40 | cls.filename_dna = get_test_data('dna.fa') 41 | cls.filename_pep = get_test_data('pep.fa') 42 | 43 | def test_arg_version(self): 44 | cmd = ['--version'] 45 | return_code, out = runscript(self.script_name, cmd) 46 | self.assertEqual(return_code, 0) 47 | self.assertIn(__version__, out) 48 | 49 | def test_arg_help(self): 50 | cmd = ['--help'] 51 | return_code, out = runscript(self.script_name, cmd) 52 | self.assertEqual(return_code, 0) 53 | 54 | def test_arg_out_when_no_fasta(self): 55 | cmd = ['--out', 'out.txt'] 56 | return_code, out = runscript(self.script_name, cmd) 57 | self.assertEqual(return_code, 2) 58 | self.assertIn('--fasta/-f', out) 59 | 60 | def test_arg_outfmt_when_no_fasta(self): 61 | cmd = ['--outfmt', 'pairwise'] 62 | return_code, out = runscript(self.script_name, cmd) 63 | self.assertEqual(return_code, 2) 64 | self.assertIn('--fasta/-f', out) 65 | 66 | def _test_output(self, script_name, args, outfile=True): 67 | input_filename = args[args.index('--fasta') + 1] 68 | if outfile: 69 | args.append('--out') 70 | output_filename = '{}.out'.format(input_filename) 71 | args.append(output_filename) 72 | returncode, result = runscript(script_name, args) 73 | if outfile: 74 | fh = open(output_filename) 75 | result = fh.read() 76 | fh.close() 77 | os.remove(output_filename) 78 | md5 = calc_md5(result) 79 | return returncode, result, md5 80 | 81 | 82 | class ScriptsWordCommonTest(ScriptsCommonTest): 83 | 84 | @classmethod 85 | def set_test_data(cls): 86 | ScriptsCommonTest.set_test_data() 87 | cls.filename_char_weights = get_test_data('char_weights.txt') 88 | cls.filename_char_freqs = get_test_data('char_freqs.txt') 89 | cls.filename_pep_1mer_wordpos = get_test_data( 90 | 'pep.fa.1mer.wordpos.txt') 91 | cls.filename_pep_1mer = get_test_data('pep.fa.1mer.txt') 92 | cls.filename_pep_2mer_wordpos = get_test_data( 93 | 'pep.fa.2mer.wordpos.txt') 94 | cls.filename_pep_2mer = get_test_data('pep.fa.2mer.txt') 95 | cls.filename_pep_3mer_wordpos = get_test_data( 96 | 'pep.fa.3mer.wordpos.txt') 97 | cls.filename_pep_3mer = get_test_data('pep.fa.3mer.txt') 98 | 99 | class ModulesCommonTest: 100 | 101 | @classmethod 102 | def set_test_data(cls): 103 | fh = open(get_test_data('dna.fa')) 104 | cls.dna_records = seqrecords.read_fasta(fh) 105 | fh.close() 106 | fh = open(get_test_data('pep.fa')) 107 | cls.pep_records = seqrecords.read_fasta(fh) 108 | fh.close() 109 | cls.dna_filename = get_test_data('dna.fa') 110 | cls.pep_filename = get_test_data('pep.fa') 111 | --------------------------------------------------------------------------------