├── .gitignore
├── LICENSE.txt
├── README.rst
├── alfpy
├── __init__.py
├── bbc.py
├── fcgr.py
├── graphdna.py
├── lempelziv.py
├── ncd.py
├── utils
│ ├── __init__.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── seqcontent.py
│ │ └── subsmat.py
│ ├── distance.py
│ ├── distmatrix.py
│ ├── fasta.py
│ └── seqrecords.py
├── version.py
├── wmetric.py
├── word_bool_distance.py
├── word_d2.py
├── word_distance.py
├── word_pattern.py
├── word_rtd.py
├── word_sets_distance.py
└── word_vector.py
├── bin
├── calc_bbc.py
├── calc_fcgr.py
├── calc_graphdna.py
├── calc_lempelziv.py
├── calc_ncd.py
├── calc_wmetric.py
├── calc_word.py
├── calc_word_bool.py
├── calc_word_cv.py
├── calc_word_d2.py
├── calc_word_ffp.py
├── calc_word_rtd.py
├── calc_word_sets.py
└── create_wordpattern.py
├── example_data
├── input
│ ├── aminoacid.freqs.swissprot.txt
│ ├── aminoacid.weights.txt
│ ├── bears.dna.fasta
│ ├── gp120.pep.fasta
│ ├── hiv.pep.fasta
│ ├── sample.dna.fasta
│ └── sample.pep.fasta
└── output
│ ├── bears.dna.fasta.1mer
│ ├── bears.dna.fasta.2mer
│ ├── bears.dna.fasta.3mer
│ ├── bears.dna.fasta.pairwise
│ ├── bears.dna.fasta.phylip
│ ├── bears.dna.fasta.teiresias.2mer
│ ├── bears.dna.fasta.teiresias.3mer
│ ├── gp120.pep.fasta.1mer
│ ├── gp120.pep.fasta.2mer
│ ├── gp120.pep.fasta.3mer
│ ├── gp120.pep.fasta.pairwise
│ ├── gp120.pep.fasta.phylip
│ ├── gp120.pep.fasta.teiresias.2mer
│ ├── gp120.pep.fasta.teiresias.3mer
│ ├── hiv.pep.fasta.1mer
│ ├── hiv.pep.fasta.2mer
│ ├── hiv.pep.fasta.3mer
│ ├── hiv.pep.fasta.pairwise
│ ├── hiv.pep.fasta.phylip
│ ├── hiv.pep.fasta.teiresias.2mer
│ └── hiv.pep.fasta.teiresias.3mer
├── setup.py
└── tests
├── __init__.py
├── data
├── char_freqs.txt
├── char_weights.txt
├── dna.fa
├── dna.fa.1mer.txt
├── dna.fa.1mer.wordpos.txt
├── dna.fa.2mer.txt
├── dna.fa.2mer.wordpos.txt
├── pep.fa
├── pep.fa.1mer.txt
├── pep.fa.1mer.wordpos.txt
├── pep.fa.2mer.txt
├── pep.fa.2mer.wordpos.txt
├── pep.fa.3mer.txt
└── pep.fa.3mer.wordpos.txt
├── test_bbc.py
├── test_calc_bbc.py
├── test_calc_fcgr.py
├── test_calc_graphdna.py
├── test_calc_lempelziv.py
├── test_calc_ncd.py
├── test_calc_wmetric.py
├── test_calc_word.py
├── test_calc_word_bool.py
├── test_calc_word_cv.py
├── test_calc_word_d2.py
├── test_calc_word_ffp.py
├── test_calc_word_rtd.py
├── test_calc_word_sets.py
├── test_create_wordpattern.py
├── test_distance.py
├── test_distmatrix.py
├── test_fasta.py
├── test_fcgr.py
├── test_graphdna.py
├── test_lempelziv.py
├── test_ncd.py
├── test_seqrecords.py
├── test_wmetric.py
├── test_word_bool_distance.py
├── test_word_d2.py
├── test_word_distance.py
├── test_word_pattern.py
├── test_word_rtd.py
├── test_word_sets_distance.py
├── test_word_vector.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | env/
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *,cover
49 | .hypothesis/
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | # Django stuff:
56 | *.log
57 | local_settings.py
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # IPython Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # dotenv
82 | .env
83 |
84 | # virtualenv
85 | venv/
86 | ENV/
87 |
88 | # Spyder project settings
89 | .spyderproject
90 |
91 | # Rope project settings
92 | .ropeproject
93 |
94 | # My
95 | test.py
96 |
97 |
98 | # cache files for sublime text
99 | *.tmlanguage.cache
100 | *.tmPreferences.cache
101 | *.stTheme.cache
102 |
103 | # workspace files are user-specific
104 | *.sublime-workspace
105 |
106 | # project files should be checked into the repository, unless a significant
107 | # proportion of contributors will probably not be using SublimeText
108 | # *.sublime-project
109 |
110 | # sftp configuration file
111 | sftp-config.json
112 |
113 | # Package control specific files
114 | Package Control.last-run
115 | Package Control.ca-list
116 | Package Control.ca-bundle
117 | Package Control.system-ca-bundle
118 | Package Control.cache/
119 | Package Control.ca-certs/
120 | bh_unicode_properties.cache
121 |
122 | # Sublime-github package stores a github token in this file
123 | # https://packagecontrol.io/packages/sublime-github
124 | GitHub.sublime-settings
125 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) 2016 Andrzej Zielezinski, combio.pl, http://combio.pl/alfree
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | alfpy
2 | =====
3 |
4 | alfpy is a bionformatics Python package that provides alignment-free framework
5 | to compare biological sequences (DNA/RNA/protein) and infers their
6 | phylogenetic relationships.
7 |
8 | alfpy also contains Python scripts with user-friendly command-line interfaces
9 | that let you compare unaligned FASTA sequences with more than 40 distance methods.
10 |
11 |
12 | Latest source code
13 | ------------------
14 | The official source code repository is at: https://github.com/aziele/alfpy
15 |
16 |
17 | Web sites
18 | ---------
19 | alfpy is also available as a web app: http://www.combio.pl/alfree
20 |
21 |
22 | Requirements
23 | ============
24 |
25 | 1. Python (https://www.python.org/) version 2.7 or >= 3.3
26 | 2. NumPy (http://www.numpy.org/).
27 |
28 |
29 | Installation
30 | ============
31 |
32 | Option 1: Get the latest official version
33 | -----------------------------------------
34 |
35 | Install the latest official version with `pip `_
36 | ::
37 |
38 | sudo pip install alfpy
39 |
40 | If you are not allowed to use `sudo`, install alfpy as user::
41 |
42 | sudo pip install --user alfpy
43 |
44 |
45 |
46 | Option 2: Get the latest development version
47 | --------------------------------------------
48 |
49 | Get it using this shell command, which requires Git::
50 |
51 | git clone https://github.com/aziele/alfpy.git
52 |
53 | If you don't feel like using git, just download the package manually as a `gzipped tarball `_.
54 |
55 | Unpack the zip package, go to the directory and run the installation::
56 |
57 | cd alfpy
58 | python setup.py install
59 |
60 | or::
61 |
62 | python setup.py install --user
63 |
64 | Alfpy usage
65 | ===========
66 |
67 | The examples of using Alfpy are available at: http://www.combio.pl/alfree/download/.
68 |
69 |
70 | Testing
71 | =======
72 |
73 | To run tests, go to the alfpy source code directory and type::
74 |
75 | python -m unittest discover
76 |
77 |
78 | If you want to test a specific file (e.g. ``test_word_distance.py``), type::
79 |
80 | python -m unittest tests.test_word_distance
81 |
82 |
83 | Contact
84 | =======
85 |
86 | Drop us any feedback at: bioinfo@amu.edu.pl or on twitter `@a_zielezinski `_.
87 |
88 | License
89 | =======
90 |
91 | alfpy is under the MIT license; see ``LICENSE.txt``. Distribution,
92 | modification and redistribution, incorporation into other software,
93 | and pretty much everything else is allowed.
94 |
95 |
96 | .. |Travis| image:: https://travis-ci.org/aziele/alfpy.svg?branch=master
97 | :target: https://travis-ci.org/aziele/alfpy
98 |
99 |
100 | .. |PyPI| image:: https://img.shields.io/pypi/v/alfpy.svg?branch=master
101 | :target: https://pypi.python.org/pypi/alfpy
102 |
103 | .. |Landscape| image:: https://landscape.io/github/aziele/alfpy/master/landscape.svg?style=flat
104 | :target: https://landscape.io/github/aziele/alfpy/master
105 | :alt: Code Health
106 |
107 | .. |Codecov| image:: https://codecov.io/gh/aziele/alfpy/branch/master/graph/badge.svg
108 | :target: https://codecov.io/gh/aziele/alfpy
109 |
--------------------------------------------------------------------------------
/alfpy/__init__.py:
--------------------------------------------------------------------------------
1 | from .version import __version__
2 |
3 | version = __version__
--------------------------------------------------------------------------------
/alfpy/bbc.py:
--------------------------------------------------------------------------------
1 | """This module computes distances between DNA/protein sequences based on the
2 | sequence feature, named Base-Base Correlation (BBC).
3 |
4 | References:
5 | 1. Liu, Zhi-Hua, et al. (2007) Bioinformatics and Biomedical Engineering,
6 | ICBBE. The 1st International Conference on. IEEE, 2007.
7 | doi: 10.1109/ICBBE.2007.98
8 |
9 | 2. Liu Z, Meng J, Sun X. (2008) Biochem Biophys Res Commun. 368(2):223-30.
10 | doi: 10.1016/j.bbrc.2008.01.070.
11 |
12 | Todo:
13 | * handle sequence symbols not included in molecule's alphabet
14 |
15 | """
16 |
17 | import numpy as np
18 |
19 | from .utils import distance
20 |
21 |
22 | def base_base_correlation(seq, k, alphabet=None):
23 | """Compute the base base correlation (BBC) vector for a sequence.
24 |
25 | Args:
26 | seq (str) : sequence
27 | k (int) : parameter of the BBC. Intuitively, it represents
28 | the maximum distance to observe correlation between bases.
29 | alphabet (str/list) : List of possible characters. This can be used to
30 | avoid autodetection of the alphabet in the case where
31 | sequences with missing letters are to be compared.
32 |
33 | Returns:
34 | numpy.ndarray: shape (1, 16) for DNA and (1, 400) for protein.
35 |
36 | Examples:
37 | >>> print(base_base_correlation('ATGCATGC', 1, 'ATGC'))
38 | [[
39 | -0.12547302 -0.12547302 0.2281059 0.17169665 0.01815213
40 | -0.12547302 -0.12547302 0.04258163 0.04258163 0.17169665
41 | -0.12547302 -0.12547302 -0.12547302 0.2281059 0.17169665
42 | -0.12547302
43 | ]]
44 |
45 | Note:
46 | A description of the method can be found here:
47 | http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=4272582
48 |
49 | This implementation is generalized for any sequence type.
50 | """
51 |
52 | s = seq
53 |
54 | if k > len(s) - 2:
55 | raise Exception("Sequence too short to compute BBC with "
56 | "k={}".format(k))
57 |
58 | if alphabet is None:
59 | alphabet = set(s)
60 | else:
61 | s = "".join([c for c in s if c in alphabet])
62 |
63 | alphabet = sorted(list(alphabet))
64 | alphabet = dict(zip(alphabet, range(len(alphabet))))
65 | L = len(alphabet)
66 |
67 | # Compute the base probabilities for every character.
68 | p = np.zeros(L)
69 | for c in s:
70 | p[alphabet[c]] += 1
71 | p /= np.sum(p)
72 | p.shape = (1, L)
73 |
74 | bbc = np.zeros((L, L))
75 | for l in range(1, k + 2):
76 | # Compute $p_{ij}(l)$ representing the probability of
77 | # observing the bases i and j separated by l "gaps".
78 | # Compute it for all 16 combinations of alleles.
79 | l_dist_correlations = np.zeros((L, L))
80 | for i in range(len(s) - l):
81 | nuc1 = alphabet[s[i]]
82 | nuc2 = alphabet[s[i + l]]
83 | l_dist_correlations[nuc1][nuc2] += 1
84 | l_dist_correlations /= np.sum(l_dist_correlations)
85 |
86 | # Compute the D_{ij}(l) which is the deviation from
87 | # statistical independance.
88 | # $D_{ij}(l) = p_{ij}(l) - p_i p_j$
89 | D = l_dist_correlations - np.dot(p.T, p)
90 |
91 | bbc += D + (D ** 2 / 2 * np.dot(p.T ** 2, p ** 2)) + D ** 3
92 |
93 | # Flatten the bbc into a 16 feature vector.
94 | bbc.shape = (1, L * L)
95 | return bbc
96 |
97 |
98 | def create_vectors(seq_records, k=10, alphabet="ATGC"):
99 | """Create BBC's vectors for multiple sequence records.
100 |
101 | Args:
102 | seq_records (obj SeqRecords)
103 | """
104 | data = np.zeros(shape=(seq_records.count, len(alphabet)**2))
105 | for seqidx, seq in enumerate(seq_records.seq_list):
106 | vector = base_base_correlation(seq, k=k, alphabet=alphabet)
107 | data[seqidx] = vector
108 | return data
109 |
110 |
111 | class Distance(distance.Distance):
112 |
113 | def __init__(self, vector, disttype='euclid_norm'):
114 | super(Distance, self).__init__(vector, disttype)
115 |
116 |
117 | def main():
118 | from .utils.seqrecords import main
119 | from .utils import distmatrix
120 | seq_records = main()
121 | vector = create_vectors(seq_records, 10, alphabet="ATGC")
122 | dist = Distance(vector)
123 | matrix = distmatrix.create(seq_records.id_list, dist)
124 | matrix.display()
125 |
126 |
127 | if __name__ == '__main__':
128 | main()
129 |
--------------------------------------------------------------------------------
/alfpy/fcgr.py:
--------------------------------------------------------------------------------
1 | """This module computes distances between DNA sequences based on the Frequency
2 | Chaos Game Representation (FCGR)
3 |
4 | References:
5 | 1. Hatje K, Kollmar M (2012) Front Plant Sci 3: 192.
6 | doi: 10.3389/fpls.2012.00192
7 |
8 |
9 | Functions for creating DNA-representing vectors were built upon:
10 | Cheng J, Cao F, Liu Z. (2013) Mol Biol Evol. 2013 30(5):1032-7.
11 | doi: 10.1093/molbev/mst021.
12 |
13 | """
14 |
15 | import numpy as np
16 |
17 | from .utils import distance
18 |
19 |
20 | def fcgr_vector(dnaseq, word_size):
21 | """Create a FCGR vector representing a DNA sequence.
22 |
23 | Args:
24 | dnaseq (str/list): dna sequence
25 | word_size (int): word size (>= 1)
26 |
27 | Returns:
28 | list (length equals 4^word_size)
29 |
30 | Examples:
31 | >>> s = 'ATGCTGATGGATG'
32 | >>> print(fcgr_vector(s, 1))
33 | [5, 3, 5]
34 |
35 | >>> print(fcgr_vector(s, 2))
36 | [1, 0, 1, 0, 0, 0, 4, 0, 2, 2, 0, 0, 1, 3, 0]
37 |
38 | """
39 | ndata = pow(4, word_size)
40 | genlen = len(dnaseq)
41 | CGRs = np.zeros((genlen + 1, 2))
42 |
43 | Apoint = np.array((0.0, 1.0))
44 | Tpoint = np.array((1.0, 1.0))
45 | Gpoint = np.array((1.0, 0.0))
46 | Cpoint = np.array((0.0, 0.0))
47 | CGRs[0, 0] = 0.5
48 | CGRs[0, 1] = 0.5
49 | for i in range(0, genlen):
50 | if dnaseq[i] == 'A':
51 | CGRs[i + 1] = 0.5 * (CGRs[i] + Apoint)
52 | if dnaseq[i] == 'T':
53 | CGRs[i + 1] = 0.5 * (CGRs[i] + Tpoint)
54 | if dnaseq[i] == 'G':
55 | CGRs[i + 1] = 0.5 * (CGRs[i] + Gpoint)
56 | if dnaseq[i] == 'C':
57 | CGRs[i + 1] = 0.5 * (CGRs[i] + Cpoint)
58 | temp = 1.0 / pow(2, word_size)
59 |
60 | vectors = np.zeros(shape=(1, ndata)) # numpy
61 | vectors = [0.0] * ndata # list
62 |
63 | for point in CGRs:
64 | xx = int(point[0] / temp)
65 | yy = int(point[1] / temp)
66 | if yy == pow(2, word_size):
67 | yy = pow(2, word_size) - 1
68 | vectors[yy * pow(2, word_size) + xx] += 1
69 | vectors.pop(0)
70 | return vectors
71 |
72 |
73 | def create_vectors(seq_records, word_size):
74 | """Create a matrix of FCGR vectors.
75 |
76 | Args:
77 | seq_records (obj: SeqRecords)
78 | word_size (int): word size (>= 1)
79 |
80 | Returns:
81 | numpy.ndarray
82 |
83 | """
84 | data = np.zeros(shape=(seq_records.count, pow(4, word_size) - 1))
85 | for seqidx, seq in enumerate(seq_records.seq_list):
86 | vector = fcgr_vector(seq, word_size)
87 | data[seqidx] = vector
88 | return data
89 |
90 |
91 | class Distance(distance.Distance):
92 |
93 | def __init__(self, vector, disttype='euclid_norm'):
94 | super(Distance, self).__init__(vector, disttype)
95 |
96 |
97 | def main():
98 | from .utils.seqrecords import main
99 | from .utils import distmatrix
100 | seq_records = main()
101 |
102 | vector = create_vectors(seq_records, 1)
103 | dist = Distance(vector)
104 | matrix = distmatrix.create(seq_records.id_list, dist)
105 | matrix.display()
106 |
107 |
108 | if __name__ == '__main__':
109 | main()
110 |
--------------------------------------------------------------------------------
/alfpy/ncd.py:
--------------------------------------------------------------------------------
1 | """Normalized compression distance (NCD)
2 |
3 | The NCD is a family of distances parametrized with the compressor Z.
4 | The better Z is, the closer the NCD approaches the NID, and the better
5 | the results are.
6 |
7 | As described in:
8 | 1. Bennett, Gacs, Ming, Vintanyi, Zurek
9 | IEEE Transactions on Information Theory 1998. 44(4):1407-1423
10 | doi: 10.1109/18.681318
11 |
12 | 2. Li, Chen, Li, Ma, Vitanyi
13 | IEEE Transactions on Information Theory 2004. 50(12):3250-3264
14 | doi: 10.1109/TIT.2004.838101
15 |
16 | 3. https://en.wikipedia.org/wiki/Normalized_compression_distance
17 |
18 | """
19 | import itertools
20 | import zlib
21 |
22 |
23 | def complexity(s):
24 | """Compress string and return the size of the compression."""
25 | s = s.encode("utf-8") # Python 3 fix.
26 | compr = zlib.compress(s)
27 | c = float(len(compr))
28 | return c
29 |
30 |
31 | class Distance():
32 |
33 | def __init__(self, seq_records):
34 |
35 | self.seq_records = seq_records
36 | self._complexity = {}
37 | self.numseqs = seq_records.count
38 | # Precomputed complexity for input sequences
39 | # as well as all pairwise concatenated sequences.
40 | self._complexity = self.__precompute_complexity()
41 |
42 | def __precompute_complexity(self):
43 | d = {}
44 | seqs = self.seq_records.seq_list
45 | # Complexity for single input sequences.
46 | for seqidx, seq in enumerate(seqs):
47 | d[(seqidx,)] = complexity(seq)
48 | # Complexity for pairwise concatenated sequences.
49 | for i, j in itertools.combinations(range(self.numseqs), 2):
50 | seq12 = seqs[i] + seqs[j]
51 | c12 = complexity(seq12)
52 | d[(i, j)] = c12
53 | return d
54 |
55 | def pairwise_distance(self, seq1idx, seq2idx):
56 | """Compute NCD between two sequences.
57 |
58 | Formula:
59 | NCD_Z(x,y) = \frac{Z(xy) - \min \{Z(x),Z(y)\}}{\max \{Z(x),Z(y)\}}.
60 |
61 | where:
62 | Z(x) is the binary length of the sequence `x` compressed
63 | with compressor Z
64 | """
65 | zx = self._complexity[(seq1idx,)]
66 | zy = self._complexity[(seq2idx,)]
67 | zxy = self._complexity[(seq1idx, seq2idx)]
68 | return (zxy - min([zx, zy])) / max([zx, zy])
69 |
70 |
71 | if __name__ == '__main__':
72 | from .utils import distmatrix
73 | from .utils.seqrecords import main
74 | seq_records = main()
75 |
76 | dist = Distance(seq_records)
77 | matrix = distmatrix.create(seq_records.id_list, dist)
78 | matrix.display('pairwise')
79 |
--------------------------------------------------------------------------------
/alfpy/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aziele/alfpy/25545be14affa7d7e89e5b5ebcfe4f3e688108b7/alfpy/utils/__init__.py
--------------------------------------------------------------------------------
/alfpy/utils/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aziele/alfpy/25545be14affa7d7e89e5b5ebcfe4f3e688108b7/alfpy/utils/data/__init__.py
--------------------------------------------------------------------------------
/alfpy/utils/data/seqcontent.py:
--------------------------------------------------------------------------------
1 | """Collections of various bits of useful sequence data."""
2 |
3 | FREQS = {
4 | 'protein': {
5 | 'A': 0.0826,
6 | 'Q': 0.0393,
7 | 'L': 0.0965,
8 | 'S': 0.0659,
9 | 'R': 0.0553,
10 | 'E': 0.0674,
11 | 'K': 0.0583,
12 | 'T': 0.0534,
13 | 'N': 0.0406,
14 | 'G': 0.0708,
15 | 'M': 0.0241,
16 | 'W': 0.0109,
17 | 'D': 0.0546,
18 | 'H': 0.0227,
19 | 'F': 0.0386,
20 | 'Y': 0.0292,
21 | 'C': 0.0137,
22 | 'I': 0.0594,
23 | 'P': 0.0471,
24 | 'V': 0.0687,
25 | 'X': 1,
26 | 'B': 0.0406 + 0.0546,
27 | 'Z': 0.0393 + 0.0674
28 | },
29 | 'dna': {
30 | 'A': 0.25,
31 | 'C': 0.25,
32 | 'G': 0.25,
33 | 'T': 0.25
34 | },
35 | 'rna': {
36 | 'A': 0.25,
37 | 'C': 0.25,
38 | 'G': 0.25,
39 | 'U': 0.25
40 | }
41 | }
42 |
43 | WEIGHTS = {
44 | 'protein': {
45 | 'A': 1.2106537530266344,
46 | 'C': 7.299270072992702,
47 | 'E': 1.4836795252225519,
48 | 'D': 1.8315018315018312,
49 | 'G': 1.4124293785310733,
50 | 'F': 2.590673575129534,
51 | 'I': 1.6835016835016834,
52 | 'H': 4.405286343612334,
53 | 'K': 1.7152658662092626,
54 | 'M': 4.149377593360996,
55 | 'L': 1.0362694300518134,
56 | 'N': 2.4630541871921183,
57 | 'Q': 2.5445292620865136,
58 | 'P': 2.123142250530785,
59 | 'S': 1.5174506828528072,
60 | 'R': 1.8083182640144662,
61 | 'T': 1.8726591760299625,
62 | 'W': 9.174311926605505,
63 | 'V': 1.4556040756914121,
64 | 'Y': 3.4246575342465753
65 | },
66 | 'dna': {
67 | 'A': 1,
68 | 'C': 1,
69 | 'G': 1,
70 | 'T': 1
71 | },
72 | 'rna': {
73 | 'A': 1,
74 | 'C': 1,
75 | 'G': 1,
76 | 'U': 1
77 | }
78 |
79 | }
80 |
81 |
82 | ALPHABET = {
83 | 'dna': 'ATGC',
84 | 'protein': 'ACDEFGHIKLMNPQRSTVWY'
85 | }
86 |
87 | REDUCED_ALPHABET = {
88 | 'dna': {
89 | 'A': 'R',
90 | 'G': 'R',
91 | 'T': 'Y',
92 | 'C': 'Y'
93 | },
94 | 'protein': {
95 | 'T': 'S',
96 | 'E': 'D',
97 | 'Q': 'K',
98 | 'R': 'K',
99 | 'V': 'I',
100 | 'L': 'I',
101 | 'M': 'I',
102 | 'W': 'F',
103 | 'Y': 'F'
104 | }
105 | }
106 |
107 |
108 | def get_alphabet(mol):
109 | return ALPHABET[mol]
110 |
111 |
112 | def get_freqs(mol):
113 | return FREQS[mol]
114 |
115 |
116 | def get_weights(mol):
117 | return WEIGHTS[mol]
118 |
119 |
120 | def get_reduced_alphabet(mol):
121 | return REDUCED_ALPHABET[mol]
122 |
--------------------------------------------------------------------------------
/alfpy/utils/distance.py:
--------------------------------------------------------------------------------
1 | """This module contains a `Distance` class that combines vector
2 | with distance function.
3 |
4 | """
5 |
6 | import math
7 | import numpy as np
8 |
9 |
10 | class Distance(object):
11 | """Combine sequences-representing 2-D array of vectors
12 | with a distance function.
13 |
14 | Attributes:
15 | _vector (ndarray)
16 | _disttype (str): distance method name
17 | pairwise_distance (func): distance method
18 |
19 | """
20 |
21 | def __getitem__(self, seqnum):
22 | return self._vector[seqnum]
23 |
24 | @classmethod
25 | def get_disttypes(cls):
26 | """Return a list of available distance function names.
27 |
28 | Returns:
29 | list of strings
30 | """
31 | l = [x[7:] for x, y in cls.__dict__.items() if x.startswith('pwdist')]
32 | l.sort()
33 | return l
34 |
35 | def set_disttype(self, disttype):
36 | try:
37 | pwdist_func = getattr(self, 'pwdist_{}'.format(disttype))
38 | self.pairwise_distance = pwdist_func
39 | # Method does not exist.
40 | except AttributeError:
41 | msg = 'unknown disttype "{}"'.format(disttype)
42 | raise ValueError(msg)
43 |
44 | def __init__(self, vector, disttype):
45 | """Create instance of Distance.
46 |
47 | Args:
48 | vector (ndarray)
49 | disttype (str)
50 |
51 | Examples:
52 | >>> vector
53 | [[ 3. 6. 4. 1. 3. 4. 3. 0. 1. 1. 6. 4. 5. 0. 3. 4.]
54 | [ 0. 3. 0. 3. 0. 0. 0. 2. 9. 0. 3. 3. 0. 6. 3. 6.]
55 | [ 9. 0. 0. 3. 0. 0. 0. 2. 6. 0. 3. 3. 0. 3. 3. 3.]]
56 | >>> disttype = 'minkowski'
57 | >>> dist = Distance(vector, disttype)
58 |
59 | """
60 | self.set_disttype(disttype)
61 | self._vector = vector
62 | self._disttype = disttype
63 |
64 | def pwdist_euclid_squared(self, seq1idx, seq2idx):
65 | """Squared Euclidean distance
66 |
67 | References:
68 | 1. Blaisdell BE (1986) Proc Natl Acad Sci U S A 83: 5155-5159.
69 | doi: 10.1073/pnas.83.14.5155
70 |
71 | """
72 | value = np.sum((self[seq1idx] - self[seq2idx])**2)
73 | return value
74 |
75 | def pwdist_euclid_norm(self, seq1idx, seq2idx):
76 | """Euclidean distance
77 |
78 | References:
79 | 1. Vinga & Almeida (2003) Bioinformatics 19(4): 513-523.
80 | doi: 10.1093/bioinformatics/btg005
81 | 2. http://web.ist.utl.pt/susanavinga/NASC/
82 |
83 | """
84 | value = math.sqrt(self.pwdist_euclid_squared(seq1idx, seq2idx))
85 | return value
86 |
87 | def pwdist_google(self, seq1idx, seq2idx):
88 | """Normalized Google Distance (NGD).
89 |
90 | The maximum values for NGD is 1.0, which means two sequences are
91 | totally not similar to each other, and the minimum values for
92 | NGD is 0.0. Therefore, the similarity of the two sequences can be
93 | obtained by NGS = 1 - NGD. Two sequences are treated as two different
94 | web pages and the each word frequency represents terms found in each
95 | webpage.
96 |
97 | References:
98 | 1. Lee & Rashid (2008) Information Technology, ITSim 2008.
99 | doi:10.1109/ITSIM.2008.4631601
100 |
101 | """
102 | v1 = self[seq1idx]
103 | v2 = self[seq2idx]
104 |
105 | sumwx = float(np.sum(v1))
106 | sumwy = float(np.sum(v2))
107 |
108 | summin = float(np.sum(np.minimum(v1, v2)))
109 |
110 | ngd = (max([sumwx, sumwy]) - summin) / \
111 | ((sumwx + sumwy) - min([sumwx, sumwy]))
112 | return ngd
113 |
--------------------------------------------------------------------------------
/alfpy/utils/distmatrix.py:
--------------------------------------------------------------------------------
1 | """This module creates and handles distance matrices"""
2 |
3 | import itertools
4 | import numpy as np
5 | import sys
6 |
7 |
8 | def create(id_list, distance):
9 | """Create a distance matrix (as Matrix object).
10 |
11 | Calculate distance measures between all pairs of sequences.
12 |
13 | Args:
14 | id_list (list): list of sequence identifiers
15 | distance (obj): instance of distance.Distance
16 |
17 | Returns:
18 | Matrix object
19 |
20 | Examples:
21 | >>> vector
22 | [[ 3. 6. 4. 1. 3. 4. 3. 0. 1. 1. 6. 4. 5. 0. 3. 4.]
23 | [ 0. 3. 0. 3. 0. 0. 0. 2. 9. 0. 3. 3. 0. 6. 3. 6.]
24 | [ 9. 0. 0. 3. 0. 0. 0. 2. 6. 0. 3. 3. 0. 3. 3. 3.]]
25 | >>> disttype = 'minkowski'
26 | >>> dist = Distance(vector, disttype)
27 | >>> id_list = ['seq1', 'seq2', 'seq3']
28 | >>> matrix = create(id_list, dist)
29 |
30 | """
31 | size = len(id_list)
32 | rows = np.zeros([size, size])
33 | for i, j in itertools.combinations(range(size), 2):
34 | value = distance.pairwise_distance(i, j)
35 | rows[i][j] = value
36 | rows[j][i] = value
37 | # No need to calculate distances between the same sequences.
38 | # The distance should be zero.
39 | # for i in range(size):
40 | # value = distance.pairwise_distance(i, i)
41 | # rows[i][i] = value
42 | return Matrix(id_list, rows)
43 |
44 |
45 | def read_highcharts_matrix(id_list, data):
46 | """Create a distance matrix from a matrix in Highcharts format.
47 |
48 | Args:
49 | id_list (list): list of sequence identifiers
50 | data (list of 4-element tuples)
51 | e.g. [[0, 1, 0.35, 0.19], [0, 2, 1.0, 0.55], [1, 2, 0.88, 0.48]]
52 |
53 | Returns:
54 | Matrix object
55 | """
56 | size = len(id_list)
57 | rows = np.zeros([size, size])
58 | for i, j, _, value in data:
59 | rows[i][j] = value
60 | rows[j][i] = value
61 | return Matrix(id_list, rows)
62 |
63 |
64 | class Matrix():
65 | """Distance matrix
66 |
67 | Attributes:
68 | id_list (list): list of sequence identifiers
69 | data (ndarray): 2-D array of distance values between pairs of seqs
70 |
71 | """
72 |
73 | def __init__(self, id_list, data):
74 | """
75 | Example:
76 | >>> id_list = ['seq1', 'seq2', 'seq3']
77 | >>> data
78 | [[ 0. 0.3531587 0.35509333]
79 | [ 0.3531587 0. 0.295394 ]
80 | [ 0.35509333 0.295394 0. ]]
81 | >>> matrix = Matrix(id_list, data)
82 |
83 | """
84 | self.id_list = id_list
85 | self.data = data
86 |
87 | def normalize(self):
88 | """Normalize distance values to 0-1 range."""
89 | self.data /= self.max()
90 |
91 | def __iter__(self):
92 | """Iterate over a distance matrix."""
93 | size = self.data.shape[0]
94 | for i, j in itertools.combinations(range(size), 2):
95 | yield i, j, self.id_list[i], self.id_list[j], self.data[i][j]
96 |
97 | def writer(self, handle, f, decimal_places):
98 | """Return a distance matrix as a string in `phylip` or `pairwise`
99 | formats.
100 |
101 | Args:
102 | handle : output file / sys.stdout
103 | f (str): phylip / pairwise
104 | decimal_places (int): round distance value to decimal places
105 |
106 | """
107 | if f == 'phylip':
108 | handle.write(" {0}\n".format(len(self.id_list)))
109 | for i, line in enumerate(self.data):
110 | # PHYLIP requires that each sequence identifier
111 | # is maximum 10 characters long.
112 | seqid = self.id_list[i][:10]
113 | l = ['{0:.{1}f}'.format(line[i], decimal_places)
114 | for i in range(0, len(line))]
115 | l.insert(0, '{0: <10}'.format(seqid))
116 | handle.write(" ".join(l) + "\n")
117 | elif f == 'pairwise':
118 | for _, _, seqid1, seqid2, distval in self:
119 | handle.write("{0}\t{1}\t{2:.{3}f}\n".format(seqid1, seqid2,
120 | distval,
121 | decimal_places))
122 |
123 | def display(self, f="phylip", decimal_places=7):
124 | """Write a distance matrix to the screen."""
125 | return self.writer(sys.stdout, f, decimal_places)
126 |
127 | def write_to_file(self, handle, f="phylip", decimal_places=7):
128 | """Write a distance matrix to a file."""
129 | return self.writer(handle, f, decimal_places)
130 |
131 | def highcharts(self):
132 | """Return a distance matrix as a list in the Highcharts format."""
133 | data = []
134 | maxval = self.max()
135 | for i, j, _, _, distval in self:
136 | data.append([i, j, distval / maxval, distval])
137 | return data
138 |
139 | def format(self, decimal_places=7):
140 | lines = [" {0}".format(len(self.id_list))]
141 | for i, line in enumerate(self.data):
142 | seqid = self.id_list[i][:10]
143 | l = ['{0:.{1}f}'.format(line[i], decimal_places)
144 | for i in range(0, len(line))]
145 | l.insert(0, '{0: <10}'.format(seqid))
146 | lines.append("\n" + " ".join(l))
147 | return "".join(lines)
148 |
149 | def min(self):
150 | """Return minimum distance value in matrix"""
151 | return np.amin(self.data)
152 |
153 | def max(self):
154 | """Return maximum distance value in matrix"""
155 | return np.amax(self.data)
156 |
157 | def is_zero(self):
158 | """Return True if matrix contains only zeros"""
159 | return not np.count_nonzero(self.data)
160 |
161 | def __repr__(self):
162 | return str(self.data)
163 |
164 |
165 |
166 | if __name__ == '__main__':
167 | id_list = ['seq1', 'seq2', 'seq3']
168 | l = [[0, 0.3531587, 0.35509333],
169 | [0.3531587, 0, 0.295394],
170 | [0.35509333, 0.295394, 0.]
171 | ]
172 | data = np.array(l)
173 | matrix = Matrix(id_list, data)
174 | print(matrix.format())
175 | print(matrix.highcharts())
176 |
--------------------------------------------------------------------------------
/alfpy/utils/fasta.py:
--------------------------------------------------------------------------------
1 | """Reading and writing FASTA format files"""
2 |
3 | from itertools import groupby
4 |
5 |
6 | class FastaRecord():
7 | """Object representing a Fasta (aka Pearson) record.
8 |
9 | Attributes:
10 | seq (str) : Sequence
11 | id (str) : Sequence identifier
12 | description (str) : Sequence description
13 | """
14 |
15 | def __init__(self, seq, seqid, description=False):
16 | """Create a FastaRecord.
17 |
18 | Example:
19 | >>> import Fasta
20 | >>> record = FastaRecord(seq='MRELEAKAT',
21 | ... seqid='NP_055309.2',
22 | ... description='TNRC6A')
23 | >>> print(record)
24 | >NP_055309.2 TNRC6A
25 | MRELEAKAT
26 | """
27 | self.seq = seq
28 | self.id = seqid
29 | self.description = description
30 |
31 | def __iter__(self):
32 | """Iterate over the letters in the sequence.
33 |
34 | Example:
35 | >>> import Fasta
36 | >>> record = Fasta.read(open('sequence.fasta'))
37 | >>> for amino_acid in record:
38 | ... print(amino_acid)
39 | M
40 | R
41 | E
42 | L
43 | E
44 |
45 | This is equivalent to iterating over the sequence directly:
46 | >>> for amino_acid in record.seq:
47 | ... print(amino_acid)
48 | M
49 | R
50 | E
51 | L
52 | E
53 | """
54 | return iter(self.seq)
55 |
56 | def __contains__(self, char):
57 | """Implements the 'in' keyword, searches the sequence.
58 |
59 | Example:
60 | >>> import Fasta
61 | >>> record = Fasta.read(open('sequence.fasta'))
62 | >>> print('M' in record)
63 | True
64 | """
65 | return char in self.seq
66 |
67 | def __str__(self):
68 | """Return the record as a string in the fasta format.
69 |
70 | Example:
71 | >>> import Fasta
72 | >>> record = FastaRecord(seq='MRELEAKAT',
73 | ... id='NP_055309.2',
74 | ... description='TNRC6A')
75 | >>> print(record)
76 | >NP_055309.2 TNRC6A
77 | MRELEAKAT
78 | """
79 | return self.format(wrap=70)
80 |
81 | def __len__(self):
82 | """Return the length of the sequence.
83 |
84 | Example:
85 | >>> import Fasta
86 | >>> record = Fasta.read(open('sequence.fasta'))
87 | >>> len(record)
88 | 1240
89 | """
90 | return len(self.seq)
91 |
92 | def format(self, wrap=70):
93 | """Return a formatted Fasta record.
94 |
95 | Example:
96 | >>> import Fasta
97 | >>> record = SeqRecord(seq='MRELEAKAT',
98 | id='NP_055309.2',
99 | description='TNRC6A')
100 | >>> print(record.format())
101 | >NP_055309.2 TNRC6A
102 | MRELEAKAT
103 | """
104 | header = ">{0}".format(self.id)
105 | if self.description:
106 | header += " " + self.description
107 | header += "\n"
108 | wseq = []
109 | for i in range(0, len(self.seq), wrap):
110 | wseq.append(self.seq[i:i + wrap])
111 | return header + "\n".join(wseq)
112 |
113 |
114 | def parse(handle):
115 | """
116 | Generator function to iterate over Fasta records (as FastaRecord objects).
117 |
118 | handle - input file containing fasta sequences.
119 | """
120 | faiter = (x[1] for x in groupby(handle, lambda l: l[0] == ">"))
121 | for header in faiter:
122 | header = next(header)[1:].strip()
123 | seqid = header.split()[0]
124 | seq = "".join(s.strip() for s in next(faiter))
125 | desc = header[len(seqid):].strip()
126 | yield FastaRecord(seq, seqid, description=desc)
127 |
128 |
129 | def read(handle):
130 | """Turns a sequence file into a single FastaRecord.
131 |
132 | EXAMPLE:
133 | >>> import Fasta
134 | >>> record = Fasta.read(open('sequence.fasta'))
135 | >>> print(record.id)
136 | NP_055309.2
137 | >>> print(record.seq)
138 | MRELEAKAT
139 |
140 | If the handle contains no records an exception is raised.
141 | If the handle contains more than one record, the very first one is read.
142 |
143 | Use the Fasta.parse(handle) function if you want
144 | to read multiple records from the handle.
145 |
146 | """
147 | iterator = parse(handle)
148 | try:
149 | first = next(iterator)
150 | except StopIteration:
151 | first = None
152 | return first
153 |
154 |
155 | def to_dict(sequences):
156 | """Turns a Fasta sequence iterator or list into a dictionary.
157 |
158 | - sequences: an iterator that returns FastaRecord objects,
159 | or simply a list of SeqRecord objects.
160 |
161 | Uses record.id as key.
162 |
163 | If there are duplicate keys, an error is raised.
164 |
165 | EXAMPLE:
166 | >>> import Fasta
167 | >>> pdict = Fasta.to_dict(Fasta.parse(open('test.fa')))
168 | >>> print(sorted(pdict.keys()))
169 | ['gi|195354411|', 'tr|Q8SY33|']
170 | >>> print(pdict['tr|Q8SY33|'].description)
171 | Gawky, isoform A [Drosophila melanogaster]
172 | >>> len(pdict)
173 | 2
174 |
175 | NOTE:
176 | This approach is not suitable for very large sets of sequences,
177 | as all the SeqRecord objects are held in memory.
178 |
179 | """
180 | d = dict()
181 | for record in sequences:
182 | key = record.id
183 | if key in d:
184 | raise ValueError("Duplicate key '{}'".format(key))
185 | d[key] = record
186 | return d
187 |
188 |
189 | if __name__ == '__main__':
190 | seqs = ['>seq1 desc1', 'ATGCTGATGATAGATG', 'ATGTAGA',
191 | '>seq2 desc2', 'ATGCTGCT']
192 | for seq_record in parse(seqs):
193 | print(seq_record)
194 |
--------------------------------------------------------------------------------
/alfpy/utils/seqrecords.py:
--------------------------------------------------------------------------------
1 | from . import fasta
2 |
3 |
4 | class SeqRecords:
5 | """Object representing an ordered collection of sequence records.
6 |
7 | Attributes:
8 | id_list (list) : List of sequence record identifiers
9 | seq_list (list) : List of sequence strings
10 | count (int) : Number of sequence records
11 |
12 | """
13 |
14 | def __init__(self, id_list=None, seq_list=None):
15 | """Create a collection (may be empty) of sequence records.
16 |
17 | Example:
18 | >>> ids = ['seq1', 'seq2']
19 | >>> seqs = ['ATGCTG', 'TGCTGATAGTA']
20 | >>> seq_records = SeqRecords(id_list=ids, seq_list=seqs)
21 | >>> print seq_records
22 | SeqRecords (noseqs: 2)
23 |
24 | """
25 | self.count = 0 if not id_list else len(seq_list)
26 | self.id_list = id_list if id_list else []
27 | # Make all sequences uppercased.
28 | self.seq_list = [s.upper() for s in seq_list] if seq_list else []
29 |
30 | def add(self, seqid, seq):
31 | """Add a sequence record to the existing collection.
32 |
33 | Args:
34 | id (str) : sequence identifier
35 | seq (str) : sequence string
36 |
37 | Example:
38 | >>> seq_record.add("seq3", "TGCTGA")
39 | """
40 | self.id_list.append(seqid)
41 | self.seq_list.append(seq.upper())
42 | self.count += 1
43 |
44 | def fasta(self, wrap=70):
45 | """Return sequence records as a mutli-FASTA string.
46 |
47 | Example:
48 | >>> ids = ['seq1', 'seq2']
49 | >>> seqs = ['ATGCTG', 'TGCTGATAGTA']
50 | >>> seq_records = SeqRecords(id_list=ids, seq_list=seqs)
51 | >>> print seq_records.fasta()
52 | >seq1
53 | ATGCTG
54 | >seq2
55 | TGCTGATAGTA
56 | """
57 | l = []
58 | for seqid, seq in self:
59 | seq_record = fasta.FastaRecord(seq=seq, seqid=seqid)
60 | l.append(seq_record.format(wrap=wrap))
61 | return "\n".join(l)
62 |
63 | @property
64 | def length_list(self):
65 | """Return a list of the sequences' length_list"""
66 | return [len(seq) for seq in self.seq_list]
67 |
68 | def __iter__(self):
69 | """
70 | Iterate over sequence records in the collection.
71 |
72 | Example:
73 | >>> for amino_acid in record:
74 | ... print(amino_acid)
75 | seq1
76 | ATGCTG
77 | seq2
78 | TGCTGATAGTA
79 | """
80 | for i in range(self.count):
81 | seqid = self.id_list[i]
82 | seq = self.seq_list[i]
83 | yield seqid, seq
84 |
85 | def __len__(self):
86 | """
87 | Return the number of sequence records in the collection.
88 |
89 | Example:
90 | >>> len(seq_records)
91 | 3
92 | """
93 | return len(self.seq_list)
94 |
95 | def __repr__(self):
96 | return "{0} (noseqs: {1})".format(self.__class__.__name__,
97 | self.count)
98 |
99 |
100 | def read_fasta(handle):
101 | """Create a SeqRecords object from Fasta file.
102 |
103 | Args:
104 | file handle : a file containing Fasta sequences.
105 |
106 | """
107 | id_list = []
108 | seq_list = []
109 | for seq_record in fasta.parse(handle):
110 | id_list.append(seq_record.id)
111 | seq_list.append(seq_record.seq)
112 | return SeqRecords(id_list=id_list, seq_list=seq_list)
113 |
114 |
115 | def main():
116 | seq_records = SeqRecords()
117 | seq_records.add(
118 | 'seq1', 'AACGTACCATTGAACGTACCATTGAACGTACCATTGATGCATGGTAGAT')
119 | seq_records.add('seq2', 'CTAGGGGACTTATCTAGGGGACTTATCTAGGGGACTTAT')
120 | seq_records.add('seq3', 'CTAGGGAAAATTCTAGGGAAAATTCTAGGGAAAATT')
121 |
122 | import uuid
123 | import os
124 | outfilename = uuid.uuid4().hex
125 | oh = open(outfilename, 'w')
126 | oh.write(seq_records.fasta())
127 | oh.close()
128 |
129 | fh = open(outfilename)
130 | seq_records = read_fasta(fh)
131 | fh.close()
132 | os.remove(outfilename)
133 |
134 | return seq_records
135 |
136 |
137 | if __name__ == '__main__':
138 | seq_records = main()
139 | print(seq_records.fasta())
140 |
--------------------------------------------------------------------------------
/alfpy/version.py:
--------------------------------------------------------------------------------
1 | # I store the version here so:
2 | # 1) I don't load dependencies by storing it in __init__.py
3 | # 2) I can import it in setup.py for the same reason.
4 | # 3) I can import it into any module.
5 | __version__ = '1.0.6'
--------------------------------------------------------------------------------
/alfpy/wmetric.py:
--------------------------------------------------------------------------------
1 | """Calculate distances between protein sequences based on the W-metric (Wm).
2 |
3 | Reference:
4 | 1. Vinga, Gouveia-Oliveira, Almeida. (2004) Bioinformatics. 20(2):206-215
5 | doi: 10.1093/bioinformatics/btg392
6 |
7 | W-metric includes one-tuple composition information (the difference
8 | in amino acid frequencies between two proteins) and weights from
9 | the scoring matrices used in alignment methods.
10 |
11 | """
12 | import numpy as np
13 |
14 |
15 | def count_seq_chars(seq, alphabet):
16 | """Count characters from given alphabet that are present in sequence.
17 |
18 | Args:
19 | seq (str): sequence
20 | alphabet (str/list): list of allowed characters
21 |
22 | Returns:
23 | A list of characters' counting occurrences.
24 |
25 | Examples:
26 | >>> alphabet = 'ACDEFGHIKLMNPQRSTVWY'
27 | >>> seq = 'MKSTGWHFSG'
28 | >>> print(count_seq_chars(seq, alphabet))
29 | [0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 0, 2, 1, 0, 1, 0]
30 |
31 | """
32 | l = [0 for c in alphabet]
33 | for i, c in enumerate(alphabet):
34 | l[i] += seq.count(c)
35 | return l
36 |
37 |
38 | def freq_seq_chars(counts):
39 | """Calculate frequencies of characters (symbols) in a sequence based on
40 | characters' counts.
41 |
42 | Args:
43 | counts (list): result of the `count_seq_chars` function
44 | seqlen (int): length of a sequence
45 |
46 | Returns:
47 | A list of frequencies corresponding to alphabet
48 |
49 | Examples:
50 | >>> l = [0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 0, 2, 1, 0, 1, 0]
51 | >>> print(freq_seq_chars(l))
52 | [0.0, 0.0, 0.0, 0.0, 0.1, 0.2, 0.1,
53 | 0.0, 0.1, 0.0, 0.1, 0.0, 0.0, 0.0,
54 | 0.0, 0.2, 0.1, 0.0, 0.1, 0.0]
55 |
56 | """
57 | seqlen = float(sum(counts))
58 | return [c / seqlen for c in counts]
59 |
60 |
61 | def freq_seqs_chars(seq_records, alphabet):
62 | """Calculate frequencies of characters from given alphabet
63 | for multiple sequences (stored as seq_records object).
64 |
65 | Args:
66 | seq_records (obj): instance of SeqRecords()
67 | alphabet (list): list of allowed characters
68 |
69 | Returns:
70 | numpy.ndarray
71 | """
72 | l = []
73 | for i in range(seq_records.count):
74 | seq = seq_records.seq_list[i]
75 | counts = count_seq_chars(seq, alphabet)
76 | freq = freq_seq_chars(counts)
77 | l.append(freq)
78 | return np.array(l)
79 |
80 |
81 | class Distance:
82 | """Combine vector with a distance function.
83 |
84 | Attributes:
85 | freqs (ndarray): matrix of sequence-representing vectors
86 | matrix (ndarray): substitution matrix for amino acid changes
87 |
88 | """
89 |
90 | def __init__(self, seq_records, matrix):
91 | """Create a instance of Distance.
92 |
93 | Args:
94 | seq_records (obj: seqrecords.SeqRecords)
95 | matrix (obj: utils.data.subsmat.SubsMat)
96 |
97 | Examples:
98 | >>> from .utils.data import subsmat
99 | >>> from .utils.seqrecords import SeqRecords
100 | >>> matrix = subsmat.get('blosum62')
101 | >>> seq_records = SeqRecords()
102 | >>> seq_records.add('seq1', 'MKSTGWHF')
103 | >>> seq_records.add('seq2', 'MKSSSSTGWGWG')
104 | >>> seq_records.add('seq3', 'MKSTLKNGTEQ')
105 |
106 | >>> dist = Distance(seq_records, matrix)
107 |
108 | """
109 |
110 | self.freqs = freq_seqs_chars(seq_records, matrix.alphabet_list)
111 | self.matrix = matrix
112 |
113 | def pairwise_distance(self, seqnum1, seqnum2):
114 | """Compute W-metric between two proteins.
115 |
116 | The distance is defined by one-tuple frequencies
117 | fx and fy of two proteins, weighted by matrix W.
118 |
119 | Formula:
120 | d^{w} = \sum_{i\in A}\sum_{j\in A}(f_{i}^{X}-f_{i}^{y})
121 | \cdot (f_{j}^{X}-f_{j}^{y})\cdot w_{ij}
122 |
123 | """
124 | freqs1 = self.freqs[seqnum1]
125 | freqs2 = self.freqs[seqnum2]
126 | f = freqs1 - freqs2
127 | m = np.outer(f, f) * self.matrix.data
128 | return np.sum(m)
129 |
130 |
131 | def main():
132 | from .utils import distmatrix
133 | from .utils.data import subsmat
134 | from .utils.seqrecords import SeqRecords
135 |
136 | matrix = subsmat.get('blosum62')
137 |
138 | seq_records = SeqRecords()
139 | seq_records.add('seq1', 'MKSTGWHF')
140 | seq_records.add('seq2', 'MKSSSSTGWGWG')
141 | seq_records.add('seq3', 'MKSTLKNGTEQ')
142 |
143 | dist = Distance(seq_records, matrix)
144 |
145 | # print dist.pairwise_distance(0, 1)
146 | matrix = distmatrix.create(seq_records.id_list, dist)
147 | matrix.display()
148 |
149 |
150 | if __name__ == '__main__':
151 | main()
152 |
--------------------------------------------------------------------------------
/alfpy/word_bool_distance.py:
--------------------------------------------------------------------------------
1 | """Distance methods between two boolean vectors (representing word
2 | occurrences).
3 |
4 | References:
5 | 1. SciPy, https://www.scipy.org
6 |
7 | """
8 |
9 | import numpy as np
10 |
11 | from .utils import distance
12 |
13 |
14 | def _nbool_correspond_ft_tf(u, v):
15 | """Function used by some distance methods (in Distance class).
16 | Based on: https://github.com/scipy/scipy
17 |
18 | Args:
19 | u (numpy.ndarray) : boolean vector, shape: (N, 1)
20 | v (numpy.ndarray) : as above
21 |
22 | Returns:
23 | tuple of two numbers
24 |
25 | Examples:
26 | >>> u = np.array([True, False, True])
27 | >>> v = np.array([True, True, False])
28 | >>> print(_nbool_correspond_ft_tf(u, v))
29 | (1, 1)
30 |
31 | """
32 | not_u = ~u
33 | not_v = ~v
34 | nft = (not_u & v).sum()
35 | ntf = (u & not_v).sum()
36 | return (nft, ntf)
37 |
38 |
39 | def _nbool_correspond_all(u, v):
40 | """Function used by some distance methods (in Distance class).
41 | Based on: https://github.com/scipy/scipy
42 |
43 | Args:
44 | u (numpy.ndarray) : bool, shape: (N, )
45 | v (numpy.ndarray) : as above
46 |
47 | Returns:
48 | tuple of four numbers
49 |
50 | Examples:
51 | >>> u = np.array([True, False, True])
52 | >>> v = np.array([True, True, False])
53 | >>> print(_nbool_correspond_all(u, v))
54 | (0, 1, 1, 1)
55 |
56 | """
57 | not_u = ~u
58 | not_v = ~v
59 | nff = (not_u & not_v).sum()
60 | nft = (not_u & v).sum()
61 | ntf = (u & not_v).sum()
62 | ntt = (u & v).sum()
63 | return (nff, nft, ntf, ntt)
64 |
65 |
66 | class Distance(distance.Distance):
67 | """Combine vector boolean data (numpy.ndarray) with distance method.
68 |
69 | """
70 |
71 | def pwdist_dice(self, seq1idx, seq2idx):
72 | """Compute the Dice dissimilarity (Sorensen-Dice coefficient)
73 | between two boolean 1-D arrays.
74 |
75 | Returns:
76 | distance value (double)
77 |
78 | """
79 | u = self[seq1idx]
80 | v = self[seq2idx]
81 | ntt = (u & v).sum()
82 | (nft, ntf) = _nbool_correspond_ft_tf(u, v)
83 | return float(ntf + nft) / float(2.0 * ntt + ntf + nft)
84 |
85 | def pwdist_yule(self, seq1idx, seq2idx):
86 | """Compute the Yule dissimilarity between two boolean 1-D arrays.
87 |
88 | Returns:
89 | distance value (double)
90 |
91 | """
92 | u = self[seq1idx]
93 | v = self[seq2idx]
94 | (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
95 | return float(2.0 * ntf * nft) / float(ntt * nff + ntf * nft)
96 |
97 | def pwdist_rogerstanimoto(self, seq1idx, seq2idx):
98 | """Compute the Rogers-Tanimoto dissimilarity between two boolean
99 | 1-D arrays.
100 |
101 | Returns:
102 | distance value (double)
103 |
104 | """
105 | u = self[seq1idx]
106 | v = self[seq2idx]
107 | (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
108 | r = float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft)))
109 | return r
110 |
111 | def pwdist_russellrao(self, seq1idx, seq2idx):
112 | """Compute the Russell-Rao dissimilarity between two boolean 1-D arrays.
113 |
114 | Returns:
115 | distance value (double)
116 |
117 | """
118 | u = self[seq1idx]
119 | v = self[seq2idx]
120 |
121 | ntt = (u & v).sum()
122 | return float(len(u) - ntt) / float(len(u))
123 |
124 | def pwdist_sokalmichener(self, seq1idx, seq2idx):
125 | """Compute the Sokal-Michener dissimilarity
126 | between two boolean 1-D arrays.
127 |
128 | Returns:
129 | distance value (double)
130 |
131 | """
132 | u = self[seq1idx]
133 | v = self[seq2idx]
134 | ntt = (u & v).sum()
135 | nff = (~u & ~v).sum()
136 | (nft, ntf) = _nbool_correspond_ft_tf(u, v)
137 | return float(2.0 * (ntf + nft)) / float(ntt + nff + 2.0 * (ntf + nft))
138 |
139 | def pwdist_sokalsneath(self, seq1idx, seq2idx):
140 | """Compute the Sokal-Sneath dissimilarity
141 | between two boolean 1-D arrays.
142 |
143 | Returns:
144 | distance value (double)
145 |
146 | """
147 | u = self[seq1idx]
148 | v = self[seq2idx]
149 | ntt = (u & v).sum()
150 |
151 | (nft, ntf) = _nbool_correspond_ft_tf(u, v)
152 | denom = ntt + 2.0 * (ntf + nft)
153 | if denom == 0:
154 | raise ValueError('Sokal-Sneath dissimilarity is not defined for '
155 | 'vectors that are entirely false.')
156 | return float(2.0 * (ntf + nft)) / denom
157 |
158 | def pwdist_jaccard(self, seq1idx, seq2idx):
159 | """Compute the Jaccard-Needham dissimilarity
160 | between two boolean 1-D arrays.
161 |
162 | Returns:
163 | distance value (double)
164 |
165 | """
166 | u = self[seq1idx]
167 | v = self[seq2idx]
168 | dist = (np.double(np.bitwise_and((u != v),
169 | np.bitwise_or(u != 0, v != 0)).sum()) /
170 | np.double(np.bitwise_or(u != 0, v != 0).sum()))
171 | return dist
172 |
173 | def pwdist_hamming(self, seq1idx, seq2idx):
174 | """Compute the Hamming distance between two 1-D arrays.
175 |
176 | The Hamming distance between 1-D arrays `u` and `v`, is simply the
177 | proportion of disagreeing components in `u` and `v`.
178 |
179 | Returns:
180 | distance value (double)
181 |
182 | """
183 | u = self[seq1idx]
184 | v = self[seq2idx]
185 | return (u != v).mean()
186 |
187 | def pwdist_kulsinski(self, seq1idx, seq2idx):
188 | """Compute the Kulsinski dissimilarity between two boolean 1-D arrays.
189 |
190 | Returns:
191 | distance value (double)
192 |
193 | """
194 | u = self[seq1idx]
195 | v = self[seq2idx]
196 | n = float(len(u))
197 | (_nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
198 | return (ntf + nft - ntt + n) / (ntf + nft + n)
199 |
200 |
201 | def main():
202 | from .utils.seqrecords import SeqRecords
203 | from . import word_vector
204 | from . import word_pattern
205 | from .utils import distmatrix
206 |
207 | seq_records = SeqRecords()
208 | seq_records.add('seq1', 'MKSTGWHF')
209 | seq_records.add('seq2', 'MKSSSSTGWGWG')
210 | seq_records.add('seq3', 'MKSTLKNGTEQ')
211 |
212 | p = word_pattern.create(seq_records.seq_list, 2)
213 | bools = word_vector.Bools(seq_records.length_list, p)
214 | dist = Distance(bools, 'jaccard')
215 | matrix = distmatrix.create(seq_records.id_list, dist)
216 | matrix.display()
217 |
218 |
219 | if __name__ == '__main__':
220 | main()
221 |
--------------------------------------------------------------------------------
/alfpy/word_d2.py:
--------------------------------------------------------------------------------
1 | """This module computes distance between DNA/protein sequences based on
2 | the d2 metric.
3 |
4 | References:
5 | 1. Hide, Burke, Davison (1994) J Comput Biol 1:199-215.
6 | doi: 10.1089/cmb.1994.1.199
7 | 2. Vinga S, Almeida J (2003) Bioinformatics 19:513-523.
8 | doi: 10.1093/bioinformatics/btg005
9 |
10 | """
11 |
12 | import math
13 | import numpy as np
14 |
15 |
16 | class Distance:
17 |
18 | """Combine a list of vectors with distance function."""
19 |
20 | def __init__(self, vector_list):
21 | self.vector_list = vector_list
22 | self.pairwise_distance = self.pwdist_d2
23 |
24 | def pwdist_d2(self, seqidx1, seqidx2):
25 | d2 = 0
26 | for vector in self.vector_list:
27 | d_res = np.sum((vector[seqidx1]-vector[seqidx2])**2)
28 | d2 += d_res
29 | return d2
30 |
31 | def pwdist_d2_squareroot(self, seqidx1, seqidx2):
32 | return math.sqrt(self.pwdist_d2(seqidx1, seqidx2))
33 |
34 | def set_disttype(self, disttype):
35 | try:
36 | pwdist_func = getattr(self, 'pwdist_{}'.format(disttype))
37 | self.pairwise_distance = pwdist_func
38 | # Method does not exist.
39 | except AttributeError:
40 | msg = 'unknown disttype "{}"'.format(disttype)
41 | raise ValueError(msg)
42 |
43 |
44 | def main():
45 | from .utils.seqrecords import main
46 | from .utils.data import seqcontent
47 | from .utils import distmatrix
48 | from . import word_pattern
49 | from . import word_vector
50 |
51 | seq_records = main()
52 |
53 | patterns = []
54 | for i in range(1, 5+1):
55 | p = word_pattern.create(seq_records.seq_list, i)
56 | patterns.append(p)
57 |
58 | counts = []
59 | for p in patterns:
60 | c = word_vector.Counts(seq_records.length_list, p)
61 | counts.append(c)
62 |
63 | countsweight = []
64 | weights = seqcontent.get_weights('dna')
65 | weightmodel = word_vector.WeightModel(weights)
66 | for p in patterns:
67 | c = word_vector.CountsWeight(seq_records, p, weightmodel)
68 | countsweight.append(c)
69 | dist = Distance(countsweight)
70 | matrix = distmatrix.create(seq_records.id_list, dist)
71 | matrix.display()
72 |
73 |
74 | if __name__ == '__main__':
75 | main()
76 |
--------------------------------------------------------------------------------
/alfpy/word_rtd.py:
--------------------------------------------------------------------------------
1 | """Return Time Distribution distance (RTD)
2 |
3 | In contrast to other word-based measures, RTD accounts for the words'
4 | relative orders. Although, originally presented for DNA sequences, the
5 | implemention handles proteins as well.
6 |
7 | Return time can be defined as the time required for the reappearance of a
8 | particular state without its appearance within the epoch. The `return time`
9 | in the context of nucleotide sequence can be defined as the number of
10 | nucleotides between the successive appearances of a particular nucleotide(s)
11 | or k-mer. The frequency distribution of those RTs for a particular k-mer is
12 | referred as a return time distribution (RTD) of that k-mer.
13 |
14 | References:
15 | 1. Kolekar, Kale, Kulkarni-Kale (2012) Mol Phylogenet Evol 65 510-522
16 | doi: http://dx.doi.org/10.1016/j.ympev.2012.07.003.
17 |
18 | """
19 |
20 | import numpy as np
21 | from .utils import distance
22 |
23 |
24 | def calc_rtd(word_positions):
25 | """Compute return time distribution (RTD) of a given word.
26 |
27 | Args:
28 | word_positions (list) : list of sequence positions of a given word
29 |
30 | Returns:
31 | mean, stdev (tuple)
32 |
33 | Examples:
34 | >>> seq = 'CTACACAACTTTGCGGGTAGCCGGAAACATTGTGAATGCGGTGAACA'
35 | >>> apos = [i for i, nt in enumerate(seq) if nt == 'A']
36 | >>> print(apos)
37 | [2, 4, 6, 7, 18, 24, 25, 26, 28, 34, 35, 43, 44, 46]
38 | >>> print(calc_rtd(apos, 1))
39 | (3.3846153846153846, 3.1510306381944679)
40 |
41 | """
42 | l = []
43 | positions_count = len(word_positions)
44 | if positions_count < 2:
45 | return 0.0, 0.0
46 | for i in range(1, positions_count):
47 | pos1 = word_positions[i - 1]
48 | pos2 = word_positions[i]
49 | pos = pos2 - pos1
50 | l.append(pos)
51 | return np.mean(l), np.std(l)
52 |
53 |
54 | def create_vector(seqcount, pattern):
55 | """Compute a matrix of sequence-representing RTD vectors
56 |
57 | Args:
58 | seqcount (int): number of sequences
59 | pattern (obj: word_pattern.Pattern)
60 |
61 | Returns:
62 | ndarray: matrix of RTD vectors
63 | (shape: number of seqs, doubled number of words)
64 |
65 | """
66 | words = pattern.pat_list
67 | data = np.zeros(shape=(seqcount, len(words) * 2))
68 | for wordidx in range(len(words)):
69 | for seqidx in pattern.pos_list[wordidx]:
70 | word_positions = pattern.pos_list[wordidx][seqidx]
71 | mean, std = calc_rtd(word_positions)
72 | data[seqidx, wordidx * 2] = mean
73 | data[seqidx, wordidx * 2 + 1] = std
74 | return data
75 |
76 |
77 | class Distance(distance.Distance):
78 | pass
79 |
80 |
81 | def main():
82 | from .utils.seqrecords import main
83 | from . import word_pattern
84 | from .utils import distmatrix
85 |
86 | seq_records = main()
87 | p = word_pattern.create(seq_records.seq_list, 2, True)
88 | vector = create_vector(seq_records.count, p)
89 | dist = Distance(vector, 'google')
90 | matrix = distmatrix.create(seq_records.id_list, dist)
91 | matrix.display()
92 |
93 |
94 | if __name__ == '__main__':
95 | main()
96 |
--------------------------------------------------------------------------------
/alfpy/word_sets_distance.py:
--------------------------------------------------------------------------------
1 | """Distance methods measuring dissimilarity between sets of words.
2 |
3 | These methods are also implemented in numpy and provided in the
4 | `word_bool_distance` module. However, here are their faster
5 | implemetations based on python sets.
6 | """
7 |
8 | from .utils import distance
9 |
10 |
11 | def _getwords(seq, word_size):
12 | """Return a set of words (of a given size) that are present
13 | in a given sequence.
14 |
15 | Args:
16 | seq (str)
17 | word_size (int): >= 1
18 |
19 | Example:
20 | >>> seq = 'ATGCGTA'
21 | >>> print(_getwords(seq, 2))
22 | set(['GT', 'CG', 'GC', 'AT', 'TG', 'TA'])
23 |
24 | """
25 | s = set([])
26 | for i in range(0, len(seq) - word_size + 1):
27 | word = seq[i:i + word_size]
28 | s.add(word)
29 | return s
30 |
31 |
32 | class Distance(distance.Distance):
33 | """Combine vector data with pairwise distance methods that measures
34 | dissimilarity between sets."""
35 |
36 | def __init__(self, seq_records, word_size, disttype='jaccard'):
37 | """Create an instance of Distance
38 |
39 | Args:
40 | seq_records (SeqRecords obj)
41 | word_size (int): >= 1
42 |
43 | """
44 | self._vector = [_getwords(s, word_size) for s in seq_records.seq_list]
45 | self.set_disttype(disttype)
46 |
47 | def pwdist_jaccard(self, seq1idx, seq2idx):
48 | """Jaccard distance is complementary to the Jaccard coefficient
49 | and is obtained by subtracting the Jaccard coefficient from 1."""
50 | s1 = self[seq1idx]
51 | s2 = self[seq2idx]
52 | return 1 - len(s1 & s2) / float(len(s1 | s2))
53 |
54 | def pwdist_dice(self, seq1idx, seq2idx):
55 | """Sorensen-Dice coefficient (Czekanowski's binary index)"""
56 | s1 = self[seq1idx]
57 | s2 = self[seq2idx]
58 | return 1 - (2 * len(s1 & s2) / float(len(s1) + len(s2)))
59 |
60 | def pwdist_hamming(self, seq1idx, seq2idx):
61 | """Hamming distance measures the number of words which are in either
62 | of the sets and not in their intersection.
63 |
64 | """
65 | s1 = self[seq1idx]
66 | s2 = self[seq2idx]
67 | return len(s1.symmetric_difference(s2))
68 |
69 |
70 | def main():
71 | from .utils.seqrecords import SeqRecords
72 | from .utils import distmatrix
73 |
74 | seq_records = SeqRecords()
75 | seq_records.add('seq1', 'MKSTGWHF')
76 | seq_records.add('seq2', 'MKSSSSTGWGWG')
77 | seq_records.add('seq3', 'MKSTLKNGTEQ')
78 | dist = Distance(seq_records, 2, 'jaccard')
79 | matrix = distmatrix.create(seq_records.id_list, dist)
80 | matrix.display()
81 |
82 | if __name__ == '__main__':
83 | main()
84 |
--------------------------------------------------------------------------------
/bin/calc_bbc.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 |
8 | from alfpy import bbc
9 | from alfpy.utils import distmatrix
10 | from alfpy.utils import seqrecords
11 | from alfpy.utils.data.seqcontent import get_alphabet
12 | from alfpy.version import __version__
13 |
14 |
15 | def get_parser():
16 | parser = argparse.ArgumentParser(
17 | description='''Calculatee distance between DNA/protein sequences
18 | based on Base-Base Correlation (BBC).''',
19 | add_help=False, prog='calc_bbc.py'
20 | )
21 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
22 | group.add_argument('--fasta', '-f',
23 | help='input FASTA sequence filename', required=True,
24 | type=argparse.FileType('r'), metavar="FILE")
25 | group.add_argument('--molecule', '-m', choices=['dna', 'rna', 'protein'],
26 | help='choose sequence alphabet', required=True)
27 |
28 | group = parser.add_argument_group('OPTIONAL ARGUMENTS')
29 | group.add_argument('--k', '-k', help='''maximum distance to observe
30 | correlation between bases [default: %(default)s]''',
31 | type=int, default=10, metavar="INT")
32 | group.add_argument('--out', '-o', help="output filename",
33 | metavar="FILE")
34 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
35 | default='phylip',
36 | help='distances output format [default: %(default)s]')
37 |
38 | group = parser.add_argument_group("OTHER OPTIONS")
39 | group.add_argument("-h", "--help", action="help",
40 | help="show this help message and exit")
41 | group.add_argument('--version', action='version',
42 | version='%(prog)s {}'.format(__version__))
43 |
44 | if len(sys.argv[1:]) == 0:
45 | # parser.print_help()
46 | parser.print_usage()
47 | parser.exit()
48 | return parser
49 |
50 |
51 | def validate_args(parser):
52 | args = parser.parse_args()
53 | try:
54 | args.alphabet = get_alphabet(args.molecule)
55 | except KeyError:
56 | parser.error("Unknown alphabet {}".format(args.molecule))
57 | return args
58 |
59 |
60 | def main():
61 | parser = get_parser()
62 | args = validate_args(parser)
63 |
64 | seq_records = seqrecords.read_fasta(args.fasta)
65 | vector = bbc.create_vectors(seq_records, args.k, alphabet=args.alphabet)
66 | dist = bbc.Distance(vector)
67 | matrix = distmatrix.create(seq_records.id_list, dist)
68 |
69 | if args.out:
70 | oh = open(args.out, 'w')
71 | matrix.write_to_file(oh, args.outfmt)
72 | oh.close()
73 | else:
74 | matrix.display(args.outfmt)
75 |
76 |
77 | if __name__ == '__main__':
78 | main()
79 |
--------------------------------------------------------------------------------
/bin/calc_fcgr.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 |
8 | from alfpy import fcgr
9 | from alfpy.utils import distmatrix
10 | from alfpy.utils import seqrecords
11 | from alfpy.version import __version__
12 |
13 |
14 | def get_parser():
15 | parser = argparse.ArgumentParser(
16 | description='''Calculate distances between DNA sequences based on
17 | Frequency Chaos Game Representation (FCGR) patterns of
18 | word occurrences.''',
19 | add_help=False, prog='calc_fcgr.py'
20 | )
21 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
22 | group.add_argument('--fasta', '-f',
23 | help='input FASTA sequence filename', required=True,
24 | type=argparse.FileType('r'), metavar="FILE")
25 | group.add_argument('--word_size', '-w', required=True,
26 | help='word size', type=int)
27 |
28 | group = parser.add_argument_group('OUTPUT ARGUMENTS')
29 | group.add_argument('--out', '-o', help="output filename",
30 | metavar="FILE")
31 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
32 | default='phylip',
33 | help='distances output format [DEFAULT: %(default)s]')
34 |
35 | group = parser.add_argument_group("OTHER OPTIONS")
36 | group.add_argument("-h", "--help", action="help",
37 | help="show this help message and exit")
38 | group.add_argument('--version', action='version',
39 | version='%(prog)s {}'.format(__version__))
40 |
41 | if len(sys.argv[1:]) == 0:
42 | # parser.print_help()
43 | parser.print_usage()
44 | parser.exit()
45 | return parser
46 |
47 |
48 | def validate_args(parser):
49 | args = parser.parse_args()
50 | if args.word_size < 1:
51 | parser.error('--word_size must be >= 1')
52 | return args
53 |
54 |
55 | def main():
56 | parser = get_parser()
57 | args = validate_args(parser)
58 |
59 | seq_records = seqrecords.read_fasta(args.fasta)
60 |
61 | vector = fcgr.create_vectors(seq_records, args.word_size)
62 | dist = fcgr.Distance(vector)
63 | matrix = distmatrix.create(seq_records.id_list, dist)
64 |
65 | if args.out:
66 | oh = open(args.out, 'w')
67 | matrix.write_to_file(oh, args.outfmt)
68 | oh.close()
69 | else:
70 | matrix.display(args.outfmt)
71 |
72 |
73 | if __name__ == '__main__':
74 | main()
75 |
--------------------------------------------------------------------------------
/bin/calc_graphdna.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 |
8 | from alfpy import graphdna
9 | from alfpy.utils import distmatrix
10 | from alfpy.utils import seqrecords
11 | from alfpy.version import __version__
12 |
13 |
14 | def get_parser():
15 | parser = argparse.ArgumentParser(
16 | description='''Calculate distance between DNA sequences based on
17 | the two-dimensional (2D) graphical DNA curve''',
18 | add_help=False, prog='calc_graphdna.py'
19 | )
20 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
21 | group.add_argument('--fasta', '-f',
22 | help='input FASTA sequence filename', required=True,
23 | type=argparse.FileType('r'), metavar="FILE")
24 |
25 | group = parser.add_argument_group('OPTIONAL ARGUMENTS')
26 | group.add_argument('--vector', '-v', choices=['2DSV', '2DNV', '2DMV'],
27 | help='vector type [default: %(default)s]',
28 | default='2DNV')
29 | group.add_argument('--ndim', '-n', type=int, metavar='N',
30 | help='''number of dimensions representing a sequence.
31 | (required if --vector 2DMV) [default: %(default)s]''',
32 | default=10)
33 |
34 | group = parser.add_argument_group('OUTPUT ARGUMENTS')
35 | group.add_argument('--out', '-o', help="output filename", metavar="FILE")
36 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
37 | default='phylip',
38 | help='distances output format [default: %(default)s]')
39 |
40 | group = parser.add_argument_group("OTHER OPTIONS")
41 | group.add_argument("-h", "--help", action="help",
42 | help="show this help message and exit")
43 | group.add_argument('--version', action='version',
44 | version='%(prog)s {}'.format(__version__))
45 |
46 | if len(sys.argv[1:]) == 0:
47 | # parser.print_help()
48 | parser.print_usage()
49 | parser.exit()
50 | return parser
51 |
52 |
53 | def validate_args(parser):
54 | args = parser.parse_args()
55 | if args.vector == '2DMV' and args.ndim is None:
56 | parser.error("--vector 2DMV requires the --ndim")
57 | # TODO: mk as a range
58 | # stackoverflow.com/questions/18700634/python-argparse-integer-condition-12
59 | return args
60 |
61 |
62 | def main():
63 | parser = get_parser()
64 | args = validate_args(parser)
65 |
66 | seq_records = seqrecords.read_fasta(args.fasta)
67 | if args.vector == '2DSV':
68 | vector = graphdna.create_2DSGraphVectors(seq_records)
69 | elif args.vector == '2DNV':
70 | vector = graphdna.create_2DNGraphVectors(seq_records)
71 | else:
72 | vector = graphdna.create_2DMGraphVectors(seq_records, args.ndim)
73 | dist = graphdna.Distance(vector)
74 | matrix = distmatrix.create(seq_records.id_list, dist)
75 |
76 | if args.out:
77 | oh = open(args.out, 'w')
78 | matrix.write_to_file(oh, args.outfmt)
79 | oh.close()
80 | else:
81 | matrix.display(args.outfmt)
82 |
83 |
84 | if __name__ == '__main__':
85 | main()
86 |
--------------------------------------------------------------------------------
/bin/calc_lempelziv.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 |
8 | from alfpy import lempelziv
9 | from alfpy.utils import distmatrix
10 | from alfpy.utils import seqrecords
11 | from alfpy.version import __version__
12 |
13 |
14 | def get_parser():
15 | parser = argparse.ArgumentParser(
16 | description='''Calculate distance between DNA/protein sequences based
17 | on Lempel-Ziv complexity.''',
18 | add_help=False, prog='calc_lempelziv.py'
19 | )
20 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
21 | group.add_argument('--fasta', '-f',
22 | help='input FASTA sequence filename', required=True,
23 | type=argparse.FileType('r'), metavar="FILE")
24 |
25 | group = parser.add_argument_group('OPTIONAL ARGUMENTS')
26 | distlist = ['d', 'd_star', 'd1', 'd1_star', 'd1_star2']
27 | group.add_argument('--distance', '-d', choices=distlist,
28 | help='choose from: {} [DEFAULT: %(default)s]'.format(
29 | ", ".join(distlist)),
30 | metavar='', default="d1_star2")
31 |
32 | group = parser.add_argument_group('OUTPUT ARGUMENTS')
33 | group.add_argument('--out', '-o', help="output filename",
34 | metavar="FILE")
35 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
36 | default='phylip',
37 | help='distances output format [DEFAULT: %(default)s]')
38 |
39 | group = parser.add_argument_group("OTHER OPTIONS")
40 | group.add_argument("-h", "--help", action="help",
41 | help="show this help message and exit")
42 | group.add_argument('--version', action='version',
43 | version='%(prog)s {}'.format(__version__))
44 |
45 | if len(sys.argv[1:]) == 0:
46 | # parser.print_help()
47 | parser.print_usage()
48 | parser.exit()
49 | return parser
50 |
51 |
52 | def validate_args(parser):
53 | args = parser.parse_args()
54 | return args
55 |
56 |
57 | def main():
58 | parser = get_parser()
59 | args = validate_args(parser)
60 |
61 | seq_records = seqrecords.read_fasta(args.fasta)
62 | dist = lempelziv.Distance(seq_records, args.distance)
63 | matrix = distmatrix.create(seq_records.id_list, dist)
64 |
65 | if args.out:
66 | oh = open(args.out, 'w')
67 | matrix.write_to_file(oh, args.outfmt)
68 | oh.close()
69 | else:
70 | matrix.display(args.outfmt)
71 |
72 |
73 | if __name__ == '__main__':
74 | main()
75 |
--------------------------------------------------------------------------------
/bin/calc_ncd.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 |
8 | from alfpy import ncd
9 | from alfpy.utils import distmatrix
10 | from alfpy.utils import seqrecords
11 | from alfpy.version import __version__
12 |
13 |
14 | def get_parser():
15 | parser = argparse.ArgumentParser(
16 | description='''Calculate distances between DNA/protein sequences based
17 | on Normalized Compression Distance (NCD).''',
18 | add_help=False, prog='calc_ncd.py'
19 | )
20 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
21 | group.add_argument('--fasta', '-f',
22 | help='input FASTA sequence filename', required=True,
23 | type=argparse.FileType('r'), metavar="FILE")
24 |
25 | group = parser.add_argument_group('OUTPUT ARGUMENTS')
26 | group.add_argument('--out', '-o', help="output filename",
27 | metavar="FILE")
28 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
29 | default='phylip',
30 | help='distances output format [DEFAULT: %(default)s]')
31 |
32 | group = parser.add_argument_group("OTHER OPTIONS")
33 | group.add_argument("-h", "--help", action="help",
34 | help="show this help message and exit")
35 | group.add_argument('--version', action='version',
36 | version='%(prog)s {}'.format(__version__))
37 |
38 | if len(sys.argv[1:]) == 0:
39 | # parser.print_help()
40 | parser.print_usage()
41 | parser.exit()
42 |
43 | return parser
44 |
45 |
46 | def validate_args(parser):
47 | args = parser.parse_args()
48 | return args
49 |
50 |
51 | def main():
52 | parser = get_parser()
53 | args = validate_args(parser)
54 |
55 | seq_records = seqrecords.read_fasta(args.fasta)
56 | dist = ncd.Distance(seq_records)
57 | matrix = distmatrix.create(seq_records.id_list, dist)
58 |
59 | if args.out:
60 | oh = open(args.out, 'w')
61 | matrix.write_to_file(oh, args.outfmt)
62 | oh.close()
63 | else:
64 | matrix.display(args.outfmt)
65 |
66 |
67 | if __name__ == '__main__':
68 | main()
69 |
--------------------------------------------------------------------------------
/bin/calc_wmetric.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 |
8 | from alfpy import wmetric
9 | from alfpy.utils import distmatrix
10 | from alfpy.utils import seqrecords
11 | from alfpy.utils.data import subsmat
12 | from alfpy.version import __version__
13 |
14 |
15 | def get_parser():
16 | parser = argparse.ArgumentParser(
17 | description='''Calculate distances between protein sequences based
18 | on W-metric (Wm).''', add_help=False, prog='calc_wmetric.py'
19 | )
20 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
21 | group.add_argument('--fasta', '-f',
22 | help='input FASTA sequence filename', required=True,
23 | type=argparse.FileType('r'), metavar="FILE")
24 |
25 | l = subsmat.list_subsmats()
26 | group = parser.add_argument_group('OPTIONAL ARGUMENTS')
27 | group.add_argument('--matrix', '-m', choices=l,
28 | help='choose from: {} [DEFAULT: %(default)s]'.format(
29 | ", ".join(l)), metavar='',
30 | default="blosum62")
31 |
32 | group = parser.add_argument_group('OUTPUT ARGUMENTS')
33 | group.add_argument('--out', '-o', help="output filename",
34 | metavar="FILE")
35 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
36 | default='phylip',
37 | help='distances output format [DEFAULT: %(default)s]')
38 |
39 | group = parser.add_argument_group("OTHER OPTIONS")
40 | group.add_argument("-h", "--help", action="help",
41 | help="show this help message and exit")
42 | group.add_argument('--version', action='version',
43 | version='%(prog)s {}'.format(__version__))
44 |
45 | if len(sys.argv[1:]) == 0:
46 | # parser.print_help()
47 | parser.print_usage()
48 | parser.exit()
49 |
50 | return parser
51 |
52 |
53 | def validate_args(parser):
54 | args = parser.parse_args()
55 | try:
56 | args.matrix = subsmat.get(args.matrix)
57 | except KeyError:
58 | parser.error("Unknown matrix {}".format(args.matrix))
59 | return args
60 |
61 |
62 | def main():
63 | parser = get_parser()
64 | args = validate_args(parser)
65 |
66 | seq_records = seqrecords.read_fasta(args.fasta)
67 | dist = wmetric.Distance(seq_records, args.matrix)
68 | matrix = distmatrix.create(seq_records.id_list, dist)
69 |
70 | if args.out:
71 | oh = open(args.out, 'w')
72 | matrix.write_to_file(oh, args.outfmt)
73 | oh.close()
74 | else:
75 | matrix.display(args.outfmt)
76 |
77 |
78 | if __name__ == '__main__':
79 | main()
80 |
--------------------------------------------------------------------------------
/bin/calc_word_bool.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 |
8 | from alfpy import word_bool_distance
9 | from alfpy import word_pattern
10 | from alfpy import word_vector
11 | from alfpy.utils import distmatrix
12 | from alfpy.utils import seqrecords
13 | from alfpy.version import __version__
14 |
15 |
16 | def get_parser():
17 | parser = argparse.ArgumentParser(
18 | description='''Calculate distances between DNA/protein sequences based
19 | on boolean 1-D vectors of word counting occurrences.''',
20 | add_help=False, prog='calc_word_bool.py'
21 | )
22 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
23 | group.add_argument('--fasta', '-f',
24 | help='input FASTA sequence filename', required=True,
25 | type=argparse.FileType('r'), metavar="FILE")
26 |
27 | group = parser.add_argument_group(' Choose between the two options')
28 | g1 = group.add_mutually_exclusive_group()
29 | g1.add_argument('--word_size', '-s', metavar="N",
30 | help='word size for creating word patterns',
31 | type=int)
32 | g1.add_argument('--word_pattern', '-w',
33 | help='input filename w/ pre-computed word patterns',
34 | type=argparse.FileType('r'), metavar="FILE")
35 |
36 | group = parser.add_argument_group('OPTIONAL ARGUMENTS')
37 | distlist = word_bool_distance.Distance.get_disttypes()
38 | group.add_argument('--distance', '-d', choices=distlist,
39 | help='choose from: {} [DEFAULT: %(default)s]'.format(
40 | ", ".join(distlist)),
41 | metavar='', default="jaccard")
42 |
43 | group = parser.add_argument_group('OUTPUT ARGUMENTS')
44 | group.add_argument('--out', '-o', help="output filename",
45 | metavar="FILE")
46 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
47 | default='phylip',
48 | help='distances output format [DEFAULT: %(default)s]')
49 |
50 | group = parser.add_argument_group("OTHER OPTIONS")
51 | group.add_argument("-h", "--help", action="help",
52 | help="show this help message and exit")
53 | group.add_argument('--version', action='version',
54 | version='%(prog)s {}'.format(__version__))
55 |
56 | if len(sys.argv[1:]) == 0:
57 | # parser.print_help()
58 | parser.print_usage() # for just the usage line
59 | parser.exit()
60 |
61 | return parser
62 |
63 |
64 | def validate_args(parser):
65 | args = parser.parse_args()
66 | if args.word_size:
67 | if args.word_size < 1:
68 | parser.error('Word size must be >= 1.')
69 | elif args.word_pattern:
70 | pass
71 | else:
72 | parser.error("Specify either: --word_size or --word_pattern.")
73 | return args
74 |
75 |
76 | def main():
77 | parser = get_parser()
78 | args = validate_args(parser)
79 |
80 | seq_records = seqrecords.read_fasta(args.fasta)
81 | if args.word_size:
82 | p = word_pattern.create(seq_records.seq_list, args.word_size)
83 | else:
84 | p = word_pattern.read(args.word_pattern)
85 |
86 | bools = word_vector.Bools(seq_records.length_list, p)
87 | dist = word_bool_distance.Distance(bools, args.distance)
88 | matrix = distmatrix.create(seq_records.id_list, dist)
89 |
90 | if args.out:
91 | oh = open(args.out, 'w')
92 | matrix.write_to_file(oh, args.outfmt)
93 | oh.close()
94 | else:
95 | matrix.display(args.outfmt)
96 |
97 |
98 | if __name__ == '__main__':
99 | main()
100 |
--------------------------------------------------------------------------------
/bin/calc_word_cv.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 |
8 | from alfpy import word_vector
9 | from alfpy import word_distance
10 | from alfpy.utils import distmatrix
11 | from alfpy.utils import seqrecords
12 | from alfpy import word_pattern
13 | from alfpy.version import __version__
14 |
15 |
16 | def get_parser():
17 | parser = argparse.ArgumentParser(
18 | description='''Calculate compositional distances between DNA/protein
19 | sequences based on word (of length k) occurrences using a Markov model
20 | of k-2.''',
21 | add_help=False, prog='calc_word_cv.py'
22 | )
23 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
24 | group.add_argument('--fasta', '-f',
25 | help='input FASTA sequence filename', required=True,
26 | type=argparse.FileType('r'), metavar="FILE")
27 |
28 | group = parser.add_argument_group(' Choose between the two options')
29 | g1 = group.add_mutually_exclusive_group()
30 | g1.add_argument('--word_size', '-s', metavar="k", type=int,
31 | help='''word size (k-mer) for creating word patterns
32 | (must be >= 3)'''
33 | )
34 | g1.add_argument('--word_patterns', '-w', nargs=3,
35 | help='''3 input word pattern files (k-, [k-1]-,
36 | [k-2]-mers)''',
37 | type=argparse.FileType('r'), metavar="FILE")
38 |
39 | group = parser.add_argument_group('OUTPUT ARGUMENTS')
40 | group.add_argument('--out', '-o', help="output filename",
41 | metavar="FILE")
42 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
43 | default='phylip',
44 | help='distances output format [DEFAULT: %(default)s]')
45 |
46 | group = parser.add_argument_group("OTHER OPTIONS")
47 | group.add_argument("-h", "--help", action="help",
48 | help="show this help message and exit")
49 | group.add_argument('--version', action='version',
50 | version='%(prog)s {}'.format(__version__))
51 |
52 | if len(sys.argv[1:]) == 0:
53 | # parser.print_help()
54 | parser.print_usage()
55 | parser.exit()
56 |
57 | return parser
58 |
59 |
60 | def validate_args(parser):
61 | args = parser.parse_args()
62 | if args.word_size:
63 | if args.word_size < 3:
64 | parser.error('Word size must be >= 3')
65 |
66 | elif args.word_patterns:
67 | l = []
68 | for i in range(0, 3):
69 | try:
70 | p = word_pattern.read(args.word_patterns[i])
71 | l.append(p)
72 | except Exception:
73 | parser.error('Invalid format for word pattern: {0}'.format(
74 | args.word_patterns[i].name))
75 |
76 | if len(l) == 3:
77 | # check if follow rule
78 | k, k1, k2 = [len(p.pat_list[0]) for p in l]
79 | if not (k == k1 + 1 == k2 + 2):
80 | parser.error(
81 | '''Word pattern lengths do not follow k, k-1, k-2''')
82 |
83 | args.word_patterns = l
84 | else:
85 | parser.error("Specify either: --word_size or --word_pattern.")
86 | return args
87 |
88 |
89 | def main():
90 | parser = get_parser()
91 | args = validate_args(parser)
92 |
93 | seq_records = seqrecords.read_fasta(args.fasta)
94 |
95 | if args.word_patterns:
96 | l = args.word_patterns
97 | else:
98 | l = []
99 | for i in range(args.word_size, args.word_size - 3, -1):
100 | p = word_pattern.create(seq_records.seq_list, i)
101 | l.append(p)
102 |
103 | compos = word_vector.Composition(seq_records.length_list, *l)
104 | dist = word_distance.Distance(compos, 'angle_cos_diss')
105 | matrix = distmatrix.create(seq_records.id_list, dist)
106 |
107 | if args.out:
108 | oh = open(args.out, 'w')
109 | matrix.write_to_file(oh, args.outfmt)
110 | oh.close()
111 | else:
112 | matrix.display(args.outfmt)
113 |
114 |
115 | if __name__ == '__main__':
116 | main()
117 |
--------------------------------------------------------------------------------
/bin/calc_word_d2.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 |
8 | from alfpy import word_d2
9 | from alfpy import word_pattern
10 | from alfpy import word_vector
11 | from alfpy.utils import distmatrix
12 | from alfpy.utils import seqrecords
13 | from alfpy.version import __version__
14 |
15 |
16 | def get_parser():
17 | parser = argparse.ArgumentParser(
18 | description='''Calculate d2 distance between DNA/protein sequences based
19 | on subsequence (words) occurrences.''',
20 | add_help=False, prog='calc_word_d2.py'
21 | )
22 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
23 | group.add_argument('--fasta', '-f',
24 | help='input FASTA sequence filename', required=True,
25 | type=argparse.FileType('r'), metavar="FILE")
26 |
27 | group = parser.add_argument_group('OPTIONAL ARGUMENTS')
28 | group.add_argument('--min_word_size', '-l',
29 | help='minimum word size [default: %(default)s]',
30 | type=int, metavar="WORD_SIZE", default=1,
31 | )
32 | group.add_argument('--max_word_size', '-u',
33 | help='maximum word size [default: %(default)s]',
34 | type=int, metavar="WORD_SIZE", default=3,
35 | )
36 | veclist = ['counts', 'freqs']
37 | group.add_argument('--vector', '-v', choices=veclist,
38 | help='choose from: {} [DEFAULT: %(default)s]'.format(
39 | ", ".join(veclist)),
40 | metavar='', default="counts")
41 | group.add_argument('--char_weights', '-W', metavar="FILE",
42 | help='''file w/ weights of background sequence characters
43 | (nt/aa)''',
44 | type=argparse.FileType('r'))
45 |
46 | group = parser.add_argument_group('OUTPUT ARGUMENTS')
47 | group.add_argument('--out', '-o', help="output filename",
48 | metavar="FILE")
49 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
50 | default='phylip',
51 | help='distances output format [DEFAULT: %(default)s]')
52 |
53 | group = parser.add_argument_group("OTHER OPTIONS")
54 | group.add_argument("-h", "--help", action="help",
55 | help="show this help message and exit")
56 | group.add_argument('--version', action='version',
57 | version='%(prog)s {}'.format(__version__))
58 |
59 | if len(sys.argv[1:]) == 0:
60 | # parser.print_help()
61 | parser.print_usage()
62 | parser.exit()
63 |
64 | return parser
65 |
66 |
67 | def validate_args(parser):
68 | args = parser.parse_args()
69 | if not args.min_word_size:
70 | parser.error("min_word_size must be greater than 0")
71 | elif args.min_word_size >= args.max_word_size:
72 | parser.error("max_word_size must be greater than min_word_size")
73 | if args.char_weights:
74 | try:
75 | weights = word_vector.read_weightfile(args.char_weights)
76 | args.char_weights = weights
77 | except Exception:
78 | e = 'Invalid format for --char_weights {0}'.format(
79 | args.char_weights.name)
80 | parser.error(e)
81 | return args
82 |
83 |
84 | def main():
85 | parser = get_parser()
86 | args = validate_args(parser)
87 |
88 | seq_records = seqrecords.read_fasta(args.fasta)
89 |
90 | patterns = []
91 | for i in range(args.min_word_size, args.max_word_size + 1):
92 | p = word_pattern.create(seq_records.seq_list, i)
93 | patterns.append(p)
94 |
95 | vecs = []
96 | if args.char_weights is not None:
97 | weightmodel = word_vector.WeightModel(char_weights=args.char_weights)
98 | vecklas = {'counts': word_vector.CountsWeight,
99 | 'freqs': word_vector.FreqsWeight}[args.vector]
100 | kwargs = {'seq_lengths': seq_records.length_list,
101 | 'weightmodel': weightmodel}
102 | else:
103 | vecklas = {'counts': word_vector.Counts,
104 | 'freqs': word_vector.Freqs}[args.vector]
105 | kwargs = {'seq_lengths': seq_records.length_list}
106 | for p in patterns:
107 | v = vecklas(patterns=p, **kwargs)
108 | vecs.append(v)
109 |
110 | dist = word_d2.Distance(vecs)
111 | matrix = distmatrix.create(seq_records.id_list, dist)
112 |
113 | if args.out:
114 | oh = open(args.out, 'w')
115 | matrix.write_to_file(oh, args.outfmt)
116 | oh.close()
117 | else:
118 | matrix.display(args.outfmt)
119 |
120 |
121 | if __name__ == '__main__':
122 | main()
123 |
--------------------------------------------------------------------------------
/bin/calc_word_ffp.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 |
8 | from alfpy import word_vector
9 | from alfpy import word_distance
10 | from alfpy.utils import distmatrix
11 | from alfpy.utils import seqrecords
12 | from alfpy import word_pattern
13 | from alfpy.utils.data import seqcontent
14 | from alfpy.version import __version__
15 |
16 |
17 | def get_parser():
18 | parser = argparse.ArgumentParser(
19 | description='''Calculate distance between DNA/protein sequences based
20 | on feature frequency profiles (FFPs) of words.''',
21 | add_help=False, prog='calc_word_ffp.py'
22 | )
23 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
24 | group.add_argument('--fasta', '-f',
25 | help='input FASTA sequence filename', required=True,
26 | type=argparse.FileType('r'), metavar="FILE")
27 | group.add_argument('--molecule', '-m', choices=['dna', 'rna', 'protein'],
28 | help='choose sequence alphabet', required=True)
29 |
30 | group = parser.add_argument_group(' Choose between the two options')
31 | g1 = group.add_mutually_exclusive_group()
32 | g1.add_argument('--word_size', '-s', metavar="N",
33 | help='word size for creating word patterns',
34 | type=int)
35 | g1.add_argument('--word_pattern', '-w',
36 | help='input filename w/ pre-computed word patterns',
37 | type=argparse.FileType('r'), metavar="FILE")
38 |
39 | group = parser.add_argument_group('OPTIONAL ARGUMENTS')
40 | distlist = word_distance.Distance.get_disttypes()
41 | group.add_argument('--distance', '-d', choices=distlist,
42 | help='choose from: {} [DEFAULT: %(default)s]'.format(
43 | ", ".join(distlist)),
44 | metavar='', default="jsd")
45 | group.add_argument('--reduce_alphabet', '-r', action="store_true",
46 | help='''reduce the words' nt/aa alphabet to smaller
47 | number of symbols''')
48 | group.add_argument('--merge_revcomp', '-M', action="store_true",
49 | help='''merge together DNA words with their reverse
50 | complement words''')
51 |
52 | group = parser.add_argument_group('OUTPUT ARGUMENTS')
53 | group.add_argument('--out', '-o', help="output filename",
54 | metavar="FILE")
55 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
56 | default='phylip',
57 | help='distances output format [DEFAULT: %(default)s]')
58 |
59 | group = parser.add_argument_group("OTHER OPTIONS")
60 | group.add_argument("-h", "--help", action="help",
61 | help="show this help message and exit")
62 | group.add_argument('--version', action='version',
63 | version='%(prog)s {}'.format(__version__))
64 |
65 | if len(sys.argv[1:]) == 0:
66 | # parser.print_help()
67 | parser.print_usage()
68 | parser.exit()
69 |
70 | return parser
71 |
72 |
73 | def validate_args(parser):
74 | args = parser.parse_args()
75 | if args.word_size:
76 | if args.word_size < 1:
77 | parser.error('word size must be >= 1')
78 | elif args.word_pattern:
79 | pass
80 | else:
81 | parser.error("Specify either: --word_size or --word_pattern.")
82 |
83 | if args.molecule == 'protein' and args.merge_revcomp:
84 | parser.error("Incompatible arguments: -m protein --merge_revcomp")
85 |
86 | return args
87 |
88 |
89 | def main():
90 | parser = get_parser()
91 | args = validate_args(parser)
92 |
93 | seq_records = seqrecords.read_fasta(args.fasta)
94 | if args.word_size:
95 | p = word_pattern.create(seq_records.seq_list, args.word_size)
96 | else:
97 | p = word_pattern.read(args.word_pattern)
98 |
99 | if args.reduce_alphabet:
100 | p = p.reduce_alphabet(seqcontent.get_reduced_alphabet(args.molecule))
101 | if args.merge_revcomp:
102 | p = p.merge_revcomp()
103 |
104 | freqs = word_vector.Freqs(seq_records.length_list, p)
105 |
106 | dist = word_distance.Distance(freqs, args.distance)
107 | matrix = distmatrix.create(seq_records.id_list, dist)
108 |
109 | if args.out:
110 | oh = open(args.out, 'w')
111 | matrix.write_to_file(oh, args.outfmt)
112 | oh.close()
113 | else:
114 | matrix.display(args.outfmt)
115 |
116 |
117 | if __name__ == '__main__':
118 | main()
119 |
--------------------------------------------------------------------------------
/bin/calc_word_rtd.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 |
8 | from alfpy import word_distance
9 | from alfpy import word_pattern
10 | from alfpy import word_rtd
11 | from alfpy.utils import distmatrix
12 | from alfpy.utils import seqrecords
13 | from alfpy.version import __version__
14 |
15 |
16 | def get_parser():
17 | parser = argparse.ArgumentParser(
18 | description='''Calculate distances between protein/DNA sequences based
19 | on Return Time Distribution (RTD) of words\' occurrences and their
20 | relative orders''',
21 | add_help=False, prog='calc_word_rtd.py'
22 | )
23 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
24 | group.add_argument('--fasta', '-f',
25 | help='input FASTA sequence filename', required=True,
26 | type=argparse.FileType('r'), metavar="FILE")
27 |
28 | group = parser.add_argument_group(' Choose between the two options')
29 | g1 = group.add_mutually_exclusive_group()
30 | g1.add_argument('--word_size', '-s', metavar="N",
31 | help='word size for creating word patterns',
32 | type=int)
33 | g1.add_argument('--word_pattern', '-w',
34 | help='input filename w/ pre-computed word patterns',
35 | type=argparse.FileType('r'), metavar="FILE")
36 |
37 | group = parser.add_argument_group('OPTIONAL ARGUMENTS')
38 | distlist = word_distance.Distance.get_disttypes()
39 | group.add_argument('--distance', '-d', choices=distlist,
40 | help='choose from: {} [DEFAULT: %(default)s]'.format(
41 | ", ".join(distlist)),
42 | metavar='', default="google")
43 |
44 | group = parser.add_argument_group('OUTPUT ARGUMENTS')
45 | group.add_argument('--out', '-o', help="output filename",
46 | metavar="FILE")
47 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
48 | default='phylip',
49 | help='distances output format [DEFAULT: %(default)s]')
50 |
51 | group = parser.add_argument_group("OTHER OPTIONS")
52 | group.add_argument("-h", "--help", action="help",
53 | help="show this help message and exit")
54 | group.add_argument('--version', action='version',
55 | version='%(prog)s {}'.format(__version__))
56 |
57 | if len(sys.argv[1:]) == 0:
58 | # parser.print_help()
59 | parser.print_usage()
60 | parser.exit()
61 |
62 | return parser
63 |
64 |
65 | def validate_args(parser):
66 | args = parser.parse_args()
67 | if args.word_size:
68 | if args.word_size < 1:
69 | parser.error('word size must be >= 1')
70 | elif args.word_pattern:
71 | p = word_pattern.read(args.word_pattern)
72 | if not p.pos_list:
73 | e = "{0} does not contain info on word positions.\n"
74 | e += "Please use: create_wordpattern.py with"
75 | e += " --word_position option."
76 | parser.error(e.format(args.word_pattern.name))
77 | else:
78 | args.word_pattern = p
79 | else:
80 | parser.error("Specify either: --word_size or --word_pattern.")
81 | return args
82 |
83 |
84 | def main():
85 | parser = get_parser()
86 | args = validate_args(parser)
87 |
88 | seq_records = seqrecords.read_fasta(args.fasta)
89 | if args.word_size:
90 | p = word_pattern.create(seq_records.seq_list, args.word_size, True)
91 | else:
92 | p = args.word_pattern
93 |
94 | vector = word_rtd.create_vector(seq_records.count, p)
95 | dist = word_rtd.Distance(vector, args.distance)
96 |
97 | matrix = distmatrix.create(seq_records.id_list, dist)
98 |
99 | if args.out:
100 | oh = open(args.out, 'w')
101 | matrix.write_to_file(oh, args.outfmt)
102 | oh.close()
103 | else:
104 | matrix.display(args.outfmt)
105 |
106 |
107 | if __name__ == '__main__':
108 | main()
109 |
--------------------------------------------------------------------------------
/bin/calc_word_sets.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 | from alfpy import word_sets_distance
8 | from alfpy.utils import distmatrix
9 | from alfpy.utils import seqrecords
10 | from alfpy.version import __version__
11 |
12 |
13 | def get_parser():
14 | parser = argparse.ArgumentParser(
15 | description='''Calculate distances between DNA/protein sequences based
16 | on boolean 1-D vectors of word counting occurrences.''',
17 | add_help=False, prog='calc_word_sets.py'
18 |
19 | )
20 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
21 | group.add_argument('--fasta', '-f',
22 | help='input FASTA sequence filename', required=True,
23 | type=argparse.FileType('r'), metavar="FILE")
24 | group.add_argument('--word_size', '-s', metavar="N", required=True,
25 | help='word size for creating word patterns',
26 | type=int)
27 |
28 | group = parser.add_argument_group('OPTIONAL ARGUMENTS')
29 | distlist = ['dice', 'hamming', 'jaccard']
30 | group.add_argument('--distance', '-d', choices=distlist,
31 | help='choose from: {} [DEFAULT: %(default)s]'.format(
32 | ", ".join(distlist)),
33 | metavar='', default="dice")
34 |
35 | group = parser.add_argument_group('OUTPUT ARGUMENTS')
36 | group.add_argument('--out', '-o', help="output filename",
37 | metavar="FILE")
38 | group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
39 | default='phylip',
40 | help='distances output format [DEFAULT: %(default)s]')
41 |
42 | group = parser.add_argument_group("OTHER OPTIONS")
43 | group.add_argument("-h", "--help", action="help",
44 | help="show this help message and exit")
45 | group.add_argument('--version', action='version',
46 | version='%(prog)s {}'.format(__version__))
47 |
48 | if len(sys.argv[1:]) == 0:
49 | # parser.print_help()
50 | parser.print_usage()
51 | parser.exit()
52 |
53 | return parser
54 |
55 |
56 | def validate_args(parser):
57 | args = parser.parse_args()
58 | if args.word_size < 1:
59 | parser.error('Word size must be >= 1.')
60 | return args
61 |
62 |
63 | def main():
64 | parser = get_parser()
65 | args = validate_args(parser)
66 |
67 | seq_records = seqrecords.read_fasta(args.fasta)
68 | dist = word_sets_distance.Distance(seq_records, args.word_size,
69 | args.distance)
70 | matrix = distmatrix.create(seq_records.id_list, dist)
71 |
72 | if args.out:
73 | oh = open(args.out, 'w')
74 | matrix.write_to_file(oh, args.outfmt)
75 | oh.close()
76 | else:
77 | matrix.display(args.outfmt)
78 |
79 |
80 | if __name__ == '__main__':
81 | main()
82 |
--------------------------------------------------------------------------------
/bin/create_wordpattern.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # Copyright (c) 2016 Zielezinski A, combio.pl
4 |
5 | import argparse
6 | import sys
7 |
8 | from alfpy import word_pattern
9 | from alfpy.utils import seqrecords
10 | from alfpy.version import __version__
11 |
12 |
13 | def get_parser():
14 | parser = argparse.ArgumentParser(
15 | description='''Count subsequences (words) of a given length (size)
16 | for each sequence in input FASTA-formatted file.''',
17 | add_help=False, prog='create_wordpattern.py'
18 | )
19 | group = parser.add_argument_group('REQUIRED ARGUMENTS')
20 | group.add_argument('--fasta', '-f',
21 | help='input FASTA sequence filename', required=True,
22 | type=argparse.FileType('r'), metavar="FILE")
23 | group.add_argument('--word_size', '-w', required=True, type=int,
24 | metavar="k", help='word size (>=1)')
25 |
26 | group = parser.add_argument_group('OPTIONAL ARGUMENTS')
27 | group.add_argument('--word_position', '-p', action="store_true",
28 | help='''report word positions in output''')
29 | group.add_argument('--out', '-o', help="output pattern filename",
30 | metavar="FILE")
31 |
32 | t = ' Teiresias options'
33 | d = ' more info @ https://cm.jefferson.edu/data-tools-downloads/'
34 | d += 'teiresias-code/\n'
35 | group = parser.add_argument_group(t, d)
36 | group.add_argument('--teiresias', '-t', action="store_true",
37 | help='''Teiresias program creates word patterns.
38 | [by default: disabled]''',
39 | )
40 | group.add_argument('--l', '-l', type=int,
41 | help='minimum number of literals and/or brackets')
42 | group.add_argument('--k', '-k', type=int,
43 | help='minimum support that any word can have')
44 |
45 | group = parser.add_argument_group("OTHER OPTIONS")
46 | group.add_argument("-h", "--help", action="help",
47 | help="show this help message and exit")
48 | group.add_argument('--version', action='version',
49 | version='%(prog)s {}'.format(__version__))
50 |
51 | if len(sys.argv[1:]) == 0:
52 | # parser.print_help()
53 | parser.print_usage() # for just the usage line
54 | parser.exit()
55 |
56 | return parser
57 |
58 |
59 | def validate_args(parser):
60 | args = parser.parse_args()
61 | if args.teiresias:
62 | if args.l is None:
63 | parser.error("Teiresias requires --l")
64 | if args.k is None:
65 | parser.error("Teiresias requires --k")
66 | if args.word_size < 2:
67 | parser.error("Teiresias requires --word_size to be >= 2")
68 | if args.l < 2:
69 | parser.error("--l must be at least 2")
70 | if args.l > args.word_size:
71 | parser.error("--word_size must be >= than --l")
72 | elif args.word_size < 1:
73 | parser.error("--word_size must be >= 1")
74 | return args
75 |
76 |
77 | def main():
78 | parser = get_parser()
79 | args = validate_args(parser)
80 |
81 | if args.teiresias:
82 | args.fasta.close()
83 | p = word_pattern.run_teiresias(args.fasta.name,
84 | w=args.word_size,
85 | l=args.l,
86 | k=args.k,
87 | output_filename=args.out)
88 | else:
89 | seq_records = seqrecords.read_fasta(args.fasta)
90 | args.fasta.close()
91 | p = word_pattern.create(seq_records.seq_list,
92 | args.word_size,
93 | args.word_position)
94 |
95 | if args.out:
96 | oh = open(args.out, 'w')
97 | oh.write(p.format())
98 | oh.close()
99 | else:
100 | print(p.format())
101 | # or sys.stdout(p.format()+'\n')
102 |
103 |
104 | if __name__ == '__main__':
105 | main()
106 |
--------------------------------------------------------------------------------
/example_data/input/aminoacid.freqs.swissprot.txt:
--------------------------------------------------------------------------------
1 | # UniProtKB/Swiss-Prot protein knowledgebase release 2016_09 statistics
2 | # Release 2016_09 of 05-Oct-16 of UniProtKB/Swiss-Prot contains 552259 sequence entries,
3 | # comprising 197423140 amino acids abstracted from 247204 references.
4 | # http://web.expasy.org/docs/relnotes/relstat.html
5 | A 0.0826
6 | Q 0.0393
7 | L 0.0965
8 | S 0.0659
9 | R 0.0553
10 | E 0.0674
11 | K 0.0583
12 | T 0.0534
13 | N 0.0406
14 | G 0.0708
15 | M 0.0241
16 | W 0.0109
17 | D 0.0546
18 | H 0.0227
19 | F 0.0386
20 | Y 0.0292
21 | C 0.0137
22 | I 0.0594
23 | P 0.0471
24 | V 0.0687
--------------------------------------------------------------------------------
/example_data/input/aminoacid.weights.txt:
--------------------------------------------------------------------------------
1 | # Based on amino acid frequencies
2 | # Weight = 1 / amino acid freq / 10
3 | # should be greater than 1.
4 | A 1.21065375303
5 | C 7.29927007299
6 | E 1.48367952522
7 | D 1.8315018315
8 | G 1.41242937853
9 | F 2.59067357513
10 | I 1.6835016835
11 | H 4.40528634361
12 | K 1.71526586621
13 | M 4.14937759336
14 | L 1.03626943005
15 | N 2.46305418719
16 | Q 2.54452926209
17 | P 2.12314225053
18 | S 1.51745068285
19 | R 1.80831826401
20 | T 1.87265917603
21 | W 9.17431192661
22 | V 1.45560407569
23 | Y 3.42465753425
--------------------------------------------------------------------------------
/example_data/input/hiv.pep.fasta:
--------------------------------------------------------------------------------
1 | >DENTIST
2 | EVVIRSANFTDNAKIIIVQLNASVEINCTRPNNYTRKGIRIGPGRAVYAAEEIIGDIRRAHCNISREKWN
3 | NTLKQVVTKLREQFVNKTIIFTHPSGGDPEIVMHSVNCGGEFFY
4 | >PATIENT_A
5 | VIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIRIGPGRAVYAAEEIIGDIRRAHCNISREKWNNT
6 | LKQVVTKLREQFVNKTIIFNHSSGGDPEIVMHSFNCGGEFFY
7 | >PATIENT_B
8 | FTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGAKWNNTLEQVKT
9 | KLREQFGNTTIFFNHSSG
10 | >PATIENT_C
11 | EVVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAVYATDRIIGDIRQAHCNISREKWN
12 | NTLKQVVTKLREQFVNKTIIFTHPSGGDPEIVMHSVNCGGEFFY
13 | >PATIENT_D
14 | EVVIRSANFSDNAKTIIVQLNKSVKITCIRPSNNTRQSIPIGPGKAVYATGQIIGDIRQAHCNLSEAKWN
15 | NTLAQIVKKLKEQFRNRTIVFNQSSGGDPEIVMHSFNCGGEFFYC
16 | >PATIENT_E
17 | ASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGEKWNNTLKQVVTKLREQFGDKTIIF
18 | NHSSGGDPEIVM
19 | >PATIENT_F
20 | EVVIRSENFTDNVKTIIVQLNESVQINCTRPNNNTRKSIHIAPGRAFYATGEIIGDIRQAHCNLSSTKWN
21 | NTLRQIAKKLKEQFGNKTIVFNQSSGGDPEIVMHSFNCGGEFFYC
22 | >PATIENT_G
23 | EVVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRRGIHIGPGRAFYATDRIVGDIRQAYCNISREKWN
24 | NTLKQVVAKLREQFVNKTIIFNHSSGGDPEIVMHSVNCGGEFFYCNT
25 | >PATIENT_H
26 | LAEGEVIIRSENFTDNAKTIIVQLNATINITCERPHNNTRKSIHIGPGRAFFATGDITGDIRQAHCNLSK
27 | GDWDNALKQIVTKLGEQFGRNKTIVFKQSSGGDPEIIMHSFNCAGEFSYCN
28 | >DENTIST_WIFE
29 | NFTNNAKTIIVQLNTSVEINCTRPSNNTSKGIHIGPGRAFHATDRITGDIRQAHCNISKAKWNDTLQQVV
30 | KKLREQFGGNKTIVFNQSSGGDPEIVLHSFNCGGEFFYCNTT
31 | >Local_Control_1
32 | FTDNAKTIIVQLKNSVVINCTRPNNNTRRSVHIGPGSSLYTTDIIGDIRQAHCNLSRANWNKTLEQIVTK
33 | LGEQFGNNTTIVFNSSSGG
34 | >Local_Control_2
35 | SENFTDNTKTIIVQLNTSVTINCTRPGNNTRKSITMGPGKVFYAGEIIGDIRQAHCNLSRAAWNDTLKQI
36 | VGKLQEQFGNKTIVFNHSSGGDPEIVMHSF
37 | >Local_Control_3
38 | RSENFTNNAKIIIVHLNKTVNITCTRPNNNTRRSIPIGPGKAFYTTDIIGNIRQAHCNLSRAEWNNTLKQ
39 | IVKKLREQFKNKTIVFNHSSGGDPEIVMHSF
40 | >Local_Control_4
41 | LAEEEVVIRSENFTNNAKIIIVHLNKTVNITCTRPNNNTRRSIPMGPGKAFYTTEIIGNIRQAHCNLSKA
42 | EWNNTLRQIVKKLRDNLRIKQ
43 | >Local_Control_5
44 | LAEKEVVIRSENFTDNTKTIIIQLNTSVTINCTRPGNNTRKSITMGPGKVFYAGEIIGDIRQAHCNLSRT
45 | AWNDTLKQIVGKLQEQFGNKTIVFNHSSGGDPEIVMHSF
46 |
--------------------------------------------------------------------------------
/example_data/input/sample.dna.fasta:
--------------------------------------------------------------------------------
1 | >seq1
2 | AACGTACCATTGAACGTACCATTGAACGTACCATTG
3 | >seq2
4 | CTAGGGGACTTATCTAGGGGACTTATCTAGGGGACTTAT
5 | >seq3
6 | CTAGGGAAAATTCTAGGGAAAATTCTAGGGAAAATT
7 |
--------------------------------------------------------------------------------
/example_data/input/sample.pep.fasta:
--------------------------------------------------------------------------------
1 | >seq1
2 | MKSTGWHF
3 | >seq2
4 | MKSSSSTGWGWG
5 | >seq3
6 | MKSTLKNGTEQ
--------------------------------------------------------------------------------
/example_data/output/bears.dna.fasta.1mer:
--------------------------------------------------------------------------------
1 | 2693 11 A 0:333 1:132 2:133 3:130 4:132 5:342 6:131 7:346 8:352 9:351 10:311
2 | 1717 11 T 0:226 1:86 2:83 3:81 4:83 5:236 6:87 7:232 8:225 9:216 10:162
3 | 1650 11 C 0:219 1:69 2:71 3:73 4:72 5:210 6:70 7:210 8:213 9:217 10:226
4 | 1337 11 G 0:188 1:60 2:59 3:63 4:59 5:178 6:61 7:166 8:172 9:172 10:159
--------------------------------------------------------------------------------
/example_data/output/bears.dna.fasta.2mer:
--------------------------------------------------------------------------------
1 | 1096 11 AA 0:139 1:57 2:60 3:58 4:59 5:138 6:57 7:129 8:140 9:136 10:123
2 | 697 11 TA 0:83 1:31 2:36 3:30 4:35 5:102 6:36 7:99 8:94 9:95 10:56
3 | 559 11 AG 0:70 1:28 2:26 3:27 4:26 5:74 6:29 7:75 8:68 9:72 10:64
4 | 550 11 AC 0:66 1:23 2:22 3:22 4:24 5:67 6:21 7:74 8:69 9:75 10:87
5 | 516 11 CA 0:64 1:24 2:16 3:20 4:17 5:53 6:18 7:71 8:74 9:72 10:87
6 | 487 11 AT 0:58 1:24 2:25 3:23 4:23 5:63 6:24 7:68 8:75 9:68 10:36
7 | 470 11 CC 0:67 1:17 2:22 3:24 4:23 5:57 6:20 7:60 8:62 9:61 10:57
8 | 464 11 TT 0:65 1:26 2:23 3:24 4:24 5:61 6:24 7:68 8:54 9:50 10:45
9 | 456 11 CT 0:57 1:23 2:24 3:21 4:24 5:70 6:26 7:54 8:51 9:57 10:49
10 | 379 11 GA 0:47 1:19 2:20 3:21 4:20 5:49 6:19 7:47 8:44 9:48 10:45
11 | 342 11 GC 0:49 1:11 2:12 3:12 4:11 5:49 6:15 7:42 8:43 9:48 10:50
12 | 307 11 GT 0:46 1:13 2:11 3:13 4:12 5:41 6:13 7:41 8:44 9:41 10:32
13 | 305 11 GG 0:46 1:16 2:15 3:16 4:15 5:39 6:14 7:36 8:41 9:35 10:32
14 | 285 11 TC 0:36 1:18 2:15 3:15 4:14 5:37 6:14 7:34 8:39 9:32 10:31
15 | 267 11 TG 0:42 1:11 2:9 3:12 4:10 5:36 6:12 7:30 8:37 9:38 10:30
16 | 206 11 CG 0:30 1:5 2:9 3:8 4:8 5:29 6:6 7:25 8:26 9:27 10:33
--------------------------------------------------------------------------------
/example_data/output/bears.dna.fasta.3mer:
--------------------------------------------------------------------------------
1 | 462 11 AAA 0:56 1:24 2:29 3:26 4:29 5:60 6:25 7:47 8:56 9:57 10:53
2 | 322 11 TAA 0:43 1:14 2:16 3:13 4:15 5:46 6:14 7:49 8:48 9:44 10:20
3 | 243 11 AAG 0:38 1:11 2:10 3:11 4:9 5:35 6:12 7:33 8:28 9:28 10:28
4 | 215 11 AAC 0:26 1:13 2:12 3:13 4:14 5:25 6:10 7:24 8:22 9:26 10:30
5 | 210 11 CTA 0:24 1:10 2:13 3:10 4:13 5:33 6:13 7:24 8:23 9:27 10:20
6 | 196 11 TTA 0:25 1:9 2:9 3:8 4:10 5:30 6:8 7:28 8:24 9:28 10:17
7 | 195 11 ATA 0:22 1:8 2:10 3:7 4:9 5:23 6:11 7:32 8:31 9:28 10:14
8 | 176 11 AAT 0:19 1:9 2:9 3:8 4:7 5:18 6:10 7:25 8:34 9:25 10:12
9 | 171 11 ACA 0:21 1:6 2:3 3:4 4:4 5:19 6:4 7:25 8:23 9:24 10:38
10 | 169 11 CAA 0:23 1:10 2:6 3:9 4:6 5:15 6:7 7:19 8:22 9:20 10:32
11 | 168 11 AGC 0:24 1:5 2:4 3:4 4:4 5:27 6:8 7:23 8:22 9:25 10:22
12 | 165 11 ACC 0:20 1:9 2:10 3:10 4:11 5:16 6:8 7:20 8:18 9:23 10:20
13 | 165 11 CAC 0:22 1:4 2:3 3:3 4:3 5:17 6:3 7:25 8:27 9:23 10:35
14 | 148 11 CCC 0:25 1:5 2:8 3:10 4:8 5:18 6:7 7:18 8:17 9:20 10:12
15 | 143 11 ACT 0:15 1:7 2:8 3:6 4:8 5:21 6:8 7:20 8:16 9:15 10:19
16 | 143 11 AGA 0:14 1:10 2:9 3:10 4:9 5:16 6:8 7:17 8:13 9:15 10:22
17 | 140 11 ATT 0:16 1:8 2:7 3:6 4:7 5:21 6:7 7:18 8:22 9:19 10:9
18 | 140 11 GAA 0:17 1:8 2:9 3:9 4:9 5:17 6:10 7:14 8:14 9:15 10:18
19 | 137 11 CCA 0:18 1:4 2:5 3:6 4:5 5:13 6:4 7:20 8:22 9:20 10:20
20 | 134 11 TAG 0:13 1:7 2:7 3:7 4:7 5:19 6:9 7:20 8:16 9:18 10:11
21 | 133 11 CCT 0:16 1:6 2:6 3:6 4:6 5:21 6:7 7:15 8:17 9:15 10:18
22 | 129 11 AGT 0:19 1:6 2:6 3:6 4:6 5:17 6:7 7:17 8:18 9:16 10:11
23 | 126 11 TTT 0:18 1:7 2:8 3:8 4:8 5:14 6:7 7:23 8:12 9:5 10:16
24 | 122 11 CTT 0:17 1:8 2:6 3:7 4:6 5:17 6:7 7:17 8:11 9:14 10:12
25 | 122 11 TAT 0:16 1:6 2:7 3:6 4:7 5:19 6:6 7:14 8:16 9:16 10:9
26 | 119 11 TAC 0:11 1:4 2:6 3:4 4:6 5:18 6:7 7:16 8:14 9:17 10:16
27 | 115 11 AGG 0:13 1:6 2:6 3:6 4:6 5:14 6:6 7:18 8:15 9:16 10:9
28 | 115 11 TCA 0:12 1:10 2:6 3:7 4:6 5:10 6:5 7:16 8:16 9:11 10:16
29 | 108 11 GCC 0:13 1:2 2:3 3:3 4:3 5:14 6:4 7:17 8:17 9:14 10:18
30 | 99 11 GGA 0:14 1:5 2:6 3:6 4:6 5:11 6:5 7:12 8:13 9:13 10:8
31 | 98 11 GCT 0:17 1:4 2:4 3:4 4:4 5:17 6:5 7:11 8:9 9:14 10:9
32 | 97 11 CAT 0:10 1:6 2:4 3:5 4:4 5:12 6:5 7:16 8:12 9:16 10:7
33 | 96 11 GAG 0:10 1:6 2:5 3:6 4:5 5:11 6:5 7:11 8:11 9:13 10:13
34 | 93 11 GTA 0:12 1:4 2:4 3:5 4:3 5:15 6:4 7:14 8:15 9:12 10:5
35 | 92 11 GAT 0:13 1:3 2:5 3:4 4:5 5:14 6:3 7:13 8:13 9:11 10:8
36 | 91 11 GCA 0:12 1:4 2:2 3:3 4:2 5:11 6:5 7:10 8:13 9:16 10:13
37 | 90 11 GGT 0:15 1:5 2:3 3:4 4:4 5:11 6:4 7:13 8:13 9:11 10:7
38 | 84 11 CAG 0:9 1:4 2:3 3:3 4:4 5:9 6:3 7:11 8:13 9:13 10:12
39 | 82 11 TCT 0:9 1:6 2:6 3:5 4:6 5:11 6:6 7:8 8:9 9:13 10:3
40 | 82 11 TGG 0:15 1:5 2:4 3:5 4:5 5:10 6:5 7:7 8:12 9:9 10:5
41 | 80 11 CTC 0:10 1:4 2:4 3:2 4:4 5:13 6:3 7:9 8:12 9:9 10:10
42 | 80 11 GTG 0:13 1:4 2:2 3:3 4:3 5:10 6:4 7:11 8:11 9:11 10:8
43 | 78 11 ATG 0:10 1:3 2:4 3:4 4:4 5:12 6:3 7:8 8:11 9:12 10:7
44 | 76 11 GTT 0:14 1:3 2:2 3:3 4:3 5:9 6:3 7:10 8:9 9:12 10:8
45 | 76 11 TTC 0:9 1:7 2:4 3:5 4:4 5:10 6:6 7:10 8:8 9:9 10:4
46 | 71 11 ATC 0:10 1:5 2:4 3:6 4:3 5:7 6:3 7:9 8:10 9:8 10:6
47 | 71 11 TGA 0:11 1:1 2:1 3:1 4:1 5:11 6:3 7:11 8:10 9:12 10:9
48 | 70 11 ACG 0:9 1:1 2:1 3:2 4:1 5:11 6:1 7:9 8:12 9:13 10:10
49 | 66 11 CGA 0:8 1:3 2:4 3:4 4:4 5:11 6:3 7:7 8:8 9:8 10:6
50 | 66 11 TTG 0:13 1:3 2:2 3:3 4:2 5:7 6:3 7:7 8:10 9:8 10:8
51 | 65 11 TGC 0:9 1:3 2:2 3:3 4:2 5:7 6:2 7:7 8:9 9:10 10:11
52 | 60 11 GGG 0:9 1:4 2:3 3:3 4:3 5:9 6:2 7:6 8:7 9:6 10:8
53 | 58 11 GTC 0:7 1:2 2:3 3:2 4:3 5:7 6:2 7:6 8:9 9:6 10:11
54 | 56 11 GGC 0:8 1:2 2:3 3:3 4:2 5:8 6:3 7:5 8:8 9:5 10:9
55 | 53 11 CGC 0:8 1:1 2:3 3:2 4:3 5:7 6:2 7:7 8:4 9:8 10:8
56 | 52 11 CCG 0:8 1:2 2:3 3:2 4:4 5:5 6:2 7:7 8:6 9:6 10:7
57 | 51 11 GAC 0:7 1:2 2:1 3:2 4:1 5:7 6:1 7:9 8:6 9:9 10:6
58 | 49 11 TGT 0:7 1:2 2:2 3:3 4:2 5:8 6:2 7:5 8:6 9:7 10:5
59 | 48 11 CGG 0:9 1:1 2:2 3:2 4:1 5:6 6:1 7:5 8:7 9:4 10:10
60 | 48 11 TCC 0:9 1:1 2:1 3:1 4:1 5:9 6:1 7:5 8:10 9:4 10:6
61 | 44 11 GCG 0:7 1:1 2:3 3:2 4:2 5:6 6:1 7:4 8:4 9:4 10:10
62 | 43 11 CTG 0:6 1:1 2:1 3:2 4:1 5:7 6:2 7:4 8:5 9:7 10:7
63 | 40 11 TCG 0:6 1:1 2:2 3:2 4:1 5:7 6:2 7:5 8:4 9:4 10:6
64 | 39 6 CGT 0:5 5:5 7:6 8:7 9:7 10:9
--------------------------------------------------------------------------------
/example_data/output/bears.dna.fasta.pairwise:
--------------------------------------------------------------------------------
1 | American_Black_Bear American_Brown_Bear 0.7106017
2 | American_Black_Bear Spectacled_Bear 0.7765043
3 | American_Black_Bear Asiatic_Black_Bear 0.7020057
4 | American_Black_Bear Polar_Bear 0.7736390
5 | American_Black_Bear Giant_Panda 0.5702006
6 | American_Black_Bear Red_Panda 0.8080229
7 | American_Black_Bear Dog 0.6131805
8 | American_Black_Bear Raccoon 0.5873926
9 | American_Black_Bear Cow 0.6704871
10 | American_Black_Bear Crocodilian_skink 0.7822350
11 | American_Brown_Bear Spectacled_Bear 0.4545455
12 | American_Brown_Bear Asiatic_Black_Bear 0.3034483
13 | American_Brown_Bear Polar_Bear 0.4405594
14 | American_Brown_Bear Giant_Panda 0.7761628
15 | American_Brown_Bear Red_Panda 0.5174825
16 | American_Brown_Bear Dog 0.7953216
17 | American_Brown_Bear Raccoon 0.7777778
18 | American_Brown_Bear Cow 0.8171091
19 | American_Brown_Bear Crocodilian_skink 0.8705502
20 | Spectacled_Bear Asiatic_Black_Bear 0.3655172
21 | Spectacled_Bear Polar_Bear 0.1478873
22 | Spectacled_Bear Giant_Panda 0.7877907
23 | Spectacled_Bear Red_Panda 0.5352113
24 | Spectacled_Bear Dog 0.7982456
25 | Spectacled_Bear Raccoon 0.7836257
26 | Spectacled_Bear Cow 0.8289086
27 | Spectacled_Bear Crocodilian_skink 0.8705502
28 | Asiatic_Black_Bear Polar_Bear 0.3655172
29 | Asiatic_Black_Bear Giant_Panda 0.7906977
30 | Asiatic_Black_Bear Red_Panda 0.5448276
31 | Asiatic_Black_Bear Dog 0.8157895
32 | Asiatic_Black_Bear Raccoon 0.7923977
33 | Asiatic_Black_Bear Cow 0.8436578
34 | Asiatic_Black_Bear Crocodilian_skink 0.8673139
35 | Polar_Bear Giant_Panda 0.8052326
36 | Polar_Bear Red_Panda 0.5177305
37 | Polar_Bear Dog 0.8070175
38 | Polar_Bear Raccoon 0.7894737
39 | Polar_Bear Cow 0.8289086
40 | Polar_Bear Crocodilian_skink 0.8770227
41 | Giant_Panda Red_Panda 0.7994186
42 | Giant_Panda Dog 0.5930233
43 | Giant_Panda Raccoon 0.5755814
44 | Giant_Panda Cow 0.6424419
45 | Giant_Panda Crocodilian_skink 0.8081395
46 | Red_Panda Dog 0.7807018
47 | Red_Panda Raccoon 0.7690058
48 | Red_Panda Cow 0.8318584
49 | Red_Panda Crocodilian_skink 0.8705502
50 | Dog Raccoon 0.5497076
51 | Dog Cow 0.6228070
52 | Dog Crocodilian_skink 0.7982456
53 | Raccoon Cow 0.6608187
54 | Raccoon Crocodilian_skink 0.8070175
55 | Cow Crocodilian_skink 0.7994100
56 |
--------------------------------------------------------------------------------
/example_data/output/bears.dna.fasta.phylip:
--------------------------------------------------------------------------------
1 | 11
2 | American_B 0.0000000 0.6865672 0.7423168 0.6650000 0.7290168 0.6136784 0.7734554 0.6832740 0.6509946 0.7013889 0.8239203
3 | American_B 0.6865672 0.0000000 0.5071770 0.3027027 0.4878049 0.7410926 0.5636364 0.7677725 0.7476190 0.7806005 0.8517647
4 | Spectacled 0.7423168 0.5071770 0.0000000 0.3939394 0.1197605 0.7345972 0.5777778 0.7813953 0.7558685 0.7656613 0.8504673
5 | Asiatic_Bl 0.6650000 0.3027027 0.3939394 0.0000000 0.3877551 0.7328605 0.6000000 0.7908046 0.7570093 0.7752294 0.8403756
6 | Polar_Bear 0.7290168 0.4878049 0.1197605 0.3877551 0.0000000 0.7393365 0.5739910 0.7832168 0.7488152 0.7731481 0.8524590
7 | Giant_Pand 0.6136784 0.7410926 0.7345972 0.7328605 0.7393365 0.0000000 0.7660550 0.6410256 0.6275229 0.6772487 0.8283828
8 | Red_Panda 0.7734554 0.5636364 0.5777778 0.6000000 0.5739910 0.7660550 0.0000000 0.7458432 0.7405660 0.7790433 0.8465116
9 | Dog 0.6832740 0.7677725 0.7813953 0.7908046 0.7832168 0.6410256 0.7458432 0.0000000 0.6022727 0.6642599 0.8195616
10 | Raccoon 0.6509946 0.7476190 0.7558685 0.7570093 0.7488152 0.6275229 0.7405660 0.6022727 0.0000000 0.6725979 0.8252912
11 | Cow 0.7013889 0.7806005 0.7656613 0.7752294 0.7731481 0.6772487 0.7790433 0.6642599 0.6725979 0.0000000 0.8219634
12 | Crocodilia 0.8239203 0.8517647 0.8504673 0.8403756 0.8524590 0.8283828 0.8465116 0.8195616 0.8252912 0.8219634 0.0000000
13 |
--------------------------------------------------------------------------------
/example_data/output/gp120.pep.fasta.1mer:
--------------------------------------------------------------------------------
1 | 1331 27 T 0:46 1:44 2:49 3:52 4:40 5:47 6:39 7:45 8:45 9:44 10:49 11:42 12:46 13:43 14:44 15:50 16:51 17:44 18:57 19:57 20:53 21:56 22:50 23:55 24:62 25:60 26:61
2 | 1152 27 N 0:47 1:46 2:49 3:48 4:44 5:49 6:41 7:45 8:43 9:43 10:47 11:43 12:49 13:49 14:43 15:37 16:40 17:38 18:38 19:39 20:42 21:42 22:39 23:39 24:35 25:37 26:40
3 | 841 27 V 0:28 1:30 2:28 3:39 4:37 5:33 6:37 7:33 8:34 9:33 10:39 11:35 12:32 13:39 14:39 15:23 16:26 17:32 18:29 19:31 20:28 21:22 22:22 23:23 24:29 25:29 26:31
4 | 835 27 I 0:36 1:40 2:36 3:32 4:38 5:39 6:35 7:34 8:35 9:36 10:31 11:36 12:36 13:32 14:31 15:29 16:28 17:24 18:25 19:23 20:25 21:25 22:25 23:30 24:25 25:25 26:24
5 | 820 27 K 0:28 1:28 2:34 3:28 4:31 5:30 6:36 7:31 8:33 9:33 10:33 11:36 12:35 13:30 14:27 15:25 16:30 17:28 18:29 19:28 20:29 21:28 22:27 23:29 24:30 25:32 26:32
6 | 758 27 S 0:33 1:31 2:23 3:24 4:30 5:24 6:29 7:38 8:35 9:37 10:31 11:27 12:32 13:26 14:31 15:23 16:22 17:25 18:22 19:21 20:27 21:28 22:23 23:27 24:30 25:30 26:29
7 | 751 27 E 0:26 1:29 2:30 3:27 4:31 5:26 6:29 7:26 8:25 9:25 10:27 11:26 12:26 13:28 14:21 15:28 16:27 17:33 18:31 19:31 20:27 21:30 22:27 23:28 24:28 25:29 26:30
8 | 731 27 L 0:34 1:31 2:32 3:33 4:26 5:28 6:26 7:26 8:26 9:26 10:31 11:27 12:28 13:29 14:28 15:25 16:22 17:26 18:26 19:26 20:22 21:23 22:23 23:20 24:29 25:29 26:29
9 | 715 27 G 0:26 1:25 2:29 3:29 4:31 5:29 6:27 7:31 8:30 9:29 10:32 11:29 12:27 13:28 14:26 15:25 16:29 17:20 18:23 19:23 20:24 21:22 22:23 23:19 24:28 25:26 26:25
10 | 664 27 R 0:25 1:24 2:20 3:26 4:25 5:25 6:21 7:21 8:22 9:22 10:21 11:20 12:21 13:24 14:22 15:31 16:25 17:25 18:25 19:25 20:24 21:24 22:29 23:27 24:30 25:29 26:31
11 | 613 27 A 0:27 1:23 2:27 3:22 4:21 5:22 6:20 7:24 8:22 9:21 10:21 11:22 12:21 13:20 14:26 15:26 16:24 17:21 18:23 19:23 20:19 21:25 22:28 23:24 24:20 25:20 26:21
12 | 604 27 P 0:22 1:22 2:25 3:21 4:21 5:22 6:21 7:22 8:22 9:22 10:22 11:23 12:20 13:23 14:25 15:23 16:25 17:24 18:21 19:21 20:22 21:21 22:20 23:25 24:23 25:23 26:23
13 | 544 27 C 0:18 1:18 2:18 3:18 4:18 5:18 6:18 7:18 8:18 9:18 10:18 11:18 12:18 13:19 14:18 15:23 16:23 17:23 18:23 19:23 20:22 21:23 22:22 23:23 24:22 25:22 26:24
14 | 520 27 D 0:15 1:18 2:19 3:16 4:15 5:17 6:19 7:16 8:17 9:18 10:16 11:17 12:20 13:17 14:18 15:21 16:22 17:26 18:19 19:19 20:19 21:20 22:21 23:23 24:25 25:25 26:22
15 | 450 27 Q 0:18 1:17 2:16 3:21 4:15 5:18 6:19 7:19 8:18 9:19 10:19 11:18 12:18 13:19 14:16 15:16 16:12 17:14 18:13 19:13 20:16 21:18 22:17 23:15 24:16 25:16 26:14
16 | 436 27 Y 0:13 1:13 2:13 3:12 4:13 5:11 6:12 7:11 8:11 9:11 10:13 11:12 12:13 13:13 14:12 15:21 16:23 17:21 18:20 19:20 20:24 21:23 22:18 23:21 24:21 25:21 26:20
17 | 432 27 F 0:13 1:13 2:13 3:15 4:16 5:16 6:16 7:19 8:19 9:18 10:15 11:16 12:15 13:15 14:15 15:18 16:17 17:16 18:18 19:18 20:16 21:16 22:20 23:18 24:14 25:14 26:13
18 | 333 27 W 0:9 1:10 2:10 3:12 4:10 5:10 6:9 7:10 8:10 9:10 10:9 11:10 12:10 13:10 14:9 15:14 16:15 17:14 18:15 19:15 20:15 21:16 22:16 23:15 24:17 25:17 26:16
19 | 266 27 M 0:7 1:9 2:10 3:7 4:6 5:7 6:9 7:10 8:9 9:9 10:8 11:8 12:9 13:9 14:8 15:13 16:11 17:12 18:11 19:11 20:10 21:10 22:13 23:12 24:13 25:13 26:12
20 | 233 27 H 0:8 1:7 2:8 3:8 4:13 5:9 6:8 7:7 8:7 9:7 10:8 11:8 12:8 13:9 14:11 15:9 16:9 17:8 18:8 19:9 20:9 21:12 22:9 23:9 24:8 25:8 26:9
--------------------------------------------------------------------------------
/example_data/output/hiv.pep.fasta.1mer:
--------------------------------------------------------------------------------
1 | 179 15 I 0:14 1:14 2:11 3:14 4:13 5:10 6:12 7:13 8:14 9:10 10:9 11:10 12:12 13:11 14:12
2 | 176 15 N 0:11 1:13 2:11 3:12 4:11 5:8 6:13 7:14 8:11 9:13 10:12 11:10 12:14 13:13 14:10
3 | 124 15 T 0:7 1:6 2:8 3:8 4:6 5:6 6:8 7:7 8:9 9:11 10:10 11:10 12:9 13:8 14:11
4 | 119 15 G 0:8 1:8 2:8 3:8 4:8 5:9 6:8 7:8 8:11 9:10 10:7 11:9 12:5 13:3 14:9
5 | 99 15 V 0:10 1:8 2:3 3:10 4:8 5:4 6:7 7:10 8:4 9:6 10:6 11:6 12:5 13:5 14:7
6 | 97 15 R 0:9 1:9 2:5 3:8 4:6 5:5 6:6 7:9 8:6 9:5 10:5 11:4 12:7 13:8 14:5
7 | 94 15 K 0:6 1:6 2:5 3:6 4:8 5:5 6:7 7:5 8:7 9:7 10:4 11:6 12:8 13:7 14:7
8 | 92 15 S 0:5 1:6 2:4 3:5 4:9 5:4 6:8 7:6 8:7 9:7 10:8 11:7 12:6 13:3 14:7
9 | 82 15 A 0:7 1:7 2:6 3:6 4:7 5:4 6:5 7:7 8:8 9:5 10:3 11:4 12:4 13:5 14:4
10 | 81 15 E 0:8 1:7 2:4 3:6 4:5 5:5 6:7 7:6 8:7 9:4 10:2 11:4 12:4 13:6 14:6
11 | 77 15 F 0:5 1:6 2:5 3:5 4:6 5:3 6:7 7:6 8:7 9:7 10:3 11:5 12:5 13:2 14:5
12 | 65 15 Q 0:3 1:3 2:4 3:4 4:7 5:3 6:6 7:4 8:5 9:6 10:4 11:5 12:3 13:3 14:5
13 | 58 15 L 0:3 1:3 2:3 3:3 4:4 5:2 6:4 7:3 8:5 9:4 10:5 11:4 12:4 13:6 14:5
14 | 49 15 D 0:3 1:3 2:2 3:4 4:3 5:3 6:3 7:4 8:6 9:4 10:3 11:4 12:2 13:1 14:4
15 | 47 15 P 0:4 1:3 2:2 3:4 4:4 5:3 6:3 7:3 8:3 9:3 10:2 11:3 12:4 13:3 14:3
16 | 46 15 H 0:3 1:3 2:3 3:4 4:2 5:3 6:3 7:3 8:4 9:4 10:2 11:3 12:4 13:2 14:3
17 | 43 15 C 0:3 1:3 2:2 3:3 4:4 5:2 6:4 7:4 8:4 9:4 10:2 11:2 12:2 13:2 14:2
18 | 23 15 Y 0:3 1:2 2:1 3:2 4:2 5:1 6:2 7:3 8:1 9:1 10:1 11:1 12:1 13:1 14:1
19 | 15 15 W 0:1 1:1 2:1 3:1 4:1 5:1 6:1 7:1 8:1 9:1 10:1 11:1 12:1 13:1 14:1
20 | 14 12 M 0:1 1:1 3:1 4:1 5:1 6:1 7:1 8:1 11:2 12:1 13:1 14:2
--------------------------------------------------------------------------------
/example_data/output/hiv.pep.fasta.pairwise:
--------------------------------------------------------------------------------
1 | DENTIST PATIENT_A 0.1910112
2 | DENTIST PATIENT_B 0.4886364
3 | DENTIST PATIENT_C 0.2111111
4 | DENTIST PATIENT_D 0.5384615
5 | DENTIST PATIENT_E 0.4886364
6 | DENTIST PATIENT_F 0.5326087
7 | DENTIST PATIENT_G 0.3516484
8 | DENTIST PATIENT_H 0.6210526
9 | DENTIST DENTIST_WIFE 0.5280899
10 | DENTIST Local_Control_1 0.6363636
11 | DENTIST Local_Control_2 0.6022727
12 | DENTIST Local_Control_3 0.5909091
13 | DENTIST Local_Control_4 0.6477273
14 | DENTIST Local_Control_5 0.6022727
15 | PATIENT_A PATIENT_B 0.4382022
16 | PATIENT_A PATIENT_C 0.2444444
17 | PATIENT_A PATIENT_D 0.4945055
18 | PATIENT_A PATIENT_E 0.3932584
19 | PATIENT_A PATIENT_F 0.4782609
20 | PATIENT_A PATIENT_G 0.3076923
21 | PATIENT_A PATIENT_H 0.5789474
22 | PATIENT_A DENTIST_WIFE 0.4943820
23 | PATIENT_A Local_Control_1 0.6067416
24 | PATIENT_A Local_Control_2 0.5280899
25 | PATIENT_A Local_Control_3 0.5280899
26 | PATIENT_A Local_Control_4 0.6292135
27 | PATIENT_A Local_Control_5 0.5280899
28 | PATIENT_B PATIENT_C 0.4444444
29 | PATIENT_B PATIENT_D 0.6043956
30 | PATIENT_B PATIENT_E 0.3200000
31 | PATIENT_B PATIENT_F 0.5108696
32 | PATIENT_B PATIENT_G 0.4725275
33 | PATIENT_B PATIENT_H 0.6210526
34 | PATIENT_B DENTIST_WIFE 0.5505618
35 | PATIENT_B Local_Control_1 0.4605263
36 | PATIENT_B Local_Control_2 0.5421687
37 | PATIENT_B Local_Control_3 0.5487805
38 | PATIENT_B Local_Control_4 0.5584416
39 | PATIENT_B Local_Control_5 0.5909091
40 | PATIENT_C PATIENT_D 0.5054945
41 | PATIENT_C PATIENT_E 0.4222222
42 | PATIENT_C PATIENT_F 0.4673913
43 | PATIENT_C PATIENT_G 0.2747253
44 | PATIENT_C PATIENT_H 0.5578947
45 | PATIENT_C DENTIST_WIFE 0.5000000
46 | PATIENT_C Local_Control_1 0.6000000
47 | PATIENT_C Local_Control_2 0.5444444
48 | PATIENT_C Local_Control_3 0.5444444
49 | PATIENT_C Local_Control_4 0.6111111
50 | PATIENT_C Local_Control_5 0.5444444
51 | PATIENT_D PATIENT_E 0.6043956
52 | PATIENT_D PATIENT_F 0.4347826
53 | PATIENT_D PATIENT_G 0.5384615
54 | PATIENT_D PATIENT_H 0.5578947
55 | PATIENT_D DENTIST_WIFE 0.4835165
56 | PATIENT_D Local_Control_1 0.6153846
57 | PATIENT_D Local_Control_2 0.5494505
58 | PATIENT_D Local_Control_3 0.5494505
59 | PATIENT_D Local_Control_4 0.6153846
60 | PATIENT_D Local_Control_5 0.5384615
61 | PATIENT_E PATIENT_F 0.5217391
62 | PATIENT_E PATIENT_G 0.4615385
63 | PATIENT_E PATIENT_H 0.6000000
64 | PATIENT_E DENTIST_WIFE 0.5393258
65 | PATIENT_E Local_Control_1 0.5263158
66 | PATIENT_E Local_Control_2 0.5301205
67 | PATIENT_E Local_Control_3 0.5121951
68 | PATIENT_E Local_Control_4 0.5584416
69 | PATIENT_E Local_Control_5 0.5340909
70 | PATIENT_F PATIENT_G 0.4782609
71 | PATIENT_F PATIENT_H 0.5052632
72 | PATIENT_F DENTIST_WIFE 0.4782609
73 | PATIENT_F Local_Control_1 0.5869565
74 | PATIENT_F Local_Control_2 0.4673913
75 | PATIENT_F Local_Control_3 0.5434783
76 | PATIENT_F Local_Control_4 0.5760870
77 | PATIENT_F Local_Control_5 0.4673913
78 | PATIENT_G PATIENT_H 0.5684211
79 | PATIENT_G DENTIST_WIFE 0.5054945
80 | PATIENT_G Local_Control_1 0.6373626
81 | PATIENT_G Local_Control_2 0.5824176
82 | PATIENT_G Local_Control_3 0.5494505
83 | PATIENT_G Local_Control_4 0.6373626
84 | PATIENT_G Local_Control_5 0.5714286
85 | PATIENT_H DENTIST_WIFE 0.5473684
86 | PATIENT_H Local_Control_1 0.5684211
87 | PATIENT_H Local_Control_2 0.5578947
88 | PATIENT_H Local_Control_3 0.6105263
89 | PATIENT_H Local_Control_4 0.6526316
90 | PATIENT_H Local_Control_5 0.5473684
91 | DENTIST_WIFE Local_Control_1 0.6067416
92 | DENTIST_WIFE Local_Control_2 0.5505618
93 | DENTIST_WIFE Local_Control_3 0.5955056
94 | DENTIST_WIFE Local_Control_4 0.6629213
95 | DENTIST_WIFE Local_Control_5 0.5505618
96 | Local_Control_1 Local_Control_2 0.5421687
97 | Local_Control_1 Local_Control_3 0.5487805
98 | Local_Control_1 Local_Control_4 0.5584416
99 | Local_Control_1 Local_Control_5 0.5909091
100 | Local_Control_2 Local_Control_3 0.5060241
101 | Local_Control_2 Local_Control_4 0.6144578
102 | Local_Control_2 Local_Control_5 0.1818182
103 | Local_Control_3 Local_Control_4 0.3902439
104 | Local_Control_3 Local_Control_5 0.5340909
105 | Local_Control_4 Local_Control_5 0.6136364
106 |
--------------------------------------------------------------------------------
/example_data/output/hiv.pep.fasta.phylip:
--------------------------------------------------------------------------------
1 | 15
2 | DENTIST 0.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
3 | PATIENT_A 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000
4 | PATIENT_B 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000
5 | PATIENT_C 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000
6 | PATIENT_D 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000
7 | PATIENT_E 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000
8 | PATIENT_F 1.0000000 0.6636364 0.6636364 0.6636364 0.6636364 0.6636364 0.0000000 0.6636364 0.4822695 0.3363636 0.6636364 0.3454545 0.6727273 0.6727273 0.3454545
9 | PATIENT_G 1.0000000 0.5888889 0.5888889 0.5888889 0.5888889 0.5888889 0.6636364 0.0000000 1.0000000 1.0000000 0.5888889 1.0000000 0.6400000 0.6000000 1.0000000
10 | PATIENT_H 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.4822695 1.0000000 0.0000000 0.4751773 1.0000000 0.4893617 1.0000000 1.0000000 0.4893617
11 | DENTIST_WI 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.3363636 1.0000000 0.4751773 0.0000000 1.0000000 0.1724138 1.0000000 1.0000000 0.1724138
12 | Local_Cont 1.0000000 0.2884615 0.2884615 0.2884615 0.2884615 0.2884615 0.6636364 0.5888889 1.0000000 1.0000000 0.0000000 1.0000000 0.6400000 0.3076923 1.0000000
13 | Local_Cont 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.3454545 1.0000000 0.4893617 0.1724138 1.0000000 0.0000000 1.0000000 1.0000000 0.0000000
14 | Local_Cont 1.0000000 0.6400000 0.6400000 0.6400000 0.6400000 0.6400000 0.6727273 0.6400000 1.0000000 1.0000000 0.6400000 1.0000000 0.0000000 0.6400000 1.0000000
15 | Local_Cont 1.0000000 0.0270270 0.0270270 0.0270270 0.0270270 0.0270270 0.6727273 0.6000000 1.0000000 1.0000000 0.3076923 1.0000000 0.6400000 0.0000000 1.0000000
16 | Local_Cont 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.3454545 1.0000000 0.4893617 0.1724138 1.0000000 0.0000000 1.0000000 1.0000000 0.0000000
17 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | # Read a __version__
4 | exec(open('alfpy/version.py').read())
5 |
6 | # Long description
7 | fh = open('README.rst')
8 | long_description = fh.read()
9 | fh.close()
10 |
11 | setup(
12 | name='alfpy',
13 | version=__version__,
14 | description="Alignment-free package to compare DNA/RNA/protein sequences (bioinformatics).",
15 | long_description=long_description,
16 | author='Andrzej Zielezinski',
17 | keywords='alignment-free bioinformatics sequence DNA protein homology phylogeny',
18 | license="MIT",
19 | author_email='andrzejz@amu.edu.pl',
20 | url="http://www.combio.pl/alfree",
21 | packages=['alfpy', 'alfpy.utils', 'alfpy.utils.data'],
22 | #setup_requires=["numpy"],
23 | install_requires=["numpy"],
24 | scripts=[
25 | 'bin/calc_bbc.py',
26 | 'bin/calc_graphdna.py',
27 | 'bin/calc_fcgr.py',
28 | 'bin/calc_lempelziv.py',
29 | 'bin/calc_ncd.py',
30 | 'bin/calc_wmetric.py',
31 | 'bin/calc_word.py',
32 | 'bin/calc_word_bool.py',
33 | 'bin/calc_word_sets.py',
34 | 'bin/calc_word_cv.py',
35 | 'bin/calc_word_d2.py',
36 | 'bin/calc_word_ffp.py',
37 | 'bin/calc_word_rtd.py',
38 | 'bin/create_wordpattern.py'
39 | ],
40 | classifiers=[
41 | 'License :: OSI Approved :: MIT License',
42 | 'Environment :: Console',
43 | 'Operating System :: MacOS',
44 | 'Operating System :: POSIX :: Linux',
45 | 'Programming Language :: Python :: 2',
46 | 'Programming Language :: Python :: 2.7',
47 | 'Programming Language :: Python :: 3',
48 | 'Programming Language :: Python :: 3.3',
49 | 'Programming Language :: Python :: 3.4',
50 | 'Programming Language :: Python :: 3.5',
51 | 'Topic :: Scientific/Engineering',
52 | 'Topic :: Scientific/Engineering :: Bio-Informatics',
53 | ],
54 |
55 | )
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aziele/alfpy/25545be14affa7d7e89e5b5ebcfe4f3e688108b7/tests/__init__.py
--------------------------------------------------------------------------------
/tests/data/char_freqs.txt:
--------------------------------------------------------------------------------
1 | # UniProtKB/Swiss-Prot protein knowledgebase release 2016_09 statistics
2 | # Release 2016_09 of 05-Oct-16 of UniProtKB/Swiss-Prot contains 552259 sequence entries,
3 | # comprising 197423140 amino acids abstracted from 247204 references.
4 | # http://web.expasy.org/docs/relnotes/relstat.html
5 | A 0.0826
6 | Q 0.0393
7 | L 0.0965
8 | S 0.0659
9 | R 0.0553
10 | E 0.0674
11 | K 0.0583
12 | T 0.0534
13 | N 0.0406
14 | G 0.0708
15 | M 0.0241
16 | W 0.0109
17 | D 0.0546
18 | H 0.0227
19 | F 0.0386
20 | Y 0.0292
21 | C 0.0137
22 | I 0.0594
23 | P 0.0471
24 | V 0.0687
--------------------------------------------------------------------------------
/tests/data/char_weights.txt:
--------------------------------------------------------------------------------
1 | # Based on amino acid frequencies
2 | # Weight = 1 / amino acid freq / 10
3 | # should be greater than 1.
4 | A 1.21065375303
5 | C 7.29927007299
6 | E 1.48367952522
7 | D 1.8315018315
8 | G 1.41242937853
9 | F 2.59067357513
10 | I 1.6835016835
11 | H 4.40528634361
12 | K 1.71526586621
13 | M 4.14937759336
14 | L 1.03626943005
15 | N 2.46305418719
16 | Q 2.54452926209
17 | P 2.12314225053
18 | S 1.51745068285
19 | R 1.80831826401
20 | T 1.87265917603
21 | W 9.17431192661
22 | V 1.45560407569
23 | Y 3.42465753425
--------------------------------------------------------------------------------
/tests/data/dna.fa:
--------------------------------------------------------------------------------
1 | >seq1
2 | AACGTACCATTGAACGTACCGTAGG
3 | >seq2
4 | CTAGGGGACTTATCTAGG
5 | >seq3
6 | CTAGGGAACATACCA
--------------------------------------------------------------------------------
/tests/data/dna.fa.1mer.txt:
--------------------------------------------------------------------------------
1 | 18 3 A 0:8 1:4 2:6
2 | 15 3 G 0:6 1:6 2:3
3 | 13 3 C 0:6 1:3 2:4
4 | 12 3 T 0:5 1:5 2:2
--------------------------------------------------------------------------------
/tests/data/dna.fa.1mer.wordpos.txt:
--------------------------------------------------------------------------------
1 | 18 3 A 0 0 0 1 0 5 0 8 0 12 0 13 0 17 0 22 1 2 1 7 1 11 1 15 2 2 2 6 2 7 2 9 2 11 2 14
2 | 15 3 G 0 3 0 11 0 15 0 20 0 23 0 24 1 3 1 4 1 5 1 6 1 16 1 17 2 3 2 4 2 5
3 | 13 3 C 0 2 0 6 0 7 0 14 0 18 0 19 1 0 1 8 1 13 2 0 2 8 2 12 2 13
4 | 12 3 T 0 4 0 9 0 10 0 16 0 21 1 1 1 9 1 10 1 12 1 14 2 1 2 10
--------------------------------------------------------------------------------
/tests/data/dna.fa.2mer.txt:
--------------------------------------------------------------------------------
1 | 8 3 TA 0:3 1:3 2:2
2 | 7 3 AC 0:4 1:1 2:2
3 | 7 3 GG 0:1 1:4 2:2
4 | 4 3 AG 0:1 1:2 2:1
5 | 4 2 CT 1:3 2:1
6 | 3 3 AT 0:1 1:1 2:1
7 | 3 3 GA 0:1 1:1 2:1
8 | 3 2 AA 0:2 2:1
9 | 3 2 CA 0:1 2:2
10 | 3 2 CC 0:2 2:1
11 | 3 1 CG 0:3
12 | 3 1 GT 0:3
13 | 2 2 TT 0:1 1:1
14 | 1 1 TC 1:1
15 | 1 1 TG 0:1
--------------------------------------------------------------------------------
/tests/data/dna.fa.2mer.wordpos.txt:
--------------------------------------------------------------------------------
1 | 8 3 TA 0 4 0 16 0 21 1 1 1 10 1 14 2 1 2 10
2 | 7 3 AC 0 1 0 5 0 13 0 17 1 7 2 7 2 11
3 | 7 3 GG 0 23 1 3 1 4 1 5 1 16 2 3 2 4
4 | 4 3 AG 0 22 1 2 1 15 2 2
5 | 4 2 CT 1 0 1 8 1 13 2 0
6 | 3 3 AT 0 8 1 11 2 9
7 | 3 3 GA 0 11 1 6 2 5
8 | 3 2 AA 0 0 0 12 2 6
9 | 3 2 CA 0 7 2 8 2 13
10 | 3 2 CC 0 6 0 18 2 12
11 | 3 1 CG 0 2 0 14 0 19
12 | 3 1 GT 0 3 0 15 0 20
13 | 2 2 TT 0 9 1 9
14 | 1 1 TC 1 12
15 | 1 1 TG 0 10
--------------------------------------------------------------------------------
/tests/data/pep.fa:
--------------------------------------------------------------------------------
1 | >seq1 seq1 desc
2 | MEVVIRSANFTDNAKIIIVQLNASVEINC
3 | TRPNNYTRKGIRIGPGRAVYAAEEIIGDN
4 | TLKQVVTKLRE
5 | >seq2 seq2 desc
6 | MVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIR
7 | IGPGRAVYAAEEIIGDIRRAHCNIS
8 | >seq3 seq3 desc
9 | MFTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGAKW
10 | >seq4
11 | MFTDNAKIIIVQLNASVEINCTRPNNNTR
12 |
--------------------------------------------------------------------------------
/tests/data/pep.fa.1mer.txt:
--------------------------------------------------------------------------------
1 | 34 4 I 0:9 1:11 2:10 3:4
2 | 28 4 N 0:7 1:8 2:7 3:6
3 | 21 4 A 0:6 1:7 2:6 3:2
4 | 19 4 R 0:6 1:7 2:4 3:2
5 | 15 4 T 0:5 1:3 2:4 3:3
6 | 15 4 V 0:7 1:4 2:2 3:2
7 | 14 3 G 0:4 1:4 2:6
8 | 11 4 E 0:5 1:3 2:2 3:1
9 | 10 4 K 0:4 1:2 2:3 3:1
10 | 8 4 S 0:2 1:3 2:2 3:1
11 | 7 4 D 0:2 1:2 2:2 3:1
12 | 7 4 P 0:2 1:2 2:2 3:1
13 | 6 4 C 0:1 1:2 2:2 3:1
14 | 6 4 L 0:3 1:1 2:1 3:1
15 | 6 4 Q 0:2 1:1 2:2 3:1
16 | 5 4 F 0:1 1:1 2:2 3:1
17 | 4 4 M 0:1 1:1 2:1 3:1
18 | 4 3 Y 0:2 1:1 2:1
19 | 3 2 H 1:1 2:2
20 | 1 1 W 2:1
--------------------------------------------------------------------------------
/tests/data/pep.fa.1mer.wordpos.txt:
--------------------------------------------------------------------------------
1 | 34 4 I 0 4 0 15 0 16 0 17 0 26 0 39 0 41 0 53 0 54 1 2 1 13 1 14 1 15 1 24 1 37 1 39 1 51 1 52 1 55 1 62 2 7 2 8 2 9 2 18 2 31 2 33 2 45 2 46 2 49 2 56 3 7 3 8 3 9 3 18
2 | 28 4 N 0 8 0 12 0 21 0 27 0 32 0 33 0 57 1 6 1 10 1 19 1 25 1 30 1 31 1 32 1 61 2 4 2 13 2 19 2 24 2 25 2 26 2 55 3 4 3 13 3 19 3 24 3 25 3 26
3 | 21 4 A 0 7 0 13 0 22 0 46 0 49 0 50 1 5 1 11 1 20 1 44 1 47 1 48 1 58 2 5 2 14 2 38 2 41 2 52 2 59 3 5 3 14
4 | 19 4 R 0 5 0 30 0 36 0 40 0 45 0 67 1 3 1 28 1 34 1 38 1 43 1 56 1 57 2 22 2 28 2 37 2 50 3 22 3 28
5 | 15 4 T 0 10 0 29 0 35 0 58 0 64 1 8 1 27 1 33 2 2 2 21 2 27 2 42 3 2 3 21 3 27
6 | 15 4 V 0 2 0 3 0 18 0 24 0 47 0 62 0 63 1 1 1 16 1 22 1 45 2 10 2 16 3 10 3 16
7 | 14 3 G 0 38 0 42 0 44 0 55 1 36 1 40 1 42 1 53 2 30 2 34 2 36 2 43 2 47 2 58
8 | 11 4 E 0 1 0 25 0 51 0 52 0 68 1 23 1 49 1 50 2 17 2 44 3 17
9 | 10 4 K 0 14 0 37 0 60 0 65 1 12 1 35 2 6 2 29 2 60 3 6
10 | 8 4 S 0 6 0 23 1 4 1 21 1 63 2 15 2 57 3 15
11 | 7 4 D 0 11 0 56 1 9 1 54 2 3 2 48 3 3
12 | 7 4 P 0 31 0 43 1 29 1 41 2 23 2 35 3 23
13 | 6 4 C 0 28 1 26 1 60 2 20 2 54 3 20
14 | 6 4 L 0 20 0 59 0 66 1 18 2 12 3 12
15 | 6 4 Q 0 19 0 61 1 17 2 11 2 51 3 11
16 | 5 4 F 0 9 1 7 2 1 2 39 3 1
17 | 4 4 M 0 0 1 0 2 0 3 0
18 | 4 3 Y 0 34 0 48 1 46 2 40
19 | 3 2 H 1 59 2 32 2 53
20 | 1 1 W 2 61
--------------------------------------------------------------------------------
/tests/data/pep.fa.2mer.txt:
--------------------------------------------------------------------------------
1 | 11 4 II 0:3 1:3 2:3 3:2
2 | 8 4 NA 0:2 1:2 2:2 3:2
3 | 8 4 TR 0:2 1:2 2:2 3:2
4 | 7 4 EI 0:2 1:2 2:2 3:1
5 | 7 4 NN 0:1 1:2 2:2 3:2
6 | 6 3 IG 0:2 1:2 2:2
7 | 6 3 IR 0:2 1:3 2:1
8 | 5 4 AK 0:1 1:1 2:2 3:1
9 | 5 4 DN 0:2 1:1 2:1 3:1
10 | 4 4 AS 0:1 1:1 2:1 3:1
11 | 4 4 CT 0:1 1:1 2:1 3:1
12 | 4 4 FT 0:1 1:1 2:1 3:1
13 | 4 4 IN 0:1 1:1 2:1 3:1
14 | 4 4 IV 0:1 1:1 2:1 3:1
15 | 4 4 KI 0:1 1:1 2:1 3:1
16 | 4 4 LN 0:1 1:1 2:1 3:1
17 | 4 4 NC 0:1 1:1 2:1 3:1
18 | 4 4 NT 0:1 1:1 2:1 3:1
19 | 4 4 PN 0:1 1:1 2:1 3:1
20 | 4 4 QL 0:1 1:1 2:1 3:1
21 | 4 4 RP 0:1 1:1 2:1 3:1
22 | 4 4 SV 0:1 1:1 2:1 3:1
23 | 4 4 TD 0:1 1:1 2:1 3:1
24 | 4 4 VE 0:1 1:1 2:1 3:1
25 | 4 4 VQ 0:1 1:1 2:1 3:1
26 | 4 3 RA 0:1 1:2 2:1
27 | 3 3 GD 0:1 1:1 2:1
28 | 3 3 GI 0:1 1:1 2:1
29 | 3 3 GP 0:1 1:1 2:1
30 | 3 3 GR 0:1 1:1 2:1
31 | 3 3 KG 0:1 1:1 2:1
32 | 3 3 PG 0:1 1:1 2:1
33 | 3 3 RK 0:1 1:1 2:1
34 | 3 3 YA 0:1 1:1 2:1
35 | 2 2 AA 0:1 1:1
36 | 2 2 AE 0:1 1:1
37 | 2 2 AH 1:1 2:1
38 | 2 2 AN 0:1 1:1
39 | 2 2 AV 0:1 1:1
40 | 2 2 CN 1:1 2:1
41 | 2 2 DI 1:1 2:1
42 | 2 2 EE 0:1 1:1
43 | 2 2 HC 1:1 2:1
44 | 2 2 IS 1:1 2:1
45 | 2 2 MF 2:1 3:1
46 | 2 2 NF 0:1 1:1
47 | 2 2 NI 1:1 2:1
48 | 2 2 RI 0:1 1:1
49 | 2 2 RS 0:1 1:1
50 | 2 2 SA 0:1 1:1
51 | 2 2 VI 0:1 1:1
52 | 2 2 VY 0:1 1:1
53 | 2 1 VV 0:2
54 | 1 1 AF 2:1
55 | 1 1 AT 2:1
56 | 1 1 EV 0:1
57 | 1 1 FY 2:1
58 | 1 1 GA 2:1
59 | 1 1 GE 2:1
60 | 1 1 HI 2:1
61 | 1 1 IH 2:1
62 | 1 1 KL 0:1
63 | 1 1 KQ 0:1
64 | 1 1 KW 2:1
65 | 1 1 LK 0:1
66 | 1 1 LR 0:1
67 | 1 1 ME 0:1
68 | 1 1 MV 1:1
69 | 1 1 NY 0:1
70 | 1 1 QA 2:1
71 | 1 1 QV 0:1
72 | 1 1 RE 0:1
73 | 1 1 RQ 2:1
74 | 1 1 RR 1:1
75 | 1 1 SG 2:1
76 | 1 1 TG 2:1
77 | 1 1 TK 0:1
78 | 1 1 TL 0:1
79 | 1 1 VT 0:1
80 | 1 1 YT 0:1
--------------------------------------------------------------------------------
/tests/data/pep.fa.2mer.wordpos.txt:
--------------------------------------------------------------------------------
1 | 11 4 II 0 15 0 16 0 53 1 13 1 14 1 51 2 7 2 8 2 45 3 7 3 8
2 | 8 4 NA 0 12 0 21 1 10 1 19 2 4 2 13 3 4 3 13
3 | 8 4 TR 0 29 0 35 1 27 1 33 2 21 2 27 3 21 3 27
4 | 7 4 EI 0 25 0 52 1 23 1 50 2 17 2 44 3 17
5 | 7 4 NN 0 32 1 30 1 31 2 24 2 25 3 24 3 25
6 | 6 3 IG 0 41 0 54 1 39 1 52 2 33 2 46
7 | 6 3 IR 0 4 0 39 1 2 1 37 1 55 2 49
8 | 5 4 AK 0 13 1 11 2 5 2 59 3 5
9 | 5 4 DN 0 11 0 56 1 9 2 3 3 3
10 | 4 4 AS 0 22 1 20 2 14 3 14
11 | 4 4 CT 0 28 1 26 2 20 3 20
12 | 4 4 FT 0 9 1 7 2 1 3 1
13 | 4 4 IN 0 26 1 24 2 18 3 18
14 | 4 4 IV 0 17 1 15 2 9 3 9
15 | 4 4 KI 0 14 1 12 2 6 3 6
16 | 4 4 LN 0 20 1 18 2 12 3 12
17 | 4 4 NC 0 27 1 25 2 19 3 19
18 | 4 4 NT 0 57 1 32 2 26 3 26
19 | 4 4 PN 0 31 1 29 2 23 3 23
20 | 4 4 QL 0 19 1 17 2 11 3 11
21 | 4 4 RP 0 30 1 28 2 22 3 22
22 | 4 4 SV 0 23 1 21 2 15 3 15
23 | 4 4 TD 0 10 1 8 2 2 3 2
24 | 4 4 VE 0 24 1 22 2 16 3 16
25 | 4 4 VQ 0 18 1 16 2 10 3 10
26 | 4 3 RA 0 45 1 43 1 57 2 37
27 | 3 3 GD 0 55 1 53 2 47
28 | 3 3 GI 0 38 1 36 2 30
29 | 3 3 GP 0 42 1 40 2 34
30 | 3 3 GR 0 44 1 42 2 36
31 | 3 3 KG 0 37 1 35 2 29
32 | 3 3 PG 0 43 1 41 2 35
33 | 3 3 RK 0 36 1 34 2 28
34 | 3 3 YA 0 48 1 46 2 40
35 | 2 2 AA 0 49 1 47
36 | 2 2 AE 0 50 1 48
37 | 2 2 AH 1 58 2 52
38 | 2 2 AN 0 7 1 5
39 | 2 2 AV 0 46 1 44
40 | 2 2 CN 1 60 2 54
41 | 2 2 DI 1 54 2 48
42 | 2 2 EE 0 51 1 49
43 | 2 2 HC 1 59 2 53
44 | 2 2 IS 1 62 2 56
45 | 2 2 MF 2 0 3 0
46 | 2 2 NF 0 8 1 6
47 | 2 2 NI 1 61 2 55
48 | 2 2 RI 0 40 1 38
49 | 2 2 RS 0 5 1 3
50 | 2 2 SA 0 6 1 4
51 | 2 2 VI 0 3 1 1
52 | 2 2 VY 0 47 1 45
53 | 2 1 VV 0 2 0 62
54 | 1 1 AF 2 38
55 | 1 1 AT 2 41
56 | 1 1 EV 0 1
57 | 1 1 FY 2 39
58 | 1 1 GA 2 58
59 | 1 1 GE 2 43
60 | 1 1 HI 2 32
61 | 1 1 IH 2 31
62 | 1 1 KL 0 65
63 | 1 1 KQ 0 60
64 | 1 1 KW 2 60
65 | 1 1 LK 0 59
66 | 1 1 LR 0 66
67 | 1 1 ME 0 0
68 | 1 1 MV 1 0
69 | 1 1 NY 0 33
70 | 1 1 QA 2 51
71 | 1 1 QV 0 61
72 | 1 1 RE 0 67
73 | 1 1 RQ 2 50
74 | 1 1 RR 1 56
75 | 1 1 SG 2 57
76 | 1 1 TG 2 42
77 | 1 1 TK 0 64
78 | 1 1 TL 0 58
79 | 1 1 VT 0 63
80 | 1 1 YT 0 34
--------------------------------------------------------------------------------
/tests/data/pep.fa.3mer.txt:
--------------------------------------------------------------------------------
1 | 4 4 AKI 0:1 1:1 2:1 3:1
2 | 4 4 ASV 0:1 1:1 2:1 3:1
3 | 4 4 CTR 0:1 1:1 2:1 3:1
4 | 4 4 DNA 0:1 1:1 2:1 3:1
5 | 4 4 EIN 0:1 1:1 2:1 3:1
6 | 4 4 FTD 0:1 1:1 2:1 3:1
7 | 4 4 III 0:1 1:1 2:1 3:1
8 | 4 4 IIV 0:1 1:1 2:1 3:1
9 | 4 4 INC 0:1 1:1 2:1 3:1
10 | 4 4 IVQ 0:1 1:1 2:1 3:1
11 | 4 4 KII 0:1 1:1 2:1 3:1
12 | 4 4 LNA 0:1 1:1 2:1 3:1
13 | 4 4 NAK 0:1 1:1 2:1 3:1
14 | 4 4 NAS 0:1 1:1 2:1 3:1
15 | 4 4 NCT 0:1 1:1 2:1 3:1
16 | 4 4 PNN 0:1 1:1 2:1 3:1
17 | 4 4 QLN 0:1 1:1 2:1 3:1
18 | 4 4 RPN 0:1 1:1 2:1 3:1
19 | 4 4 SVE 0:1 1:1 2:1 3:1
20 | 4 4 TDN 0:1 1:1 2:1 3:1
21 | 4 4 TRP 0:1 1:1 2:1 3:1
22 | 4 4 VEI 0:1 1:1 2:1 3:1
23 | 4 4 VQL 0:1 1:1 2:1 3:1
24 | 3 3 EII 0:1 1:1 2:1
25 | 3 3 GPG 0:1 1:1 2:1
26 | 3 3 GRA 0:1 1:1 2:1
27 | 3 3 IGD 0:1 1:1 2:1
28 | 3 3 IGP 0:1 1:1 2:1
29 | 3 3 IIG 0:1 1:1 2:1
30 | 3 3 KGI 0:1 1:1 2:1
31 | 3 3 NNN 1:1 2:1 3:1
32 | 3 3 NNT 1:1 2:1 3:1
33 | 3 3 NTR 1:1 2:1 3:1
34 | 3 3 PGR 0:1 1:1 2:1
35 | 3 3 RKG 0:1 1:1 2:1
36 | 3 3 TRK 0:1 1:1 2:1
37 | 2 2 AAE 0:1 1:1
38 | 2 2 AEE 0:1 1:1
39 | 2 2 AHC 1:1 2:1
40 | 2 2 ANF 0:1 1:1
41 | 2 2 AVY 0:1 1:1
42 | 2 2 CNI 1:1 2:1
43 | 2 2 DIR 1:1 2:1
44 | 2 2 EEI 0:1 1:1
45 | 2 2 GDI 1:1 2:1
46 | 2 2 GIR 0:1 1:1
47 | 2 2 HCN 1:1 2:1
48 | 2 2 IRI 0:1 1:1
49 | 2 2 IRS 0:1 1:1
50 | 2 2 MFT 2:1 3:1
51 | 2 2 NFT 0:1 1:1
52 | 2 2 NIS 1:1 2:1
53 | 2 2 RAV 0:1 1:1
54 | 2 2 RIG 0:1 1:1
55 | 2 2 RSA 0:1 1:1
56 | 2 2 SAN 0:1 1:1
57 | 2 2 VIR 0:1 1:1
58 | 2 2 VYA 0:1 1:1
59 | 2 2 YAA 0:1 1:1
60 | 1 1 AFY 2:1
61 | 1 1 AKW 2:1
62 | 1 1 ATG 2:1
63 | 1 1 DNT 0:1
64 | 1 1 EVV 0:1
65 | 1 1 FYA 2:1
66 | 1 1 GAK 2:1
67 | 1 1 GDN 0:1
68 | 1 1 GEI 2:1
69 | 1 1 GIH 2:1
70 | 1 1 HIG 2:1
71 | 1 1 IHI 2:1
72 | 1 1 IRQ 2:1
73 | 1 1 IRR 1:1
74 | 1 1 ISG 2:1
75 | 1 1 KLR 0:1
76 | 1 1 KQV 0:1
77 | 1 1 LKQ 0:1
78 | 1 1 LRE 0:1
79 | 1 1 MEV 0:1
80 | 1 1 MVI 1:1
81 | 1 1 NNY 0:1
82 | 1 1 NTL 0:1
83 | 1 1 NYT 0:1
84 | 1 1 QAH 2:1
85 | 1 1 QVV 0:1
86 | 1 1 RAF 2:1
87 | 1 1 RAH 1:1
88 | 1 1 RQA 2:1
89 | 1 1 RRA 1:1
90 | 1 1 SGA 2:1
91 | 1 1 TGE 2:1
92 | 1 1 TKL 0:1
93 | 1 1 TLK 0:1
94 | 1 1 VTK 0:1
95 | 1 1 VVI 0:1
96 | 1 1 VVT 0:1
97 | 1 1 YAT 2:1
98 | 1 1 YTR 0:1
--------------------------------------------------------------------------------
/tests/data/pep.fa.3mer.wordpos.txt:
--------------------------------------------------------------------------------
1 | 4 4 AKI 0 13 1 11 2 5 3 5
2 | 4 4 ASV 0 22 1 20 2 14 3 14
3 | 4 4 CTR 0 28 1 26 2 20 3 20
4 | 4 4 DNA 0 11 1 9 2 3 3 3
5 | 4 4 EIN 0 25 1 23 2 17 3 17
6 | 4 4 FTD 0 9 1 7 2 1 3 1
7 | 4 4 III 0 15 1 13 2 7 3 7
8 | 4 4 IIV 0 16 1 14 2 8 3 8
9 | 4 4 INC 0 26 1 24 2 18 3 18
10 | 4 4 IVQ 0 17 1 15 2 9 3 9
11 | 4 4 KII 0 14 1 12 2 6 3 6
12 | 4 4 LNA 0 20 1 18 2 12 3 12
13 | 4 4 NAK 0 12 1 10 2 4 3 4
14 | 4 4 NAS 0 21 1 19 2 13 3 13
15 | 4 4 NCT 0 27 1 25 2 19 3 19
16 | 4 4 PNN 0 31 1 29 2 23 3 23
17 | 4 4 QLN 0 19 1 17 2 11 3 11
18 | 4 4 RPN 0 30 1 28 2 22 3 22
19 | 4 4 SVE 0 23 1 21 2 15 3 15
20 | 4 4 TDN 0 10 1 8 2 2 3 2
21 | 4 4 TRP 0 29 1 27 2 21 3 21
22 | 4 4 VEI 0 24 1 22 2 16 3 16
23 | 4 4 VQL 0 18 1 16 2 10 3 10
24 | 3 3 EII 0 52 1 50 2 44
25 | 3 3 GPG 0 42 1 40 2 34
26 | 3 3 GRA 0 44 1 42 2 36
27 | 3 3 IGD 0 54 1 52 2 46
28 | 3 3 IGP 0 41 1 39 2 33
29 | 3 3 IIG 0 53 1 51 2 45
30 | 3 3 KGI 0 37 1 35 2 29
31 | 3 3 NNN 1 30 2 24 3 24
32 | 3 3 NNT 1 31 2 25 3 25
33 | 3 3 NTR 1 32 2 26 3 26
34 | 3 3 PGR 0 43 1 41 2 35
35 | 3 3 RKG 0 36 1 34 2 28
36 | 3 3 TRK 0 35 1 33 2 27
37 | 2 2 AAE 0 49 1 47
38 | 2 2 AEE 0 50 1 48
39 | 2 2 AHC 1 58 2 52
40 | 2 2 ANF 0 7 1 5
41 | 2 2 AVY 0 46 1 44
42 | 2 2 CNI 1 60 2 54
43 | 2 2 DIR 1 54 2 48
44 | 2 2 EEI 0 51 1 49
45 | 2 2 GDI 1 53 2 47
46 | 2 2 GIR 0 38 1 36
47 | 2 2 HCN 1 59 2 53
48 | 2 2 IRI 0 39 1 37
49 | 2 2 IRS 0 4 1 2
50 | 2 2 MFT 2 0 3 0
51 | 2 2 NFT 0 8 1 6
52 | 2 2 NIS 1 61 2 55
53 | 2 2 RAV 0 45 1 43
54 | 2 2 RIG 0 40 1 38
55 | 2 2 RSA 0 5 1 3
56 | 2 2 SAN 0 6 1 4
57 | 2 2 VIR 0 3 1 1
58 | 2 2 VYA 0 47 1 45
59 | 2 2 YAA 0 48 1 46
60 | 1 1 AFY 2 38
61 | 1 1 AKW 2 59
62 | 1 1 ATG 2 41
63 | 1 1 DNT 0 56
64 | 1 1 EVV 0 1
65 | 1 1 FYA 2 39
66 | 1 1 GAK 2 58
67 | 1 1 GDN 0 55
68 | 1 1 GEI 2 43
69 | 1 1 GIH 2 30
70 | 1 1 HIG 2 32
71 | 1 1 IHI 2 31
72 | 1 1 IRQ 2 49
73 | 1 1 IRR 1 55
74 | 1 1 ISG 2 56
75 | 1 1 KLR 0 65
76 | 1 1 KQV 0 60
77 | 1 1 LKQ 0 59
78 | 1 1 LRE 0 66
79 | 1 1 MEV 0 0
80 | 1 1 MVI 1 0
81 | 1 1 NNY 0 32
82 | 1 1 NTL 0 57
83 | 1 1 NYT 0 33
84 | 1 1 QAH 2 51
85 | 1 1 QVV 0 61
86 | 1 1 RAF 2 37
87 | 1 1 RAH 1 57
88 | 1 1 RQA 2 50
89 | 1 1 RRA 1 56
90 | 1 1 SGA 2 57
91 | 1 1 TGE 2 42
92 | 1 1 TKL 0 64
93 | 1 1 TLK 0 58
94 | 1 1 VTK 0 63
95 | 1 1 VVI 0 2
96 | 1 1 VVT 0 62
97 | 1 1 YAT 2 40
98 | 1 1 YTR 0 34
--------------------------------------------------------------------------------
/tests/test_calc_bbc.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsCommonTest.set_test_data()
11 | self.script_name = 'calc_bbc.py'
12 |
13 | def test_arg_molecule_when_no_fasta(self):
14 | args = ['--molecule', 'dna']
15 | returncode, out = utils.runscript(self.script_name, args)
16 | self.assertEqual(returncode, 2)
17 | self.assertIn('--fasta/-f', out)
18 |
19 | def test_arg_molecule_invalid_choice(self):
20 | args = ['--fasta', self.filename_dna,
21 | '--molecule', 'nonexistent_mol']
22 | returncode, out = utils.runscript(self.script_name, args)
23 | self.assertEqual(returncode, 2)
24 | self.assertIn('--molecule/-m', out)
25 |
26 | def test_output_on_dna1(self):
27 | args = ['--fasta', self.filename_dna, '--m', 'dna']
28 | returncode, out, md5 = self._test_output(self.script_name, args)
29 | self.assertEqual(returncode, 0)
30 | self.assertEqual(md5, '6cfc27479ca5fb3d5d2d468544005d8b')
31 |
32 | def test_output_on_dna_k2(self):
33 | args = ['--fasta', self.filename_dna, '--m', 'dna', '--k', '2']
34 | returncode, out, md5 = self._test_output(self.script_name, args)
35 | self.assertEqual(returncode, 0)
36 | self.assertEqual(md5, '1ea7e82d6bb7b8648e0dcca9e089361c')
37 |
38 | def test_output_on_dna_k2_pairwise(self):
39 | args = ['--fasta', self.filename_dna, '--m', 'dna',
40 | '--k', '2', '--outfmt', 'pairwise']
41 | returncode, out, md5 = self._test_output(self.script_name, args)
42 | self.assertEqual(returncode, 0)
43 | self.assertEqual(md5, '74de6627e68cfb609701c13637ba4090')
44 |
45 | def test_output_on_protein(self):
46 | args = ['--fasta', self.filename_pep, '--m', 'protein']
47 | returncode, out, md5 = self._test_output(self.script_name, args)
48 | self.assertEqual(returncode, 0)
49 | self.assertEqual(md5, '154f2788be2ec349092f22ce359acf80')
50 |
51 | def test_output_on_protein_no_outfile(self):
52 | args = ['--fasta', self.filename_pep, '--m', 'protein']
53 | returncode, out, md5 = self._test_output(self.script_name, args, False)
54 | self.assertEqual(returncode, 0)
55 | self.assertEqual(md5, '154f2788be2ec349092f22ce359acf80')
56 |
57 |
58 | if __name__ == '__main__':
59 | unittest.main()
60 |
--------------------------------------------------------------------------------
/tests/test_calc_fcgr.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsCommonTest.set_test_data()
11 | self.script_name = 'calc_fcgr.py'
12 |
13 | def test_arg_word_size_2_when_no_fasta(self):
14 | args = ['--word_size', '2']
15 | returncode, out = utils.runscript(self.script_name, args)
16 | self.assertEqual(returncode, 2)
17 | self.assertIn('--fasta/-f', out)
18 |
19 | def test_arg_fasta_when_no_word_size(self):
20 | args = ['--fasta', self.filename_dna]
21 | returncode, out = utils.runscript(self.script_name, args)
22 | self.assertEqual(returncode, 2)
23 | self.assertIn('--word_size/-w', out)
24 |
25 | def test_arg_word_size_too_small(self):
26 | args = ['--fasta', self.filename_dna, '--word_size', '0']
27 | returncode, out = utils.runscript(self.script_name, args)
28 | self.assertEqual(returncode, 2)
29 | self.assertIn('--word_size must be >= 1', out)
30 |
31 | def test_output_word_size_1(self):
32 | args = ['--fasta', self.filename_dna, '--word_size', '1']
33 | returncode, out, md5 = self._test_output(self.script_name, args)
34 | self.assertEqual(returncode, 0)
35 | self.assertEqual(md5, 'bee51f3214f06f4e4265aa05bf9d6a7e')
36 |
37 | def test_output_word_size_2(self):
38 | args = ['--fasta', self.filename_dna, '--word_size', '2']
39 | returncode, out, md5 = self._test_output(self.script_name, args)
40 | self.assertEqual(returncode, 0)
41 | self.assertEqual(md5, '7175a91fb9fc31661ce07aea28743605')
42 |
43 |
44 | if __name__ == '__main__':
45 | unittest.main()
46 |
--------------------------------------------------------------------------------
/tests/test_calc_graphdna.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsCommonTest.set_test_data()
11 | self.script_name = 'calc_graphdna.py'
12 |
13 | def test_arg_vector_when_no_fasta(self):
14 | args = ['--vector', '2DSV']
15 | returncode, out = utils.runscript(self.script_name, args)
16 | self.assertEqual(returncode, 2)
17 | self.assertIn('--fasta/-f', out)
18 |
19 | def test_arg_vector_invalid_choice(self):
20 | args = ['--fasta', self.filename_dna, '--vector', 'nonexistent']
21 | returncode, out = utils.runscript(self.script_name, args)
22 | self.assertEqual(returncode, 2)
23 | self.assertIn('invalid choice', out)
24 |
25 | def test_output_default(self):
26 | args = ['--fasta', self.filename_dna]
27 | returncode, out, md5 = self._test_output(self.script_name, args)
28 | self.assertEqual(returncode, 0)
29 | self.assertEqual(md5, '496832ba4841a988a46c81770ee54668')
30 |
31 | def test_output_vector_2DSV(self):
32 | args = ['--fasta', self.filename_dna, '--vector', '2DSV']
33 | returncode, out, md5 = self._test_output(self.script_name, args)
34 | self.assertEqual(returncode, 0)
35 | self.assertEqual(md5, 'e35a44622d4f0411b26e12e8eedcdb64')
36 |
37 | def test_output_vector_2DMV(self):
38 | args = ['--fasta', self.filename_dna, '--vector', '2DMV']
39 | returncode, out, md5 = self._test_output(self.script_name, args)
40 | self.assertEqual(returncode, 0)
41 | self.assertEqual(md5, '7638015e1c25657cd572071f3b9ae7c4')
42 |
43 | def test_script_output_vector_2DNV_pairwise(self):
44 | args = ['--fasta', self.filename_dna, '--vector', '2DNV',
45 | '--outfmt', 'pairwise']
46 | returncode, out, md5 = self._test_output(self.script_name, args)
47 | self.assertEqual(returncode, 0)
48 | self.assertEqual(md5, '2921e374b468b6de81a1c9140681a3b4')
49 |
50 |
51 | if __name__ == '__main__':
52 | unittest.main()
53 |
--------------------------------------------------------------------------------
/tests/test_calc_lempelziv.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsCommonTest.set_test_data()
11 | self.script_name = 'calc_lempelziv.py'
12 |
13 | def test_agr_fasta_when_invalid_distance(self):
14 | args = ['--fasta', self.filename_dna,
15 | '--distance', 'nonexistent']
16 | returncode, out = utils.runscript(self.script_name, args)
17 | self.assertEqual(returncode, 2)
18 | self.assertIn('invalid choice', out)
19 |
20 | def test_agr_distance_when_no_fasta(self):
21 | args = ['--distance', 'd1']
22 | returncode, out = utils.runscript(self.script_name, args)
23 | self.assertEqual(returncode, 2)
24 | self.assertIn('--fasta/-f', out)
25 |
26 | def test_output_default(self):
27 | args = ['--fasta', self.filename_pep]
28 | returncode, out, md5 = self._test_output(self.script_name, args)
29 | self.assertEqual(returncode, 0)
30 | self.assertEqual(md5, '89d18a9ac1e573743fa0214c48dde40c')
31 |
32 | def test_output_distance_d(self):
33 | args = ['--fasta', self.filename_pep, '--distance', 'd']
34 | returncode, out, md5 = self._test_output(self.script_name, args)
35 | self.assertEqual(returncode, 0)
36 | self.assertEqual(md5, 'c71cb1521d0fc9084eee21c8599785ef')
37 |
38 | def test_output_distance_d_star_pairwise(self):
39 | args = ['--fasta', self.filename_pep, '--distance', 'd_star',
40 | '--outfmt', 'pairwise']
41 | returncode, out, md5 = self._test_output(self.script_name, args)
42 | self.assertEqual(returncode, 0)
43 | self.assertEqual(md5, '3ed3ca10d198fe4f44ea85134dbcb481')
44 |
45 |
46 | if __name__ == '__main__':
47 | unittest.main()
48 |
--------------------------------------------------------------------------------
/tests/test_calc_ncd.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsCommonTest.set_test_data()
11 | self.script_name = 'calc_ncd.py'
12 |
13 | def test_output_default(self):
14 | args = ['--fasta', self.filename_pep]
15 | returncode, out, md5 = self._test_output(self.script_name, args)
16 | self.assertEqual(returncode, 0)
17 | self.assertEqual(md5, 'e5491c3e4197bf1abb92e7f76bdefeaf')
18 |
19 | def test_output_pairwise(self):
20 | args = ['--fasta', self.filename_pep, '--outfmt', 'pairwise']
21 | returncode, out, md5 = self._test_output(self.script_name, args)
22 | self.assertEqual(returncode, 0)
23 | self.assertEqual(md5, 'cb69bbabd9a4286a9596f8af3b2b82d5')
24 |
25 |
26 | if __name__ == '__main__':
27 | unittest.main()
28 |
--------------------------------------------------------------------------------
/tests/test_calc_wmetric.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsCommonTest.set_test_data()
11 | self.script_name = 'calc_wmetric.py'
12 |
13 | def test_arg_matrix_when_no_fasta(self):
14 | args = ['--matrix', 'blosum62']
15 | returncode, out = utils.runscript(self.script_name, args)
16 | self.assertEqual(returncode, 2)
17 | self.assertIn('--fasta/-f', out)
18 |
19 | def test_arg_matrix_invalid_choice(self):
20 | args = ['--matrix', 'nonexistent']
21 | returncode, out = utils.runscript(self.script_name, args)
22 | self.assertEqual(returncode, 2)
23 | self.assertIn('--matrix/-m', out)
24 |
25 | def test_output_default(self):
26 | args = ['--fasta', self.filename_pep]
27 | returncode, out, md5 = self._test_output(self.script_name, args)
28 | self.assertEqual(returncode, 0)
29 | self.assertEqual(md5, '27ad675a7a2e5c2872a8ab495f2d4494')
30 |
31 | def test_output_phylip(self):
32 | args = ['--fasta', self.filename_pep, '--outfmt', 'phylip']
33 | returncode, out, md5 = self._test_output(self.script_name, args)
34 | self.assertEqual(returncode, 0)
35 | self.assertEqual(md5, '27ad675a7a2e5c2872a8ab495f2d4494')
36 |
37 | def test_output_pairwise(self):
38 | args = ['--fasta', self.filename_pep, '--outfmt', 'pairwise']
39 | returncode, out, md5 = self._test_output(self.script_name, args)
40 | self.assertEqual(returncode, 0)
41 | self.assertEqual(md5, '195fb45ed46a80473e1d004b9ce40e94')
42 |
43 | def test_output_pam250(self):
44 | args = ['--fasta', self.filename_pep, '--outfmt', 'phylip',
45 | '--matrix', 'pam250']
46 | returncode, out, md5 = self._test_output(self.script_name, args)
47 | self.assertEqual(returncode, 0)
48 | self.assertEqual(md5, '217ed91de43b091205add32a673cf8fe')
49 |
50 |
51 | if __name__ == '__main__':
52 | unittest.main()
53 |
--------------------------------------------------------------------------------
/tests/test_calc_word_bool.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsWordCommonTest.set_test_data()
11 | self.script_name = 'calc_word_bool.py'
12 |
13 | def test_arg_word_size_when_no_fasta(self):
14 | args = ['--word_size', '2']
15 | returncode, out = utils.runscript(self.script_name, args)
16 | self.assertEqual(returncode, 2)
17 | self.assertIn('--fasta/-f', out)
18 |
19 | def test_arg_word_pattern_when_no_fasta(self):
20 | args = ['--word_pattern', self.filename_pep_2mer]
21 | returncode, out = utils.runscript(self.script_name, args)
22 | self.assertEqual(returncode, 2)
23 | self.assertIn('--fasta/-f', out)
24 |
25 | def test_arg_fasta_when_no_wordsize_or_wordpattern(self):
26 | args = ['--fasta', self.filename_pep]
27 | returncode, out = utils.runscript(self.script_name, args)
28 | self.assertEqual(returncode, 2)
29 | self.assertIn('Specify either: --word_size or --word', out)
30 |
31 | def test_arg_fasta_when_no_wordsize_or_wordpattern(self):
32 | args = ['--fasta', self.filename_pep]
33 | returncode, out = utils.runscript(self.script_name, args)
34 | self.assertEqual(returncode, 2)
35 | self.assertIn('Specify either: --word_size or --word', out)
36 |
37 | def test_arg_word_size_too_small(self):
38 | args = ['--fasta', self.filename_pep, '--word_size', '-1']
39 | returncode, out = utils.runscript(self.script_name, args)
40 | self.assertEqual(returncode, 2)
41 | self.assertIn('Word size must be >= 1.', out)
42 |
43 | def test_output_word_size1(self):
44 | args = ['--fasta', self.filename_pep, '--word_size', '1']
45 | returncode, out, md5 = self._test_output(self.script_name, args)
46 | self.assertEqual(returncode, 0)
47 | self.assertEqual(md5, '4caed60c7590f45e9a6de19482839e9c')
48 |
49 |
50 | if __name__ == '__main__':
51 | unittest.main()
52 |
--------------------------------------------------------------------------------
/tests/test_calc_word_cv.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsWordCommonTest.set_test_data()
11 | self.script_name = 'calc_word_cv.py'
12 |
13 | def test_word_size_smaller_than_3(self):
14 | args = ['--fasta', self.filename_pep, '--word_size', '2']
15 | returncode, out = utils.runscript(self.script_name, args)
16 | self.assertEqual(returncode, 2)
17 | self.assertIn('error: Word size must be >= 3', out)
18 |
19 | def test_word_pattern_only_one_file(self):
20 | args = ['--fasta', self.filename_pep, '--word_pattern',
21 | self.filename_pep_2mer]
22 | returncode, out = utils.runscript(self.script_name, args)
23 | self.assertEqual(returncode, 2)
24 | self.assertIn('expected 3 argument', out)
25 |
26 | def test_word_pattern_not_follow_rule(self):
27 | args = ['--fasta', self.filename_pep, '--word_pattern',
28 | self.filename_pep_2mer, self.filename_pep_2mer,
29 | self.filename_pep_2mer]
30 | returncode, out = utils.runscript(self.script_name, args)
31 | self.assertEqual(returncode, 2)
32 | self.assertIn(' do not follow k, k-1, k-2', out)
33 |
34 | def test_fasta_when_no_word_size_or_pattern(self):
35 | args = ['--fasta', self.filename_pep]
36 | returncode, out = utils.runscript(self.script_name, args)
37 | self.assertEqual(returncode, 2)
38 | self.assertIn('Specify either: --word_size or --word_pattern', out)
39 |
40 | def test_output_word_size(self):
41 | args = ['--fasta', self.filename_pep, '--word_size', '3']
42 | returncode, out, md5 = self._test_output(self.script_name, args)
43 | self.assertEqual(returncode, 0)
44 | self.assertEqual(md5, '4fbba77e4f7a64601e7d0cb3b0b6878d')
45 |
46 | def test_output_word_pattern(self):
47 | args = ['--fasta', self.filename_pep, '--word_patterns',
48 | self.filename_pep_3mer, self.filename_pep_2mer,
49 | self.filename_pep_1mer
50 | ]
51 | returncode, out, md5 = self._test_output(self.script_name, args)
52 | self.assertEqual(returncode, 0)
53 | self.assertEqual(md5, '4fbba77e4f7a64601e7d0cb3b0b6878d')
54 |
55 |
56 | if __name__ == '__main__':
57 | unittest.main()
58 |
--------------------------------------------------------------------------------
/tests/test_calc_word_d2.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsWordCommonTest.set_test_data()
11 | self.script_name = 'calc_word_d2.py'
12 |
13 | def test_arg_when_u_smaller_than_l(self):
14 | args = ['--fasta', self.filename_pep, '-l', '3', '-u', '2']
15 | returncode, out = utils.runscript(self.script_name, args)
16 | self.assertEqual(returncode, 2)
17 | self.assertIn('error: max_word_size must be greater than ', out)
18 |
19 | def test_arg_char_weights_invalid_format(self):
20 | args = ['--fasta', self.filename_pep,
21 | '-l', '1', '-u', '4',
22 | '--char_weights', self.filename_pep,
23 | '--vector', 'freqs']
24 | returncode, out = utils.runscript(self.script_name, args)
25 | self.assertEqual(returncode, 2)
26 | self.assertIn('Invalid format for --char_weights', out)
27 |
28 | def test_arg_word_size_0(self):
29 | args = ['--fasta', self.filename_pep, '-l', '0']
30 | returncode, out = utils.runscript(self.script_name, args)
31 | self.assertEqual(returncode, 2)
32 | self.assertIn('min_word_size must be greater than 0', out)
33 |
34 | def test_output_default(self):
35 | args = ['--fasta', self.filename_pep]
36 | returncode, out, md5 = self._test_output(self.script_name, args)
37 | self.assertEqual(returncode, 0)
38 | self.assertEqual(md5, 'f651314b77dcd4fe9b3143de28000ca8')
39 |
40 | def test_output_l1_u4(self):
41 | args = ['--fasta', self.filename_pep, '-l', '1', '-u', '4']
42 | returncode, out, md5 = self._test_output(self.script_name, args)
43 | self.assertEqual(returncode, 0)
44 | self.assertEqual(md5, '164ef1a902f74517e6b7cff7798c595f')
45 |
46 | def test_output_l1_u4_freqs(self):
47 | args = ['--fasta', self.filename_pep, '-l', '1', '-u', '4',
48 | '--vector', 'freqs']
49 | returncode, out, md5 = self._test_output(self.script_name, args)
50 | self.assertEqual(returncode, 0)
51 | self.assertEqual(md5, '8340c1687a0e6ae50c5f6bcc24196247')
52 |
53 | def test_output_l1_u4_char_weights(self):
54 | args = ['--fasta', self.filename_pep, '-l', '1', '-u', '4',
55 | '--char_weights', self.filename_char_weights]
56 | returncode, out, md5 = self._test_output(self.script_name, args)
57 | self.assertEqual(returncode, 0)
58 | self.assertEqual(md5, '81873a0cb36f7e05698fa664311f38ee')
59 |
60 | def test_script_l1_u4_char_weights_freqs(self):
61 | args = ['--fasta', self.filename_pep, '-l', '1', '-u', '4',
62 | '--vector', 'freqs',
63 | '--char_weights', self.filename_char_weights]
64 | returncode, out, md5 = self._test_output(self.script_name, args)
65 | self.assertEqual(returncode, 0)
66 | self.assertEqual(md5, '96c944f9e8e4d2b8ca67bc2620f47d3a')
67 |
68 |
69 | if __name__ == '__main__':
70 | unittest.main()
71 |
--------------------------------------------------------------------------------
/tests/test_calc_word_ffp.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsWordCommonTest.set_test_data()
11 | self.script_name = 'calc_word_ffp.py'
12 |
13 | def test_arg_word_size_when_no_fasta(self):
14 | args = ['--word_size', '2']
15 | returncode, out = utils.runscript(self.script_name, args)
16 | self.assertEqual(returncode, 2)
17 | self.assertIn('--fasta/-f', out)
18 |
19 | def test_arg_no_molecule(self):
20 | args = ['--fasta', self.filename_pep]
21 | returncode, out = utils.runscript(self.script_name, args)
22 | self.assertEqual(returncode, 2)
23 | self.assertIn('--molecule/-m', out)
24 |
25 | def test_arg_no_word_size(self):
26 | args = ['--fasta', self.filename_pep, '--molecule', 'protein']
27 | returncode, out = utils.runscript(self.script_name, args)
28 | self.assertEqual(returncode, 2)
29 | self.assertIn('--word_size', out)
30 |
31 | def test_arg_incompatible_args_protein_merge_revcomp(self):
32 | args = ['--fasta', self.filename_pep, '--word_size', '2',
33 | '--molecule', 'protein', '--merge_revcomp']
34 | returncode, out = utils.runscript(self.script_name, args)
35 | self.assertEqual(returncode, 2)
36 | self.assertIn('Incompatible arguments', out)
37 |
38 | def test_arg_distance_invalid_choice(self):
39 | args = ['--fasta', self.filename_pep, '--word_size', '2',
40 | '--molecule', 'protein', '--distance', 'nonexistent']
41 | returncode, out = utils.runscript(self.script_name, args)
42 | self.assertEqual(returncode, 2)
43 | self.assertIn('invalid choice', out)
44 |
45 | def test_output_pep_word_size2(self):
46 | args = ['--fasta', self.filename_pep, '--word_size', '2',
47 | '--molecule', 'protein']
48 | returncode, out, md5 = self._test_output(self.script_name, args)
49 | self.assertEqual(returncode, 0)
50 | self.assertEqual(md5, '79caa37b67848c52b41a8cb074d810e1')
51 |
52 | def test_output_pep_word_size2_reduce_alphabet(self):
53 | args = ['--fasta', self.filename_pep, '--word_size', '2',
54 | '--molecule', 'protein', '--reduce_alphabet']
55 | returncode, out, md5 = self._test_output(self.script_name, args)
56 | self.assertEqual(returncode, 0)
57 | self.assertEqual(md5, '2e03fddfa6a10d810c3481fd53ada4a3')
58 |
59 | def test_output_pep_word_pattern2_reduce_alphabet(self):
60 | args = ['--fasta', self.filename_pep, '--molecule', 'protein',
61 | '--word_pattern', self.filename_pep_2mer, '--reduce_alphabet']
62 | returncode, out, md5 = self._test_output(self.script_name, args)
63 | self.assertEqual(returncode, 0)
64 | self.assertEqual(md5, '2e03fddfa6a10d810c3481fd53ada4a3')
65 |
66 | def test_output_dna_word_size2(self):
67 | args = ['--fasta', self.filename_dna, '--molecule', 'dna',
68 | '--word_size', '2']
69 | returncode, out, md5 = self._test_output(self.script_name, args)
70 | self.assertEqual(returncode, 0)
71 | self.assertEqual(md5, '69d68abfe5cb8e855f77f9f8fff20178')
72 |
73 | def test_output_dna_word_size2_mergerevcomp(self):
74 | args = ['--fasta', self.filename_dna, '--molecule', 'dna',
75 | '--word_size', '2', '--merge_revcomp']
76 | returncode, out, md5 = self._test_output(self.script_name, args)
77 | self.assertEqual(returncode, 0)
78 | self.assertEqual(md5, 'd3fd336b21aac9922ed7831b8d9f5f83')
79 |
80 | def test_output_dna_word_size2_mergerevcomp_reduce(self):
81 | args = ['--fasta', self.filename_dna, '--molecule', 'dna',
82 | '--word_size', '2', '--merge_revcomp', '--reduce_alphabet']
83 | returncode, out, md5 = self._test_output(self.script_name, args)
84 | self.assertEqual(returncode, 0)
85 | self.assertEqual(md5, '83fd63884c64c88ee3ff6e4eb2183e8b')
86 |
87 | if __name__ == '__main__':
88 | unittest.main()
89 |
--------------------------------------------------------------------------------
/tests/test_calc_word_rtd.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsWordCommonTest.set_test_data()
11 | self.script_name = 'calc_word_rtd.py'
12 |
13 | def test_arg_word_size_when_no_fasta(self):
14 | args = ['--word_size', '2']
15 | returncode, out = utils.runscript(self.script_name, args)
16 | self.assertEqual(returncode, 2)
17 | self.assertIn('--fasta/-f', out)
18 |
19 | def test_arg_fasta_when_no_word_size(self):
20 | args = ['--fasta', self.filename_pep]
21 | returncode, out = utils.runscript(self.script_name, args)
22 | self.assertEqual(returncode, 2)
23 | self.assertIn('Specify either: --word_size or --word_pattern.', out)
24 |
25 | def test_arg_word_pattern_invalid_format(self):
26 | args = ['--fasta', self.filename_pep,
27 | '--word_pattern', self.filename_pep_2mer]
28 | returncode, out = utils.runscript(self.script_name, args)
29 | self.assertEqual(returncode, 2)
30 | self.assertIn('does not contain info on word positions', out)
31 |
32 | def test_arg_distance_invalid_choice(self):
33 | args = ['--fasta', self.filename_pep, '--word_size', '2',
34 | '--distance', 'nonexistent']
35 | returncode, out = utils.runscript(self.script_name, args)
36 | self.assertEqual(returncode, 2)
37 | self.assertIn('invalid choice', out)
38 |
39 | def test_output_word_size_2(self):
40 | args = ['--fasta', self.filename_pep, '--word_size', '2']
41 | returncode, out, md5 = self._test_output(self.script_name, args)
42 | self.assertEqual(returncode, 0)
43 | self.assertEqual(md5, '1e1a089908495d60275c039272e8e45f')
44 |
45 | def test_output_wordpattern(self):
46 | args = ['--fasta', self.filename_pep,
47 | '--word_pattern', self.filename_pep_2mer_wordpos]
48 | returncode, out, md5 = self._test_output(self.script_name, args)
49 | self.assertEqual(returncode, 0)
50 | self.assertEqual(md5, '1e1a089908495d60275c039272e8e45f')
51 |
52 | def test_output_word_size_1(self):
53 | args = ['--fasta', self.filename_pep, '--outfmt', 'pairwise',
54 | '--word_size', '1']
55 | returncode, out, md5 = self._test_output(self.script_name, args)
56 | self.assertEqual(returncode, 0)
57 | self.assertEqual(md5, 'b4f581dabfa83b2f1ff4f5d367865711')
58 |
59 |
60 | if __name__ == '__main__':
61 | unittest.main()
62 |
--------------------------------------------------------------------------------
/tests/test_calc_word_sets.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsWordCommonTest.set_test_data()
11 | self.script_name = 'calc_word_sets.py'
12 |
13 | def test_arg_word_size_when_no_fasta(self):
14 | args = ['--word_size', '2']
15 | returncode, out = utils.runscript(self.script_name, args)
16 | self.assertEqual(returncode, 2)
17 | self.assertIn('--fasta/-f', out)
18 |
19 | def test_arg_fasta_when_no_wordsize(self):
20 | args = ['--fasta', self.filename_pep]
21 | returncode, out = utils.runscript(self.script_name, args)
22 | self.assertEqual(returncode, 2)
23 | self.assertIn('--word_size', out)
24 |
25 | def test_arg_word_size_too_small(self):
26 | args = ['--fasta', self.filename_pep, '--word_size', '-1']
27 | returncode, out = utils.runscript(self.script_name, args)
28 | self.assertEqual(returncode, 2)
29 | self.assertIn('Word size must be >= 1.', out)
30 |
31 | def test_arg_distance_invalid_choice(self):
32 | args = ['--fasta', self.filename_pep, '--word_size', '-1',
33 | '--distance', 'nonexistent']
34 | returncode, out = utils.runscript(self.script_name, args)
35 | self.assertEqual(returncode, 2)
36 | self.assertIn('invalid choice', out)
37 |
38 | def test_output_word_size2(self):
39 | args = ['--fasta', self.filename_pep, '--word_size', '2']
40 | returncode, out, md5 = self._test_output(self.script_name, args)
41 | print(out)
42 | self.assertEqual(returncode, 0)
43 | self.assertEqual(md5, 'f1b4cf9538d2d2a2a4f1e81ac1b1251d')
44 |
45 | def test_output_word_size2(self):
46 | args = ['--fasta', self.filename_pep, '--word_size', '2',
47 | '--distance', 'jaccard']
48 | returncode, out, md5 = self._test_output(self.script_name, args)
49 | self.assertEqual(returncode, 0)
50 | self.assertEqual(md5, '7a744c4665ac06483c5eb36ee03d4fa8')
51 |
52 |
53 | if __name__ == '__main__':
54 | unittest.main()
55 |
--------------------------------------------------------------------------------
/tests/test_create_wordpattern.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from . import utils
4 |
5 |
6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
7 |
8 | def __init__(self, *args, **kwargs):
9 | super(ScriptTest, self).__init__(*args, **kwargs)
10 | utils.ScriptsCommonTest.set_test_data()
11 | self.script_name = 'create_wordpattern.py'
12 |
13 | def test_arg_word_size_when_no_fasta(self):
14 | args = ['--word_size', '2']
15 | returncode, out = utils.runscript(self.script_name, args)
16 | self.assertEqual(returncode, 2)
17 | self.assertIn('--fasta/-f', out)
18 |
19 | def test_arg_word_size_0(self):
20 | args = ['--fasta', self.filename_pep, '--word_size', '0']
21 | returncode, out = utils.runscript(self.script_name, args)
22 | self.assertEqual(returncode, 2)
23 | self.assertIn('--word_size must be >= 1', out)
24 |
25 | def test_arg_teiresias_when_no_l(self):
26 | args = ['--fasta', self.filename_pep, '--word_size', '2',
27 | '--teiresias']
28 | returncode, out = utils.runscript(self.script_name, args)
29 | self.assertEqual(returncode, 2)
30 | self.assertIn('Teiresias requires --l', out)
31 |
32 | def test_arg_teiresias_when_no_k(self):
33 | args = ['--fasta', self.filename_pep, '--word_size', '2',
34 | '--teiresias', '--l', '2']
35 | returncode, out = utils.runscript(self.script_name, args)
36 | self.assertEqual(returncode, 2)
37 | self.assertIn('Teiresias requires --k', out)
38 |
39 | def test_arg_teiresias_when_k_and_not_l(self):
40 | args = ['--fasta', self.filename_pep, '--word_size', '2',
41 | '--teiresias', '--k', '2']
42 | returncode, out = utils.runscript(self.script_name, args)
43 | self.assertEqual(returncode, 2)
44 | self.assertIn('Teiresias requires --l', out)
45 |
46 | def test_teiresias_when_l_too_small(self):
47 | args = ['--fasta', self.filename_pep, '--word_size', '2',
48 | '--teiresias', '--k', '2', '--l', '1']
49 | returncode, out = utils.runscript(self.script_name, args)
50 | self.assertEqual(returncode, 2)
51 | self.assertIn('--l must be at least 2', out)
52 |
53 | def test_output_word_size_2(self):
54 | args = ['--fasta', self.filename_pep, '--word_size', '2']
55 | returncode, out, md5 = self._test_output(self.script_name, args)
56 | self.assertEqual(returncode, 0)
57 | self.assertEqual(md5, '2aea23ad3e883708dc2f95111f7f04ec')
58 |
59 | def test_output_word_size_2_wordpos(self):
60 | args = ['--fasta', self.filename_pep, '--word_size', '2',
61 | '--word_position']
62 | returncode, out, md5 = self._test_output(self.script_name, args)
63 | self.assertEqual(returncode, 0)
64 | self.assertEqual(md5, '040e121be77617191c7d7c847edafc8e')
65 |
66 | def test_output_word_size_1(self):
67 | args = ['--fasta', self.filename_pep, '--word_size', '1']
68 | returncode, out, md5 = self._test_output(self.script_name, args)
69 | self.assertEqual(returncode, 0)
70 | self.assertEqual(md5, '2d4dd98798cb6320975f6919fe43b777')
71 |
72 |
73 | if __name__ == '__main__':
74 | unittest.main()
75 |
--------------------------------------------------------------------------------
/tests/test_distance.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from alfpy import word_pattern
4 | from alfpy import word_vector
5 | from alfpy.utils import distance
6 | from alfpy.utils import distmatrix
7 |
8 | from . import utils
9 |
10 |
11 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest):
12 |
13 | def __init__(self, *args, **kwargs):
14 | super(DistanceTest, self).__init__(*args, **kwargs)
15 | utils.ModulesCommonTest.set_test_data()
16 | self.pattern = word_pattern.create(self.dna_records.seq_list, 2)
17 | self.counts = word_vector.Counts(self.dna_records.length_list,
18 | self.pattern)
19 | self.freqs = word_vector.Freqs(self.dna_records.length_list,
20 | self.pattern)
21 |
22 | def test_euclid_squared_counts(self):
23 | # The result of this method is identical to that from decaf+py.
24 | dist = distance.Distance(self.counts, 'euclid_squared')
25 | matrix = distmatrix.create(self.dna_records.id_list, dist)
26 | data = [' 3',
27 | 'seq1 0.0000000 57.0000000 30.0000000',
28 | 'seq2 57.0000000 0.0000000 19.0000000',
29 | 'seq3 30.0000000 19.0000000 0.0000000']
30 | self.assertEqual(matrix.format(), "\n".join(data))
31 |
32 | def test_euclid_squared_freqs(self):
33 | # The result of this method is identical to that from decaf+py.
34 | dist = distance.Distance(self.freqs, 'euclid_squared')
35 | matrix = distmatrix.create(self.dna_records.id_list, dist)
36 | data = [' 3',
37 | 'seq1 0.0000000 0.1416402 0.0641298',
38 | 'seq2 0.1416402 0.0000000 0.0677565',
39 | 'seq3 0.0641298 0.0677565 0.0000000']
40 | self.assertEqual(matrix.format(), "\n".join(data))
41 |
42 | def test_euclid_norm_counts(self):
43 | # The result of this method is identical to that from decaf+py.
44 | dist = distance.Distance(self.counts, 'euclid_norm')
45 | matrix = distmatrix.create(self.dna_records.id_list, dist)
46 | data = [' 3',
47 | 'seq1 0.0000000 7.5498344 5.4772256',
48 | 'seq2 7.5498344 0.0000000 4.3588989',
49 | 'seq3 5.4772256 4.3588989 0.0000000']
50 | self.assertEqual(matrix.format(), "\n".join(data))
51 |
52 | def test_euclid_norm_freqs(self):
53 | # The result of this method is identical to that from decaf+py.
54 | dist = distance.Distance(self.freqs, 'euclid_norm')
55 | matrix = distmatrix.create(self.dna_records.id_list, dist)
56 | data = [' 3',
57 | 'seq1 0.0000000 0.3763512 0.2532387',
58 | 'seq2 0.3763512 0.0000000 0.2603008',
59 | 'seq3 0.2532387 0.2603008 0.0000000']
60 | self.assertEqual(matrix.format(), "\n".join(data))
61 |
62 | def test_google_freqs(self):
63 | dist = distance.Distance(self.freqs, 'google')
64 | matrix = distmatrix.create(self.dna_records.id_list, dist)
65 | data = [' 3',
66 | 'seq1 0.0000000 0.6078431 0.3809524',
67 | 'seq2 0.6078431 0.0000000 0.3949580',
68 | 'seq3 0.3809524 0.3949580 0.0000000']
69 | self.assertEqual(matrix.format(), "\n".join(data))
70 |
71 | def test_get_disttypes(self):
72 | distlist = distance.Distance.get_disttypes()
73 | exp = ['euclid_norm', 'euclid_squared', 'google']
74 | self.assertListEqual(distlist, exp)
75 |
76 | def test_set_disttypes_throws_exception(self):
77 | dist = distance.Distance(self.freqs, 'google')
78 | with self.assertRaises(Exception) as context:
79 | dist.set_disttype('nonexistent')
80 | self.assertIn('unknown disttype', str(context.exception))
81 |
82 | if __name__ == '__main__':
83 | unittest.main()
84 |
--------------------------------------------------------------------------------
/tests/test_distmatrix.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import unittest
4 |
5 | from alfpy import word_distance
6 | from alfpy.utils import distmatrix
7 |
8 | from . import utils
9 |
10 |
11 | class TestDistMatrix(unittest.TestCase):
12 |
13 | def setUp(self):
14 | id_list = ['seq1', 'seq2', 'seq3']
15 | data = np.array([[0, 0.3531587, 0.35509333],
16 | [0.3531587, 0, 0.295394],
17 | [0.35509333, 0.295394, 0.]
18 | ])
19 | self.matrix = distmatrix.Matrix(id_list, data)
20 | self.output_filename = utils.get_test_data('distmatrix.txt')
21 |
22 | def test_format(self):
23 | exp = [
24 | ' 3',
25 | 'seq1 0.0000000 0.3531587 0.3550933',
26 | 'seq2 0.3531587 0.0000000 0.2953940',
27 | 'seq3 0.3550933 0.2953940 0.0000000'
28 | ]
29 | self.assertEqual(self.matrix.format(), "\n".join(exp))
30 |
31 | def test_format_decimal3(self):
32 | exp = [
33 | ' 3',
34 | 'seq1 0.000 0.353 0.355',
35 | 'seq2 0.353 0.000 0.295',
36 | 'seq3 0.355 0.295 0.000'
37 | ]
38 | self.assertEqual(self.matrix.format(3), "\n".join(exp))
39 |
40 | def test_min(self):
41 | self.assertEqual(self.matrix.min(), 0)
42 |
43 | def test_max(self):
44 | self.assertEqual(self.matrix.max(), 0.35509332999999998)
45 |
46 | def test_is_zero(self):
47 | self.assertFalse(self.matrix.is_zero())
48 |
49 | def test_normalize(self):
50 | self.matrix.normalize()
51 | exp = [
52 | " 3",
53 | "seq1 0.0000000 0.9945518 1.0000000",
54 | "seq2 0.9945518 0.0000000 0.8318771",
55 | "seq3 1.0000000 0.8318771 0.0000000",
56 | ]
57 | self.assertEqual(self.matrix.format(), "\n".join(exp))
58 |
59 | def test_write_to_file_phylip(self):
60 | oh = open(self.output_filename, 'w')
61 | self.matrix.write_to_file(oh)
62 | oh.close()
63 | fh = open(self.output_filename)
64 | result = fh.read()
65 | fh.close()
66 | os.remove(self.output_filename)
67 | exp = [
68 | ' 3',
69 | 'seq1 0.0000000 0.3531587 0.3550933',
70 | 'seq2 0.3531587 0.0000000 0.2953940',
71 | 'seq3 0.3550933 0.2953940 0.0000000\n'
72 | ]
73 | self.assertEqual(result, "\n".join(exp))
74 |
75 | def test_write_to_file_pairwise(self):
76 | oh = open(self.output_filename, 'w')
77 | self.matrix.write_to_file(oh, 'pairwise')
78 | oh.close()
79 | fh = open(self.output_filename)
80 | result = fh.read()
81 | fh.close()
82 | os.remove(self.output_filename)
83 | exp = [
84 | "seq1\tseq2\t0.3531587",
85 | "seq1\tseq3\t0.3550933",
86 | "seq2\tseq3\t0.2953940\n"
87 | ]
88 | self.assertEqual(result, "\n".join(exp))
89 |
90 | def test_write_to_file_pairwise_decimal3(self):
91 | oh = open(self.output_filename, 'w')
92 | self.matrix.write_to_file(oh, 'pairwise', 3)
93 | oh.close()
94 | fh = open(self.output_filename)
95 | result = fh.read()
96 | fh.close()
97 | os.remove(self.output_filename)
98 | exp = [
99 | "seq1\tseq2\t0.353",
100 | "seq1\tseq3\t0.355",
101 | "seq2\tseq3\t0.295\n"
102 | ]
103 | self.assertEqual(result, "\n".join(exp))
104 |
105 | def test_iter(self):
106 | exp = [(0, 1, 'seq1', 'seq2', 0.35315869999999999),
107 | (0, 2, 'seq1', 'seq3', 0.35509332999999998),
108 | (1, 2, 'seq2', 'seq3', 0.29539399999999999)]
109 | self.assertEqual(list(self.matrix), exp)
110 |
111 | def test_create_matrix(self):
112 | l = [[3, 6, 4, 1, 3, 4, 3, 0, 1, 1, 6, 4, 5, 0, 3, 4],
113 | [0, 3, 0, 3, 0, 0, 0, 2, 9, 0, 3, 3, 0, 6, 3, 6],
114 | [9, 0, 0, 3, 0, 0, 0, 2, 6, 0, 3, 3, 0, 3, 3, 3]]
115 | vector = np.array(l)
116 | dist = word_distance.Distance(vector, 'minkowski')
117 | id_list = ['seq1', 'seq2', 'seq3']
118 | matrix = distmatrix.create(id_list, dist)
119 | exp = [
120 | ' 3',
121 | 'seq1 0.0000000 14.6969385 14.1774469',
122 | 'seq2 14.6969385 0.0000000 10.8166538',
123 | 'seq3 14.1774469 10.8166538 0.0000000'
124 | ]
125 | self.assertEqual(matrix.format(), "\n".join(exp))
126 |
127 | def test_highcharts(self):
128 | self.assertEqual(len(self.matrix.highcharts()), 3)
129 |
130 | def test_read_highcharts_matrix(self):
131 | id_list = ['seq1', 'seq2', 'seq3']
132 | data = [[0, 1, 0.35, 0.19], [0, 2, 1.0, 0.55], [1, 2, 0.88, 0.48]]
133 | matrix = distmatrix.read_highcharts_matrix(id_list, data)
134 | md5 = utils.calc_md5(matrix.format())
135 | self.assertEqual(md5, "476c8f5d284a84ee3c7c419bde2d7658")
136 |
137 |
138 | if __name__ == '__main__':
139 | unittest.main()
140 |
--------------------------------------------------------------------------------
/tests/test_fasta.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | from alfpy.utils import fasta
5 |
6 | from . import utils
7 |
8 |
9 | class FastaTest(unittest.TestCase):
10 |
11 | def __init__(self, *args, **kwargs):
12 | super(FastaTest, self).__init__(*args, **kwargs)
13 | self.ID_LIST = ['seq1', 'seq2', 'seq3', 'seq4']
14 | self.DESC_LIST = ['seq1 desc', 'seq2 desc', 'seq3 desc', '']
15 | self.SEQ_LIST = [
16 | 'MEVVIRSANFTDNAKIIIVQLNASVEINCTRPNNYTRKGIRIGPGRAVYAAEEIIGDNTLKQVVTKLRE',
17 | 'MVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIRIGPGRAVYAAEEIIGDIRRAHCNIS',
18 | 'MFTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGAKW',
19 | 'MFTDNAKIIIVQLNASVEINCTRPNNNTR'
20 | ]
21 |
22 | def _validate_FastaRecord_init(self, fasta_record, seqidx):
23 | self.assertIsInstance(fasta_record, fasta.FastaRecord)
24 | self.assertEqual(fasta_record.seq, self.SEQ_LIST[seqidx])
25 | self.assertEqual(fasta_record.id, self.ID_LIST[seqidx])
26 | self.assertEqual(fasta_record.description, self.DESC_LIST[seqidx])
27 | self.assertEqual(len(fasta_record), len(self.SEQ_LIST[seqidx]))
28 |
29 | def test_single_FastaRecord_init(self):
30 | r = fasta.FastaRecord(self.SEQ_LIST[0],
31 | self.ID_LIST[0],
32 | self.DESC_LIST[0])
33 | self._validate_FastaRecord_init(r, seqidx=0)
34 |
35 | def test_single_FastaRecord_iter(self):
36 | r = fasta.FastaRecord(self.SEQ_LIST[3],
37 | self.ID_LIST[3],
38 | self.DESC_LIST[3])
39 | i = iter(r)
40 | self.assertEqual(next(i), 'M')
41 | self.assertEqual(next(i), 'F')
42 |
43 | def test_single_FastaRecord_contains(self):
44 | r = fasta.FastaRecord(self.SEQ_LIST[3],
45 | self.ID_LIST[3],
46 | self.DESC_LIST[3])
47 | self.assertTrue('MFT' in r)
48 |
49 | def test_multiple_FastaRecord_init(self):
50 | for i in range(len(self.ID_LIST)):
51 | r = fasta.FastaRecord(self.SEQ_LIST[i],
52 | self.ID_LIST[i],
53 | self.DESC_LIST[i])
54 | self._validate_FastaRecord_init(r, seqidx=i)
55 |
56 | def test_read_fasta(self):
57 | fh = open(utils.get_test_data('pep.fa'))
58 | r = fasta.read(fh)
59 | fh.close()
60 | self._validate_FastaRecord_init(r, seqidx=0)
61 |
62 | def test_parse_fasta(self):
63 | fh = open(utils.get_test_data('pep.fa'))
64 | for i, rec in enumerate(fasta.parse(fh)):
65 | self._validate_FastaRecord_init(rec, seqidx=i)
66 | fh.close()
67 |
68 | def test_to_dict(self):
69 | fh = open(utils.get_test_data('pep.fa'))
70 | d = fasta.to_dict(fasta.parse(fh))
71 | fh.close()
72 | self.assertEqual(len(d), 4)
73 |
74 | def test_to_dict_value_error(self):
75 | h = ['>seq1\n', 'ATG\n', '>seq1\n', 'ATGC']
76 | with self.assertRaises(ValueError) as context:
77 | d = fasta.to_dict(fasta.parse(h))
78 | self.assertIn('Duplicate key', str(context.exception))
79 |
80 |
81 | def test_parse_fasta_missing_sequences(self):
82 | ids = ['seq1', 'seq2']
83 | seqs = ['ATGC', '']
84 | l = ['>{}\n'.format(ids[0]),
85 | '{}\n\n\n'.format(seqs[0]),
86 | '>{}\n'.format(ids[1]),
87 | '{}\n'.format(seqs[1])
88 | ]
89 | for i, fasta_record in enumerate(fasta.parse(l)):
90 | self.assertIsInstance(fasta_record, fasta.FastaRecord)
91 | self.assertEqual(fasta_record.seq, seqs[i])
92 |
93 | def test_fasta_format(self, wrap=70):
94 | l = ['>seq1 seq1 desc\n',
95 | 'A' * wrap + '\n',
96 | 'B' * wrap]
97 | r = fasta.read(l)
98 | self.assertEqual(''.join(l), r.format(wrap=wrap))
99 |
100 | def test_input_output_file_fasta(self):
101 | filename = 'temp.fa'
102 | oh = open(utils.get_test_data(filename), 'w')
103 | l1 = []
104 | fh = open(utils.get_test_data('pep.fa'))
105 | for seq_record in fasta.parse(fh):
106 | l1.append(seq_record.format())
107 | oh.write(seq_record.format())
108 | oh.write('\n')
109 | fh.close()
110 | oh.close()
111 | fh = open(utils.get_test_data(filename))
112 | l2 = [seq_record.format() for seq_record in fasta.parse(fh)]
113 | fh.close()
114 | os.remove(utils.get_test_data(filename))
115 | self.assertEqual(l1, l2)
116 |
117 |
118 | if __name__ == '__main__':
119 | unittest.main()
120 |
--------------------------------------------------------------------------------
/tests/test_fcgr.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | from alfpy import fcgr
5 | from alfpy.utils import distmatrix
6 |
7 | from . import utils
8 |
9 |
10 | class VectorTest(unittest.TestCase, utils.ModulesCommonTest):
11 |
12 | def __init__(self, *args, **kwargs):
13 | super(VectorTest, self).__init__(*args, **kwargs)
14 | utils.ModulesCommonTest.set_test_data()
15 |
16 | def test_fcgr_vector1(self):
17 | vec = fcgr.fcgr_vector('CTAGGGAACATACCA', 1)
18 | self.assertEqual(vec, [3.0, 6.0, 3.0])
19 |
20 | def test_fcgr_vector2(self):
21 | vec = fcgr.fcgr_vector('CTAGGGAACATACCA', 2)
22 | exp = [0.0, 0.0, 2.0, 2.0, 1.0, 1.0, 0.0, 2.0,
23 | 1.0, 2.0, 0.0, 1.0, 2.0, 1.0, 0.0]
24 | self.assertEqual(vec, exp)
25 |
26 | def test_fcgr_vector3(self):
27 | vec = fcgr.fcgr_vector('CTAGGGAACATACCXXA', 1)
28 | self.assertEqual(vec, [3.0, 6.0, 3.0])
29 |
30 | def test_create_vectors(self):
31 | vecs = fcgr.create_vectors(self.dna_records, 2)
32 | exp = [[0, 3, 1, 4, 0, 1, 1, 1, 1, 1, 3, 2, 4, 1, 1],
33 | [0, 0, 4, 1, 2, 2, 0, 0, 1, 4, 0, 0, 3, 1, 1],
34 | [0, 0, 2, 2, 1, 1, 0, 2, 1, 2, 0, 1, 2, 1, 0]]
35 | self.assertEqual(vecs.tolist(), exp)
36 |
37 |
38 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest):
39 |
40 | def __init__(self, *args, **kwargs):
41 | super(DistanceTest, self).__init__(*args, **kwargs)
42 | utils.ModulesCommonTest.set_test_data()
43 |
44 | def test_distance1(self):
45 | vecs = fcgr.create_vectors(self.dna_records, 2)
46 | dist = fcgr.Distance(vecs)
47 | matrix = distmatrix.create(self.dna_records.id_list, dist)
48 | exp = [
49 | " 3",
50 | "seq1 0.0000000 7.5498344 5.7445626",
51 | "seq2 7.5498344 0.0000000 4.2426407",
52 | "seq3 5.7445626 4.2426407 0.0000000"
53 | ]
54 | self.assertEqual(matrix.format(), "\n".join(exp))
55 |
56 | def test_distance2(self):
57 | vecs = fcgr.create_vectors(self.dna_records, 2)
58 | dist = fcgr.Distance(vecs, 'google')
59 | matrix = distmatrix.create(self.dna_records.id_list, dist)
60 | exp = [
61 | " 3",
62 | "seq1 0.0000000 0.5833333 0.5416667",
63 | "seq2 0.5833333 0.0000000 0.4210526",
64 | "seq3 0.5416667 0.4210526 0.0000000"
65 | ]
66 | self.assertEqual(matrix.format(), "\n".join(exp))
67 |
68 |
69 | if __name__ == '__main__':
70 | unittest.main()
71 |
--------------------------------------------------------------------------------
/tests/test_graphdna.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import unittest
3 |
4 | from alfpy import graphdna
5 | from alfpy.utils import distmatrix
6 |
7 | from . import utils
8 |
9 |
10 | class VectorTest(unittest.TestCase, utils.ModulesCommonTest):
11 |
12 | def __init__(self, *args, **kwargs):
13 | super(VectorTest, self).__init__(*args, **kwargs)
14 | utils.ModulesCommonTest.set_test_data()
15 |
16 | def test_2DSGraphVector(self):
17 | seq = 'CTAGGGAACATACCA'
18 | vec = graphdna._2DSGraphVector(seq)
19 |
20 | exp = [2.99197183, -8.04298066, 9.16666667, -5.78272208,
21 | 6.5, -1.75064326, 5, -2.92241364, 9.25, -3.81343559]
22 | self.assertTrue(np.allclose(vec, np.array(exp)))
23 |
24 | def test_2DSGraphVector_ambiguousDNA(self):
25 | seq = 'CTAGGGAANNNXXXCATACCA'
26 | vec = graphdna._2DSGraphVector(seq)
27 |
28 | exp = [2.99197183, -8.04298066, 9.16666667, -5.78272208,
29 | 6.5, -1.75064326, 5, -2.92241364, 9.25, -3.81343559]
30 | self.assertTrue(np.allclose(vec, np.array(exp)))
31 |
32 | def test_2DMGraphVector_ndim10(self):
33 | seq = 'CTAGGGAACATACCA'
34 | vec = graphdna._2DMGraphVector(seq, 10)
35 | exp = [15, 12.14790682, 13.5804606, 15.88980624, 19.16010756,
36 | 23.55763468, 29.38627489, 37.08035601, 47.23633868,
37 | 60.66394053]
38 | self.assertEqual(vec.shape, (10,))
39 | self.assertTrue(np.allclose(vec, np.array(exp)))
40 |
41 | def test_2DMGraphVector_ndim10_ambiguousDNA(self):
42 | seq = 'CTAGGGAACATACCA'
43 | vec = graphdna._2DMGraphVector(seq, 10)
44 | exp = [15, 12.14790682, 13.5804606, 15.88980624, 19.16010756,
45 | 23.55763468, 29.38627489, 37.08035601, 47.23633868,
46 | 60.66394053]
47 | self.assertEqual(vec.shape, (10,))
48 | self.assertTrue(np.allclose(vec, np.array(exp)))
49 |
50 | def test_2DMGraphVector_ndim5(self):
51 | seq = 'CTAGGGAACATACCA'
52 | vec = graphdna._2DMGraphVector(seq, 5)
53 | exp = [15, 12.14790682, 13.5804606, 15.88980624, 19.16010756]
54 | self.assertEqual(vec.shape, (5,))
55 | self.assertTrue(np.allclose(vec, np.array(exp)))
56 |
57 | def test_2DNGraphVector(self):
58 | seq = 'CTAGGGAACATACCA'
59 | vec = graphdna._2DNGraphVector(seq)
60 | md5 = utils.calc_md5(vec)
61 | self.assertEqual(len(vec), 48)
62 | self.assertEqual(md5, '44829cc0277531646d656cdaacd3ae94')
63 |
64 | def test_2DNGraphVector_ambiguousDNA(self):
65 | seq = 'CTAGGGAACATACCA'
66 | vec = graphdna._2DNGraphVector(seq)
67 | md5 = utils.calc_md5(vec)
68 | self.assertEqual(len(vec), 48)
69 | self.assertEqual(md5, '44829cc0277531646d656cdaacd3ae94')
70 |
71 | def test_create_2DSGraphVectors(self):
72 | data = graphdna.create_2DSGraphVectors(self.dna_records)
73 | md5 = utils.calc_md5(data)
74 | self.assertEqual(md5, 'e2399897bb7eaa5ca3a81c84e2eeac84')
75 |
76 | def test_create_2DMGraphVectors(self):
77 | data = graphdna.create_2DMGraphVectors(self.dna_records, 10)
78 | md5 = utils.calc_md5(data)
79 | self.assertEqual(md5, '8c7d4dca912aeaf7c88d325799dadf00')
80 |
81 | def test_create_2DNGraphVectors(self):
82 | data = graphdna.create_2DNGraphVectors(self.dna_records)
83 | md5 = utils.calc_md5(data)
84 | self.assertEqual(md5, '3211fc3837b876521a6ab8b6a22b411c')
85 |
86 |
87 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest):
88 |
89 | def __init__(self, *args, **kwargs):
90 | super(DistanceTest, self).__init__(*args, **kwargs)
91 | utils.ModulesCommonTest.set_test_data()
92 |
93 | def test_distance_2DSG(self):
94 | data = graphdna.create_2DSGraphVectors(self.dna_records)
95 | dist = graphdna.Distance(data)
96 | matrix = distmatrix.create(self.dna_records.id_list, dist)
97 | exp = [
98 | ' 3',
99 | 'seq1 0.0000000 9.4762599 14.6585286',
100 | 'seq2 9.4762599 0.0000000 6.7199568',
101 | 'seq3 14.6585286 6.7199568 0.0000000',
102 | ]
103 | self.assertEqual(matrix.format(), "\n".join(exp))
104 |
105 | def test_distance_2DMG(self):
106 | data = graphdna.create_2DMGraphVectors(self.dna_records, 10)
107 | dist = graphdna.Distance(data)
108 | matrix = distmatrix.create(self.dna_records.id_list, dist)
109 | exp = [
110 | ' 3',
111 | 'seq1 0.0000000 22.2449494 55.9753388',
112 | 'seq2 22.2449494 0.0000000 34.2064423',
113 | 'seq3 55.9753388 34.2064423 0.0000000'
114 | ]
115 | self.assertEqual(matrix.format(), "\n".join(exp))
116 |
117 | def test_distance_2DNG(self):
118 | data = graphdna.create_2DNGraphVectors(self.dna_records)
119 | dist = graphdna.Distance(data)
120 | matrix = distmatrix.create(self.dna_records.id_list, dist)
121 | exp = [
122 | ' 3',
123 | 'seq1 0.0000000 10.3711467 15.1355787',
124 | 'seq2 10.3711467 0.0000000 7.8973545',
125 | 'seq3 15.1355787 7.8973545 0.0000000'
126 | ]
127 | self.assertEqual(matrix.format(), "\n".join(exp))
128 |
129 |
130 | if __name__ == '__main__':
131 | unittest.main()
132 |
--------------------------------------------------------------------------------
/tests/test_lempelziv.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from alfpy import lempelziv
4 | from alfpy.utils import distmatrix
5 |
6 | from . import utils
7 |
8 |
9 | class VectorTest(unittest.TestCase, utils.ModulesCommonTest):
10 |
11 | def __init__(self, *args, **kwargs):
12 | super(VectorTest, self).__init__(*args, **kwargs)
13 | utils.ModulesCommonTest.set_test_data()
14 |
15 | def test_complexity(self):
16 | seq = 'MFTDNAKIIIVQLNASVEINCTRPNNNTR'
17 | c = lempelziv.complexity(seq)
18 | self.assertEqual(c, 19)
19 |
20 | def test_complexity1(self):
21 | seq = 'MFTDNAKIIIVQLNASVEINCTRPNNNTR'
22 | c = lempelziv.complexity1(seq)
23 | self.assertEqual(c, 20)
24 |
25 | def test_complexities(self):
26 | dist = lempelziv.Distance(self.pep_records)
27 | exp = [((0,), 40), ((0, 1), 47), ((0, 2), 53),
28 | ((0, 3), 43), ((1,), 38), ((1, 0), 47),
29 | ((1, 2), 47), ((1, 3), 41), ((2,), 35),
30 | ((2, 0), 50), ((2, 1), 45), ((2, 3), 37),
31 | ((3,), 19), ((3, 0), 39), ((3, 1), 37),
32 | ((3, 2), 36)]
33 | self.assertEqual(sorted(dist._complexity.items()), exp)
34 |
35 |
36 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest):
37 |
38 | def __init__(self, *args, **kwargs):
39 | super(DistanceTest, self).__init__(*args, **kwargs)
40 | utils.ModulesCommonTest.set_test_data()
41 | self.dist = lempelziv.Distance(self.pep_records, 'd')
42 |
43 | def test_distance_d(self):
44 | matrix = distmatrix.create(self.pep_records.id_list, self.dist)
45 | exp = [
46 | " 4",
47 | "seq1 0 9 15 20",
48 | "seq2 9 0 10 18",
49 | "seq3 15 10 0 17",
50 | "seq4 20 18 17 0"
51 | ]
52 | self.assertEqual(matrix.format(decimal_places=0), "\n".join(exp))
53 |
54 | def test_distance_d_star(self):
55 | self.dist.set_disttype('d_star')
56 | matrix = distmatrix.create(self.pep_records.id_list, self.dist)
57 | exp = [
58 | " 4",
59 | "seq1 0.0000000 0.2250000 0.3750000 0.5000000",
60 | "seq2 0.2250000 0.0000000 0.2631579 0.4736842",
61 | "seq3 0.3750000 0.2631579 0.0000000 0.4857143",
62 | "seq4 0.5000000 0.4736842 0.4857143 0.0000000"
63 | ]
64 | self.assertEqual(matrix.format(), "\n".join(exp))
65 |
66 | def test_distance_d1(self):
67 | self.dist.set_disttype('d1')
68 | matrix = distmatrix.create(self.pep_records.id_list, self.dist)
69 | exp = [
70 | " 4",
71 | "seq1 0 16 28 23",
72 | "seq2 16 0 19 21",
73 | "seq3 28 19 0 19",
74 | "seq4 23 21 19 0"
75 | ]
76 | self.assertEqual(matrix.format(0), "\n".join(exp))
77 |
78 | def test_distance_d1_star(self):
79 | self.dist.set_disttype('d1_star')
80 | matrix = distmatrix.create(self.pep_records.id_list, self.dist)
81 | exp = [
82 | " 4",
83 | "seq1 0.0000000 0.3404255 0.5283019 0.5348837",
84 | "seq2 0.3404255 0.0000000 0.4042553 0.5121951",
85 | "seq3 0.5283019 0.4042553 0.0000000 0.5135135",
86 | "seq4 0.5348837 0.5121951 0.5135135 0.0000000"
87 | ]
88 | self.assertEqual(matrix.format(), "\n".join(exp))
89 |
90 | def test_distance_d1_star2(self):
91 | self.dist.set_disttype('d1_star2')
92 | matrix = distmatrix.create(self.pep_records.id_list, self.dist)
93 | exp = [
94 | " 4",
95 | "seq1 0.0000000 0.3404255 0.5436893 0.5609756",
96 | "seq2 0.3404255 0.0000000 0.4130435 0.5384615",
97 | "seq3 0.5436893 0.4130435 0.0000000 0.5205479",
98 | "seq4 0.5609756 0.5384615 0.5205479 0.0000000"
99 | ]
100 | self.assertEqual(matrix.format(), "\n".join(exp))
101 |
102 | def test_set_disttype_throws_exception(self):
103 | with self.assertRaises(Exception) as context:
104 | self.dist.set_disttype('nonexitent')
105 | self.assertIn('unknown disttype', str(context.exception))
106 |
107 |
108 | if __name__ == '__main__':
109 | unittest.main()
110 |
--------------------------------------------------------------------------------
/tests/test_ncd.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from alfpy import ncd
4 | from alfpy.utils import distmatrix
5 |
6 | from . import utils
7 |
8 |
9 | class Test(unittest.TestCase, utils.ModulesCommonTest):
10 |
11 | def __init__(self, *args, **kwargs):
12 | super(Test, self).__init__(*args, **kwargs)
13 | utils.ModulesCommonTest.set_test_data()
14 |
15 | def test_complexity1(self):
16 | seq = 'AACGTACCATTGAACGTACCGTAGG'
17 | c = ncd.complexity(seq)
18 | self.assertEqual(c, 26)
19 |
20 | def test_complexity2(self):
21 | seq = 'MFTDNAKIIIVQLNASVEINCTRPNNNTR'
22 | c = ncd.complexity(seq)
23 | self.assertEqual(c, 37)
24 |
25 | def test_complexities(self):
26 | dist = ncd.Distance(self.pep_records)
27 | exp = [
28 | ((0,), 63.0), ((0, 1), 77.0), ((0, 2), 85.0),
29 | ((0, 3), 70.0), ((1,), 60.0), ((1, 2), 78.0),
30 | ((1, 3), 65.0), ((2,), 61.0), ((2, 3), 66.0),
31 | ((3,), 37.0)
32 | ]
33 | self.assertEqual(exp, sorted(dist._complexity.items()))
34 |
35 | def test_distance(self):
36 | dist = ncd.Distance(self.pep_records)
37 | matrix = distmatrix.create(self.pep_records.id_list, dist)
38 | exp = [
39 | " 4",
40 | "seq1 0.0000000 0.2698413 0.3809524 0.5238095",
41 | "seq2 0.2698413 0.0000000 0.2950820 0.4666667",
42 | "seq3 0.3809524 0.2950820 0.0000000 0.4754098",
43 | "seq4 0.5238095 0.4666667 0.4754098 0.0000000"
44 | ]
45 | self.assertEqual(matrix.format(), "\n".join(exp))
46 |
47 |
48 | if __name__ == '__main__':
49 | unittest.main()
50 |
--------------------------------------------------------------------------------
/tests/test_seqrecords.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from alfpy.utils import seqrecords
4 |
5 | from . import utils
6 |
7 |
8 | class SeqRecordsTest(unittest.TestCase):
9 |
10 | def __init__(self, *args, **kwargs):
11 | super(SeqRecordsTest, self).__init__(*args, **kwargs)
12 | self.ID_LIST = ['seq1', 'seq2', 'seq3', 'seq4']
13 | self.DESC_LIST = ['seq1 desc', 'seq2 desc', 'seq3 desc', '']
14 | self.SEQ_LIST = [
15 | 'MEVVIRSANFTDNAKIIIVQLNASVEINCTRPNNYTRKGIRIGPGRAVYAAEEIIGDNTLKQVVTKLRE',
16 | 'MVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIRIGPGRAVYAAEEIIGDIRRAHCNIS',
17 | 'MFTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGAKW',
18 | 'MFTDNAKIIIVQLNASVEINCTRPNNNTR'
19 | ]
20 |
21 | def _validate_seqrecords(self, rec):
22 | self.assertEqual(rec.id_list, self.ID_LIST)
23 | self.assertEqual(rec.seq_list, self.SEQ_LIST)
24 | self.assertEqual(rec.length_list, [len(s) for s in self.SEQ_LIST])
25 | self.assertEqual(rec.count, len(self.SEQ_LIST))
26 |
27 | def test_SeqRecords_init(self):
28 | rec = seqrecords.SeqRecords(
29 | id_list=self.ID_LIST, seq_list=self.SEQ_LIST)
30 | self._validate_seqrecords(rec)
31 |
32 | def test_SeqRecords_add(self):
33 | rec = seqrecords.SeqRecords()
34 | for i in range(len(self.ID_LIST)):
35 | rec.add(self.ID_LIST[i], self.SEQ_LIST[i])
36 | self._validate_seqrecords(rec)
37 |
38 | def test_SeqRecords_len(self):
39 | rec = seqrecords.SeqRecords(
40 | id_list=self.ID_LIST, seq_list=self.SEQ_LIST)
41 | self.assertEqual(len(rec), 4)
42 |
43 | def test_read_fasta(self):
44 | fh = open(utils.get_test_data('pep.fa'))
45 | rec = seqrecords.read_fasta(fh)
46 | fh.close()
47 | self._validate_seqrecords(rec)
48 |
49 | def test_fasta(self):
50 | rec = seqrecords.SeqRecords(
51 | id_list=self.ID_LIST, seq_list=self.SEQ_LIST)
52 | md5 = utils.calc_md5(rec.fasta(wrap=30))
53 | exp = [
54 | ">seq1",
55 | "MEVVIRSANFTDNAKIIIVQLNASVEINCT",
56 | "RPNNYTRKGIRIGPGRAVYAAEEIIGDNTL",
57 | "KQVVTKLRE",
58 | ">seq2",
59 | "MVIRSANFTDNAKIIIVQLNASVEINCTRP",
60 | "NNNTRKGIRIGPGRAVYAAEEIIGDIRRAH",
61 | "CNIS",
62 | ">seq3",
63 | "MFTDNAKIIIVQLNASVEINCTRPNNNTRK",
64 | "GIHIGPGRAFYATGEIIGDIRQAHCNISGA",
65 | "KW",
66 | ">seq4",
67 | "MFTDNAKIIIVQLNASVEINCTRPNNNTR"
68 | ]
69 | self.assertEqual(rec.fasta(wrap=30), "\n".join(exp))
70 |
71 | if __name__ == '__main__':
72 | unittest.main()
73 |
--------------------------------------------------------------------------------
/tests/test_wmetric.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from alfpy import wmetric
4 | from alfpy.utils import distmatrix
5 | from alfpy.utils.data import subsmat
6 |
7 | from . import utils
8 |
9 |
10 | class VectorTest(unittest.TestCase):
11 |
12 | def test_count_seq_chars(self):
13 | seq = 'MKSTGWHFSG'
14 | l = wmetric.count_seq_chars(seq, utils.ALPHABET_PEP)
15 | expl = [0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 1, 0, 0]
16 | self.assertEqual(l, expl)
17 |
18 | def test_count_seq_chars_pep_ambiguous(self):
19 | seq = 'MKSTGWXXXXXXXOOOOOOOHFSG'
20 | l = wmetric.count_seq_chars(seq, utils.ALPHABET_PEP)
21 | expl = [0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 1, 0, 0]
22 | self.assertEqual(l, expl)
23 |
24 | def test_freq_seq_chars(self):
25 | seq = 'MKSTGWXXXXXXXOOOOOOOHFSG'
26 | l = wmetric.count_seq_chars(seq, utils.ALPHABET_PEP)
27 | freq = wmetric.freq_seq_chars(l)
28 | expfreq = [0.0, 0.0, 0.0, 0.0, 0.1, 0.2, 0.1, 0.0, 0.1, 0.0,
29 | 0.1, 0.0, 0.0, 0.0, 0.2, 0.1, 0.0, 0.1, 0.0, 0.0]
30 | self.assertEqual(freq, expfreq)
31 |
32 |
33 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest):
34 |
35 | def __init__(self, *args, **kwargs):
36 | super(DistanceTest, self).__init__(*args, **kwargs)
37 | utils.ModulesCommonTest.set_test_data()
38 |
39 | def test_wmetric_blosum62(self):
40 | # The result of this method is identical to that from decaf+py.
41 | matrix = subsmat.get('blosum62')
42 | dist = wmetric.Distance(self.pep_records, matrix)
43 | matrix = distmatrix.create(self.pep_records.id_list, dist)
44 | data = [' 4',
45 | 'seq1 0.0000000 0.0392559 0.0783026 0.1261381',
46 | 'seq2 0.0392559 0.0000000 0.0377364 0.1166475',
47 | 'seq3 0.0783026 0.0377364 0.0000000 0.1677386',
48 | 'seq4 0.1261381 0.1166475 0.1677386 0.0000000']
49 | self.assertEqual(matrix.format(), "\n".join(data))
50 |
51 | def test_wmetric_pam250(self):
52 | matrix = subsmat.get('pam250')
53 | dist = wmetric.Distance(self.pep_records, matrix)
54 | matrix = distmatrix.create(self.pep_records.id_list, dist)
55 | data = [' 4',
56 | 'seq1 0.0000000 0.0289700 0.0467580 0.0353781',
57 | 'seq2 0.0289700 0.0000000 0.0227122 0.0372699',
58 | 'seq3 0.0467580 0.0227122 0.0000000 0.0578383',
59 | 'seq4 0.0353781 0.0372699 0.0578383 0.0000000']
60 | self.assertEqual(matrix.format(), "\n".join(data))
61 |
62 |
63 | if __name__ == '__main__':
64 | unittest.main()
65 |
--------------------------------------------------------------------------------
/tests/test_word_d2.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from alfpy import word_d2
4 | from alfpy import word_pattern
5 | from alfpy import word_vector
6 | from alfpy.utils import distmatrix
7 |
8 | from . import utils
9 |
10 |
11 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest):
12 |
13 | def __init__(self, *args, **kwargs):
14 | super(DistanceTest, self).__init__(*args, **kwargs)
15 | utils.ModulesCommonTest.set_test_data()
16 | self.patterns = []
17 | self.counts = []
18 | self.freqs = []
19 | for i in range(1, 5):
20 | p = word_pattern.create(self.pep_records.seq_list, i)
21 | self.patterns.append(p)
22 | c = word_vector.Counts(self.pep_records.length_list, p)
23 | self.counts.append(c)
24 | f = word_vector.Freqs(self.pep_records.length_list, p)
25 | self.freqs.append(f)
26 |
27 | def test_counts_from1_to4(self):
28 | dist = word_d2.Distance(self.counts)
29 | matrix = distmatrix.create(self.pep_records.id_list, dist)
30 | exp = [
31 | ' 4',
32 | 'seq1 0 130 236 286',
33 | 'seq2 130 0 142 258',
34 | 'seq3 236 142 0 212',
35 | 'seq4 286 258 212 0'
36 | ]
37 | self.assertEqual(matrix.format(decimal_places=0), "\n".join(exp))
38 |
39 | def test_freqs_from1_to4(self):
40 | dist = word_d2.Distance(self.freqs)
41 | matrix = distmatrix.create(self.pep_records.id_list, dist)
42 | exp = [
43 | ' 4',
44 | 'seq1 0.0000000 0.0313590 0.0573154 0.1020235',
45 | 'seq2 0.0313590 0.0000000 0.0373677 0.0907196',
46 | 'seq3 0.0573154 0.0373677 0.0000000 0.0870581',
47 | 'seq4 0.1020235 0.0907196 0.0870581 0.0000000'
48 |
49 | ]
50 | self.assertEqual(matrix.format(), "\n".join(exp))
51 |
52 | def test_counts_from1_to1(self):
53 | dist = word_d2.Distance([self.counts[0]])
54 | matrix = distmatrix.create(self.pep_records.id_list, dist)
55 | exp = [
56 | ' 4',
57 | 'seq1 0 37 57 140',
58 | 'seq2 37 0 28 137',
59 | 'seq3 57 28 0 111',
60 | 'seq4 140 137 111 0'
61 | ]
62 | self.assertEqual(matrix.format(decimal_places=0), "\n".join(exp))
63 |
64 | def test_freqs_from1_to4_d2_squareroot(self):
65 | dist = word_d2.Distance(self.freqs)
66 | dist.set_disttype('d2_squareroot')
67 | matrix = distmatrix.create(self.pep_records.id_list, dist)
68 | exp = [
69 | " 4",
70 | "seq1 0.0000000 0.1770847 0.2394063 0.3194113",
71 | "seq2 0.1770847 0.0000000 0.1933073 0.3011969",
72 | "seq3 0.2394063 0.1933073 0.0000000 0.2950560",
73 | "seq4 0.3194113 0.3011969 0.2950560 0.0000000"
74 |
75 | ]
76 | self.assertEqual(matrix.format(), "\n".join(exp))
77 |
78 | def test_set_disttype_throws_exception(self):
79 | dist = word_d2.Distance(self.freqs)
80 | with self.assertRaises(Exception) as context:
81 | dist.set_disttype('nonexistent')
82 | self.assertIn('unknown disttype', str(context.exception))
83 |
84 |
85 | if __name__ == '__main__':
86 | unittest.main()
87 |
--------------------------------------------------------------------------------
/tests/test_word_rtd.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from alfpy import word_pattern
4 | from alfpy import word_rtd
5 | from alfpy.utils import distmatrix
6 |
7 | from . import utils
8 |
9 |
10 | class Test(unittest.TestCase, utils.ModulesCommonTest):
11 |
12 | def __init__(self, *args, **kwargs):
13 | super(Test, self).__init__(*args, **kwargs)
14 | utils.ModulesCommonTest.set_test_data()
15 | self.pep_2mer_pos = word_pattern.create(
16 | self.pep_records.seq_list, 2, True)
17 |
18 | def test_calc_rtd(self):
19 | seq = 'CTACACAACTTTGCGGGTAGCCGGAAACATTGTGAATGCGGTGAACA'
20 | apos = [i for i, nt in enumerate(seq) if nt == 'A']
21 | val = word_rtd.calc_rtd(apos)
22 | exp = (3.3846153846153846, 3.1510306381944679)
23 | self.assertEqual(val, exp)
24 |
25 | def test_create_vector(self):
26 | vec = word_rtd.create_vector(self.pep_records.count, self.pep_2mer_pos)
27 | exp = (self.pep_records.count, len(self.pep_2mer_pos.pat_list)*2)
28 | self.assertEqual(vec.shape, exp)
29 |
30 | def test_distance(self):
31 | vec = word_rtd.create_vector(self.pep_records.count, self.pep_2mer_pos)
32 | dist = word_rtd.Distance(vec, 'google')
33 | matrix = distmatrix.create(self.pep_records.id_list, dist)
34 | exp = [
35 | " 4",
36 | "seq1 0.0000000 0.4892241 0.6034483 0.9310345",
37 | "seq2 0.4892241 0.0000000 0.3673469 0.8802817",
38 | "seq3 0.6034483 0.3673469 0.0000000 0.8843537",
39 | "seq4 0.9310345 0.8802817 0.8843537 0.0000000"
40 | ]
41 | self.assertEqual(matrix.format(), "\n".join(exp))
42 |
43 |
44 | if __name__ == '__main__':
45 | unittest.main()
46 |
--------------------------------------------------------------------------------
/tests/test_word_sets_distance.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import unittest
3 |
4 | from alfpy import word_pattern
5 | from alfpy import word_sets_distance
6 | from alfpy.utils import distmatrix
7 |
8 | from . import utils
9 |
10 |
11 | class Test(unittest.TestCase, utils.ModulesCommonTest):
12 |
13 | def __init__(self, *args, **kwargs):
14 | super(Test, self).__init__(*args, **kwargs)
15 | utils.ModulesCommonTest.set_test_data()
16 | self.p = word_pattern.create(self.pep_records.seq_list, 2)
17 |
18 | def test_getwords(self):
19 | words = word_sets_distance._getwords('ATGCGTA', 2)
20 | self.assertSetEqual(words, set(['GT', 'CG', 'GC', 'AT', 'TG', 'TA']))
21 |
22 | def test_distance_dice(self):
23 | # The result of this function is identical
24 | # to the Dice distance implemented in word_bool_distance.
25 | dist = word_sets_distance.Distance(self.pep_records, 2, 'dice')
26 | matrix = distmatrix.create(self.pep_records.id_list, dist)
27 | exp = [
28 | " 4",
29 | "seq1 0.0000000 0.1964286 0.3928571 0.4457831",
30 | "seq2 0.1964286 0.0000000 0.2452830 0.4025974",
31 | "seq3 0.3928571 0.2452830 0.0000000 0.3766234",
32 | "seq4 0.4457831 0.4025974 0.3766234 0.0000000"
33 | ]
34 | self.assertEqual(matrix.format(), "\n".join(exp))
35 |
36 | def test_distance_hamming(self):
37 | dist = word_sets_distance.Distance(self.pep_records, 2, 'hamming')
38 | matrix = distmatrix.create(self.pep_records.id_list, dist)
39 | exp = [
40 | " 4",
41 | "seq1 0 22 44 37",
42 | "seq2 22 0 26 31",
43 | "seq3 44 26 0 29",
44 | "seq4 37 31 29 0"
45 | ]
46 | self.assertEqual(matrix.format(0), "\n".join(exp))
47 |
48 | def test_distance_jaccard(self):
49 | # The result of this function is identical
50 | # to the Jaccard distance implemented in word_bool_distance.
51 | dist = word_sets_distance.Distance(self.pep_records, 2, 'jaccard')
52 | matrix = distmatrix.create(self.pep_records.id_list, dist)
53 | exp = [
54 | " 4",
55 | "seq1 0.0000000 0.3283582 0.5641026 0.6166667",
56 | "seq2 0.3283582 0.0000000 0.3939394 0.5740741",
57 | "seq3 0.5641026 0.3939394 0.0000000 0.5471698",
58 | "seq4 0.6166667 0.5740741 0.5471698 0.0000000"
59 | ]
60 | self.assertEqual(matrix.format(), "\n".join(exp))
61 |
62 |
63 | if __name__ == '__main__':
64 | unittest.main()
65 |
--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import os
3 | import subprocess
4 |
5 | from alfpy.utils import seqrecords
6 | from alfpy import __version__
7 |
8 |
9 | ALPHABET_DNA = 'ATGC'
10 | ALPHABET_PEP = 'ACDEFGHIKLMNPRSTQWVY'
11 |
12 |
13 | def get_test_data(filename):
14 | filepath = os.path.join(os.path.dirname(__file__), 'data', filename)
15 | return filepath
16 |
17 |
18 | def calc_md5(obj):
19 | return hashlib.md5(str(obj).encode("utf-8")).hexdigest()
20 |
21 |
22 | def runscript(scriptname, args):
23 | cmd = [scriptname]
24 | for arg in args:
25 | cmd.append(arg)
26 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
27 | stderr=subprocess.PIPE,
28 | universal_newlines=True)
29 | out = "".join(p.communicate())
30 | return p.returncode, out
31 |
32 |
33 | class ScriptsCommonTest:
34 | """Methods testing arguments that are common to all scripts."""
35 |
36 | # the name of the file to read from
37 |
38 | @classmethod
39 | def set_test_data(cls):
40 | cls.filename_dna = get_test_data('dna.fa')
41 | cls.filename_pep = get_test_data('pep.fa')
42 |
43 | def test_arg_version(self):
44 | cmd = ['--version']
45 | return_code, out = runscript(self.script_name, cmd)
46 | self.assertEqual(return_code, 0)
47 | self.assertIn(__version__, out)
48 |
49 | def test_arg_help(self):
50 | cmd = ['--help']
51 | return_code, out = runscript(self.script_name, cmd)
52 | self.assertEqual(return_code, 0)
53 |
54 | def test_arg_out_when_no_fasta(self):
55 | cmd = ['--out', 'out.txt']
56 | return_code, out = runscript(self.script_name, cmd)
57 | self.assertEqual(return_code, 2)
58 | self.assertIn('--fasta/-f', out)
59 |
60 | def test_arg_outfmt_when_no_fasta(self):
61 | cmd = ['--outfmt', 'pairwise']
62 | return_code, out = runscript(self.script_name, cmd)
63 | self.assertEqual(return_code, 2)
64 | self.assertIn('--fasta/-f', out)
65 |
66 | def _test_output(self, script_name, args, outfile=True):
67 | input_filename = args[args.index('--fasta') + 1]
68 | if outfile:
69 | args.append('--out')
70 | output_filename = '{}.out'.format(input_filename)
71 | args.append(output_filename)
72 | returncode, result = runscript(script_name, args)
73 | if outfile:
74 | fh = open(output_filename)
75 | result = fh.read()
76 | fh.close()
77 | os.remove(output_filename)
78 | md5 = calc_md5(result)
79 | return returncode, result, md5
80 |
81 |
82 | class ScriptsWordCommonTest(ScriptsCommonTest):
83 |
84 | @classmethod
85 | def set_test_data(cls):
86 | ScriptsCommonTest.set_test_data()
87 | cls.filename_char_weights = get_test_data('char_weights.txt')
88 | cls.filename_char_freqs = get_test_data('char_freqs.txt')
89 | cls.filename_pep_1mer_wordpos = get_test_data(
90 | 'pep.fa.1mer.wordpos.txt')
91 | cls.filename_pep_1mer = get_test_data('pep.fa.1mer.txt')
92 | cls.filename_pep_2mer_wordpos = get_test_data(
93 | 'pep.fa.2mer.wordpos.txt')
94 | cls.filename_pep_2mer = get_test_data('pep.fa.2mer.txt')
95 | cls.filename_pep_3mer_wordpos = get_test_data(
96 | 'pep.fa.3mer.wordpos.txt')
97 | cls.filename_pep_3mer = get_test_data('pep.fa.3mer.txt')
98 |
99 | class ModulesCommonTest:
100 |
101 | @classmethod
102 | def set_test_data(cls):
103 | fh = open(get_test_data('dna.fa'))
104 | cls.dna_records = seqrecords.read_fasta(fh)
105 | fh.close()
106 | fh = open(get_test_data('pep.fa'))
107 | cls.pep_records = seqrecords.read_fasta(fh)
108 | fh.close()
109 | cls.dna_filename = get_test_data('dna.fa')
110 | cls.pep_filename = get_test_data('pep.fa')
111 |
--------------------------------------------------------------------------------