├── .gitignore
├── LICENSE.txt
├── README.rst
├── alfpy
    ├── __init__.py
    ├── bbc.py
    ├── fcgr.py
    ├── graphdna.py
    ├── lempelziv.py
    ├── ncd.py
    ├── utils
    │   ├── __init__.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── seqcontent.py
    │   │   └── subsmat.py
    │   ├── distance.py
    │   ├── distmatrix.py
    │   ├── fasta.py
    │   └── seqrecords.py
    ├── version.py
    ├── wmetric.py
    ├── word_bool_distance.py
    ├── word_d2.py
    ├── word_distance.py
    ├── word_pattern.py
    ├── word_rtd.py
    ├── word_sets_distance.py
    └── word_vector.py
├── bin
    ├── calc_bbc.py
    ├── calc_fcgr.py
    ├── calc_graphdna.py
    ├── calc_lempelziv.py
    ├── calc_ncd.py
    ├── calc_wmetric.py
    ├── calc_word.py
    ├── calc_word_bool.py
    ├── calc_word_cv.py
    ├── calc_word_d2.py
    ├── calc_word_ffp.py
    ├── calc_word_rtd.py
    ├── calc_word_sets.py
    └── create_wordpattern.py
├── example_data
    ├── input
    │   ├── aminoacid.freqs.swissprot.txt
    │   ├── aminoacid.weights.txt
    │   ├── bears.dna.fasta
    │   ├── gp120.pep.fasta
    │   ├── hiv.pep.fasta
    │   ├── sample.dna.fasta
    │   └── sample.pep.fasta
    └── output
    │   ├── bears.dna.fasta.1mer
    │   ├── bears.dna.fasta.2mer
    │   ├── bears.dna.fasta.3mer
    │   ├── bears.dna.fasta.pairwise
    │   ├── bears.dna.fasta.phylip
    │   ├── bears.dna.fasta.teiresias.2mer
    │   ├── bears.dna.fasta.teiresias.3mer
    │   ├── gp120.pep.fasta.1mer
    │   ├── gp120.pep.fasta.2mer
    │   ├── gp120.pep.fasta.3mer
    │   ├── gp120.pep.fasta.pairwise
    │   ├── gp120.pep.fasta.phylip
    │   ├── gp120.pep.fasta.teiresias.2mer
    │   ├── gp120.pep.fasta.teiresias.3mer
    │   ├── hiv.pep.fasta.1mer
    │   ├── hiv.pep.fasta.2mer
    │   ├── hiv.pep.fasta.3mer
    │   ├── hiv.pep.fasta.pairwise
    │   ├── hiv.pep.fasta.phylip
    │   ├── hiv.pep.fasta.teiresias.2mer
    │   └── hiv.pep.fasta.teiresias.3mer
├── setup.py
└── tests
    ├── __init__.py
    ├── data
        ├── char_freqs.txt
        ├── char_weights.txt
        ├── dna.fa
        ├── dna.fa.1mer.txt
        ├── dna.fa.1mer.wordpos.txt
        ├── dna.fa.2mer.txt
        ├── dna.fa.2mer.wordpos.txt
        ├── pep.fa
        ├── pep.fa.1mer.txt
        ├── pep.fa.1mer.wordpos.txt
        ├── pep.fa.2mer.txt
        ├── pep.fa.2mer.wordpos.txt
        ├── pep.fa.3mer.txt
        └── pep.fa.3mer.wordpos.txt
    ├── test_bbc.py
    ├── test_calc_bbc.py
    ├── test_calc_fcgr.py
    ├── test_calc_graphdna.py
    ├── test_calc_lempelziv.py
    ├── test_calc_ncd.py
    ├── test_calc_wmetric.py
    ├── test_calc_word.py
    ├── test_calc_word_bool.py
    ├── test_calc_word_cv.py
    ├── test_calc_word_d2.py
    ├── test_calc_word_ffp.py
    ├── test_calc_word_rtd.py
    ├── test_calc_word_sets.py
    ├── test_create_wordpattern.py
    ├── test_distance.py
    ├── test_distmatrix.py
    ├── test_fasta.py
    ├── test_fcgr.py
    ├── test_graphdna.py
    ├── test_lempelziv.py
    ├── test_ncd.py
    ├── test_seqrecords.py
    ├── test_wmetric.py
    ├── test_word_bool_distance.py
    ├── test_word_d2.py
    ├── test_word_distance.py
    ├── test_word_pattern.py
    ├── test_word_rtd.py
    ├── test_word_sets_distance.py
    ├── test_word_vector.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | *~
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | env/
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *,cover
 49 | .hypothesis/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # IPython Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # dotenv
 82 | .env
 83 | 
 84 | # virtualenv
 85 | venv/
 86 | ENV/
 87 | 
 88 | # Spyder project settings
 89 | .spyderproject
 90 | 
 91 | # Rope project settings
 92 | .ropeproject
 93 | 
 94 | # My
 95 | test.py
 96 | 
 97 | 
 98 | # cache files for sublime text
 99 | *.tmlanguage.cache
100 | *.tmPreferences.cache
101 | *.stTheme.cache
102 | 
103 | # workspace files are user-specific
104 | *.sublime-workspace
105 | 
106 | # project files should be checked into the repository, unless a significant
107 | # proportion of contributors will probably not be using SublimeText
108 | # *.sublime-project
109 | 
110 | # sftp configuration file
111 | sftp-config.json
112 | 
113 | # Package control specific files
114 | Package Control.last-run
115 | Package Control.ca-list
116 | Package Control.ca-bundle
117 | Package Control.system-ca-bundle
118 | Package Control.cache/
119 | Package Control.ca-certs/
120 | bh_unicode_properties.cache
121 | 
122 | # Sublime-github package stores a github token in this file
123 | # https://packagecontrol.io/packages/sublime-github
124 | GitHub.sublime-settings
125 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2016 Andrzej Zielezinski, combio.pl, http://combio.pl/alfree
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | alfpy
  2 | =====
  3 | 
  4 | alfpy is a bionformatics Python package that provides alignment-free framework 
  5 | to compare biological sequences (DNA/RNA/protein) and infers their 
  6 | phylogenetic relationships. 
  7 | 
  8 | alfpy also contains Python scripts with user-friendly command-line interfaces 
  9 | that let you compare unaligned FASTA sequences with more than 40 distance methods.
 10 | 
 11 | 
 12 | Latest source code
 13 | ------------------
 14 | The official source code repository is at: https://github.com/aziele/alfpy
 15 | 
 16 | 
 17 | Web sites
 18 | ---------
 19 | alfpy is also available as a web app: http://www.combio.pl/alfree
 20 | 
 21 | 
 22 | Requirements
 23 | ============
 24 | 
 25 | 1. Python (https://www.python.org/) version 2.7 or >= 3.3
 26 | 2. NumPy (http://www.numpy.org/).
 27 | 
 28 | 
 29 | Installation
 30 | ============
 31 | 
 32 | Option 1: Get the latest official version
 33 | -----------------------------------------
 34 | 
 35 | Install the latest official version with `pip <https://pip.pypa.io/en/stable/installing/>`_
 36 | ::
 37 | 
 38 |    sudo pip install alfpy
 39 | 
 40 | If you are not allowed to use `sudo`, install alfpy as user::
 41 | 
 42 |    sudo pip install --user alfpy
 43 | 
 44 | 
 45 | 
 46 | Option 2: Get the latest development version
 47 | --------------------------------------------
 48 | 
 49 | Get it using this shell command, which requires Git::
 50 | 
 51 |    git clone https://github.com/aziele/alfpy.git
 52 | 
 53 | If you don't feel like using git, just download the package manually as a `gzipped tarball <https://github.com/aziele/alfpy/archive/master.zip/>`_.
 54 | 
 55 | Unpack the zip package, go to the directory and run the installation::
 56 | 
 57 |    cd alfpy
 58 |    python setup.py install
 59 | 
 60 | or::
 61 | 
 62 |    python setup.py install --user
 63 | 
 64 | Alfpy usage
 65 | ===========
 66 | 
 67 | The examples of using Alfpy are available at: http://www.combio.pl/alfree/download/.
 68 | 
 69 | 
 70 | Testing
 71 | =======
 72 | 
 73 | To run tests, go to the alfpy source code directory and type::
 74 | 
 75 |     python -m unittest discover
 76 | 
 77 | 
 78 | If you want to test a specific file (e.g. ``test_word_distance.py``), type::
 79 | 
 80 |     python -m unittest tests.test_word_distance
 81 | 
 82 | 
 83 | Contact
 84 | =======
 85 | 
 86 | Drop us any feedback at: bioinfo@amu.edu.pl or on twitter `@a_zielezinski <https://twitter.com/a_zielezinski>`_.
 87 | 
 88 | License
 89 | =======
 90 | 
 91 | alfpy is under the MIT license; see ``LICENSE.txt``. Distribution, 
 92 | modification and redistribution, incorporation into other software,
 93 | and pretty much everything else is allowed.
 94 | 
 95 | 
 96 | .. |Travis| image:: https://travis-ci.org/aziele/alfpy.svg?branch=master
 97 |     :target: https://travis-ci.org/aziele/alfpy
 98 | 
 99 | 
100 | .. |PyPI| image:: https://img.shields.io/pypi/v/alfpy.svg?branch=master
101 |     :target: https://pypi.python.org/pypi/alfpy
102 | 
103 | .. |Landscape| image:: https://landscape.io/github/aziele/alfpy/master/landscape.svg?style=flat
104 |    :target: https://landscape.io/github/aziele/alfpy/master
105 |    :alt: Code Health
106 | 
107 | .. |Codecov| image:: https://codecov.io/gh/aziele/alfpy/branch/master/graph/badge.svg
108 |    :target: https://codecov.io/gh/aziele/alfpy
109 | 


--------------------------------------------------------------------------------
/alfpy/__init__.py:
--------------------------------------------------------------------------------
1 | from .version import __version__
2 | 
3 | version = __version__


--------------------------------------------------------------------------------
/alfpy/bbc.py:
--------------------------------------------------------------------------------
  1 | """This module computes distances between DNA/protein sequences based on the
  2 | sequence feature, named Base-Base Correlation (BBC).
  3 | 
  4 | References:
  5 |     1. Liu, Zhi-Hua, et al. (2007) Bioinformatics and Biomedical Engineering,
  6 |        ICBBE. The 1st International Conference on. IEEE, 2007.
  7 |        doi: 10.1109/ICBBE.2007.98
  8 | 
  9 |     2. Liu Z, Meng J, Sun X. (2008) Biochem Biophys Res Commun. 368(2):223-30.
 10 |        doi: 10.1016/j.bbrc.2008.01.070.
 11 | 
 12 | Todo:
 13 |     * handle sequence symbols not included in molecule's alphabet
 14 | 
 15 | """
 16 | 
 17 | import numpy as np
 18 | 
 19 | from .utils import distance
 20 | 
 21 | 
 22 | def base_base_correlation(seq, k, alphabet=None):
 23 |     """Compute the base base correlation (BBC) vector for a sequence.
 24 | 
 25 |     Args:
 26 |         seq (str) : sequence
 27 |         k (int)   : parameter of the BBC. Intuitively, it represents
 28 |                     the maximum distance to observe correlation between bases.
 29 |         alphabet (str/list) : List of possible characters. This can be used to
 30 |                     avoid autodetection of the alphabet in the case where
 31 |                     sequences with missing letters are to be compared.
 32 | 
 33 |     Returns:
 34 |         numpy.ndarray: shape (1, 16) for DNA and (1, 400) for protein.
 35 | 
 36 |     Examples:
 37 |         >>> print(base_base_correlation('ATGCATGC', 1, 'ATGC'))
 38 |         [[
 39 |          -0.12547302 -0.12547302  0.2281059   0.17169665  0.01815213
 40 |          -0.12547302 -0.12547302  0.04258163  0.04258163  0.17169665
 41 |          -0.12547302 -0.12547302 -0.12547302  0.2281059   0.17169665
 42 |          -0.12547302
 43 |         ]]
 44 | 
 45 |     Note:
 46 |         A description of the method can be found here:
 47 |         http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=4272582
 48 | 
 49 |         This implementation is generalized for any sequence type.
 50 |     """
 51 | 
 52 |     s = seq
 53 | 
 54 |     if k > len(s) - 2:
 55 |         raise Exception("Sequence too short to compute BBC with "
 56 |                         "k={}".format(k))
 57 | 
 58 |     if alphabet is None:
 59 |         alphabet = set(s)
 60 |     else:
 61 |         s = "".join([c for c in s if c in alphabet])
 62 | 
 63 |     alphabet = sorted(list(alphabet))
 64 |     alphabet = dict(zip(alphabet, range(len(alphabet))))
 65 |     L = len(alphabet)
 66 | 
 67 |     # Compute the base probabilities for every character.
 68 |     p = np.zeros(L)
 69 |     for c in s:
 70 |         p[alphabet[c]] += 1
 71 |     p /= np.sum(p)
 72 |     p.shape = (1, L)
 73 | 
 74 |     bbc = np.zeros((L, L))
 75 |     for l in range(1, k + 2):
 76 |         # Compute $p_{ij}(l)$ representing the probability of
 77 |         # observing the bases i and j separated by l "gaps".
 78 |         # Compute it for all 16 combinations of alleles.
 79 |         l_dist_correlations = np.zeros((L, L))
 80 |         for i in range(len(s) - l):
 81 |             nuc1 = alphabet[s[i]]
 82 |             nuc2 = alphabet[s[i + l]]
 83 |             l_dist_correlations[nuc1][nuc2] += 1
 84 |         l_dist_correlations /= np.sum(l_dist_correlations)
 85 | 
 86 |         # Compute the D_{ij}(l) which is the deviation from
 87 |         # statistical independance.
 88 |         # $D_{ij}(l) = p_{ij}(l) - p_i p_j$
 89 |         D = l_dist_correlations - np.dot(p.T, p)
 90 | 
 91 |         bbc += D + (D ** 2 / 2 * np.dot(p.T ** 2, p ** 2)) + D ** 3
 92 | 
 93 |     # Flatten the bbc into a 16 feature vector.
 94 |     bbc.shape = (1, L * L)
 95 |     return bbc
 96 | 
 97 | 
 98 | def create_vectors(seq_records, k=10, alphabet="ATGC"):
 99 |     """Create BBC's vectors for multiple sequence records.
100 | 
101 |     Args:
102 |         seq_records (obj SeqRecords)
103 |     """
104 |     data = np.zeros(shape=(seq_records.count, len(alphabet)**2))
105 |     for seqidx, seq in enumerate(seq_records.seq_list):
106 |         vector = base_base_correlation(seq, k=k, alphabet=alphabet)
107 |         data[seqidx] = vector
108 |     return data
109 | 
110 | 
111 | class Distance(distance.Distance):
112 | 
113 |     def __init__(self, vector, disttype='euclid_norm'):
114 |         super(Distance, self).__init__(vector, disttype)
115 | 
116 | 
117 | def main():
118 |     from .utils.seqrecords import main
119 |     from .utils import distmatrix
120 |     seq_records = main()
121 |     vector = create_vectors(seq_records, 10, alphabet="ATGC")
122 |     dist = Distance(vector)
123 |     matrix = distmatrix.create(seq_records.id_list, dist)
124 |     matrix.display()
125 | 
126 | 
127 | if __name__ == '__main__':
128 |     main()
129 | 


--------------------------------------------------------------------------------
/alfpy/fcgr.py:
--------------------------------------------------------------------------------
  1 | """This module computes distances between DNA sequences based on the Frequency
  2 | Chaos Game Representation (FCGR)
  3 | 
  4 | References:
  5 |     1. Hatje K, Kollmar M (2012) Front Plant Sci 3: 192.
  6 |        doi: 10.3389/fpls.2012.00192
  7 | 
  8 | 
  9 | Functions for creating DNA-representing vectors were built upon:
 10 |     Cheng J, Cao F, Liu Z. (2013) Mol Biol Evol. 2013 30(5):1032-7.
 11 |     doi: 10.1093/molbev/mst021.
 12 | 
 13 | """
 14 | 
 15 | import numpy as np
 16 | 
 17 | from .utils import distance
 18 | 
 19 | 
 20 | def fcgr_vector(dnaseq, word_size):
 21 |     """Create a FCGR vector representing a DNA sequence.
 22 | 
 23 |     Args:
 24 |         dnaseq (str/list): dna sequence
 25 |         word_size (int): word size (>= 1)
 26 | 
 27 |     Returns:
 28 |         list (length equals 4^word_size)
 29 | 
 30 |     Examples:
 31 |         >>> s = 'ATGCTGATGGATG'
 32 |         >>> print(fcgr_vector(s, 1))
 33 |         [5, 3, 5]
 34 | 
 35 |         >>> print(fcgr_vector(s, 2))
 36 |         [1, 0, 1, 0, 0, 0, 4, 0, 2, 2, 0, 0, 1, 3, 0]
 37 | 
 38 |     """
 39 |     ndata = pow(4, word_size)
 40 |     genlen = len(dnaseq)
 41 |     CGRs = np.zeros((genlen + 1, 2))
 42 | 
 43 |     Apoint = np.array((0.0, 1.0))
 44 |     Tpoint = np.array((1.0, 1.0))
 45 |     Gpoint = np.array((1.0, 0.0))
 46 |     Cpoint = np.array((0.0, 0.0))
 47 |     CGRs[0, 0] = 0.5
 48 |     CGRs[0, 1] = 0.5
 49 |     for i in range(0, genlen):
 50 |         if dnaseq[i] == 'A':
 51 |             CGRs[i + 1] = 0.5 * (CGRs[i] + Apoint)
 52 |         if dnaseq[i] == 'T':
 53 |             CGRs[i + 1] = 0.5 * (CGRs[i] + Tpoint)
 54 |         if dnaseq[i] == 'G':
 55 |             CGRs[i + 1] = 0.5 * (CGRs[i] + Gpoint)
 56 |         if dnaseq[i] == 'C':
 57 |             CGRs[i + 1] = 0.5 * (CGRs[i] + Cpoint)
 58 |     temp = 1.0 / pow(2, word_size)
 59 | 
 60 |     vectors = np.zeros(shape=(1, ndata))  # numpy
 61 |     vectors = [0.0] * ndata  # list
 62 | 
 63 |     for point in CGRs:
 64 |         xx = int(point[0] / temp)
 65 |         yy = int(point[1] / temp)
 66 |         if yy == pow(2, word_size):
 67 |             yy = pow(2, word_size) - 1
 68 |         vectors[yy * pow(2, word_size) + xx] += 1
 69 |     vectors.pop(0)
 70 |     return vectors
 71 | 
 72 | 
 73 | def create_vectors(seq_records, word_size):
 74 |     """Create a matrix of FCGR vectors.
 75 | 
 76 |     Args:
 77 |         seq_records (obj: SeqRecords)
 78 |         word_size (int): word size (>= 1)
 79 | 
 80 |     Returns:
 81 |         numpy.ndarray
 82 | 
 83 |     """
 84 |     data = np.zeros(shape=(seq_records.count, pow(4, word_size) - 1))
 85 |     for seqidx, seq in enumerate(seq_records.seq_list):
 86 |         vector = fcgr_vector(seq, word_size)
 87 |         data[seqidx] = vector
 88 |     return data
 89 | 
 90 | 
 91 | class Distance(distance.Distance):
 92 | 
 93 |     def __init__(self, vector, disttype='euclid_norm'):
 94 |         super(Distance, self).__init__(vector, disttype)
 95 | 
 96 | 
 97 | def main():
 98 |     from .utils.seqrecords import main
 99 |     from .utils import distmatrix
100 |     seq_records = main()
101 | 
102 |     vector = create_vectors(seq_records, 1)
103 |     dist = Distance(vector)
104 |     matrix = distmatrix.create(seq_records.id_list, dist)
105 |     matrix.display()
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     main()
110 | 


--------------------------------------------------------------------------------
/alfpy/ncd.py:
--------------------------------------------------------------------------------
 1 | """Normalized compression distance (NCD)
 2 | 
 3 | The NCD is a family of distances parametrized with the compressor Z.
 4 | The better Z is, the closer the NCD approaches the NID, and the better
 5 | the results are.
 6 | 
 7 | As described in:
 8 | 1. Bennett, Gacs, Ming, Vintanyi, Zurek
 9 |    IEEE Transactions on Information Theory 1998. 44(4):1407-1423
10 |    doi: 10.1109/18.681318
11 | 
12 | 2. Li, Chen, Li, Ma, Vitanyi
13 |    IEEE Transactions on Information Theory 2004. 50(12):3250-3264
14 |    doi: 10.1109/TIT.2004.838101
15 | 
16 | 3. https://en.wikipedia.org/wiki/Normalized_compression_distance
17 | 
18 | """
19 | import itertools
20 | import zlib
21 | 
22 | 
23 | def complexity(s):
24 |     """Compress string and return the size of the compression."""
25 |     s = s.encode("utf-8")  # Python 3 fix.
26 |     compr = zlib.compress(s)
27 |     c = float(len(compr))
28 |     return c
29 | 
30 | 
31 | class Distance():
32 | 
33 |     def __init__(self, seq_records):
34 | 
35 |         self.seq_records = seq_records
36 |         self._complexity = {}
37 |         self.numseqs = seq_records.count
38 |         # Precomputed complexity for input sequences
39 |         # as well as all pairwise concatenated sequences.
40 |         self._complexity = self.__precompute_complexity()
41 | 
42 |     def __precompute_complexity(self):
43 |         d = {}
44 |         seqs = self.seq_records.seq_list
45 |         # Complexity for single input sequences.
46 |         for seqidx, seq in enumerate(seqs):
47 |             d[(seqidx,)] = complexity(seq)
48 |         # Complexity for pairwise concatenated sequences.
49 |         for i, j in itertools.combinations(range(self.numseqs), 2):
50 |             seq12 = seqs[i] + seqs[j]
51 |             c12 = complexity(seq12)
52 |             d[(i, j)] = c12
53 |         return d
54 | 
55 |     def pairwise_distance(self, seq1idx, seq2idx):
56 |         """Compute NCD between two sequences.
57 | 
58 |         Formula:
59 |         NCD_Z(x,y) = \frac{Z(xy) - \min \{Z(x),Z(y)\}}{\max \{Z(x),Z(y)\}}.
60 | 
61 |         where:
62 |         Z(x) is the binary length of the sequence `x` compressed
63 |         with compressor Z
64 |         """
65 |         zx = self._complexity[(seq1idx,)]
66 |         zy = self._complexity[(seq2idx,)]
67 |         zxy = self._complexity[(seq1idx, seq2idx)]
68 |         return (zxy - min([zx, zy])) / max([zx, zy])
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     from .utils import distmatrix
73 |     from .utils.seqrecords import main
74 |     seq_records = main()
75 | 
76 |     dist = Distance(seq_records)
77 |     matrix = distmatrix.create(seq_records.id_list, dist)
78 |     matrix.display('pairwise')
79 | 


--------------------------------------------------------------------------------
/alfpy/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aziele/alfpy/25545be14affa7d7e89e5b5ebcfe4f3e688108b7/alfpy/utils/__init__.py


--------------------------------------------------------------------------------
/alfpy/utils/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aziele/alfpy/25545be14affa7d7e89e5b5ebcfe4f3e688108b7/alfpy/utils/data/__init__.py


--------------------------------------------------------------------------------
/alfpy/utils/data/seqcontent.py:
--------------------------------------------------------------------------------
  1 | """Collections of various bits of useful sequence data."""
  2 | 
  3 | FREQS = {
  4 |     'protein': {
  5 |         'A': 0.0826,
  6 |         'Q': 0.0393,
  7 |         'L': 0.0965,
  8 |         'S': 0.0659,
  9 |         'R': 0.0553,
 10 |         'E': 0.0674,
 11 |         'K': 0.0583,
 12 |         'T': 0.0534,
 13 |         'N': 0.0406,
 14 |         'G': 0.0708,
 15 |         'M': 0.0241,
 16 |         'W': 0.0109,
 17 |         'D': 0.0546,
 18 |         'H': 0.0227,
 19 |         'F': 0.0386,
 20 |         'Y': 0.0292,
 21 |         'C': 0.0137,
 22 |         'I': 0.0594,
 23 |         'P': 0.0471,
 24 |         'V': 0.0687,
 25 |         'X': 1,
 26 |         'B': 0.0406 + 0.0546,
 27 |         'Z': 0.0393 + 0.0674
 28 |     },
 29 |     'dna': {
 30 |         'A': 0.25,
 31 |         'C': 0.25,
 32 |         'G': 0.25,
 33 |         'T': 0.25
 34 |     },
 35 |     'rna': {
 36 |         'A': 0.25,
 37 |         'C': 0.25,
 38 |         'G': 0.25,
 39 |         'U': 0.25
 40 |     }
 41 | }
 42 | 
 43 | WEIGHTS = {
 44 |     'protein': {
 45 |         'A': 1.2106537530266344,
 46 |         'C': 7.299270072992702,
 47 |         'E': 1.4836795252225519,
 48 |         'D': 1.8315018315018312,
 49 |         'G': 1.4124293785310733,
 50 |         'F': 2.590673575129534,
 51 |         'I': 1.6835016835016834,
 52 |         'H': 4.405286343612334,
 53 |         'K': 1.7152658662092626,
 54 |         'M': 4.149377593360996,
 55 |         'L': 1.0362694300518134,
 56 |         'N': 2.4630541871921183,
 57 |         'Q': 2.5445292620865136,
 58 |         'P': 2.123142250530785,
 59 |         'S': 1.5174506828528072,
 60 |         'R': 1.8083182640144662,
 61 |         'T': 1.8726591760299625,
 62 |         'W': 9.174311926605505,
 63 |         'V': 1.4556040756914121,
 64 |         'Y': 3.4246575342465753
 65 |     },
 66 |     'dna': {
 67 |         'A': 1,
 68 |         'C': 1,
 69 |         'G': 1,
 70 |         'T': 1
 71 |     },
 72 |     'rna': {
 73 |         'A': 1,
 74 |         'C': 1,
 75 |         'G': 1,
 76 |         'U': 1
 77 |     }
 78 | 
 79 | }
 80 | 
 81 | 
 82 | ALPHABET = {
 83 |     'dna': 'ATGC',
 84 |     'protein': 'ACDEFGHIKLMNPQRSTVWY'
 85 | }
 86 | 
 87 | REDUCED_ALPHABET = {
 88 |     'dna': {
 89 |         'A': 'R',
 90 |         'G': 'R',
 91 |         'T': 'Y',
 92 |         'C': 'Y'
 93 |     },
 94 |     'protein': {
 95 |         'T': 'S',
 96 |         'E': 'D',
 97 |         'Q': 'K',
 98 |         'R': 'K',
 99 |         'V': 'I',
100 |         'L': 'I',
101 |         'M': 'I',
102 |         'W': 'F',
103 |         'Y': 'F'
104 |     }
105 | }
106 | 
107 | 
108 | def get_alphabet(mol):
109 |     return ALPHABET[mol]
110 | 
111 | 
112 | def get_freqs(mol):
113 |     return FREQS[mol]
114 | 
115 | 
116 | def get_weights(mol):
117 |     return WEIGHTS[mol]
118 | 
119 | 
120 | def get_reduced_alphabet(mol):
121 |     return REDUCED_ALPHABET[mol]
122 | 


--------------------------------------------------------------------------------
/alfpy/utils/distance.py:
--------------------------------------------------------------------------------
  1 | """This module contains a `Distance` class that combines vector
  2 | with distance function.
  3 | 
  4 | """
  5 | 
  6 | import math
  7 | import numpy as np
  8 | 
  9 | 
 10 | class Distance(object):
 11 |     """Combine sequences-representing 2-D array of vectors
 12 |     with a distance function.
 13 | 
 14 |     Attributes:
 15 |         _vector (ndarray)
 16 |         _disttype (str): distance method name
 17 |         pairwise_distance (func): distance method
 18 | 
 19 |     """
 20 | 
 21 |     def __getitem__(self, seqnum):
 22 |         return self._vector[seqnum]
 23 | 
 24 |     @classmethod
 25 |     def get_disttypes(cls):
 26 |         """Return a list of available distance function names.
 27 | 
 28 |         Returns:
 29 |             list of strings
 30 |         """
 31 |         l = [x[7:] for x, y in cls.__dict__.items() if x.startswith('pwdist')]
 32 |         l.sort()
 33 |         return l
 34 | 
 35 |     def set_disttype(self, disttype):
 36 |         try:
 37 |             pwdist_func = getattr(self, 'pwdist_{}'.format(disttype))
 38 |             self.pairwise_distance = pwdist_func
 39 |         # Method does not exist.
 40 |         except AttributeError:
 41 |             msg = 'unknown disttype "{}"'.format(disttype)
 42 |             raise ValueError(msg)
 43 | 
 44 |     def __init__(self, vector, disttype):
 45 |         """Create instance of Distance.
 46 | 
 47 |         Args:
 48 |             vector (ndarray)
 49 |             disttype (str)
 50 | 
 51 |         Examples:
 52 |         >>> vector
 53 |         [[ 3.  6.  4.  1.  3.  4.  3.  0.  1.  1.  6.  4.  5.  0.  3.  4.]
 54 |          [ 0.  3.  0.  3.  0.  0.  0.  2.  9.  0.  3.  3.  0.  6.  3.  6.]
 55 |          [ 9.  0.  0.  3.  0.  0.  0.  2.  6.  0.  3.  3.  0.  3.  3.  3.]]
 56 |         >>> disttype = 'minkowski'
 57 |         >>> dist = Distance(vector, disttype)
 58 | 
 59 |         """
 60 |         self.set_disttype(disttype)
 61 |         self._vector = vector
 62 |         self._disttype = disttype
 63 | 
 64 |     def pwdist_euclid_squared(self, seq1idx, seq2idx):
 65 |         """Squared Euclidean distance
 66 | 
 67 |         References:
 68 |             1. Blaisdell BE (1986) Proc Natl Acad Sci U S A 83: 5155-5159.
 69 |                doi: 10.1073/pnas.83.14.5155
 70 | 
 71 |         """
 72 |         value = np.sum((self[seq1idx] - self[seq2idx])**2)
 73 |         return value
 74 | 
 75 |     def pwdist_euclid_norm(self, seq1idx, seq2idx):
 76 |         """Euclidean distance
 77 | 
 78 |         References:
 79 |             1. Vinga & Almeida (2003) Bioinformatics 19(4): 513-523.
 80 |                doi: 10.1093/bioinformatics/btg005
 81 |             2. http://web.ist.utl.pt/susanavinga/NASC/
 82 | 
 83 |         """
 84 |         value = math.sqrt(self.pwdist_euclid_squared(seq1idx, seq2idx))
 85 |         return value
 86 | 
 87 |     def pwdist_google(self, seq1idx, seq2idx):
 88 |         """Normalized Google Distance (NGD).
 89 | 
 90 |         The maximum values for NGD is 1.0, which means two sequences are
 91 |         totally not similar to each other, and the minimum values for
 92 |         NGD is 0.0. Therefore, the similarity of the two sequences can be
 93 |         obtained by NGS = 1 - NGD. Two sequences are treated as two different
 94 |         web pages and the each word frequency represents terms found in each
 95 |         webpage.
 96 | 
 97 |         References:
 98 |             1. Lee & Rashid (2008) Information Technology, ITSim 2008.
 99 |                doi:10.1109/ITSIM.2008.4631601
100 | 
101 |         """
102 |         v1 = self[seq1idx]
103 |         v2 = self[seq2idx]
104 | 
105 |         sumwx = float(np.sum(v1))
106 |         sumwy = float(np.sum(v2))
107 | 
108 |         summin = float(np.sum(np.minimum(v1, v2)))
109 | 
110 |         ngd = (max([sumwx, sumwy]) - summin) / \
111 |             ((sumwx + sumwy) - min([sumwx, sumwy]))
112 |         return ngd
113 | 


--------------------------------------------------------------------------------
/alfpy/utils/distmatrix.py:
--------------------------------------------------------------------------------
  1 | """This module creates and handles distance matrices"""
  2 | 
  3 | import itertools
  4 | import numpy as np
  5 | import sys
  6 | 
  7 | 
  8 | def create(id_list, distance):
  9 |     """Create a distance matrix (as Matrix object).
 10 | 
 11 |     Calculate distance measures between all pairs of sequences.
 12 | 
 13 |     Args:
 14 |         id_list (list): list of sequence identifiers
 15 |         distance (obj): instance of distance.Distance
 16 | 
 17 |     Returns:
 18 |         Matrix object
 19 | 
 20 |     Examples:
 21 |         >>> vector
 22 |         [[ 3.  6.  4.  1.  3.  4.  3.  0.  1.  1.  6.  4.  5.  0.  3.  4.]
 23 |          [ 0.  3.  0.  3.  0.  0.  0.  2.  9.  0.  3.  3.  0.  6.  3.  6.]
 24 |          [ 9.  0.  0.  3.  0.  0.  0.  2.  6.  0.  3.  3.  0.  3.  3.  3.]]
 25 |         >>> disttype = 'minkowski'
 26 |         >>> dist = Distance(vector, disttype)
 27 |         >>> id_list = ['seq1', 'seq2', 'seq3']
 28 |         >>> matrix = create(id_list, dist)
 29 | 
 30 |     """
 31 |     size = len(id_list)
 32 |     rows = np.zeros([size, size])
 33 |     for i, j in itertools.combinations(range(size), 2):
 34 |         value = distance.pairwise_distance(i, j)
 35 |         rows[i][j] = value
 36 |         rows[j][i] = value
 37 |     # No need to calculate distances between the same sequences.
 38 |     # The distance should be zero.
 39 |     # for i in range(size):
 40 |     #    value = distance.pairwise_distance(i, i)
 41 |     #    rows[i][i] = value
 42 |     return Matrix(id_list, rows)
 43 | 
 44 | 
 45 | def read_highcharts_matrix(id_list, data):
 46 |     """Create a distance matrix from a matrix in Highcharts format.
 47 | 
 48 |     Args:
 49 |         id_list (list): list of sequence identifiers
 50 |         data (list of 4-element tuples)
 51 |             e.g. [[0, 1, 0.35, 0.19], [0, 2, 1.0, 0.55], [1, 2, 0.88, 0.48]]
 52 | 
 53 |     Returns:
 54 |         Matrix object
 55 |     """
 56 |     size = len(id_list)
 57 |     rows = np.zeros([size, size])
 58 |     for i, j, _, value in data:
 59 |         rows[i][j] = value
 60 |         rows[j][i] = value
 61 |     return Matrix(id_list, rows)
 62 | 
 63 | 
 64 | class Matrix():
 65 |     """Distance matrix
 66 | 
 67 |     Attributes:
 68 |         id_list (list): list of sequence identifiers
 69 |         data (ndarray): 2-D array of distance values between pairs of seqs
 70 | 
 71 |     """
 72 | 
 73 |     def __init__(self, id_list, data):
 74 |         """
 75 |         Example:
 76 |             >>> id_list = ['seq1', 'seq2', 'seq3']
 77 |             >>> data
 78 |             [[ 0.          0.3531587   0.35509333]
 79 |              [ 0.3531587   0.          0.295394  ]
 80 |              [ 0.35509333  0.295394    0.        ]]
 81 |             >>> matrix = Matrix(id_list, data)
 82 | 
 83 |         """
 84 |         self.id_list = id_list
 85 |         self.data = data
 86 | 
 87 |     def normalize(self):
 88 |         """Normalize distance values to 0-1 range."""
 89 |         self.data /= self.max()
 90 | 
 91 |     def __iter__(self):
 92 |         """Iterate over a distance matrix."""
 93 |         size = self.data.shape[0]
 94 |         for i, j in itertools.combinations(range(size), 2):
 95 |             yield i, j, self.id_list[i], self.id_list[j], self.data[i][j]
 96 | 
 97 |     def writer(self, handle, f, decimal_places):
 98 |         """Return a distance matrix as a string in `phylip` or `pairwise`
 99 |         formats.
100 | 
101 |         Args:
102 |             handle : output file / sys.stdout
103 |             f (str): phylip / pairwise
104 |             decimal_places (int): round distance value to decimal places
105 | 
106 |         """
107 |         if f == 'phylip':
108 |             handle.write("   {0}\n".format(len(self.id_list)))
109 |             for i, line in enumerate(self.data):
110 |                 # PHYLIP requires that each sequence identifier
111 |                 # is maximum 10 characters long.
112 |                 seqid = self.id_list[i][:10]
113 |                 l = ['{0:.{1}f}'.format(line[i], decimal_places)
114 |                      for i in range(0, len(line))]
115 |                 l.insert(0, '{0: <10}'.format(seqid))
116 |                 handle.write(" ".join(l) + "\n")
117 |         elif f == 'pairwise':
118 |             for _, _, seqid1, seqid2, distval in self:
119 |                 handle.write("{0}\t{1}\t{2:.{3}f}\n".format(seqid1, seqid2,
120 |                                                             distval,
121 |                                                             decimal_places))
122 | 
123 |     def display(self, f="phylip", decimal_places=7):
124 |         """Write a distance matrix to the screen."""
125 |         return self.writer(sys.stdout, f, decimal_places)
126 | 
127 |     def write_to_file(self, handle, f="phylip", decimal_places=7):
128 |         """Write a distance matrix to a file."""
129 |         return self.writer(handle, f, decimal_places)
130 | 
131 |     def highcharts(self):
132 |         """Return a distance matrix as a list in the Highcharts format."""
133 |         data = []
134 |         maxval = self.max()
135 |         for i, j, _, _, distval in self:
136 |             data.append([i, j, distval / maxval, distval])
137 |         return data
138 | 
139 |     def format(self, decimal_places=7):
140 |         lines = ["   {0}".format(len(self.id_list))]
141 |         for i, line in enumerate(self.data):
142 |             seqid = self.id_list[i][:10]
143 |             l = ['{0:.{1}f}'.format(line[i], decimal_places)
144 |                  for i in range(0, len(line))]
145 |             l.insert(0, '{0: <10}'.format(seqid))
146 |             lines.append("\n" + " ".join(l))
147 |         return "".join(lines)
148 | 
149 |     def min(self):
150 |         """Return minimum distance value in matrix"""
151 |         return np.amin(self.data)
152 | 
153 |     def max(self):
154 |         """Return maximum distance value in matrix"""
155 |         return np.amax(self.data)
156 | 
157 |     def is_zero(self):
158 |         """Return True if matrix contains only zeros"""
159 |         return not np.count_nonzero(self.data)
160 | 
161 |     def __repr__(self):
162 |         return str(self.data)
163 | 
164 | 
165 | 
166 | if __name__ == '__main__':
167 |     id_list = ['seq1', 'seq2', 'seq3']
168 |     l = [[0, 0.3531587, 0.35509333],
169 |          [0.3531587, 0, 0.295394],
170 |          [0.35509333, 0.295394, 0.]
171 |          ]
172 |     data = np.array(l)
173 |     matrix = Matrix(id_list, data)
174 |     print(matrix.format())
175 |     print(matrix.highcharts())
176 | 


--------------------------------------------------------------------------------
/alfpy/utils/fasta.py:
--------------------------------------------------------------------------------
  1 | """Reading and writing FASTA format files"""
  2 | 
  3 | from itertools import groupby
  4 | 
  5 | 
  6 | class FastaRecord():
  7 |     """Object representing a Fasta (aka Pearson) record.
  8 | 
  9 |     Attributes:
 10 |         seq (str)         : Sequence
 11 |         id  (str)         : Sequence identifier
 12 |         description (str) : Sequence description
 13 |     """
 14 | 
 15 |     def __init__(self, seq, seqid, description=False):
 16 |         """Create a FastaRecord.
 17 | 
 18 |         Example:
 19 |             >>> import Fasta
 20 |             >>> record = FastaRecord(seq='MRELEAKAT',
 21 |             ...                      seqid='NP_055309.2',
 22 |             ...                      description='TNRC6A')
 23 |             >>> print(record)
 24 |             >NP_055309.2 TNRC6A
 25 |             MRELEAKAT
 26 |         """
 27 |         self.seq = seq
 28 |         self.id = seqid
 29 |         self.description = description
 30 | 
 31 |     def __iter__(self):
 32 |         """Iterate over the letters in the sequence.
 33 | 
 34 |         Example:
 35 |             >>> import Fasta
 36 |             >>> record = Fasta.read(open('sequence.fasta'))
 37 |             >>> for amino_acid in record:
 38 |             ...     print(amino_acid)
 39 |             M
 40 |             R
 41 |             E
 42 |             L
 43 |             E
 44 | 
 45 |             This is equivalent to iterating over the sequence directly:
 46 |             >>> for amino_acid in record.seq:
 47 |             ...     print(amino_acid)
 48 |             M
 49 |             R
 50 |             E
 51 |             L
 52 |             E
 53 |         """
 54 |         return iter(self.seq)
 55 | 
 56 |     def __contains__(self, char):
 57 |         """Implements the 'in' keyword, searches the sequence.
 58 | 
 59 |         Example:
 60 |             >>> import Fasta
 61 |             >>> record = Fasta.read(open('sequence.fasta'))
 62 |             >>> print('M' in record)
 63 |             True
 64 |         """
 65 |         return char in self.seq
 66 | 
 67 |     def __str__(self):
 68 |         """Return the record as a string in the fasta format.
 69 | 
 70 |         Example:
 71 |             >>> import Fasta
 72 |             >>> record = FastaRecord(seq='MRELEAKAT',
 73 |             ...                      id='NP_055309.2',
 74 |             ...                      description='TNRC6A')
 75 |             >>> print(record)
 76 |             >NP_055309.2 TNRC6A
 77 |             MRELEAKAT
 78 |         """
 79 |         return self.format(wrap=70)
 80 | 
 81 |     def __len__(self):
 82 |         """Return the length of the sequence.
 83 | 
 84 |         Example:
 85 |             >>> import Fasta
 86 |             >>> record = Fasta.read(open('sequence.fasta'))
 87 |             >>> len(record)
 88 |             1240
 89 |         """
 90 |         return len(self.seq)
 91 | 
 92 |     def format(self, wrap=70):
 93 |         """Return a formatted Fasta record.
 94 | 
 95 |         Example:
 96 |             >>> import Fasta
 97 |             >>> record = SeqRecord(seq='MRELEAKAT',
 98 |                                    id='NP_055309.2',
 99 |                                    description='TNRC6A')
100 |             >>> print(record.format())
101 |             >NP_055309.2 TNRC6A
102 |             MRELEAKAT
103 |         """
104 |         header = ">{0}".format(self.id)
105 |         if self.description:
106 |             header += " " + self.description
107 |         header += "\n"
108 |         wseq = []
109 |         for i in range(0, len(self.seq), wrap):
110 |             wseq.append(self.seq[i:i + wrap])
111 |         return header + "\n".join(wseq)
112 | 
113 | 
114 | def parse(handle):
115 |     """
116 |     Generator function to iterate over Fasta records (as FastaRecord objects).
117 | 
118 |     handle - input file containing fasta sequences.
119 |     """
120 |     faiter = (x[1] for x in groupby(handle, lambda l: l[0] == ">"))
121 |     for header in faiter:
122 |         header = next(header)[1:].strip()
123 |         seqid = header.split()[0]
124 |         seq = "".join(s.strip() for s in next(faiter))
125 |         desc = header[len(seqid):].strip()
126 |         yield FastaRecord(seq, seqid, description=desc)
127 | 
128 | 
129 | def read(handle):
130 |     """Turns a sequence file into a single FastaRecord.
131 | 
132 |     EXAMPLE:
133 |     >>> import Fasta
134 |     >>> record = Fasta.read(open('sequence.fasta'))
135 |     >>> print(record.id)
136 |     NP_055309.2
137 |     >>> print(record.seq)
138 |     MRELEAKAT
139 | 
140 |     If the handle contains no records an exception is raised.
141 |     If the handle contains more than one record, the very first one is read.
142 | 
143 |     Use the Fasta.parse(handle) function if you want
144 |     to read multiple records from the handle.
145 | 
146 |     """
147 |     iterator = parse(handle)
148 |     try:
149 |         first = next(iterator)
150 |     except StopIteration:
151 |         first = None
152 |     return first
153 | 
154 | 
155 | def to_dict(sequences):
156 |     """Turns a Fasta sequence iterator or list into a dictionary.
157 | 
158 |     - sequences: an iterator that returns FastaRecord objects,
159 |       or simply a list of SeqRecord objects.
160 | 
161 |     Uses record.id as key.
162 | 
163 |     If there are duplicate keys, an error is raised.
164 | 
165 |     EXAMPLE:
166 |     >>> import Fasta
167 |     >>> pdict = Fasta.to_dict(Fasta.parse(open('test.fa')))
168 |     >>> print(sorted(pdict.keys()))
169 |     ['gi|195354411|', 'tr|Q8SY33|']
170 |     >>> print(pdict['tr|Q8SY33|'].description)
171 |     Gawky, isoform A [Drosophila melanogaster]
172 |     >>> len(pdict)
173 |     2
174 | 
175 |     NOTE:
176 |     This approach is not suitable for very large sets of sequences,
177 |     as all the SeqRecord objects are held in memory.
178 | 
179 |     """
180 |     d = dict()
181 |     for record in sequences:
182 |         key = record.id
183 |         if key in d:
184 |             raise ValueError("Duplicate key '{}'".format(key))
185 |         d[key] = record
186 |     return d
187 | 
188 | 
189 | if __name__ == '__main__':
190 |     seqs = ['>seq1 desc1', 'ATGCTGATGATAGATG', 'ATGTAGA',
191 |             '>seq2 desc2', 'ATGCTGCT']
192 |     for seq_record in parse(seqs):
193 |         print(seq_record)
194 | 


--------------------------------------------------------------------------------
/alfpy/utils/seqrecords.py:
--------------------------------------------------------------------------------
  1 | from . import fasta
  2 | 
  3 | 
  4 | class SeqRecords:
  5 |     """Object representing an ordered collection of sequence records.
  6 | 
  7 |     Attributes:
  8 |         id_list (list)  : List of sequence record identifiers
  9 |         seq_list (list) : List of sequence strings
 10 |         count (int)     : Number of sequence records
 11 | 
 12 |     """
 13 | 
 14 |     def __init__(self, id_list=None, seq_list=None):
 15 |         """Create a collection (may be empty) of sequence records.
 16 | 
 17 |         Example:
 18 |             >>> ids = ['seq1', 'seq2']
 19 |             >>> seqs = ['ATGCTG', 'TGCTGATAGTA']
 20 |             >>> seq_records = SeqRecords(id_list=ids, seq_list=seqs)
 21 |             >>> print seq_records
 22 |             SeqRecords (noseqs: 2)
 23 | 
 24 |         """
 25 |         self.count = 0 if not id_list else len(seq_list)
 26 |         self.id_list = id_list if id_list else []
 27 |         # Make all sequences uppercased.
 28 |         self.seq_list = [s.upper() for s in seq_list] if seq_list else []
 29 | 
 30 |     def add(self, seqid, seq):
 31 |         """Add a sequence record to the existing collection.
 32 | 
 33 |         Args:
 34 |             id (str)  : sequence identifier
 35 |             seq (str) : sequence string
 36 | 
 37 |         Example:
 38 |             >>> seq_record.add("seq3", "TGCTGA")
 39 |         """
 40 |         self.id_list.append(seqid)
 41 |         self.seq_list.append(seq.upper())
 42 |         self.count += 1
 43 | 
 44 |     def fasta(self, wrap=70):
 45 |         """Return sequence records as a mutli-FASTA string.
 46 | 
 47 |         Example:
 48 |             >>> ids = ['seq1', 'seq2']
 49 |             >>> seqs = ['ATGCTG', 'TGCTGATAGTA']
 50 |             >>> seq_records = SeqRecords(id_list=ids, seq_list=seqs)
 51 |             >>> print seq_records.fasta()
 52 |             >seq1
 53 |             ATGCTG
 54 |             >seq2
 55 |             TGCTGATAGTA
 56 |         """
 57 |         l = []
 58 |         for seqid, seq in self:
 59 |             seq_record = fasta.FastaRecord(seq=seq, seqid=seqid)
 60 |             l.append(seq_record.format(wrap=wrap))
 61 |         return "\n".join(l)
 62 | 
 63 |     @property
 64 |     def length_list(self):
 65 |         """Return a list of the sequences' length_list"""
 66 |         return [len(seq) for seq in self.seq_list]
 67 | 
 68 |     def __iter__(self):
 69 |         """
 70 |         Iterate over sequence records in the collection.
 71 | 
 72 |         Example:
 73 |             >>> for amino_acid in record:
 74 |             ...     print(amino_acid)
 75 |             seq1
 76 |             ATGCTG
 77 |             seq2
 78 |             TGCTGATAGTA
 79 |         """
 80 |         for i in range(self.count):
 81 |             seqid = self.id_list[i]
 82 |             seq = self.seq_list[i]
 83 |             yield seqid, seq
 84 | 
 85 |     def __len__(self):
 86 |         """
 87 |         Return the number of sequence records in the collection.
 88 | 
 89 |         Example:
 90 |             >>> len(seq_records)
 91 |             3
 92 |         """
 93 |         return len(self.seq_list)
 94 | 
 95 |     def __repr__(self):
 96 |         return "{0} (noseqs: {1})".format(self.__class__.__name__,
 97 |                                           self.count)
 98 | 
 99 | 
100 | def read_fasta(handle):
101 |     """Create a SeqRecords object from Fasta file.
102 | 
103 |     Args:
104 |         file handle : a file containing Fasta sequences.
105 | 
106 |     """
107 |     id_list = []
108 |     seq_list = []
109 |     for seq_record in fasta.parse(handle):
110 |         id_list.append(seq_record.id)
111 |         seq_list.append(seq_record.seq)
112 |     return SeqRecords(id_list=id_list, seq_list=seq_list)
113 | 
114 | 
115 | def main():
116 |     seq_records = SeqRecords()
117 |     seq_records.add(
118 |         'seq1', 'AACGTACCATTGAACGTACCATTGAACGTACCATTGATGCATGGTAGAT')
119 |     seq_records.add('seq2', 'CTAGGGGACTTATCTAGGGGACTTATCTAGGGGACTTAT')
120 |     seq_records.add('seq3', 'CTAGGGAAAATTCTAGGGAAAATTCTAGGGAAAATT')
121 | 
122 |     import uuid
123 |     import os
124 |     outfilename = uuid.uuid4().hex
125 |     oh = open(outfilename, 'w')
126 |     oh.write(seq_records.fasta())
127 |     oh.close()
128 | 
129 |     fh = open(outfilename)
130 |     seq_records = read_fasta(fh)
131 |     fh.close()
132 |     os.remove(outfilename)
133 | 
134 |     return seq_records
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     seq_records = main()
139 |     print(seq_records.fasta())
140 | 


--------------------------------------------------------------------------------
/alfpy/version.py:
--------------------------------------------------------------------------------
1 | # I store the version here so:
2 | # 1) I don't load dependencies by storing it in __init__.py
3 | # 2) I can import it in setup.py for the same reason.
4 | # 3) I can import it into any module.
5 | __version__ = '1.0.6'


--------------------------------------------------------------------------------
/alfpy/wmetric.py:
--------------------------------------------------------------------------------
  1 | """Calculate distances between protein sequences based on the W-metric (Wm).
  2 | 
  3 | Reference:
  4 |     1. Vinga, Gouveia-Oliveira, Almeida. (2004) Bioinformatics. 20(2):206-215
  5 |        doi: 10.1093/bioinformatics/btg392
  6 | 
  7 | W-metric includes one-tuple composition information (the difference
  8 | in amino acid frequencies between two proteins) and weights from
  9 | the scoring matrices used in alignment methods.
 10 | 
 11 | """
 12 | import numpy as np
 13 | 
 14 | 
 15 | def count_seq_chars(seq, alphabet):
 16 |     """Count characters from given alphabet that are present in sequence.
 17 | 
 18 |     Args:
 19 |        seq (str): sequence
 20 |        alphabet (str/list): list of allowed characters
 21 | 
 22 |     Returns:
 23 |        A list of characters' counting occurrences.
 24 | 
 25 |     Examples:
 26 |        >>> alphabet = 'ACDEFGHIKLMNPQRSTVWY'
 27 |        >>> seq = 'MKSTGWHFSG'
 28 |        >>> print(count_seq_chars(seq, alphabet))
 29 |        [0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 0, 2, 1, 0, 1, 0]
 30 | 
 31 |     """
 32 |     l = [0 for c in alphabet]
 33 |     for i, c in enumerate(alphabet):
 34 |         l[i] += seq.count(c)
 35 |     return l
 36 | 
 37 | 
 38 | def freq_seq_chars(counts):
 39 |     """Calculate frequencies of characters (symbols) in a sequence based on
 40 |     characters' counts.
 41 | 
 42 |     Args:
 43 |        counts (list): result of the `count_seq_chars` function
 44 |        seqlen (int):  length of a sequence
 45 | 
 46 |     Returns:
 47 |        A list of frequencies corresponding to alphabet
 48 | 
 49 |     Examples:
 50 |         >>> l = [0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 0, 2, 1, 0, 1, 0]
 51 |         >>> print(freq_seq_chars(l))
 52 |         [0.0, 0.0, 0.0, 0.0, 0.1, 0.2, 0.1,
 53 |          0.0, 0.1, 0.0, 0.1, 0.0, 0.0, 0.0,
 54 |          0.0, 0.2, 0.1, 0.0, 0.1, 0.0]
 55 | 
 56 |     """
 57 |     seqlen = float(sum(counts))
 58 |     return [c / seqlen for c in counts]
 59 | 
 60 | 
 61 | def freq_seqs_chars(seq_records, alphabet):
 62 |     """Calculate frequencies of characters from given alphabet
 63 |     for multiple sequences (stored as seq_records object).
 64 | 
 65 |     Args:
 66 |        seq_records (obj): instance of SeqRecords()
 67 |        alphabet (list): list of allowed characters
 68 | 
 69 |     Returns:
 70 |        numpy.ndarray
 71 |     """
 72 |     l = []
 73 |     for i in range(seq_records.count):
 74 |         seq = seq_records.seq_list[i]
 75 |         counts = count_seq_chars(seq, alphabet)
 76 |         freq = freq_seq_chars(counts)
 77 |         l.append(freq)
 78 |     return np.array(l)
 79 | 
 80 | 
 81 | class Distance:
 82 |     """Combine vector with a distance function.
 83 | 
 84 |     Attributes:
 85 |         freqs (ndarray): matrix of sequence-representing vectors
 86 |         matrix (ndarray): substitution matrix for amino acid changes
 87 | 
 88 |     """
 89 | 
 90 |     def __init__(self, seq_records, matrix):
 91 |         """Create a instance of Distance.
 92 | 
 93 |         Args:
 94 |             seq_records (obj: seqrecords.SeqRecords)
 95 |             matrix (obj: utils.data.subsmat.SubsMat)
 96 | 
 97 |         Examples:
 98 |             >>> from .utils.data import subsmat
 99 |             >>> from .utils.seqrecords import SeqRecords
100 |             >>> matrix = subsmat.get('blosum62')
101 |             >>> seq_records = SeqRecords()
102 |             >>> seq_records.add('seq1', 'MKSTGWHF')
103 |             >>> seq_records.add('seq2', 'MKSSSSTGWGWG')
104 |             >>> seq_records.add('seq3', 'MKSTLKNGTEQ')
105 | 
106 |             >>> dist = Distance(seq_records, matrix)
107 | 
108 |         """
109 | 
110 |         self.freqs = freq_seqs_chars(seq_records, matrix.alphabet_list)
111 |         self.matrix = matrix
112 | 
113 |     def pairwise_distance(self, seqnum1, seqnum2):
114 |         """Compute W-metric between two proteins.
115 | 
116 |         The distance is defined by one-tuple frequencies
117 |         fx and fy of two proteins, weighted by matrix W.
118 | 
119 |         Formula:
120 |         d^{w} = \sum_{i\in A}\sum_{j\in A}(f_{i}^{X}-f_{i}^{y})
121 |         \cdot (f_{j}^{X}-f_{j}^{y})\cdot w_{ij}
122 | 
123 |         """
124 |         freqs1 = self.freqs[seqnum1]
125 |         freqs2 = self.freqs[seqnum2]
126 |         f = freqs1 - freqs2
127 |         m = np.outer(f, f) * self.matrix.data
128 |         return np.sum(m)
129 | 
130 | 
131 | def main():
132 |     from .utils import distmatrix
133 |     from .utils.data import subsmat
134 |     from .utils.seqrecords import SeqRecords
135 | 
136 |     matrix = subsmat.get('blosum62')
137 | 
138 |     seq_records = SeqRecords()
139 |     seq_records.add('seq1', 'MKSTGWHF')
140 |     seq_records.add('seq2', 'MKSSSSTGWGWG')
141 |     seq_records.add('seq3', 'MKSTLKNGTEQ')
142 | 
143 |     dist = Distance(seq_records, matrix)
144 | 
145 |     # print dist.pairwise_distance(0, 1)
146 |     matrix = distmatrix.create(seq_records.id_list, dist)
147 |     matrix.display()
148 | 
149 | 
150 | if __name__ == '__main__':
151 |     main()
152 | 


--------------------------------------------------------------------------------
/alfpy/word_bool_distance.py:
--------------------------------------------------------------------------------
  1 | """Distance methods between two boolean vectors (representing word
  2 | occurrences).
  3 | 
  4 | References:
  5 |     1. SciPy, https://www.scipy.org
  6 | 
  7 | """
  8 | 
  9 | import numpy as np
 10 | 
 11 | from .utils import distance
 12 | 
 13 | 
 14 | def _nbool_correspond_ft_tf(u, v):
 15 |     """Function used by some distance methods (in Distance class).
 16 |     Based on: https://github.com/scipy/scipy
 17 | 
 18 |     Args:
 19 |         u (numpy.ndarray) : boolean vector, shape: (N, 1)
 20 |         v (numpy.ndarray) : as above
 21 | 
 22 |     Returns:
 23 |         tuple of two numbers
 24 | 
 25 |     Examples:
 26 |         >>> u = np.array([True, False, True])
 27 |         >>> v = np.array([True, True, False])
 28 |         >>> print(_nbool_correspond_ft_tf(u, v))
 29 |         (1, 1)
 30 | 
 31 |     """
 32 |     not_u = ~u
 33 |     not_v = ~v
 34 |     nft = (not_u & v).sum()
 35 |     ntf = (u & not_v).sum()
 36 |     return (nft, ntf)
 37 | 
 38 | 
 39 | def _nbool_correspond_all(u, v):
 40 |     """Function used by some distance methods (in Distance class).
 41 |     Based on: https://github.com/scipy/scipy
 42 | 
 43 |     Args:
 44 |         u (numpy.ndarray) : bool, shape: (N, )
 45 |         v (numpy.ndarray) : as above
 46 | 
 47 |     Returns:
 48 |         tuple of four numbers
 49 | 
 50 |     Examples:
 51 |         >>> u = np.array([True, False, True])
 52 |         >>> v = np.array([True, True, False])
 53 |         >>> print(_nbool_correspond_all(u, v))
 54 |         (0, 1, 1, 1)
 55 | 
 56 |     """
 57 |     not_u = ~u
 58 |     not_v = ~v
 59 |     nff = (not_u & not_v).sum()
 60 |     nft = (not_u & v).sum()
 61 |     ntf = (u & not_v).sum()
 62 |     ntt = (u & v).sum()
 63 |     return (nff, nft, ntf, ntt)
 64 | 
 65 | 
 66 | class Distance(distance.Distance):
 67 |     """Combine vector boolean data (numpy.ndarray) with distance method.
 68 | 
 69 |     """
 70 | 
 71 |     def pwdist_dice(self, seq1idx, seq2idx):
 72 |         """Compute the Dice dissimilarity (Sorensen-Dice coefficient)
 73 |         between two boolean 1-D arrays.
 74 | 
 75 |         Returns:
 76 |             distance value (double)
 77 | 
 78 |         """
 79 |         u = self[seq1idx]
 80 |         v = self[seq2idx]
 81 |         ntt = (u & v).sum()
 82 |         (nft, ntf) = _nbool_correspond_ft_tf(u, v)
 83 |         return float(ntf + nft) / float(2.0 * ntt + ntf + nft)
 84 | 
 85 |     def pwdist_yule(self, seq1idx, seq2idx):
 86 |         """Compute the Yule dissimilarity between two boolean 1-D arrays.
 87 | 
 88 |         Returns:
 89 |             distance value (double)
 90 | 
 91 |         """
 92 |         u = self[seq1idx]
 93 |         v = self[seq2idx]
 94 |         (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
 95 |         return float(2.0 * ntf * nft) / float(ntt * nff + ntf * nft)
 96 | 
 97 |     def pwdist_rogerstanimoto(self, seq1idx, seq2idx):
 98 |         """Compute the Rogers-Tanimoto dissimilarity between two boolean
 99 |         1-D arrays.
100 | 
101 |         Returns:
102 |             distance value (double)
103 | 
104 |         """
105 |         u = self[seq1idx]
106 |         v = self[seq2idx]
107 |         (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
108 |         r = float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft)))
109 |         return r
110 | 
111 |     def pwdist_russellrao(self, seq1idx, seq2idx):
112 |         """Compute the Russell-Rao dissimilarity between two boolean 1-D arrays.
113 | 
114 |         Returns:
115 |             distance value (double)
116 | 
117 |         """
118 |         u = self[seq1idx]
119 |         v = self[seq2idx]
120 | 
121 |         ntt = (u & v).sum()
122 |         return float(len(u) - ntt) / float(len(u))
123 | 
124 |     def pwdist_sokalmichener(self, seq1idx, seq2idx):
125 |         """Compute the Sokal-Michener dissimilarity
126 |         between two boolean 1-D arrays.
127 | 
128 |         Returns:
129 |             distance value (double)
130 | 
131 |         """
132 |         u = self[seq1idx]
133 |         v = self[seq2idx]
134 |         ntt = (u & v).sum()
135 |         nff = (~u & ~v).sum()
136 |         (nft, ntf) = _nbool_correspond_ft_tf(u, v)
137 |         return float(2.0 * (ntf + nft)) / float(ntt + nff + 2.0 * (ntf + nft))
138 | 
139 |     def pwdist_sokalsneath(self, seq1idx, seq2idx):
140 |         """Compute the Sokal-Sneath dissimilarity
141 |         between two boolean 1-D arrays.
142 | 
143 |         Returns:
144 |             distance value (double)
145 | 
146 |         """
147 |         u = self[seq1idx]
148 |         v = self[seq2idx]
149 |         ntt = (u & v).sum()
150 | 
151 |         (nft, ntf) = _nbool_correspond_ft_tf(u, v)
152 |         denom = ntt + 2.0 * (ntf + nft)
153 |         if denom == 0:
154 |             raise ValueError('Sokal-Sneath dissimilarity is not defined for '
155 |                              'vectors that are entirely false.')
156 |         return float(2.0 * (ntf + nft)) / denom
157 | 
158 |     def pwdist_jaccard(self, seq1idx, seq2idx):
159 |         """Compute the Jaccard-Needham dissimilarity
160 |         between two boolean 1-D arrays.
161 | 
162 |         Returns:
163 |             distance value (double)
164 | 
165 |         """
166 |         u = self[seq1idx]
167 |         v = self[seq2idx]
168 |         dist = (np.double(np.bitwise_and((u != v),
169 |                 np.bitwise_or(u != 0, v != 0)).sum()) /
170 |                 np.double(np.bitwise_or(u != 0, v != 0).sum()))
171 |         return dist
172 | 
173 |     def pwdist_hamming(self, seq1idx, seq2idx):
174 |         """Compute the Hamming distance between two 1-D arrays.
175 | 
176 |         The Hamming distance between 1-D arrays `u` and `v`, is simply the
177 |         proportion of disagreeing components in `u` and `v`.
178 | 
179 |         Returns:
180 |             distance value (double)
181 | 
182 |         """
183 |         u = self[seq1idx]
184 |         v = self[seq2idx]
185 |         return (u != v).mean()
186 | 
187 |     def pwdist_kulsinski(self, seq1idx, seq2idx):
188 |         """Compute the Kulsinski dissimilarity between two boolean 1-D arrays.
189 | 
190 |         Returns:
191 |             distance value (double)
192 | 
193 |         """
194 |         u = self[seq1idx]
195 |         v = self[seq2idx]
196 |         n = float(len(u))
197 |         (_nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
198 |         return (ntf + nft - ntt + n) / (ntf + nft + n)
199 | 
200 | 
201 | def main():
202 |     from .utils.seqrecords import SeqRecords
203 |     from . import word_vector
204 |     from . import word_pattern
205 |     from .utils import distmatrix
206 | 
207 |     seq_records = SeqRecords()
208 |     seq_records.add('seq1', 'MKSTGWHF')
209 |     seq_records.add('seq2', 'MKSSSSTGWGWG')
210 |     seq_records.add('seq3', 'MKSTLKNGTEQ')
211 | 
212 |     p = word_pattern.create(seq_records.seq_list, 2)
213 |     bools = word_vector.Bools(seq_records.length_list, p)
214 |     dist = Distance(bools, 'jaccard')
215 |     matrix = distmatrix.create(seq_records.id_list, dist)
216 |     matrix.display()
217 | 
218 | 
219 | if __name__ == '__main__':
220 |     main()
221 | 


--------------------------------------------------------------------------------
/alfpy/word_d2.py:
--------------------------------------------------------------------------------
 1 | """This module computes distance between DNA/protein sequences based on
 2 | the d2 metric.
 3 | 
 4 | References:
 5 |     1. Hide, Burke, Davison (1994) J Comput Biol 1:199-215.
 6 |        doi: 10.1089/cmb.1994.1.199
 7 |     2. Vinga S, Almeida J (2003) Bioinformatics 19:513-523.
 8 |        doi: 10.1093/bioinformatics/btg005
 9 | 
10 | """
11 | 
12 | import math
13 | import numpy as np
14 | 
15 | 
16 | class Distance:
17 | 
18 |     """Combine a list of vectors with distance function."""
19 | 
20 |     def __init__(self, vector_list):
21 |         self.vector_list = vector_list
22 |         self.pairwise_distance = self.pwdist_d2
23 | 
24 |     def pwdist_d2(self, seqidx1, seqidx2):
25 |         d2 = 0
26 |         for vector in self.vector_list:
27 |             d_res = np.sum((vector[seqidx1]-vector[seqidx2])**2)
28 |             d2 += d_res
29 |         return d2
30 | 
31 |     def pwdist_d2_squareroot(self, seqidx1, seqidx2):
32 |         return math.sqrt(self.pwdist_d2(seqidx1, seqidx2))
33 | 
34 |     def set_disttype(self, disttype):
35 |         try:
36 |             pwdist_func = getattr(self, 'pwdist_{}'.format(disttype))
37 |             self.pairwise_distance = pwdist_func
38 |         # Method does not exist.
39 |         except AttributeError:
40 |             msg = 'unknown disttype "{}"'.format(disttype)
41 |             raise ValueError(msg)
42 | 
43 | 
44 | def main():
45 |     from .utils.seqrecords import main
46 |     from .utils.data import seqcontent
47 |     from .utils import distmatrix
48 |     from . import word_pattern
49 |     from . import word_vector
50 | 
51 |     seq_records = main()
52 | 
53 |     patterns = []
54 |     for i in range(1, 5+1):
55 |         p = word_pattern.create(seq_records.seq_list, i)
56 |         patterns.append(p)
57 | 
58 |     counts = []
59 |     for p in patterns:
60 |         c = word_vector.Counts(seq_records.length_list, p)
61 |         counts.append(c)
62 | 
63 |     countsweight = []
64 |     weights = seqcontent.get_weights('dna')
65 |     weightmodel = word_vector.WeightModel(weights)
66 |     for p in patterns:
67 |         c = word_vector.CountsWeight(seq_records, p, weightmodel)
68 |         countsweight.append(c)
69 |     dist = Distance(countsweight)
70 |     matrix = distmatrix.create(seq_records.id_list, dist)
71 |     matrix.display()
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     main()
76 | 


--------------------------------------------------------------------------------
/alfpy/word_rtd.py:
--------------------------------------------------------------------------------
 1 | """Return Time Distribution distance (RTD)
 2 | 
 3 | In contrast to other word-based measures, RTD accounts for the words'
 4 | relative orders. Although, originally presented for DNA sequences, the
 5 | implemention handles proteins as well.
 6 | 
 7 | Return time can be defined as the time required for the reappearance of a
 8 | particular state without its appearance within the epoch. The `return time`
 9 | in the context of nucleotide sequence can be defined as the number of
10 | nucleotides between the successive appearances of a particular nucleotide(s)
11 | or k-mer. The frequency distribution of those RTs for a particular k-mer is
12 | referred as a return time distribution (RTD) of that k-mer.
13 | 
14 | References:
15 |     1. Kolekar, Kale, Kulkarni-Kale (2012) Mol Phylogenet Evol 65 510-522
16 |        doi: http://dx.doi.org/10.1016/j.ympev.2012.07.003.
17 | 
18 | """
19 | 
20 | import numpy as np
21 | from .utils import distance
22 | 
23 | 
24 | def calc_rtd(word_positions):
25 |     """Compute return time distribution (RTD) of a given word.
26 | 
27 |     Args:
28 |         word_positions (list) : list of sequence positions of a given word
29 | 
30 |     Returns:
31 |         mean, stdev (tuple)
32 | 
33 |     Examples:
34 |         >>> seq = 'CTACACAACTTTGCGGGTAGCCGGAAACATTGTGAATGCGGTGAACA'
35 |         >>> apos = [i for i, nt in enumerate(seq) if nt == 'A']
36 |         >>> print(apos)
37 |         [2, 4, 6, 7, 18, 24, 25, 26, 28, 34, 35, 43, 44, 46]
38 |         >>> print(calc_rtd(apos, 1))
39 |         (3.3846153846153846, 3.1510306381944679)
40 | 
41 |     """
42 |     l = []
43 |     positions_count = len(word_positions)
44 |     if positions_count < 2:
45 |         return 0.0, 0.0
46 |     for i in range(1, positions_count):
47 |         pos1 = word_positions[i - 1]
48 |         pos2 = word_positions[i]
49 |         pos = pos2 - pos1
50 |         l.append(pos)
51 |     return np.mean(l), np.std(l)
52 | 
53 | 
54 | def create_vector(seqcount, pattern):
55 |     """Compute a matrix of sequence-representing RTD vectors
56 | 
57 |     Args:
58 |         seqcount (int): number of sequences
59 |         pattern (obj: word_pattern.Pattern)
60 | 
61 |     Returns:
62 |         ndarray: matrix of RTD vectors
63 |                  (shape: number of seqs, doubled number of words)
64 | 
65 |     """
66 |     words = pattern.pat_list
67 |     data = np.zeros(shape=(seqcount, len(words) * 2))
68 |     for wordidx in range(len(words)):
69 |         for seqidx in pattern.pos_list[wordidx]:
70 |             word_positions = pattern.pos_list[wordidx][seqidx]
71 |             mean, std = calc_rtd(word_positions)
72 |             data[seqidx, wordidx * 2] = mean
73 |             data[seqidx, wordidx * 2 + 1] = std
74 |     return data
75 | 
76 | 
77 | class Distance(distance.Distance):
78 |     pass
79 | 
80 | 
81 | def main():
82 |     from .utils.seqrecords import main
83 |     from . import word_pattern
84 |     from .utils import distmatrix
85 | 
86 |     seq_records = main()
87 |     p = word_pattern.create(seq_records.seq_list, 2, True)
88 |     vector = create_vector(seq_records.count, p)
89 |     dist = Distance(vector, 'google')
90 |     matrix = distmatrix.create(seq_records.id_list, dist)
91 |     matrix.display()
92 | 
93 | 
94 | if __name__ == '__main__':
95 |     main()
96 | 


--------------------------------------------------------------------------------
/alfpy/word_sets_distance.py:
--------------------------------------------------------------------------------
 1 | """Distance methods measuring dissimilarity between sets of words.
 2 | 
 3 | These methods are also implemented in numpy and provided in the
 4 | `word_bool_distance` module. However, here are their faster
 5 | implemetations based on python sets.
 6 | """
 7 | 
 8 | from .utils import distance
 9 | 
10 | 
11 | def _getwords(seq, word_size):
12 |     """Return a set of words (of a given size) that are present
13 |     in a given sequence.
14 | 
15 |     Args:
16 |         seq (str)
17 |         word_size (int): >= 1
18 | 
19 |     Example:
20 |         >>> seq = 'ATGCGTA'
21 |         >>> print(_getwords(seq, 2))
22 |         set(['GT', 'CG', 'GC', 'AT', 'TG', 'TA'])
23 | 
24 |     """
25 |     s = set([])
26 |     for i in range(0, len(seq) - word_size + 1):
27 |         word = seq[i:i + word_size]
28 |         s.add(word)
29 |     return s
30 | 
31 | 
32 | class Distance(distance.Distance):
33 |     """Combine vector data with pairwise distance methods that measures
34 |     dissimilarity between sets."""
35 | 
36 |     def __init__(self, seq_records, word_size, disttype='jaccard'):
37 |         """Create an instance of Distance
38 | 
39 |         Args:
40 |             seq_records (SeqRecords obj)
41 |             word_size (int): >= 1
42 | 
43 |         """
44 |         self._vector = [_getwords(s, word_size) for s in seq_records.seq_list]
45 |         self.set_disttype(disttype)
46 | 
47 |     def pwdist_jaccard(self, seq1idx, seq2idx):
48 |         """Jaccard distance is complementary to the Jaccard coefficient
49 |         and is obtained by subtracting the Jaccard coefficient from 1."""
50 |         s1 = self[seq1idx]
51 |         s2 = self[seq2idx]
52 |         return 1 - len(s1 & s2) / float(len(s1 | s2))
53 | 
54 |     def pwdist_dice(self, seq1idx, seq2idx):
55 |         """Sorensen-Dice coefficient (Czekanowski's binary index)"""
56 |         s1 = self[seq1idx]
57 |         s2 = self[seq2idx]
58 |         return 1 - (2 * len(s1 & s2) / float(len(s1) + len(s2)))
59 | 
60 |     def pwdist_hamming(self, seq1idx, seq2idx):
61 |         """Hamming distance measures the number of words which are in either
62 |         of the sets and not in their intersection.
63 | 
64 |         """
65 |         s1 = self[seq1idx]
66 |         s2 = self[seq2idx]
67 |         return len(s1.symmetric_difference(s2))
68 | 
69 | 
70 | def main():
71 |     from .utils.seqrecords import SeqRecords
72 |     from .utils import distmatrix
73 | 
74 |     seq_records = SeqRecords()
75 |     seq_records.add('seq1', 'MKSTGWHF')
76 |     seq_records.add('seq2', 'MKSSSSTGWGWG')
77 |     seq_records.add('seq3', 'MKSTLKNGTEQ')
78 |     dist = Distance(seq_records, 2, 'jaccard')
79 |     matrix = distmatrix.create(seq_records.id_list, dist)
80 |     matrix.display()
81 | 
82 | if __name__ == '__main__':
83 |     main()
84 | 


--------------------------------------------------------------------------------
/bin/calc_bbc.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | # Copyright (c) 2016 Zielezinski A, combio.pl
 4 | 
 5 | import argparse
 6 | import sys
 7 | 
 8 | from alfpy import bbc
 9 | from alfpy.utils import distmatrix
10 | from alfpy.utils import seqrecords
11 | from alfpy.utils.data.seqcontent import get_alphabet
12 | from alfpy.version import __version__
13 | 
14 | 
15 | def get_parser():
16 |     parser = argparse.ArgumentParser(
17 |         description='''Calculatee distance between DNA/protein sequences
18 |         based on Base-Base Correlation (BBC).''',
19 |         add_help=False, prog='calc_bbc.py'
20 |     )
21 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
22 |     group.add_argument('--fasta', '-f',
23 |                        help='input FASTA sequence filename', required=True,
24 |                        type=argparse.FileType('r'), metavar="FILE")
25 |     group.add_argument('--molecule', '-m', choices=['dna', 'rna', 'protein'],
26 |                        help='choose sequence alphabet', required=True)
27 | 
28 |     group = parser.add_argument_group('OPTIONAL ARGUMENTS')
29 |     group.add_argument('--k', '-k', help='''maximum distance to observe
30 |                         correlation between bases [default: %(default)s]''',
31 |                         type=int, default=10, metavar="INT")
32 |     group.add_argument('--out', '-o', help="output filename",
33 |                        metavar="FILE")
34 |     group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
35 |                        default='phylip',
36 |                        help='distances output format [default: %(default)s]')
37 | 
38 |     group = parser.add_argument_group("OTHER OPTIONS")
39 |     group.add_argument("-h", "--help", action="help",
40 |                        help="show this help message and exit")
41 |     group.add_argument('--version', action='version',
42 |                        version='%(prog)s {}'.format(__version__))
43 | 
44 |     if len(sys.argv[1:]) == 0:
45 |         # parser.print_help()
46 |         parser.print_usage()
47 |         parser.exit()
48 |     return parser
49 | 
50 | 
51 | def validate_args(parser):
52 |     args = parser.parse_args()
53 |     try:
54 |         args.alphabet = get_alphabet(args.molecule)
55 |     except KeyError:
56 |         parser.error("Unknown alphabet {}".format(args.molecule))
57 |     return args
58 | 
59 | 
60 | def main():
61 |     parser = get_parser()
62 |     args = validate_args(parser)
63 | 
64 |     seq_records = seqrecords.read_fasta(args.fasta)
65 |     vector = bbc.create_vectors(seq_records, args.k, alphabet=args.alphabet)
66 |     dist = bbc.Distance(vector)
67 |     matrix = distmatrix.create(seq_records.id_list, dist)
68 | 
69 |     if args.out:
70 |         oh = open(args.out, 'w')
71 |         matrix.write_to_file(oh, args.outfmt)
72 |         oh.close()
73 |     else:
74 |         matrix.display(args.outfmt)
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     main()
79 | 


--------------------------------------------------------------------------------
/bin/calc_fcgr.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | # Copyright (c) 2016 Zielezinski A, combio.pl
 4 | 
 5 | import argparse
 6 | import sys
 7 | 
 8 | from alfpy import fcgr
 9 | from alfpy.utils import distmatrix
10 | from alfpy.utils import seqrecords
11 | from alfpy.version import __version__
12 | 
13 | 
14 | def get_parser():
15 |     parser = argparse.ArgumentParser(
16 |         description='''Calculate distances between DNA sequences based on
17 |         Frequency Chaos Game Representation (FCGR) patterns of
18 |         word occurrences.''',
19 |         add_help=False, prog='calc_fcgr.py'
20 |     )
21 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
22 |     group.add_argument('--fasta', '-f',
23 |                        help='input FASTA sequence filename', required=True,
24 |                        type=argparse.FileType('r'), metavar="FILE")
25 |     group.add_argument('--word_size', '-w', required=True,
26 |                        help='word size', type=int)
27 | 
28 |     group = parser.add_argument_group('OUTPUT ARGUMENTS')
29 |     group.add_argument('--out', '-o', help="output filename",
30 |                        metavar="FILE")
31 |     group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
32 |                        default='phylip',
33 |                        help='distances output format [DEFAULT: %(default)s]')
34 | 
35 |     group = parser.add_argument_group("OTHER OPTIONS")
36 |     group.add_argument("-h", "--help", action="help",
37 |                        help="show this help message and exit")
38 |     group.add_argument('--version', action='version',
39 |                        version='%(prog)s {}'.format(__version__))
40 | 
41 |     if len(sys.argv[1:]) == 0:
42 |         # parser.print_help()
43 |         parser.print_usage()
44 |         parser.exit()
45 |     return parser
46 | 
47 | 
48 | def validate_args(parser):
49 |     args = parser.parse_args()
50 |     if args.word_size < 1:
51 |         parser.error('--word_size must be >= 1')
52 |     return args
53 | 
54 | 
55 | def main():
56 |     parser = get_parser()
57 |     args = validate_args(parser)
58 | 
59 |     seq_records = seqrecords.read_fasta(args.fasta)
60 | 
61 |     vector = fcgr.create_vectors(seq_records, args.word_size)
62 |     dist = fcgr.Distance(vector)
63 |     matrix = distmatrix.create(seq_records.id_list, dist)
64 | 
65 |     if args.out:
66 |         oh = open(args.out, 'w')
67 |         matrix.write_to_file(oh, args.outfmt)
68 |         oh.close()
69 |     else:
70 |         matrix.display(args.outfmt)
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     main()
75 | 


--------------------------------------------------------------------------------
/bin/calc_graphdna.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | # Copyright (c) 2016 Zielezinski A, combio.pl
 4 | 
 5 | import argparse
 6 | import sys
 7 | 
 8 | from alfpy import graphdna
 9 | from alfpy.utils import distmatrix
10 | from alfpy.utils import seqrecords
11 | from alfpy.version import __version__
12 | 
13 | 
14 | def get_parser():
15 |     parser = argparse.ArgumentParser(
16 |         description='''Calculate distance between DNA sequences based on
17 |         the two-dimensional (2D) graphical DNA curve''',
18 |         add_help=False, prog='calc_graphdna.py'
19 |     )
20 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
21 |     group.add_argument('--fasta', '-f',
22 |                        help='input FASTA sequence filename', required=True,
23 |                        type=argparse.FileType('r'), metavar="FILE")
24 | 
25 |     group = parser.add_argument_group('OPTIONAL ARGUMENTS')
26 |     group.add_argument('--vector', '-v', choices=['2DSV', '2DNV', '2DMV'],
27 |                        help='vector type [default: %(default)s]',
28 |                        default='2DNV')
29 |     group.add_argument('--ndim', '-n', type=int, metavar='N',
30 |                        help='''number of dimensions representing a sequence.
31 |                         (required if --vector 2DMV) [default: %(default)s]''',
32 |                        default=10)
33 | 
34 |     group = parser.add_argument_group('OUTPUT ARGUMENTS')
35 |     group.add_argument('--out', '-o', help="output filename", metavar="FILE")
36 |     group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
37 |                        default='phylip',
38 |                        help='distances output format [default: %(default)s]')
39 | 
40 |     group = parser.add_argument_group("OTHER OPTIONS")
41 |     group.add_argument("-h", "--help", action="help",
42 |                        help="show this help message and exit")
43 |     group.add_argument('--version', action='version',
44 |                        version='%(prog)s {}'.format(__version__))
45 | 
46 |     if len(sys.argv[1:]) == 0:
47 |         # parser.print_help()
48 |         parser.print_usage()
49 |         parser.exit()
50 |     return parser
51 | 
52 | 
53 | def validate_args(parser):
54 |     args = parser.parse_args()
55 |     if args.vector == '2DMV' and args.ndim is None:
56 |         parser.error("--vector 2DMV requires the --ndim")
57 |     # TODO: mk as a range
58 |     # stackoverflow.com/questions/18700634/python-argparse-integer-condition-12
59 |     return args
60 | 
61 | 
62 | def main():
63 |     parser = get_parser()
64 |     args = validate_args(parser)
65 | 
66 |     seq_records = seqrecords.read_fasta(args.fasta)
67 |     if args.vector == '2DSV':
68 |         vector = graphdna.create_2DSGraphVectors(seq_records)
69 |     elif args.vector == '2DNV':
70 |         vector = graphdna.create_2DNGraphVectors(seq_records)
71 |     else:
72 |         vector = graphdna.create_2DMGraphVectors(seq_records, args.ndim)
73 |     dist = graphdna.Distance(vector)
74 |     matrix = distmatrix.create(seq_records.id_list, dist)
75 | 
76 |     if args.out:
77 |         oh = open(args.out, 'w')
78 |         matrix.write_to_file(oh, args.outfmt)
79 |         oh.close()
80 |     else:
81 |         matrix.display(args.outfmt)
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     main()
86 | 


--------------------------------------------------------------------------------
/bin/calc_lempelziv.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | # Copyright (c) 2016 Zielezinski A, combio.pl
 4 | 
 5 | import argparse
 6 | import sys
 7 | 
 8 | from alfpy import lempelziv
 9 | from alfpy.utils import distmatrix
10 | from alfpy.utils import seqrecords
11 | from alfpy.version import __version__
12 | 
13 | 
14 | def get_parser():
15 |     parser = argparse.ArgumentParser(
16 |         description='''Calculate distance between DNA/protein sequences based
17 |         on Lempel-Ziv complexity.''',
18 |         add_help=False, prog='calc_lempelziv.py'
19 |     )
20 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
21 |     group.add_argument('--fasta', '-f',
22 |                        help='input FASTA sequence filename', required=True,
23 |                        type=argparse.FileType('r'), metavar="FILE")
24 | 
25 |     group = parser.add_argument_group('OPTIONAL ARGUMENTS')
26 |     distlist = ['d', 'd_star', 'd1', 'd1_star', 'd1_star2']
27 |     group.add_argument('--distance', '-d', choices=distlist,
28 |                        help='choose from: {} [DEFAULT: %(default)s]'.format(
29 |                            ", ".join(distlist)),
30 |                        metavar='', default="d1_star2")
31 | 
32 |     group = parser.add_argument_group('OUTPUT ARGUMENTS')
33 |     group.add_argument('--out', '-o', help="output filename",
34 |                        metavar="FILE")
35 |     group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
36 |                        default='phylip',
37 |                        help='distances output format [DEFAULT: %(default)s]')
38 | 
39 |     group = parser.add_argument_group("OTHER OPTIONS")
40 |     group.add_argument("-h", "--help", action="help",
41 |                        help="show this help message and exit")
42 |     group.add_argument('--version', action='version',
43 |                        version='%(prog)s {}'.format(__version__))
44 | 
45 |     if len(sys.argv[1:]) == 0:
46 |         # parser.print_help()
47 |         parser.print_usage()
48 |         parser.exit()
49 |     return parser
50 | 
51 | 
52 | def validate_args(parser):
53 |     args = parser.parse_args()
54 |     return args
55 | 
56 | 
57 | def main():
58 |     parser = get_parser()
59 |     args = validate_args(parser)
60 | 
61 |     seq_records = seqrecords.read_fasta(args.fasta)
62 |     dist = lempelziv.Distance(seq_records, args.distance)
63 |     matrix = distmatrix.create(seq_records.id_list, dist)
64 | 
65 |     if args.out:
66 |         oh = open(args.out, 'w')
67 |         matrix.write_to_file(oh, args.outfmt)
68 |         oh.close()
69 |     else:
70 |         matrix.display(args.outfmt)
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     main()
75 | 


--------------------------------------------------------------------------------
/bin/calc_ncd.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | # Copyright (c) 2016 Zielezinski A, combio.pl
 4 | 
 5 | import argparse
 6 | import sys
 7 | 
 8 | from alfpy import ncd
 9 | from alfpy.utils import distmatrix
10 | from alfpy.utils import seqrecords
11 | from alfpy.version import __version__
12 | 
13 | 
14 | def get_parser():
15 |     parser = argparse.ArgumentParser(
16 |         description='''Calculate distances between DNA/protein sequences based
17 |         on Normalized Compression Distance (NCD).''',
18 |         add_help=False, prog='calc_ncd.py'
19 |     )
20 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
21 |     group.add_argument('--fasta', '-f',
22 |                        help='input FASTA sequence filename', required=True,
23 |                        type=argparse.FileType('r'), metavar="FILE")
24 | 
25 |     group = parser.add_argument_group('OUTPUT ARGUMENTS')
26 |     group.add_argument('--out', '-o', help="output filename",
27 |                        metavar="FILE")
28 |     group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
29 |                        default='phylip',
30 |                        help='distances output format [DEFAULT: %(default)s]')
31 | 
32 |     group = parser.add_argument_group("OTHER OPTIONS")
33 |     group.add_argument("-h", "--help", action="help",
34 |                        help="show this help message and exit")
35 |     group.add_argument('--version', action='version',
36 |                        version='%(prog)s {}'.format(__version__))
37 | 
38 |     if len(sys.argv[1:]) == 0:
39 |         # parser.print_help()
40 |         parser.print_usage()
41 |         parser.exit()
42 | 
43 |     return parser
44 | 
45 | 
46 | def validate_args(parser):
47 |     args = parser.parse_args()
48 |     return args
49 | 
50 | 
51 | def main():
52 |     parser = get_parser()
53 |     args = validate_args(parser)
54 | 
55 |     seq_records = seqrecords.read_fasta(args.fasta)
56 |     dist = ncd.Distance(seq_records)
57 |     matrix = distmatrix.create(seq_records.id_list, dist)
58 | 
59 |     if args.out:
60 |         oh = open(args.out, 'w')
61 |         matrix.write_to_file(oh, args.outfmt)
62 |         oh.close()
63 |     else:
64 |         matrix.display(args.outfmt)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     main()
69 | 


--------------------------------------------------------------------------------
/bin/calc_wmetric.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | # Copyright (c) 2016 Zielezinski A, combio.pl
 4 | 
 5 | import argparse
 6 | import sys
 7 | 
 8 | from alfpy import wmetric
 9 | from alfpy.utils import distmatrix
10 | from alfpy.utils import seqrecords
11 | from alfpy.utils.data import subsmat
12 | from alfpy.version import __version__
13 | 
14 | 
15 | def get_parser():
16 |     parser = argparse.ArgumentParser(
17 |         description='''Calculate distances between protein sequences based
18 |         on W-metric (Wm).''', add_help=False, prog='calc_wmetric.py'
19 |     )
20 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
21 |     group.add_argument('--fasta', '-f',
22 |                        help='input FASTA sequence filename', required=True,
23 |                        type=argparse.FileType('r'), metavar="FILE")
24 | 
25 |     l = subsmat.list_subsmats()
26 |     group = parser.add_argument_group('OPTIONAL ARGUMENTS')
27 |     group.add_argument('--matrix', '-m', choices=l,
28 |                        help='choose from: {} [DEFAULT: %(default)s]'.format(
29 |                            ", ".join(l)), metavar='',
30 |                        default="blosum62")
31 | 
32 |     group = parser.add_argument_group('OUTPUT ARGUMENTS')
33 |     group.add_argument('--out', '-o', help="output filename",
34 |                        metavar="FILE")
35 |     group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
36 |                        default='phylip',
37 |                        help='distances output format [DEFAULT: %(default)s]')
38 | 
39 |     group = parser.add_argument_group("OTHER OPTIONS")
40 |     group.add_argument("-h", "--help", action="help",
41 |                        help="show this help message and exit")
42 |     group.add_argument('--version', action='version',
43 |                        version='%(prog)s {}'.format(__version__))
44 | 
45 |     if len(sys.argv[1:]) == 0:
46 |         # parser.print_help()
47 |         parser.print_usage()
48 |         parser.exit()
49 | 
50 |     return parser
51 | 
52 | 
53 | def validate_args(parser):
54 |     args = parser.parse_args()
55 |     try:
56 |         args.matrix = subsmat.get(args.matrix)
57 |     except KeyError:
58 |         parser.error("Unknown matrix {}".format(args.matrix))
59 |     return args
60 | 
61 | 
62 | def main():
63 |     parser = get_parser()
64 |     args = validate_args(parser)
65 | 
66 |     seq_records = seqrecords.read_fasta(args.fasta)
67 |     dist = wmetric.Distance(seq_records, args.matrix)
68 |     matrix = distmatrix.create(seq_records.id_list, dist)
69 | 
70 |     if args.out:
71 |         oh = open(args.out, 'w')
72 |         matrix.write_to_file(oh, args.outfmt)
73 |         oh.close()
74 |     else:
75 |         matrix.display(args.outfmt)
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     main()
80 | 


--------------------------------------------------------------------------------
/bin/calc_word_bool.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | # Copyright (c) 2016 Zielezinski A, combio.pl
  4 | 
  5 | import argparse
  6 | import sys
  7 | 
  8 | from alfpy import word_bool_distance
  9 | from alfpy import word_pattern
 10 | from alfpy import word_vector
 11 | from alfpy.utils import distmatrix
 12 | from alfpy.utils import seqrecords
 13 | from alfpy.version import __version__
 14 | 
 15 | 
 16 | def get_parser():
 17 |     parser = argparse.ArgumentParser(
 18 |         description='''Calculate distances between DNA/protein sequences based
 19 |         on boolean 1-D vectors of word counting occurrences.''',
 20 |         add_help=False, prog='calc_word_bool.py'
 21 |     )
 22 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
 23 |     group.add_argument('--fasta', '-f',
 24 |                        help='input FASTA sequence filename', required=True,
 25 |                        type=argparse.FileType('r'), metavar="FILE")
 26 | 
 27 |     group = parser.add_argument_group('  Choose between the two options')
 28 |     g1 = group.add_mutually_exclusive_group()
 29 |     g1.add_argument('--word_size', '-s', metavar="N",
 30 |                     help='word size for creating word patterns',
 31 |                     type=int)
 32 |     g1.add_argument('--word_pattern', '-w',
 33 |                     help='input filename w/ pre-computed word patterns',
 34 |                     type=argparse.FileType('r'), metavar="FILE")
 35 | 
 36 |     group = parser.add_argument_group('OPTIONAL ARGUMENTS')
 37 |     distlist = word_bool_distance.Distance.get_disttypes()
 38 |     group.add_argument('--distance', '-d', choices=distlist,
 39 |                        help='choose from: {} [DEFAULT: %(default)s]'.format(
 40 |                            ", ".join(distlist)),
 41 |                        metavar='', default="jaccard")
 42 | 
 43 |     group = parser.add_argument_group('OUTPUT ARGUMENTS')
 44 |     group.add_argument('--out', '-o', help="output filename",
 45 |                        metavar="FILE")
 46 |     group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
 47 |                        default='phylip',
 48 |                        help='distances output format [DEFAULT: %(default)s]')
 49 | 
 50 |     group = parser.add_argument_group("OTHER OPTIONS")
 51 |     group.add_argument("-h", "--help", action="help",
 52 |                        help="show this help message and exit")
 53 |     group.add_argument('--version', action='version',
 54 |                        version='%(prog)s {}'.format(__version__))
 55 | 
 56 |     if len(sys.argv[1:]) == 0:
 57 |         # parser.print_help()
 58 |         parser.print_usage()  # for just the usage line
 59 |         parser.exit()
 60 | 
 61 |     return parser
 62 | 
 63 | 
 64 | def validate_args(parser):
 65 |     args = parser.parse_args()
 66 |     if args.word_size:
 67 |         if args.word_size < 1:
 68 |             parser.error('Word size must be >= 1.')
 69 |     elif args.word_pattern:
 70 |         pass
 71 |     else:
 72 |         parser.error("Specify either: --word_size or --word_pattern.")
 73 |     return args
 74 | 
 75 | 
 76 | def main():
 77 |     parser = get_parser()
 78 |     args = validate_args(parser)
 79 | 
 80 |     seq_records = seqrecords.read_fasta(args.fasta)
 81 |     if args.word_size:
 82 |         p = word_pattern.create(seq_records.seq_list, args.word_size)
 83 |     else:
 84 |         p = word_pattern.read(args.word_pattern)
 85 | 
 86 |     bools = word_vector.Bools(seq_records.length_list, p)
 87 |     dist = word_bool_distance.Distance(bools, args.distance)
 88 |     matrix = distmatrix.create(seq_records.id_list, dist)
 89 | 
 90 |     if args.out:
 91 |         oh = open(args.out, 'w')
 92 |         matrix.write_to_file(oh, args.outfmt)
 93 |         oh.close()
 94 |     else:
 95 |         matrix.display(args.outfmt)
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/bin/calc_word_cv.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | # Copyright (c) 2016 Zielezinski A, combio.pl
  4 | 
  5 | import argparse
  6 | import sys
  7 | 
  8 | from alfpy import word_vector
  9 | from alfpy import word_distance
 10 | from alfpy.utils import distmatrix
 11 | from alfpy.utils import seqrecords
 12 | from alfpy import word_pattern
 13 | from alfpy.version import __version__
 14 | 
 15 | 
 16 | def get_parser():
 17 |     parser = argparse.ArgumentParser(
 18 |         description='''Calculate compositional distances between DNA/protein
 19 |         sequences based on word (of length k) occurrences using a Markov model
 20 |         of k-2.''',
 21 |         add_help=False, prog='calc_word_cv.py'
 22 |     )
 23 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
 24 |     group.add_argument('--fasta', '-f',
 25 |                        help='input FASTA sequence filename', required=True,
 26 |                        type=argparse.FileType('r'), metavar="FILE")
 27 | 
 28 |     group = parser.add_argument_group('  Choose between the two options')
 29 |     g1 = group.add_mutually_exclusive_group()
 30 |     g1.add_argument('--word_size', '-s', metavar="k", type=int,
 31 |                     help='''word size (k-mer) for creating word patterns
 32 |                         (must be >= 3)'''
 33 |                     )
 34 |     g1.add_argument('--word_patterns', '-w', nargs=3,
 35 |                     help='''3 input word pattern files (k-, [k-1]-,
 36 |                         [k-2]-mers)''',
 37 |                     type=argparse.FileType('r'), metavar="FILE")
 38 | 
 39 |     group = parser.add_argument_group('OUTPUT ARGUMENTS')
 40 |     group.add_argument('--out', '-o', help="output filename",
 41 |                        metavar="FILE")
 42 |     group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
 43 |                        default='phylip',
 44 |                        help='distances output format [DEFAULT: %(default)s]')
 45 | 
 46 |     group = parser.add_argument_group("OTHER OPTIONS")
 47 |     group.add_argument("-h", "--help", action="help",
 48 |                        help="show this help message and exit")
 49 |     group.add_argument('--version', action='version',
 50 |                        version='%(prog)s {}'.format(__version__))
 51 | 
 52 |     if len(sys.argv[1:]) == 0:
 53 |         # parser.print_help()
 54 |         parser.print_usage()
 55 |         parser.exit()
 56 | 
 57 |     return parser
 58 | 
 59 | 
 60 | def validate_args(parser):
 61 |     args = parser.parse_args()
 62 |     if args.word_size:
 63 |         if args.word_size < 3:
 64 |             parser.error('Word size must be >= 3')
 65 | 
 66 |     elif args.word_patterns:
 67 |         l = []
 68 |         for i in range(0, 3):
 69 |             try:
 70 |                 p = word_pattern.read(args.word_patterns[i])
 71 |                 l.append(p)
 72 |             except Exception:
 73 |                 parser.error('Invalid format for word pattern: {0}'.format(
 74 |                     args.word_patterns[i].name))
 75 | 
 76 |         if len(l) == 3:
 77 |             # check if follow rule
 78 |             k, k1, k2 = [len(p.pat_list[0]) for p in l]
 79 |             if not (k == k1 + 1 == k2 + 2):
 80 |                 parser.error(
 81 |                     '''Word pattern lengths do not follow k, k-1, k-2''')
 82 | 
 83 |         args.word_patterns = l
 84 |     else:
 85 |         parser.error("Specify either: --word_size or --word_pattern.")
 86 |     return args
 87 | 
 88 | 
 89 | def main():
 90 |     parser = get_parser()
 91 |     args = validate_args(parser)
 92 | 
 93 |     seq_records = seqrecords.read_fasta(args.fasta)
 94 | 
 95 |     if args.word_patterns:
 96 |         l = args.word_patterns
 97 |     else:
 98 |         l = []
 99 |         for i in range(args.word_size, args.word_size - 3, -1):
100 |             p = word_pattern.create(seq_records.seq_list, i)
101 |             l.append(p)
102 | 
103 |     compos = word_vector.Composition(seq_records.length_list, *l)
104 |     dist = word_distance.Distance(compos, 'angle_cos_diss')
105 |     matrix = distmatrix.create(seq_records.id_list, dist)
106 | 
107 |     if args.out:
108 |         oh = open(args.out, 'w')
109 |         matrix.write_to_file(oh, args.outfmt)
110 |         oh.close()
111 |     else:
112 |         matrix.display(args.outfmt)
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     main()
117 | 


--------------------------------------------------------------------------------
/bin/calc_word_d2.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | # Copyright (c) 2016 Zielezinski A, combio.pl
  4 | 
  5 | import argparse
  6 | import sys
  7 | 
  8 | from alfpy import word_d2
  9 | from alfpy import word_pattern
 10 | from alfpy import word_vector
 11 | from alfpy.utils import distmatrix
 12 | from alfpy.utils import seqrecords
 13 | from alfpy.version import __version__
 14 | 
 15 | 
 16 | def get_parser():
 17 |     parser = argparse.ArgumentParser(
 18 |         description='''Calculate d2 distance between DNA/protein sequences based
 19 |         on subsequence (words) occurrences.''',
 20 |         add_help=False, prog='calc_word_d2.py'
 21 |     )
 22 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
 23 |     group.add_argument('--fasta', '-f',
 24 |                        help='input FASTA sequence filename', required=True,
 25 |                        type=argparse.FileType('r'), metavar="FILE")
 26 | 
 27 |     group = parser.add_argument_group('OPTIONAL ARGUMENTS')
 28 |     group.add_argument('--min_word_size', '-l',
 29 |                        help='minimum word size [default: %(default)s]',
 30 |                        type=int, metavar="WORD_SIZE", default=1,
 31 |                        )
 32 |     group.add_argument('--max_word_size', '-u',
 33 |                        help='maximum word size [default: %(default)s]',
 34 |                        type=int, metavar="WORD_SIZE", default=3,
 35 |                        )
 36 |     veclist = ['counts', 'freqs']
 37 |     group.add_argument('--vector', '-v', choices=veclist,
 38 |                        help='choose from: {} [DEFAULT: %(default)s]'.format(
 39 |                             ", ".join(veclist)),
 40 |                        metavar='', default="counts")
 41 |     group.add_argument('--char_weights', '-W', metavar="FILE",
 42 |                        help='''file w/ weights of background sequence characters
 43 |                        (nt/aa)''',
 44 |                        type=argparse.FileType('r'))
 45 | 
 46 |     group = parser.add_argument_group('OUTPUT ARGUMENTS')
 47 |     group.add_argument('--out', '-o', help="output filename",
 48 |                        metavar="FILE")
 49 |     group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
 50 |                        default='phylip',
 51 |                        help='distances output format [DEFAULT: %(default)s]')
 52 | 
 53 |     group = parser.add_argument_group("OTHER OPTIONS")
 54 |     group.add_argument("-h", "--help", action="help",
 55 |                        help="show this help message and exit")
 56 |     group.add_argument('--version', action='version',
 57 |                        version='%(prog)s {}'.format(__version__))
 58 | 
 59 |     if len(sys.argv[1:]) == 0:
 60 |         # parser.print_help()
 61 |         parser.print_usage()
 62 |         parser.exit()
 63 | 
 64 |     return parser
 65 | 
 66 | 
 67 | def validate_args(parser):
 68 |     args = parser.parse_args()
 69 |     if not args.min_word_size:
 70 |         parser.error("min_word_size must be greater than 0")
 71 |     elif args.min_word_size >= args.max_word_size:
 72 |         parser.error("max_word_size must be greater than min_word_size")
 73 |     if args.char_weights:
 74 |         try:
 75 |             weights = word_vector.read_weightfile(args.char_weights)
 76 |             args.char_weights = weights
 77 |         except Exception:
 78 |             e = 'Invalid format for --char_weights {0}'.format(
 79 |                 args.char_weights.name)
 80 |             parser.error(e)
 81 |     return args
 82 | 
 83 | 
 84 | def main():
 85 |     parser = get_parser()
 86 |     args = validate_args(parser)
 87 | 
 88 |     seq_records = seqrecords.read_fasta(args.fasta)
 89 | 
 90 |     patterns = []
 91 |     for i in range(args.min_word_size, args.max_word_size + 1):
 92 |         p = word_pattern.create(seq_records.seq_list, i)
 93 |         patterns.append(p)
 94 | 
 95 |     vecs = []
 96 |     if args.char_weights is not None:
 97 |         weightmodel = word_vector.WeightModel(char_weights=args.char_weights)
 98 |         vecklas = {'counts': word_vector.CountsWeight,
 99 |                    'freqs': word_vector.FreqsWeight}[args.vector]
100 |         kwargs = {'seq_lengths': seq_records.length_list,
101 |                   'weightmodel': weightmodel}
102 |     else:
103 |         vecklas = {'counts': word_vector.Counts,
104 |                    'freqs': word_vector.Freqs}[args.vector]
105 |         kwargs = {'seq_lengths': seq_records.length_list}
106 |     for p in patterns:
107 |         v = vecklas(patterns=p, **kwargs)
108 |         vecs.append(v)
109 | 
110 |     dist = word_d2.Distance(vecs)
111 |     matrix = distmatrix.create(seq_records.id_list, dist)
112 | 
113 |     if args.out:
114 |         oh = open(args.out, 'w')
115 |         matrix.write_to_file(oh, args.outfmt)
116 |         oh.close()
117 |     else:
118 |         matrix.display(args.outfmt)
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     main()
123 | 


--------------------------------------------------------------------------------
/bin/calc_word_ffp.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | # Copyright (c) 2016 Zielezinski A, combio.pl
  4 | 
  5 | import argparse
  6 | import sys
  7 | 
  8 | from alfpy import word_vector
  9 | from alfpy import word_distance
 10 | from alfpy.utils import distmatrix
 11 | from alfpy.utils import seqrecords
 12 | from alfpy import word_pattern
 13 | from alfpy.utils.data import seqcontent
 14 | from alfpy.version import __version__
 15 | 
 16 | 
 17 | def get_parser():
 18 |     parser = argparse.ArgumentParser(
 19 |         description='''Calculate distance between DNA/protein sequences based
 20 |         on feature frequency profiles (FFPs) of words.''',
 21 |         add_help=False, prog='calc_word_ffp.py'
 22 |     )
 23 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
 24 |     group.add_argument('--fasta', '-f',
 25 |                        help='input FASTA sequence filename', required=True,
 26 |                        type=argparse.FileType('r'), metavar="FILE")
 27 |     group.add_argument('--molecule', '-m', choices=['dna', 'rna', 'protein'],
 28 |                        help='choose sequence alphabet', required=True)
 29 | 
 30 |     group = parser.add_argument_group('  Choose between the two options')
 31 |     g1 = group.add_mutually_exclusive_group()
 32 |     g1.add_argument('--word_size', '-s', metavar="N",
 33 |                     help='word size for creating word patterns',
 34 |                     type=int)
 35 |     g1.add_argument('--word_pattern', '-w',
 36 |                     help='input filename w/ pre-computed word patterns',
 37 |                     type=argparse.FileType('r'), metavar="FILE")
 38 | 
 39 |     group = parser.add_argument_group('OPTIONAL ARGUMENTS')
 40 |     distlist = word_distance.Distance.get_disttypes()
 41 |     group.add_argument('--distance', '-d', choices=distlist,
 42 |                        help='choose from: {} [DEFAULT: %(default)s]'.format(
 43 |                            ", ".join(distlist)),
 44 |                        metavar='', default="jsd")
 45 |     group.add_argument('--reduce_alphabet', '-r', action="store_true",
 46 |                        help='''reduce the words' nt/aa alphabet to smaller
 47 |                        number of symbols''')
 48 |     group.add_argument('--merge_revcomp', '-M', action="store_true",
 49 |                        help='''merge together DNA words with their reverse
 50 |                        complement words''')
 51 | 
 52 |     group = parser.add_argument_group('OUTPUT ARGUMENTS')
 53 |     group.add_argument('--out', '-o', help="output filename",
 54 |                        metavar="FILE")
 55 |     group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
 56 |                        default='phylip',
 57 |                        help='distances output format [DEFAULT: %(default)s]')
 58 | 
 59 |     group = parser.add_argument_group("OTHER OPTIONS")
 60 |     group.add_argument("-h", "--help", action="help",
 61 |                        help="show this help message and exit")
 62 |     group.add_argument('--version', action='version',
 63 |                        version='%(prog)s {}'.format(__version__))
 64 | 
 65 |     if len(sys.argv[1:]) == 0:
 66 |         # parser.print_help()
 67 |         parser.print_usage()
 68 |         parser.exit()
 69 | 
 70 |     return parser
 71 | 
 72 | 
 73 | def validate_args(parser):
 74 |     args = parser.parse_args()
 75 |     if args.word_size:
 76 |         if args.word_size < 1:
 77 |             parser.error('word size must be >= 1')
 78 |     elif args.word_pattern:
 79 |         pass
 80 |     else:
 81 |         parser.error("Specify either: --word_size or --word_pattern.")
 82 | 
 83 |     if args.molecule == 'protein' and args.merge_revcomp:
 84 |         parser.error("Incompatible arguments: -m protein --merge_revcomp")
 85 | 
 86 |     return args
 87 | 
 88 | 
 89 | def main():
 90 |     parser = get_parser()
 91 |     args = validate_args(parser)
 92 | 
 93 |     seq_records = seqrecords.read_fasta(args.fasta)
 94 |     if args.word_size:
 95 |         p = word_pattern.create(seq_records.seq_list, args.word_size)
 96 |     else:
 97 |         p = word_pattern.read(args.word_pattern)
 98 | 
 99 |     if args.reduce_alphabet:
100 |         p = p.reduce_alphabet(seqcontent.get_reduced_alphabet(args.molecule))
101 |     if args.merge_revcomp:
102 |         p = p.merge_revcomp()
103 | 
104 |     freqs = word_vector.Freqs(seq_records.length_list, p)
105 | 
106 |     dist = word_distance.Distance(freqs, args.distance)
107 |     matrix = distmatrix.create(seq_records.id_list, dist)
108 | 
109 |     if args.out:
110 |         oh = open(args.out, 'w')
111 |         matrix.write_to_file(oh, args.outfmt)
112 |         oh.close()
113 |     else:
114 |         matrix.display(args.outfmt)
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     main()
119 | 


--------------------------------------------------------------------------------
/bin/calc_word_rtd.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | # Copyright (c) 2016 Zielezinski A, combio.pl
  4 | 
  5 | import argparse
  6 | import sys
  7 | 
  8 | from alfpy import word_distance
  9 | from alfpy import word_pattern
 10 | from alfpy import word_rtd
 11 | from alfpy.utils import distmatrix
 12 | from alfpy.utils import seqrecords
 13 | from alfpy.version import __version__
 14 | 
 15 | 
 16 | def get_parser():
 17 |     parser = argparse.ArgumentParser(
 18 |         description='''Calculate distances between protein/DNA sequences based
 19 |         on Return Time Distribution (RTD) of words\' occurrences and their
 20 |         relative orders''',
 21 |         add_help=False, prog='calc_word_rtd.py'
 22 |     )
 23 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
 24 |     group.add_argument('--fasta', '-f',
 25 |                        help='input FASTA sequence filename', required=True,
 26 |                        type=argparse.FileType('r'), metavar="FILE")
 27 | 
 28 |     group = parser.add_argument_group('  Choose between the two options')
 29 |     g1 = group.add_mutually_exclusive_group()
 30 |     g1.add_argument('--word_size', '-s', metavar="N",
 31 |                     help='word size for creating word patterns',
 32 |                     type=int)
 33 |     g1.add_argument('--word_pattern', '-w',
 34 |                     help='input filename w/ pre-computed word patterns',
 35 |                     type=argparse.FileType('r'), metavar="FILE")
 36 | 
 37 |     group = parser.add_argument_group('OPTIONAL ARGUMENTS')
 38 |     distlist = word_distance.Distance.get_disttypes()
 39 |     group.add_argument('--distance', '-d', choices=distlist,
 40 |                        help='choose from: {} [DEFAULT: %(default)s]'.format(
 41 |                            ", ".join(distlist)),
 42 |                        metavar='', default="google")
 43 | 
 44 |     group = parser.add_argument_group('OUTPUT ARGUMENTS')
 45 |     group.add_argument('--out', '-o', help="output filename",
 46 |                        metavar="FILE")
 47 |     group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
 48 |                        default='phylip',
 49 |                        help='distances output format [DEFAULT: %(default)s]')
 50 | 
 51 |     group = parser.add_argument_group("OTHER OPTIONS")
 52 |     group.add_argument("-h", "--help", action="help",
 53 |                        help="show this help message and exit")
 54 |     group.add_argument('--version', action='version',
 55 |                        version='%(prog)s {}'.format(__version__))
 56 | 
 57 |     if len(sys.argv[1:]) == 0:
 58 |         # parser.print_help()
 59 |         parser.print_usage()
 60 |         parser.exit()
 61 | 
 62 |     return parser
 63 | 
 64 | 
 65 | def validate_args(parser):
 66 |     args = parser.parse_args()
 67 |     if args.word_size:
 68 |         if args.word_size < 1:
 69 |             parser.error('word size must be >= 1')
 70 |     elif args.word_pattern:
 71 |         p = word_pattern.read(args.word_pattern)
 72 |         if not p.pos_list:
 73 |             e = "{0} does not contain info on word positions.\n"
 74 |             e += "Please use: create_wordpattern.py with"
 75 |             e += " --word_position option."
 76 |             parser.error(e.format(args.word_pattern.name))
 77 |         else:
 78 |             args.word_pattern = p
 79 |     else:
 80 |         parser.error("Specify either: --word_size or --word_pattern.")
 81 |     return args
 82 | 
 83 | 
 84 | def main():
 85 |     parser = get_parser()
 86 |     args = validate_args(parser)
 87 | 
 88 |     seq_records = seqrecords.read_fasta(args.fasta)
 89 |     if args.word_size:
 90 |         p = word_pattern.create(seq_records.seq_list, args.word_size, True)
 91 |     else:
 92 |         p = args.word_pattern
 93 | 
 94 |     vector = word_rtd.create_vector(seq_records.count, p)
 95 |     dist = word_rtd.Distance(vector, args.distance)
 96 | 
 97 |     matrix = distmatrix.create(seq_records.id_list, dist)
 98 | 
 99 |     if args.out:
100 |         oh = open(args.out, 'w')
101 |         matrix.write_to_file(oh, args.outfmt)
102 |         oh.close()
103 |     else:
104 |         matrix.display(args.outfmt)
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     main()
109 | 


--------------------------------------------------------------------------------
/bin/calc_word_sets.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | # Copyright (c) 2016 Zielezinski A, combio.pl
 4 | 
 5 | import argparse
 6 | import sys
 7 | from alfpy import word_sets_distance
 8 | from alfpy.utils import distmatrix
 9 | from alfpy.utils import seqrecords
10 | from alfpy.version import __version__
11 | 
12 | 
13 | def get_parser():
14 |     parser = argparse.ArgumentParser(
15 |         description='''Calculate distances between DNA/protein sequences based
16 |         on boolean 1-D vectors of word counting occurrences.''',
17 |         add_help=False, prog='calc_word_sets.py'
18 | 
19 |     )
20 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
21 |     group.add_argument('--fasta', '-f',
22 |                        help='input FASTA sequence filename', required=True,
23 |                        type=argparse.FileType('r'), metavar="FILE")
24 |     group.add_argument('--word_size', '-s', metavar="N", required=True,
25 |                        help='word size for creating word patterns',
26 |                        type=int)
27 | 
28 |     group = parser.add_argument_group('OPTIONAL ARGUMENTS')
29 |     distlist = ['dice', 'hamming', 'jaccard']
30 |     group.add_argument('--distance', '-d', choices=distlist,
31 |                        help='choose from: {} [DEFAULT: %(default)s]'.format(
32 |                            ", ".join(distlist)),
33 |                        metavar='', default="dice")
34 | 
35 |     group = parser.add_argument_group('OUTPUT ARGUMENTS')
36 |     group.add_argument('--out', '-o', help="output filename",
37 |                        metavar="FILE")
38 |     group.add_argument('--outfmt', choices=['phylip', 'pairwise'],
39 |                        default='phylip',
40 |                        help='distances output format [DEFAULT: %(default)s]')
41 | 
42 |     group = parser.add_argument_group("OTHER OPTIONS")
43 |     group.add_argument("-h", "--help", action="help",
44 |                        help="show this help message and exit")
45 |     group.add_argument('--version', action='version',
46 |                        version='%(prog)s {}'.format(__version__))
47 | 
48 |     if len(sys.argv[1:]) == 0:
49 |         # parser.print_help()
50 |         parser.print_usage()
51 |         parser.exit()
52 | 
53 |     return parser
54 | 
55 | 
56 | def validate_args(parser):
57 |     args = parser.parse_args()
58 |     if args.word_size < 1:
59 |         parser.error('Word size must be >= 1.')
60 |     return args
61 | 
62 | 
63 | def main():
64 |     parser = get_parser()
65 |     args = validate_args(parser)
66 | 
67 |     seq_records = seqrecords.read_fasta(args.fasta)
68 |     dist = word_sets_distance.Distance(seq_records, args.word_size,
69 |                                        args.distance)
70 |     matrix = distmatrix.create(seq_records.id_list, dist)
71 | 
72 |     if args.out:
73 |         oh = open(args.out, 'w')
74 |         matrix.write_to_file(oh, args.outfmt)
75 |         oh.close()
76 |     else:
77 |         matrix.display(args.outfmt)
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/bin/create_wordpattern.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | # Copyright (c) 2016 Zielezinski A, combio.pl
  4 | 
  5 | import argparse
  6 | import sys
  7 | 
  8 | from alfpy import word_pattern
  9 | from alfpy.utils import seqrecords
 10 | from alfpy.version import __version__
 11 | 
 12 | 
 13 | def get_parser():
 14 |     parser = argparse.ArgumentParser(
 15 |         description='''Count subsequences (words) of a given length (size)
 16 |         for each sequence in input FASTA-formatted file.''',
 17 |         add_help=False, prog='create_wordpattern.py'
 18 |     )
 19 |     group = parser.add_argument_group('REQUIRED ARGUMENTS')
 20 |     group.add_argument('--fasta', '-f',
 21 |                        help='input FASTA sequence filename', required=True,
 22 |                        type=argparse.FileType('r'), metavar="FILE")
 23 |     group.add_argument('--word_size', '-w', required=True, type=int,
 24 |                        metavar="k", help='word size (>=1)')
 25 | 
 26 |     group = parser.add_argument_group('OPTIONAL ARGUMENTS')
 27 |     group.add_argument('--word_position', '-p', action="store_true",
 28 |                        help='''report word positions in output''')
 29 |     group.add_argument('--out', '-o', help="output pattern filename",
 30 |                        metavar="FILE")
 31 | 
 32 |     t = '  Teiresias options'
 33 |     d = '  more info @ https://cm.jefferson.edu/data-tools-downloads/'
 34 |     d += 'teiresias-code/\n'
 35 |     group = parser.add_argument_group(t, d)
 36 |     group.add_argument('--teiresias', '-t', action="store_true",
 37 |                        help='''Teiresias program creates word patterns.
 38 |                        [by default: disabled]''',
 39 |                        )
 40 |     group.add_argument('--l', '-l', type=int,
 41 |                        help='minimum number of literals and/or brackets')
 42 |     group.add_argument('--k', '-k', type=int,
 43 |                        help='minimum support that any word can have')
 44 | 
 45 |     group = parser.add_argument_group("OTHER OPTIONS")
 46 |     group.add_argument("-h", "--help", action="help",
 47 |                        help="show this help message and exit")
 48 |     group.add_argument('--version', action='version',
 49 |                        version='%(prog)s {}'.format(__version__))
 50 | 
 51 |     if len(sys.argv[1:]) == 0:
 52 |         # parser.print_help()
 53 |         parser.print_usage()  # for just the usage line
 54 |         parser.exit()
 55 | 
 56 |     return parser
 57 | 
 58 | 
 59 | def validate_args(parser):
 60 |     args = parser.parse_args()
 61 |     if args.teiresias:
 62 |         if args.l is None:
 63 |             parser.error("Teiresias requires --l")
 64 |         if args.k is None:
 65 |             parser.error("Teiresias requires --k")
 66 |         if args.word_size < 2:
 67 |             parser.error("Teiresias requires --word_size to be >= 2")
 68 |         if args.l < 2:
 69 |             parser.error("--l must be at least 2")
 70 |         if args.l > args.word_size:
 71 |             parser.error("--word_size must be >= than --l")
 72 |     elif args.word_size < 1:
 73 |         parser.error("--word_size must be >= 1")
 74 |     return args
 75 | 
 76 | 
 77 | def main():
 78 |     parser = get_parser()
 79 |     args = validate_args(parser)
 80 | 
 81 |     if args.teiresias:
 82 |         args.fasta.close()
 83 |         p = word_pattern.run_teiresias(args.fasta.name,
 84 |                                        w=args.word_size,
 85 |                                        l=args.l,
 86 |                                        k=args.k,
 87 |                                        output_filename=args.out)
 88 |     else:
 89 |         seq_records = seqrecords.read_fasta(args.fasta)
 90 |         args.fasta.close()
 91 |         p = word_pattern.create(seq_records.seq_list,
 92 |                                 args.word_size,
 93 |                                 args.word_position)
 94 | 
 95 |     if args.out:
 96 |         oh = open(args.out, 'w')
 97 |         oh.write(p.format())
 98 |         oh.close()
 99 |     else:
100 |         print(p.format())
101 |         # or sys.stdout(p.format()+'\n')
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     main()
106 | 


--------------------------------------------------------------------------------
/example_data/input/aminoacid.freqs.swissprot.txt:
--------------------------------------------------------------------------------
 1 | # UniProtKB/Swiss-Prot protein knowledgebase release 2016_09 statistics
 2 | # Release 2016_09 of 05-Oct-16 of UniProtKB/Swiss-Prot contains 552259 sequence entries,
 3 | # comprising 197423140 amino acids abstracted from 247204 references.
 4 | # http://web.expasy.org/docs/relnotes/relstat.html
 5 | A	0.0826
 6 | Q	0.0393
 7 | L	0.0965
 8 | S	0.0659
 9 | R	0.0553
10 | E	0.0674
11 | K	0.0583
12 | T	0.0534
13 | N	0.0406
14 | G	0.0708
15 | M	0.0241
16 | W	0.0109
17 | D	0.0546
18 | H	0.0227
19 | F	0.0386
20 | Y	0.0292
21 | C	0.0137
22 | I	0.0594
23 | P	0.0471
24 | V	0.0687


--------------------------------------------------------------------------------
/example_data/input/aminoacid.weights.txt:
--------------------------------------------------------------------------------
 1 | # Based on amino acid frequencies
 2 | # Weight = 1 / amino acid freq / 10
 3 | # should be greater than 1.
 4 | A	1.21065375303
 5 | C	7.29927007299
 6 | E	1.48367952522
 7 | D	1.8315018315
 8 | G	1.41242937853
 9 | F	2.59067357513
10 | I	1.6835016835
11 | H	4.40528634361
12 | K	1.71526586621
13 | M	4.14937759336
14 | L	1.03626943005
15 | N	2.46305418719
16 | Q	2.54452926209
17 | P	2.12314225053
18 | S	1.51745068285
19 | R	1.80831826401
20 | T	1.87265917603
21 | W	9.17431192661
22 | V	1.45560407569
23 | Y	3.42465753425


--------------------------------------------------------------------------------
/example_data/input/hiv.pep.fasta:
--------------------------------------------------------------------------------
 1 | >DENTIST
 2 | EVVIRSANFTDNAKIIIVQLNASVEINCTRPNNYTRKGIRIGPGRAVYAAEEIIGDIRRAHCNISREKWN
 3 | NTLKQVVTKLREQFVNKTIIFTHPSGGDPEIVMHSVNCGGEFFY
 4 | >PATIENT_A
 5 | VIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIRIGPGRAVYAAEEIIGDIRRAHCNISREKWNNT
 6 | LKQVVTKLREQFVNKTIIFNHSSGGDPEIVMHSFNCGGEFFY
 7 | >PATIENT_B
 8 | FTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGAKWNNTLEQVKT
 9 | KLREQFGNTTIFFNHSSG
10 | >PATIENT_C
11 | EVVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAVYATDRIIGDIRQAHCNISREKWN
12 | NTLKQVVTKLREQFVNKTIIFTHPSGGDPEIVMHSVNCGGEFFY
13 | >PATIENT_D
14 | EVVIRSANFSDNAKTIIVQLNKSVKITCIRPSNNTRQSIPIGPGKAVYATGQIIGDIRQAHCNLSEAKWN
15 | NTLAQIVKKLKEQFRNRTIVFNQSSGGDPEIVMHSFNCGGEFFYC
16 | >PATIENT_E
17 | ASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGEKWNNTLKQVVTKLREQFGDKTIIF
18 | NHSSGGDPEIVM
19 | >PATIENT_F
20 | EVVIRSENFTDNVKTIIVQLNESVQINCTRPNNNTRKSIHIAPGRAFYATGEIIGDIRQAHCNLSSTKWN
21 | NTLRQIAKKLKEQFGNKTIVFNQSSGGDPEIVMHSFNCGGEFFYC
22 | >PATIENT_G
23 | EVVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRRGIHIGPGRAFYATDRIVGDIRQAYCNISREKWN
24 | NTLKQVVAKLREQFVNKTIIFNHSSGGDPEIVMHSVNCGGEFFYCNT
25 | >PATIENT_H
26 | LAEGEVIIRSENFTDNAKTIIVQLNATINITCERPHNNTRKSIHIGPGRAFFATGDITGDIRQAHCNLSK
27 | GDWDNALKQIVTKLGEQFGRNKTIVFKQSSGGDPEIIMHSFNCAGEFSYCN
28 | >DENTIST_WIFE
29 | NFTNNAKTIIVQLNTSVEINCTRPSNNTSKGIHIGPGRAFHATDRITGDIRQAHCNISKAKWNDTLQQVV
30 | KKLREQFGGNKTIVFNQSSGGDPEIVLHSFNCGGEFFYCNTT
31 | >Local_Control_1
32 | FTDNAKTIIVQLKNSVVINCTRPNNNTRRSVHIGPGSSLYTTDIIGDIRQAHCNLSRANWNKTLEQIVTK
33 | LGEQFGNNTTIVFNSSSGG
34 | >Local_Control_2
35 | SENFTDNTKTIIVQLNTSVTINCTRPGNNTRKSITMGPGKVFYAGEIIGDIRQAHCNLSRAAWNDTLKQI
36 | VGKLQEQFGNKTIVFNHSSGGDPEIVMHSF
37 | >Local_Control_3
38 | RSENFTNNAKIIIVHLNKTVNITCTRPNNNTRRSIPIGPGKAFYTTDIIGNIRQAHCNLSRAEWNNTLKQ
39 | IVKKLREQFKNKTIVFNHSSGGDPEIVMHSF
40 | >Local_Control_4
41 | LAEEEVVIRSENFTNNAKIIIVHLNKTVNITCTRPNNNTRRSIPMGPGKAFYTTEIIGNIRQAHCNLSKA
42 | EWNNTLRQIVKKLRDNLRIKQ
43 | >Local_Control_5
44 | LAEKEVVIRSENFTDNTKTIIIQLNTSVTINCTRPGNNTRKSITMGPGKVFYAGEIIGDIRQAHCNLSRT
45 | AWNDTLKQIVGKLQEQFGNKTIVFNHSSGGDPEIVMHSF
46 | 


--------------------------------------------------------------------------------
/example_data/input/sample.dna.fasta:
--------------------------------------------------------------------------------
1 | >seq1
2 | AACGTACCATTGAACGTACCATTGAACGTACCATTG
3 | >seq2
4 | CTAGGGGACTTATCTAGGGGACTTATCTAGGGGACTTAT
5 | >seq3
6 | CTAGGGAAAATTCTAGGGAAAATTCTAGGGAAAATT
7 | 


--------------------------------------------------------------------------------
/example_data/input/sample.pep.fasta:
--------------------------------------------------------------------------------
1 | >seq1
2 | MKSTGWHF
3 | >seq2
4 | MKSSSSTGWGWG
5 | >seq3
6 | MKSTLKNGTEQ


--------------------------------------------------------------------------------
/example_data/output/bears.dna.fasta.1mer:
--------------------------------------------------------------------------------
1 | 2693	11	A 0:333 1:132 2:133 3:130 4:132 5:342 6:131 7:346 8:352 9:351 10:311
2 | 1717	11	T 0:226 1:86 2:83 3:81 4:83 5:236 6:87 7:232 8:225 9:216 10:162
3 | 1650	11	C 0:219 1:69 2:71 3:73 4:72 5:210 6:70 7:210 8:213 9:217 10:226
4 | 1337	11	G 0:188 1:60 2:59 3:63 4:59 5:178 6:61 7:166 8:172 9:172 10:159


--------------------------------------------------------------------------------
/example_data/output/bears.dna.fasta.2mer:
--------------------------------------------------------------------------------
 1 | 1096	11	AA 0:139 1:57 2:60 3:58 4:59 5:138 6:57 7:129 8:140 9:136 10:123
 2 | 697	11	TA 0:83 1:31 2:36 3:30 4:35 5:102 6:36 7:99 8:94 9:95 10:56
 3 | 559	11	AG 0:70 1:28 2:26 3:27 4:26 5:74 6:29 7:75 8:68 9:72 10:64
 4 | 550	11	AC 0:66 1:23 2:22 3:22 4:24 5:67 6:21 7:74 8:69 9:75 10:87
 5 | 516	11	CA 0:64 1:24 2:16 3:20 4:17 5:53 6:18 7:71 8:74 9:72 10:87
 6 | 487	11	AT 0:58 1:24 2:25 3:23 4:23 5:63 6:24 7:68 8:75 9:68 10:36
 7 | 470	11	CC 0:67 1:17 2:22 3:24 4:23 5:57 6:20 7:60 8:62 9:61 10:57
 8 | 464	11	TT 0:65 1:26 2:23 3:24 4:24 5:61 6:24 7:68 8:54 9:50 10:45
 9 | 456	11	CT 0:57 1:23 2:24 3:21 4:24 5:70 6:26 7:54 8:51 9:57 10:49
10 | 379	11	GA 0:47 1:19 2:20 3:21 4:20 5:49 6:19 7:47 8:44 9:48 10:45
11 | 342	11	GC 0:49 1:11 2:12 3:12 4:11 5:49 6:15 7:42 8:43 9:48 10:50
12 | 307	11	GT 0:46 1:13 2:11 3:13 4:12 5:41 6:13 7:41 8:44 9:41 10:32
13 | 305	11	GG 0:46 1:16 2:15 3:16 4:15 5:39 6:14 7:36 8:41 9:35 10:32
14 | 285	11	TC 0:36 1:18 2:15 3:15 4:14 5:37 6:14 7:34 8:39 9:32 10:31
15 | 267	11	TG 0:42 1:11 2:9 3:12 4:10 5:36 6:12 7:30 8:37 9:38 10:30
16 | 206	11	CG 0:30 1:5 2:9 3:8 4:8 5:29 6:6 7:25 8:26 9:27 10:33


--------------------------------------------------------------------------------
/example_data/output/bears.dna.fasta.3mer:
--------------------------------------------------------------------------------
 1 | 462	11	AAA 0:56 1:24 2:29 3:26 4:29 5:60 6:25 7:47 8:56 9:57 10:53
 2 | 322	11	TAA 0:43 1:14 2:16 3:13 4:15 5:46 6:14 7:49 8:48 9:44 10:20
 3 | 243	11	AAG 0:38 1:11 2:10 3:11 4:9 5:35 6:12 7:33 8:28 9:28 10:28
 4 | 215	11	AAC 0:26 1:13 2:12 3:13 4:14 5:25 6:10 7:24 8:22 9:26 10:30
 5 | 210	11	CTA 0:24 1:10 2:13 3:10 4:13 5:33 6:13 7:24 8:23 9:27 10:20
 6 | 196	11	TTA 0:25 1:9 2:9 3:8 4:10 5:30 6:8 7:28 8:24 9:28 10:17
 7 | 195	11	ATA 0:22 1:8 2:10 3:7 4:9 5:23 6:11 7:32 8:31 9:28 10:14
 8 | 176	11	AAT 0:19 1:9 2:9 3:8 4:7 5:18 6:10 7:25 8:34 9:25 10:12
 9 | 171	11	ACA 0:21 1:6 2:3 3:4 4:4 5:19 6:4 7:25 8:23 9:24 10:38
10 | 169	11	CAA 0:23 1:10 2:6 3:9 4:6 5:15 6:7 7:19 8:22 9:20 10:32
11 | 168	11	AGC 0:24 1:5 2:4 3:4 4:4 5:27 6:8 7:23 8:22 9:25 10:22
12 | 165	11	ACC 0:20 1:9 2:10 3:10 4:11 5:16 6:8 7:20 8:18 9:23 10:20
13 | 165	11	CAC 0:22 1:4 2:3 3:3 4:3 5:17 6:3 7:25 8:27 9:23 10:35
14 | 148	11	CCC 0:25 1:5 2:8 3:10 4:8 5:18 6:7 7:18 8:17 9:20 10:12
15 | 143	11	ACT 0:15 1:7 2:8 3:6 4:8 5:21 6:8 7:20 8:16 9:15 10:19
16 | 143	11	AGA 0:14 1:10 2:9 3:10 4:9 5:16 6:8 7:17 8:13 9:15 10:22
17 | 140	11	ATT 0:16 1:8 2:7 3:6 4:7 5:21 6:7 7:18 8:22 9:19 10:9
18 | 140	11	GAA 0:17 1:8 2:9 3:9 4:9 5:17 6:10 7:14 8:14 9:15 10:18
19 | 137	11	CCA 0:18 1:4 2:5 3:6 4:5 5:13 6:4 7:20 8:22 9:20 10:20
20 | 134	11	TAG 0:13 1:7 2:7 3:7 4:7 5:19 6:9 7:20 8:16 9:18 10:11
21 | 133	11	CCT 0:16 1:6 2:6 3:6 4:6 5:21 6:7 7:15 8:17 9:15 10:18
22 | 129	11	AGT 0:19 1:6 2:6 3:6 4:6 5:17 6:7 7:17 8:18 9:16 10:11
23 | 126	11	TTT 0:18 1:7 2:8 3:8 4:8 5:14 6:7 7:23 8:12 9:5 10:16
24 | 122	11	CTT 0:17 1:8 2:6 3:7 4:6 5:17 6:7 7:17 8:11 9:14 10:12
25 | 122	11	TAT 0:16 1:6 2:7 3:6 4:7 5:19 6:6 7:14 8:16 9:16 10:9
26 | 119	11	TAC 0:11 1:4 2:6 3:4 4:6 5:18 6:7 7:16 8:14 9:17 10:16
27 | 115	11	AGG 0:13 1:6 2:6 3:6 4:6 5:14 6:6 7:18 8:15 9:16 10:9
28 | 115	11	TCA 0:12 1:10 2:6 3:7 4:6 5:10 6:5 7:16 8:16 9:11 10:16
29 | 108	11	GCC 0:13 1:2 2:3 3:3 4:3 5:14 6:4 7:17 8:17 9:14 10:18
30 | 99	11	GGA 0:14 1:5 2:6 3:6 4:6 5:11 6:5 7:12 8:13 9:13 10:8
31 | 98	11	GCT 0:17 1:4 2:4 3:4 4:4 5:17 6:5 7:11 8:9 9:14 10:9
32 | 97	11	CAT 0:10 1:6 2:4 3:5 4:4 5:12 6:5 7:16 8:12 9:16 10:7
33 | 96	11	GAG 0:10 1:6 2:5 3:6 4:5 5:11 6:5 7:11 8:11 9:13 10:13
34 | 93	11	GTA 0:12 1:4 2:4 3:5 4:3 5:15 6:4 7:14 8:15 9:12 10:5
35 | 92	11	GAT 0:13 1:3 2:5 3:4 4:5 5:14 6:3 7:13 8:13 9:11 10:8
36 | 91	11	GCA 0:12 1:4 2:2 3:3 4:2 5:11 6:5 7:10 8:13 9:16 10:13
37 | 90	11	GGT 0:15 1:5 2:3 3:4 4:4 5:11 6:4 7:13 8:13 9:11 10:7
38 | 84	11	CAG 0:9 1:4 2:3 3:3 4:4 5:9 6:3 7:11 8:13 9:13 10:12
39 | 82	11	TCT 0:9 1:6 2:6 3:5 4:6 5:11 6:6 7:8 8:9 9:13 10:3
40 | 82	11	TGG 0:15 1:5 2:4 3:5 4:5 5:10 6:5 7:7 8:12 9:9 10:5
41 | 80	11	CTC 0:10 1:4 2:4 3:2 4:4 5:13 6:3 7:9 8:12 9:9 10:10
42 | 80	11	GTG 0:13 1:4 2:2 3:3 4:3 5:10 6:4 7:11 8:11 9:11 10:8
43 | 78	11	ATG 0:10 1:3 2:4 3:4 4:4 5:12 6:3 7:8 8:11 9:12 10:7
44 | 76	11	GTT 0:14 1:3 2:2 3:3 4:3 5:9 6:3 7:10 8:9 9:12 10:8
45 | 76	11	TTC 0:9 1:7 2:4 3:5 4:4 5:10 6:6 7:10 8:8 9:9 10:4
46 | 71	11	ATC 0:10 1:5 2:4 3:6 4:3 5:7 6:3 7:9 8:10 9:8 10:6
47 | 71	11	TGA 0:11 1:1 2:1 3:1 4:1 5:11 6:3 7:11 8:10 9:12 10:9
48 | 70	11	ACG 0:9 1:1 2:1 3:2 4:1 5:11 6:1 7:9 8:12 9:13 10:10
49 | 66	11	CGA 0:8 1:3 2:4 3:4 4:4 5:11 6:3 7:7 8:8 9:8 10:6
50 | 66	11	TTG 0:13 1:3 2:2 3:3 4:2 5:7 6:3 7:7 8:10 9:8 10:8
51 | 65	11	TGC 0:9 1:3 2:2 3:3 4:2 5:7 6:2 7:7 8:9 9:10 10:11
52 | 60	11	GGG 0:9 1:4 2:3 3:3 4:3 5:9 6:2 7:6 8:7 9:6 10:8
53 | 58	11	GTC 0:7 1:2 2:3 3:2 4:3 5:7 6:2 7:6 8:9 9:6 10:11
54 | 56	11	GGC 0:8 1:2 2:3 3:3 4:2 5:8 6:3 7:5 8:8 9:5 10:9
55 | 53	11	CGC 0:8 1:1 2:3 3:2 4:3 5:7 6:2 7:7 8:4 9:8 10:8
56 | 52	11	CCG 0:8 1:2 2:3 3:2 4:4 5:5 6:2 7:7 8:6 9:6 10:7
57 | 51	11	GAC 0:7 1:2 2:1 3:2 4:1 5:7 6:1 7:9 8:6 9:9 10:6
58 | 49	11	TGT 0:7 1:2 2:2 3:3 4:2 5:8 6:2 7:5 8:6 9:7 10:5
59 | 48	11	CGG 0:9 1:1 2:2 3:2 4:1 5:6 6:1 7:5 8:7 9:4 10:10
60 | 48	11	TCC 0:9 1:1 2:1 3:1 4:1 5:9 6:1 7:5 8:10 9:4 10:6
61 | 44	11	GCG 0:7 1:1 2:3 3:2 4:2 5:6 6:1 7:4 8:4 9:4 10:10
62 | 43	11	CTG 0:6 1:1 2:1 3:2 4:1 5:7 6:2 7:4 8:5 9:7 10:7
63 | 40	11	TCG 0:6 1:1 2:2 3:2 4:1 5:7 6:2 7:5 8:4 9:4 10:6
64 | 39	6	CGT 0:5 5:5 7:6 8:7 9:7 10:9


--------------------------------------------------------------------------------
/example_data/output/bears.dna.fasta.pairwise:
--------------------------------------------------------------------------------
 1 | American_Black_Bear	American_Brown_Bear	0.7106017
 2 | American_Black_Bear	Spectacled_Bear	0.7765043
 3 | American_Black_Bear	Asiatic_Black_Bear	0.7020057
 4 | American_Black_Bear	Polar_Bear	0.7736390
 5 | American_Black_Bear	Giant_Panda	0.5702006
 6 | American_Black_Bear	Red_Panda	0.8080229
 7 | American_Black_Bear	Dog	0.6131805
 8 | American_Black_Bear	Raccoon	0.5873926
 9 | American_Black_Bear	Cow	0.6704871
10 | American_Black_Bear	Crocodilian_skink	0.7822350
11 | American_Brown_Bear	Spectacled_Bear	0.4545455
12 | American_Brown_Bear	Asiatic_Black_Bear	0.3034483
13 | American_Brown_Bear	Polar_Bear	0.4405594
14 | American_Brown_Bear	Giant_Panda	0.7761628
15 | American_Brown_Bear	Red_Panda	0.5174825
16 | American_Brown_Bear	Dog	0.7953216
17 | American_Brown_Bear	Raccoon	0.7777778
18 | American_Brown_Bear	Cow	0.8171091
19 | American_Brown_Bear	Crocodilian_skink	0.8705502
20 | Spectacled_Bear	Asiatic_Black_Bear	0.3655172
21 | Spectacled_Bear	Polar_Bear	0.1478873
22 | Spectacled_Bear	Giant_Panda	0.7877907
23 | Spectacled_Bear	Red_Panda	0.5352113
24 | Spectacled_Bear	Dog	0.7982456
25 | Spectacled_Bear	Raccoon	0.7836257
26 | Spectacled_Bear	Cow	0.8289086
27 | Spectacled_Bear	Crocodilian_skink	0.8705502
28 | Asiatic_Black_Bear	Polar_Bear	0.3655172
29 | Asiatic_Black_Bear	Giant_Panda	0.7906977
30 | Asiatic_Black_Bear	Red_Panda	0.5448276
31 | Asiatic_Black_Bear	Dog	0.8157895
32 | Asiatic_Black_Bear	Raccoon	0.7923977
33 | Asiatic_Black_Bear	Cow	0.8436578
34 | Asiatic_Black_Bear	Crocodilian_skink	0.8673139
35 | Polar_Bear	Giant_Panda	0.8052326
36 | Polar_Bear	Red_Panda	0.5177305
37 | Polar_Bear	Dog	0.8070175
38 | Polar_Bear	Raccoon	0.7894737
39 | Polar_Bear	Cow	0.8289086
40 | Polar_Bear	Crocodilian_skink	0.8770227
41 | Giant_Panda	Red_Panda	0.7994186
42 | Giant_Panda	Dog	0.5930233
43 | Giant_Panda	Raccoon	0.5755814
44 | Giant_Panda	Cow	0.6424419
45 | Giant_Panda	Crocodilian_skink	0.8081395
46 | Red_Panda	Dog	0.7807018
47 | Red_Panda	Raccoon	0.7690058
48 | Red_Panda	Cow	0.8318584
49 | Red_Panda	Crocodilian_skink	0.8705502
50 | Dog	Raccoon	0.5497076
51 | Dog	Cow	0.6228070
52 | Dog	Crocodilian_skink	0.7982456
53 | Raccoon	Cow	0.6608187
54 | Raccoon	Crocodilian_skink	0.8070175
55 | Cow	Crocodilian_skink	0.7994100
56 | 


--------------------------------------------------------------------------------
/example_data/output/bears.dna.fasta.phylip:
--------------------------------------------------------------------------------
 1 |    11
 2 | American_B 0.0000000 0.6865672 0.7423168 0.6650000 0.7290168 0.6136784 0.7734554 0.6832740 0.6509946 0.7013889 0.8239203
 3 | American_B 0.6865672 0.0000000 0.5071770 0.3027027 0.4878049 0.7410926 0.5636364 0.7677725 0.7476190 0.7806005 0.8517647
 4 | Spectacled 0.7423168 0.5071770 0.0000000 0.3939394 0.1197605 0.7345972 0.5777778 0.7813953 0.7558685 0.7656613 0.8504673
 5 | Asiatic_Bl 0.6650000 0.3027027 0.3939394 0.0000000 0.3877551 0.7328605 0.6000000 0.7908046 0.7570093 0.7752294 0.8403756
 6 | Polar_Bear 0.7290168 0.4878049 0.1197605 0.3877551 0.0000000 0.7393365 0.5739910 0.7832168 0.7488152 0.7731481 0.8524590
 7 | Giant_Pand 0.6136784 0.7410926 0.7345972 0.7328605 0.7393365 0.0000000 0.7660550 0.6410256 0.6275229 0.6772487 0.8283828
 8 | Red_Panda  0.7734554 0.5636364 0.5777778 0.6000000 0.5739910 0.7660550 0.0000000 0.7458432 0.7405660 0.7790433 0.8465116
 9 | Dog        0.6832740 0.7677725 0.7813953 0.7908046 0.7832168 0.6410256 0.7458432 0.0000000 0.6022727 0.6642599 0.8195616
10 | Raccoon    0.6509946 0.7476190 0.7558685 0.7570093 0.7488152 0.6275229 0.7405660 0.6022727 0.0000000 0.6725979 0.8252912
11 | Cow        0.7013889 0.7806005 0.7656613 0.7752294 0.7731481 0.6772487 0.7790433 0.6642599 0.6725979 0.0000000 0.8219634
12 | Crocodilia 0.8239203 0.8517647 0.8504673 0.8403756 0.8524590 0.8283828 0.8465116 0.8195616 0.8252912 0.8219634 0.0000000
13 | 


--------------------------------------------------------------------------------
/example_data/output/gp120.pep.fasta.1mer:
--------------------------------------------------------------------------------
 1 | 1331	27	T 0:46 1:44 2:49 3:52 4:40 5:47 6:39 7:45 8:45 9:44 10:49 11:42 12:46 13:43 14:44 15:50 16:51 17:44 18:57 19:57 20:53 21:56 22:50 23:55 24:62 25:60 26:61
 2 | 1152	27	N 0:47 1:46 2:49 3:48 4:44 5:49 6:41 7:45 8:43 9:43 10:47 11:43 12:49 13:49 14:43 15:37 16:40 17:38 18:38 19:39 20:42 21:42 22:39 23:39 24:35 25:37 26:40
 3 | 841	27	V 0:28 1:30 2:28 3:39 4:37 5:33 6:37 7:33 8:34 9:33 10:39 11:35 12:32 13:39 14:39 15:23 16:26 17:32 18:29 19:31 20:28 21:22 22:22 23:23 24:29 25:29 26:31
 4 | 835	27	I 0:36 1:40 2:36 3:32 4:38 5:39 6:35 7:34 8:35 9:36 10:31 11:36 12:36 13:32 14:31 15:29 16:28 17:24 18:25 19:23 20:25 21:25 22:25 23:30 24:25 25:25 26:24
 5 | 820	27	K 0:28 1:28 2:34 3:28 4:31 5:30 6:36 7:31 8:33 9:33 10:33 11:36 12:35 13:30 14:27 15:25 16:30 17:28 18:29 19:28 20:29 21:28 22:27 23:29 24:30 25:32 26:32
 6 | 758	27	S 0:33 1:31 2:23 3:24 4:30 5:24 6:29 7:38 8:35 9:37 10:31 11:27 12:32 13:26 14:31 15:23 16:22 17:25 18:22 19:21 20:27 21:28 22:23 23:27 24:30 25:30 26:29
 7 | 751	27	E 0:26 1:29 2:30 3:27 4:31 5:26 6:29 7:26 8:25 9:25 10:27 11:26 12:26 13:28 14:21 15:28 16:27 17:33 18:31 19:31 20:27 21:30 22:27 23:28 24:28 25:29 26:30
 8 | 731	27	L 0:34 1:31 2:32 3:33 4:26 5:28 6:26 7:26 8:26 9:26 10:31 11:27 12:28 13:29 14:28 15:25 16:22 17:26 18:26 19:26 20:22 21:23 22:23 23:20 24:29 25:29 26:29
 9 | 715	27	G 0:26 1:25 2:29 3:29 4:31 5:29 6:27 7:31 8:30 9:29 10:32 11:29 12:27 13:28 14:26 15:25 16:29 17:20 18:23 19:23 20:24 21:22 22:23 23:19 24:28 25:26 26:25
10 | 664	27	R 0:25 1:24 2:20 3:26 4:25 5:25 6:21 7:21 8:22 9:22 10:21 11:20 12:21 13:24 14:22 15:31 16:25 17:25 18:25 19:25 20:24 21:24 22:29 23:27 24:30 25:29 26:31
11 | 613	27	A 0:27 1:23 2:27 3:22 4:21 5:22 6:20 7:24 8:22 9:21 10:21 11:22 12:21 13:20 14:26 15:26 16:24 17:21 18:23 19:23 20:19 21:25 22:28 23:24 24:20 25:20 26:21
12 | 604	27	P 0:22 1:22 2:25 3:21 4:21 5:22 6:21 7:22 8:22 9:22 10:22 11:23 12:20 13:23 14:25 15:23 16:25 17:24 18:21 19:21 20:22 21:21 22:20 23:25 24:23 25:23 26:23
13 | 544	27	C 0:18 1:18 2:18 3:18 4:18 5:18 6:18 7:18 8:18 9:18 10:18 11:18 12:18 13:19 14:18 15:23 16:23 17:23 18:23 19:23 20:22 21:23 22:22 23:23 24:22 25:22 26:24
14 | 520	27	D 0:15 1:18 2:19 3:16 4:15 5:17 6:19 7:16 8:17 9:18 10:16 11:17 12:20 13:17 14:18 15:21 16:22 17:26 18:19 19:19 20:19 21:20 22:21 23:23 24:25 25:25 26:22
15 | 450	27	Q 0:18 1:17 2:16 3:21 4:15 5:18 6:19 7:19 8:18 9:19 10:19 11:18 12:18 13:19 14:16 15:16 16:12 17:14 18:13 19:13 20:16 21:18 22:17 23:15 24:16 25:16 26:14
16 | 436	27	Y 0:13 1:13 2:13 3:12 4:13 5:11 6:12 7:11 8:11 9:11 10:13 11:12 12:13 13:13 14:12 15:21 16:23 17:21 18:20 19:20 20:24 21:23 22:18 23:21 24:21 25:21 26:20
17 | 432	27	F 0:13 1:13 2:13 3:15 4:16 5:16 6:16 7:19 8:19 9:18 10:15 11:16 12:15 13:15 14:15 15:18 16:17 17:16 18:18 19:18 20:16 21:16 22:20 23:18 24:14 25:14 26:13
18 | 333	27	W 0:9 1:10 2:10 3:12 4:10 5:10 6:9 7:10 8:10 9:10 10:9 11:10 12:10 13:10 14:9 15:14 16:15 17:14 18:15 19:15 20:15 21:16 22:16 23:15 24:17 25:17 26:16
19 | 266	27	M 0:7 1:9 2:10 3:7 4:6 5:7 6:9 7:10 8:9 9:9 10:8 11:8 12:9 13:9 14:8 15:13 16:11 17:12 18:11 19:11 20:10 21:10 22:13 23:12 24:13 25:13 26:12
20 | 233	27	H 0:8 1:7 2:8 3:8 4:13 5:9 6:8 7:7 8:7 9:7 10:8 11:8 12:8 13:9 14:11 15:9 16:9 17:8 18:8 19:9 20:9 21:12 22:9 23:9 24:8 25:8 26:9


--------------------------------------------------------------------------------
/example_data/output/hiv.pep.fasta.1mer:
--------------------------------------------------------------------------------
 1 | 179	15	I 0:14 1:14 2:11 3:14 4:13 5:10 6:12 7:13 8:14 9:10 10:9 11:10 12:12 13:11 14:12
 2 | 176	15	N 0:11 1:13 2:11 3:12 4:11 5:8 6:13 7:14 8:11 9:13 10:12 11:10 12:14 13:13 14:10
 3 | 124	15	T 0:7 1:6 2:8 3:8 4:6 5:6 6:8 7:7 8:9 9:11 10:10 11:10 12:9 13:8 14:11
 4 | 119	15	G 0:8 1:8 2:8 3:8 4:8 5:9 6:8 7:8 8:11 9:10 10:7 11:9 12:5 13:3 14:9
 5 | 99	15	V 0:10 1:8 2:3 3:10 4:8 5:4 6:7 7:10 8:4 9:6 10:6 11:6 12:5 13:5 14:7
 6 | 97	15	R 0:9 1:9 2:5 3:8 4:6 5:5 6:6 7:9 8:6 9:5 10:5 11:4 12:7 13:8 14:5
 7 | 94	15	K 0:6 1:6 2:5 3:6 4:8 5:5 6:7 7:5 8:7 9:7 10:4 11:6 12:8 13:7 14:7
 8 | 92	15	S 0:5 1:6 2:4 3:5 4:9 5:4 6:8 7:6 8:7 9:7 10:8 11:7 12:6 13:3 14:7
 9 | 82	15	A 0:7 1:7 2:6 3:6 4:7 5:4 6:5 7:7 8:8 9:5 10:3 11:4 12:4 13:5 14:4
10 | 81	15	E 0:8 1:7 2:4 3:6 4:5 5:5 6:7 7:6 8:7 9:4 10:2 11:4 12:4 13:6 14:6
11 | 77	15	F 0:5 1:6 2:5 3:5 4:6 5:3 6:7 7:6 8:7 9:7 10:3 11:5 12:5 13:2 14:5
12 | 65	15	Q 0:3 1:3 2:4 3:4 4:7 5:3 6:6 7:4 8:5 9:6 10:4 11:5 12:3 13:3 14:5
13 | 58	15	L 0:3 1:3 2:3 3:3 4:4 5:2 6:4 7:3 8:5 9:4 10:5 11:4 12:4 13:6 14:5
14 | 49	15	D 0:3 1:3 2:2 3:4 4:3 5:3 6:3 7:4 8:6 9:4 10:3 11:4 12:2 13:1 14:4
15 | 47	15	P 0:4 1:3 2:2 3:4 4:4 5:3 6:3 7:3 8:3 9:3 10:2 11:3 12:4 13:3 14:3
16 | 46	15	H 0:3 1:3 2:3 3:4 4:2 5:3 6:3 7:3 8:4 9:4 10:2 11:3 12:4 13:2 14:3
17 | 43	15	C 0:3 1:3 2:2 3:3 4:4 5:2 6:4 7:4 8:4 9:4 10:2 11:2 12:2 13:2 14:2
18 | 23	15	Y 0:3 1:2 2:1 3:2 4:2 5:1 6:2 7:3 8:1 9:1 10:1 11:1 12:1 13:1 14:1
19 | 15	15	W 0:1 1:1 2:1 3:1 4:1 5:1 6:1 7:1 8:1 9:1 10:1 11:1 12:1 13:1 14:1
20 | 14	12	M 0:1 1:1 3:1 4:1 5:1 6:1 7:1 8:1 11:2 12:1 13:1 14:2


--------------------------------------------------------------------------------
/example_data/output/hiv.pep.fasta.pairwise:
--------------------------------------------------------------------------------
  1 | DENTIST	PATIENT_A	0.1910112
  2 | DENTIST	PATIENT_B	0.4886364
  3 | DENTIST	PATIENT_C	0.2111111
  4 | DENTIST	PATIENT_D	0.5384615
  5 | DENTIST	PATIENT_E	0.4886364
  6 | DENTIST	PATIENT_F	0.5326087
  7 | DENTIST	PATIENT_G	0.3516484
  8 | DENTIST	PATIENT_H	0.6210526
  9 | DENTIST	DENTIST_WIFE	0.5280899
 10 | DENTIST	Local_Control_1	0.6363636
 11 | DENTIST	Local_Control_2	0.6022727
 12 | DENTIST	Local_Control_3	0.5909091
 13 | DENTIST	Local_Control_4	0.6477273
 14 | DENTIST	Local_Control_5	0.6022727
 15 | PATIENT_A	PATIENT_B	0.4382022
 16 | PATIENT_A	PATIENT_C	0.2444444
 17 | PATIENT_A	PATIENT_D	0.4945055
 18 | PATIENT_A	PATIENT_E	0.3932584
 19 | PATIENT_A	PATIENT_F	0.4782609
 20 | PATIENT_A	PATIENT_G	0.3076923
 21 | PATIENT_A	PATIENT_H	0.5789474
 22 | PATIENT_A	DENTIST_WIFE	0.4943820
 23 | PATIENT_A	Local_Control_1	0.6067416
 24 | PATIENT_A	Local_Control_2	0.5280899
 25 | PATIENT_A	Local_Control_3	0.5280899
 26 | PATIENT_A	Local_Control_4	0.6292135
 27 | PATIENT_A	Local_Control_5	0.5280899
 28 | PATIENT_B	PATIENT_C	0.4444444
 29 | PATIENT_B	PATIENT_D	0.6043956
 30 | PATIENT_B	PATIENT_E	0.3200000
 31 | PATIENT_B	PATIENT_F	0.5108696
 32 | PATIENT_B	PATIENT_G	0.4725275
 33 | PATIENT_B	PATIENT_H	0.6210526
 34 | PATIENT_B	DENTIST_WIFE	0.5505618
 35 | PATIENT_B	Local_Control_1	0.4605263
 36 | PATIENT_B	Local_Control_2	0.5421687
 37 | PATIENT_B	Local_Control_3	0.5487805
 38 | PATIENT_B	Local_Control_4	0.5584416
 39 | PATIENT_B	Local_Control_5	0.5909091
 40 | PATIENT_C	PATIENT_D	0.5054945
 41 | PATIENT_C	PATIENT_E	0.4222222
 42 | PATIENT_C	PATIENT_F	0.4673913
 43 | PATIENT_C	PATIENT_G	0.2747253
 44 | PATIENT_C	PATIENT_H	0.5578947
 45 | PATIENT_C	DENTIST_WIFE	0.5000000
 46 | PATIENT_C	Local_Control_1	0.6000000
 47 | PATIENT_C	Local_Control_2	0.5444444
 48 | PATIENT_C	Local_Control_3	0.5444444
 49 | PATIENT_C	Local_Control_4	0.6111111
 50 | PATIENT_C	Local_Control_5	0.5444444
 51 | PATIENT_D	PATIENT_E	0.6043956
 52 | PATIENT_D	PATIENT_F	0.4347826
 53 | PATIENT_D	PATIENT_G	0.5384615
 54 | PATIENT_D	PATIENT_H	0.5578947
 55 | PATIENT_D	DENTIST_WIFE	0.4835165
 56 | PATIENT_D	Local_Control_1	0.6153846
 57 | PATIENT_D	Local_Control_2	0.5494505
 58 | PATIENT_D	Local_Control_3	0.5494505
 59 | PATIENT_D	Local_Control_4	0.6153846
 60 | PATIENT_D	Local_Control_5	0.5384615
 61 | PATIENT_E	PATIENT_F	0.5217391
 62 | PATIENT_E	PATIENT_G	0.4615385
 63 | PATIENT_E	PATIENT_H	0.6000000
 64 | PATIENT_E	DENTIST_WIFE	0.5393258
 65 | PATIENT_E	Local_Control_1	0.5263158
 66 | PATIENT_E	Local_Control_2	0.5301205
 67 | PATIENT_E	Local_Control_3	0.5121951
 68 | PATIENT_E	Local_Control_4	0.5584416
 69 | PATIENT_E	Local_Control_5	0.5340909
 70 | PATIENT_F	PATIENT_G	0.4782609
 71 | PATIENT_F	PATIENT_H	0.5052632
 72 | PATIENT_F	DENTIST_WIFE	0.4782609
 73 | PATIENT_F	Local_Control_1	0.5869565
 74 | PATIENT_F	Local_Control_2	0.4673913
 75 | PATIENT_F	Local_Control_3	0.5434783
 76 | PATIENT_F	Local_Control_4	0.5760870
 77 | PATIENT_F	Local_Control_5	0.4673913
 78 | PATIENT_G	PATIENT_H	0.5684211
 79 | PATIENT_G	DENTIST_WIFE	0.5054945
 80 | PATIENT_G	Local_Control_1	0.6373626
 81 | PATIENT_G	Local_Control_2	0.5824176
 82 | PATIENT_G	Local_Control_3	0.5494505
 83 | PATIENT_G	Local_Control_4	0.6373626
 84 | PATIENT_G	Local_Control_5	0.5714286
 85 | PATIENT_H	DENTIST_WIFE	0.5473684
 86 | PATIENT_H	Local_Control_1	0.5684211
 87 | PATIENT_H	Local_Control_2	0.5578947
 88 | PATIENT_H	Local_Control_3	0.6105263
 89 | PATIENT_H	Local_Control_4	0.6526316
 90 | PATIENT_H	Local_Control_5	0.5473684
 91 | DENTIST_WIFE	Local_Control_1	0.6067416
 92 | DENTIST_WIFE	Local_Control_2	0.5505618
 93 | DENTIST_WIFE	Local_Control_3	0.5955056
 94 | DENTIST_WIFE	Local_Control_4	0.6629213
 95 | DENTIST_WIFE	Local_Control_5	0.5505618
 96 | Local_Control_1	Local_Control_2	0.5421687
 97 | Local_Control_1	Local_Control_3	0.5487805
 98 | Local_Control_1	Local_Control_4	0.5584416
 99 | Local_Control_1	Local_Control_5	0.5909091
100 | Local_Control_2	Local_Control_3	0.5060241
101 | Local_Control_2	Local_Control_4	0.6144578
102 | Local_Control_2	Local_Control_5	0.1818182
103 | Local_Control_3	Local_Control_4	0.3902439
104 | Local_Control_3	Local_Control_5	0.5340909
105 | Local_Control_4	Local_Control_5	0.6136364
106 | 


--------------------------------------------------------------------------------
/example_data/output/hiv.pep.fasta.phylip:
--------------------------------------------------------------------------------
 1 |    15
 2 | DENTIST    0.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
 3 | PATIENT_A  1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000
 4 | PATIENT_B  1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000
 5 | PATIENT_C  1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000
 6 | PATIENT_D  1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000
 7 | PATIENT_E  1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.6636364 0.5888889 1.0000000 1.0000000 0.2884615 1.0000000 0.6400000 0.0270270 1.0000000
 8 | PATIENT_F  1.0000000 0.6636364 0.6636364 0.6636364 0.6636364 0.6636364 0.0000000 0.6636364 0.4822695 0.3363636 0.6636364 0.3454545 0.6727273 0.6727273 0.3454545
 9 | PATIENT_G  1.0000000 0.5888889 0.5888889 0.5888889 0.5888889 0.5888889 0.6636364 0.0000000 1.0000000 1.0000000 0.5888889 1.0000000 0.6400000 0.6000000 1.0000000
10 | PATIENT_H  1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.4822695 1.0000000 0.0000000 0.4751773 1.0000000 0.4893617 1.0000000 1.0000000 0.4893617
11 | DENTIST_WI 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.3363636 1.0000000 0.4751773 0.0000000 1.0000000 0.1724138 1.0000000 1.0000000 0.1724138
12 | Local_Cont 1.0000000 0.2884615 0.2884615 0.2884615 0.2884615 0.2884615 0.6636364 0.5888889 1.0000000 1.0000000 0.0000000 1.0000000 0.6400000 0.3076923 1.0000000
13 | Local_Cont 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.3454545 1.0000000 0.4893617 0.1724138 1.0000000 0.0000000 1.0000000 1.0000000 0.0000000
14 | Local_Cont 1.0000000 0.6400000 0.6400000 0.6400000 0.6400000 0.6400000 0.6727273 0.6400000 1.0000000 1.0000000 0.6400000 1.0000000 0.0000000 0.6400000 1.0000000
15 | Local_Cont 1.0000000 0.0270270 0.0270270 0.0270270 0.0270270 0.0270270 0.6727273 0.6000000 1.0000000 1.0000000 0.3076923 1.0000000 0.6400000 0.0000000 1.0000000
16 | Local_Cont 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.3454545 1.0000000 0.4893617 0.1724138 1.0000000 0.0000000 1.0000000 1.0000000 0.0000000
17 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | # Read a __version__
 4 | exec(open('alfpy/version.py').read())
 5 | 
 6 | # Long description
 7 | fh = open('README.rst')
 8 | long_description = fh.read()
 9 | fh.close()
10 | 
11 | setup(
12 |    name='alfpy',
13 |    version=__version__,
14 |    description="Alignment-free package to compare DNA/RNA/protein sequences (bioinformatics).",
15 |    long_description=long_description,
16 |    author='Andrzej Zielezinski',
17 |    keywords='alignment-free bioinformatics sequence DNA protein homology phylogeny',
18 |    license="MIT",
19 |    author_email='andrzejz@amu.edu.pl',
20 |    url="http://www.combio.pl/alfree",
21 |    packages=['alfpy', 'alfpy.utils', 'alfpy.utils.data'],
22 |    #setup_requires=["numpy"],
23 |    install_requires=["numpy"],
24 |    scripts=[
25 |      'bin/calc_bbc.py',
26 |      'bin/calc_graphdna.py',
27 |      'bin/calc_fcgr.py',
28 |      'bin/calc_lempelziv.py',
29 |      'bin/calc_ncd.py',
30 |      'bin/calc_wmetric.py',
31 |      'bin/calc_word.py',
32 |      'bin/calc_word_bool.py',
33 |      'bin/calc_word_sets.py',
34 |      'bin/calc_word_cv.py',
35 |      'bin/calc_word_d2.py',
36 |      'bin/calc_word_ffp.py',
37 |      'bin/calc_word_rtd.py',
38 |      'bin/create_wordpattern.py'
39 |    ],
40 |    classifiers=[
41 |      'License :: OSI Approved :: MIT License',
42 |      'Environment :: Console',
43 |      'Operating System :: MacOS',
44 |      'Operating System :: POSIX :: Linux',     
45 |      'Programming Language :: Python :: 2',
46 |      'Programming Language :: Python :: 2.7',
47 |      'Programming Language :: Python :: 3',
48 |      'Programming Language :: Python :: 3.3',
49 |      'Programming Language :: Python :: 3.4',
50 |      'Programming Language :: Python :: 3.5',
51 |      'Topic :: Scientific/Engineering',
52 |      'Topic :: Scientific/Engineering :: Bio-Informatics',     
53 |    ],   
54 | 
55 | )


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aziele/alfpy/25545be14affa7d7e89e5b5ebcfe4f3e688108b7/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/char_freqs.txt:
--------------------------------------------------------------------------------
 1 | # UniProtKB/Swiss-Prot protein knowledgebase release 2016_09 statistics
 2 | # Release 2016_09 of 05-Oct-16 of UniProtKB/Swiss-Prot contains 552259 sequence entries,
 3 | # comprising 197423140 amino acids abstracted from 247204 references.
 4 | # http://web.expasy.org/docs/relnotes/relstat.html
 5 | A	0.0826
 6 | Q	0.0393
 7 | L	0.0965
 8 | S	0.0659
 9 | R	0.0553
10 | E	0.0674
11 | K	0.0583
12 | T	0.0534
13 | N	0.0406
14 | G	0.0708
15 | M	0.0241
16 | W	0.0109
17 | D	0.0546
18 | H	0.0227
19 | F	0.0386
20 | Y	0.0292
21 | C	0.0137
22 | I	0.0594
23 | P	0.0471
24 | V	0.0687


--------------------------------------------------------------------------------
/tests/data/char_weights.txt:
--------------------------------------------------------------------------------
 1 | # Based on amino acid frequencies
 2 | # Weight = 1 / amino acid freq / 10
 3 | # should be greater than 1.
 4 | A	1.21065375303
 5 | C	7.29927007299
 6 | E	1.48367952522
 7 | D	1.8315018315
 8 | G	1.41242937853
 9 | F	2.59067357513
10 | I	1.6835016835
11 | H	4.40528634361
12 | K	1.71526586621
13 | M	4.14937759336
14 | L	1.03626943005
15 | N	2.46305418719
16 | Q	2.54452926209
17 | P	2.12314225053
18 | S	1.51745068285
19 | R	1.80831826401
20 | T	1.87265917603
21 | W	9.17431192661
22 | V	1.45560407569
23 | Y	3.42465753425


--------------------------------------------------------------------------------
/tests/data/dna.fa:
--------------------------------------------------------------------------------
1 | >seq1
2 | AACGTACCATTGAACGTACCGTAGG
3 | >seq2
4 | CTAGGGGACTTATCTAGG
5 | >seq3
6 | CTAGGGAACATACCA


--------------------------------------------------------------------------------
/tests/data/dna.fa.1mer.txt:
--------------------------------------------------------------------------------
1 | 18	3	A 0:8 1:4 2:6
2 | 15	3	G 0:6 1:6 2:3
3 | 13	3	C 0:6 1:3 2:4
4 | 12	3	T 0:5 1:5 2:2


--------------------------------------------------------------------------------
/tests/data/dna.fa.1mer.wordpos.txt:
--------------------------------------------------------------------------------
1 | 18	3	A 0 0 0 1 0 5 0 8 0 12 0 13 0 17 0 22 1 2 1 7 1 11 1 15 2 2 2 6 2 7 2 9 2 11 2 14
2 | 15	3	G 0 3 0 11 0 15 0 20 0 23 0 24 1 3 1 4 1 5 1 6 1 16 1 17 2 3 2 4 2 5
3 | 13	3	C 0 2 0 6 0 7 0 14 0 18 0 19 1 0 1 8 1 13 2 0 2 8 2 12 2 13
4 | 12	3	T 0 4 0 9 0 10 0 16 0 21 1 1 1 9 1 10 1 12 1 14 2 1 2 10


--------------------------------------------------------------------------------
/tests/data/dna.fa.2mer.txt:
--------------------------------------------------------------------------------
 1 | 8	3	TA 0:3 1:3 2:2
 2 | 7	3	AC 0:4 1:1 2:2
 3 | 7	3	GG 0:1 1:4 2:2
 4 | 4	3	AG 0:1 1:2 2:1
 5 | 4	2	CT 1:3 2:1
 6 | 3	3	AT 0:1 1:1 2:1
 7 | 3	3	GA 0:1 1:1 2:1
 8 | 3	2	AA 0:2 2:1
 9 | 3	2	CA 0:1 2:2
10 | 3	2	CC 0:2 2:1
11 | 3	1	CG 0:3
12 | 3	1	GT 0:3
13 | 2	2	TT 0:1 1:1
14 | 1	1	TC 1:1
15 | 1	1	TG 0:1


--------------------------------------------------------------------------------
/tests/data/dna.fa.2mer.wordpos.txt:
--------------------------------------------------------------------------------
 1 | 8	3	TA 0 4 0 16 0 21 1 1 1 10 1 14 2 1 2 10
 2 | 7	3	AC 0 1 0 5 0 13 0 17 1 7 2 7 2 11
 3 | 7	3	GG 0 23 1 3 1 4 1 5 1 16 2 3 2 4
 4 | 4	3	AG 0 22 1 2 1 15 2 2
 5 | 4	2	CT 1 0 1 8 1 13 2 0
 6 | 3	3	AT 0 8 1 11 2 9
 7 | 3	3	GA 0 11 1 6 2 5
 8 | 3	2	AA 0 0 0 12 2 6
 9 | 3	2	CA 0 7 2 8 2 13
10 | 3	2	CC 0 6 0 18 2 12
11 | 3	1	CG 0 2 0 14 0 19
12 | 3	1	GT 0 3 0 15 0 20
13 | 2	2	TT 0 9 1 9
14 | 1	1	TC 1 12
15 | 1	1	TG 0 10


--------------------------------------------------------------------------------
/tests/data/pep.fa:
--------------------------------------------------------------------------------
 1 | >seq1 seq1 desc
 2 | MEVVIRSANFTDNAKIIIVQLNASVEINC
 3 | TRPNNYTRKGIRIGPGRAVYAAEEIIGDN
 4 | TLKQVVTKLRE
 5 | >seq2 seq2 desc
 6 | MVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIR
 7 | IGPGRAVYAAEEIIGDIRRAHCNIS
 8 | >seq3 seq3 desc
 9 | MFTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGAKW
10 | >seq4
11 | MFTDNAKIIIVQLNASVEINCTRPNNNTR
12 | 


--------------------------------------------------------------------------------
/tests/data/pep.fa.1mer.txt:
--------------------------------------------------------------------------------
 1 | 34	4	I 0:9 1:11 2:10 3:4
 2 | 28	4	N 0:7 1:8 2:7 3:6
 3 | 21	4	A 0:6 1:7 2:6 3:2
 4 | 19	4	R 0:6 1:7 2:4 3:2
 5 | 15	4	T 0:5 1:3 2:4 3:3
 6 | 15	4	V 0:7 1:4 2:2 3:2
 7 | 14	3	G 0:4 1:4 2:6
 8 | 11	4	E 0:5 1:3 2:2 3:1
 9 | 10	4	K 0:4 1:2 2:3 3:1
10 | 8	4	S 0:2 1:3 2:2 3:1
11 | 7	4	D 0:2 1:2 2:2 3:1
12 | 7	4	P 0:2 1:2 2:2 3:1
13 | 6	4	C 0:1 1:2 2:2 3:1
14 | 6	4	L 0:3 1:1 2:1 3:1
15 | 6	4	Q 0:2 1:1 2:2 3:1
16 | 5	4	F 0:1 1:1 2:2 3:1
17 | 4	4	M 0:1 1:1 2:1 3:1
18 | 4	3	Y 0:2 1:1 2:1
19 | 3	2	H 1:1 2:2
20 | 1	1	W 2:1


--------------------------------------------------------------------------------
/tests/data/pep.fa.1mer.wordpos.txt:
--------------------------------------------------------------------------------
 1 | 34	4	I 0 4 0 15 0 16 0 17 0 26 0 39 0 41 0 53 0 54 1 2 1 13 1 14 1 15 1 24 1 37 1 39 1 51 1 52 1 55 1 62 2 7 2 8 2 9 2 18 2 31 2 33 2 45 2 46 2 49 2 56 3 7 3 8 3 9 3 18
 2 | 28	4	N 0 8 0 12 0 21 0 27 0 32 0 33 0 57 1 6 1 10 1 19 1 25 1 30 1 31 1 32 1 61 2 4 2 13 2 19 2 24 2 25 2 26 2 55 3 4 3 13 3 19 3 24 3 25 3 26
 3 | 21	4	A 0 7 0 13 0 22 0 46 0 49 0 50 1 5 1 11 1 20 1 44 1 47 1 48 1 58 2 5 2 14 2 38 2 41 2 52 2 59 3 5 3 14
 4 | 19	4	R 0 5 0 30 0 36 0 40 0 45 0 67 1 3 1 28 1 34 1 38 1 43 1 56 1 57 2 22 2 28 2 37 2 50 3 22 3 28
 5 | 15	4	T 0 10 0 29 0 35 0 58 0 64 1 8 1 27 1 33 2 2 2 21 2 27 2 42 3 2 3 21 3 27
 6 | 15	4	V 0 2 0 3 0 18 0 24 0 47 0 62 0 63 1 1 1 16 1 22 1 45 2 10 2 16 3 10 3 16
 7 | 14	3	G 0 38 0 42 0 44 0 55 1 36 1 40 1 42 1 53 2 30 2 34 2 36 2 43 2 47 2 58
 8 | 11	4	E 0 1 0 25 0 51 0 52 0 68 1 23 1 49 1 50 2 17 2 44 3 17
 9 | 10	4	K 0 14 0 37 0 60 0 65 1 12 1 35 2 6 2 29 2 60 3 6
10 | 8	4	S 0 6 0 23 1 4 1 21 1 63 2 15 2 57 3 15
11 | 7	4	D 0 11 0 56 1 9 1 54 2 3 2 48 3 3
12 | 7	4	P 0 31 0 43 1 29 1 41 2 23 2 35 3 23
13 | 6	4	C 0 28 1 26 1 60 2 20 2 54 3 20
14 | 6	4	L 0 20 0 59 0 66 1 18 2 12 3 12
15 | 6	4	Q 0 19 0 61 1 17 2 11 2 51 3 11
16 | 5	4	F 0 9 1 7 2 1 2 39 3 1
17 | 4	4	M 0 0 1 0 2 0 3 0
18 | 4	3	Y 0 34 0 48 1 46 2 40
19 | 3	2	H 1 59 2 32 2 53
20 | 1	1	W 2 61


--------------------------------------------------------------------------------
/tests/data/pep.fa.2mer.txt:
--------------------------------------------------------------------------------
 1 | 11	4	II 0:3 1:3 2:3 3:2
 2 | 8	4	NA 0:2 1:2 2:2 3:2
 3 | 8	4	TR 0:2 1:2 2:2 3:2
 4 | 7	4	EI 0:2 1:2 2:2 3:1
 5 | 7	4	NN 0:1 1:2 2:2 3:2
 6 | 6	3	IG 0:2 1:2 2:2
 7 | 6	3	IR 0:2 1:3 2:1
 8 | 5	4	AK 0:1 1:1 2:2 3:1
 9 | 5	4	DN 0:2 1:1 2:1 3:1
10 | 4	4	AS 0:1 1:1 2:1 3:1
11 | 4	4	CT 0:1 1:1 2:1 3:1
12 | 4	4	FT 0:1 1:1 2:1 3:1
13 | 4	4	IN 0:1 1:1 2:1 3:1
14 | 4	4	IV 0:1 1:1 2:1 3:1
15 | 4	4	KI 0:1 1:1 2:1 3:1
16 | 4	4	LN 0:1 1:1 2:1 3:1
17 | 4	4	NC 0:1 1:1 2:1 3:1
18 | 4	4	NT 0:1 1:1 2:1 3:1
19 | 4	4	PN 0:1 1:1 2:1 3:1
20 | 4	4	QL 0:1 1:1 2:1 3:1
21 | 4	4	RP 0:1 1:1 2:1 3:1
22 | 4	4	SV 0:1 1:1 2:1 3:1
23 | 4	4	TD 0:1 1:1 2:1 3:1
24 | 4	4	VE 0:1 1:1 2:1 3:1
25 | 4	4	VQ 0:1 1:1 2:1 3:1
26 | 4	3	RA 0:1 1:2 2:1
27 | 3	3	GD 0:1 1:1 2:1
28 | 3	3	GI 0:1 1:1 2:1
29 | 3	3	GP 0:1 1:1 2:1
30 | 3	3	GR 0:1 1:1 2:1
31 | 3	3	KG 0:1 1:1 2:1
32 | 3	3	PG 0:1 1:1 2:1
33 | 3	3	RK 0:1 1:1 2:1
34 | 3	3	YA 0:1 1:1 2:1
35 | 2	2	AA 0:1 1:1
36 | 2	2	AE 0:1 1:1
37 | 2	2	AH 1:1 2:1
38 | 2	2	AN 0:1 1:1
39 | 2	2	AV 0:1 1:1
40 | 2	2	CN 1:1 2:1
41 | 2	2	DI 1:1 2:1
42 | 2	2	EE 0:1 1:1
43 | 2	2	HC 1:1 2:1
44 | 2	2	IS 1:1 2:1
45 | 2	2	MF 2:1 3:1
46 | 2	2	NF 0:1 1:1
47 | 2	2	NI 1:1 2:1
48 | 2	2	RI 0:1 1:1
49 | 2	2	RS 0:1 1:1
50 | 2	2	SA 0:1 1:1
51 | 2	2	VI 0:1 1:1
52 | 2	2	VY 0:1 1:1
53 | 2	1	VV 0:2
54 | 1	1	AF 2:1
55 | 1	1	AT 2:1
56 | 1	1	EV 0:1
57 | 1	1	FY 2:1
58 | 1	1	GA 2:1
59 | 1	1	GE 2:1
60 | 1	1	HI 2:1
61 | 1	1	IH 2:1
62 | 1	1	KL 0:1
63 | 1	1	KQ 0:1
64 | 1	1	KW 2:1
65 | 1	1	LK 0:1
66 | 1	1	LR 0:1
67 | 1	1	ME 0:1
68 | 1	1	MV 1:1
69 | 1	1	NY 0:1
70 | 1	1	QA 2:1
71 | 1	1	QV 0:1
72 | 1	1	RE 0:1
73 | 1	1	RQ 2:1
74 | 1	1	RR 1:1
75 | 1	1	SG 2:1
76 | 1	1	TG 2:1
77 | 1	1	TK 0:1
78 | 1	1	TL 0:1
79 | 1	1	VT 0:1
80 | 1	1	YT 0:1


--------------------------------------------------------------------------------
/tests/data/pep.fa.2mer.wordpos.txt:
--------------------------------------------------------------------------------
 1 | 11	4	II 0 15 0 16 0 53 1 13 1 14 1 51 2 7 2 8 2 45 3 7 3 8
 2 | 8	4	NA 0 12 0 21 1 10 1 19 2 4 2 13 3 4 3 13
 3 | 8	4	TR 0 29 0 35 1 27 1 33 2 21 2 27 3 21 3 27
 4 | 7	4	EI 0 25 0 52 1 23 1 50 2 17 2 44 3 17
 5 | 7	4	NN 0 32 1 30 1 31 2 24 2 25 3 24 3 25
 6 | 6	3	IG 0 41 0 54 1 39 1 52 2 33 2 46
 7 | 6	3	IR 0 4 0 39 1 2 1 37 1 55 2 49
 8 | 5	4	AK 0 13 1 11 2 5 2 59 3 5
 9 | 5	4	DN 0 11 0 56 1 9 2 3 3 3
10 | 4	4	AS 0 22 1 20 2 14 3 14
11 | 4	4	CT 0 28 1 26 2 20 3 20
12 | 4	4	FT 0 9 1 7 2 1 3 1
13 | 4	4	IN 0 26 1 24 2 18 3 18
14 | 4	4	IV 0 17 1 15 2 9 3 9
15 | 4	4	KI 0 14 1 12 2 6 3 6
16 | 4	4	LN 0 20 1 18 2 12 3 12
17 | 4	4	NC 0 27 1 25 2 19 3 19
18 | 4	4	NT 0 57 1 32 2 26 3 26
19 | 4	4	PN 0 31 1 29 2 23 3 23
20 | 4	4	QL 0 19 1 17 2 11 3 11
21 | 4	4	RP 0 30 1 28 2 22 3 22
22 | 4	4	SV 0 23 1 21 2 15 3 15
23 | 4	4	TD 0 10 1 8 2 2 3 2
24 | 4	4	VE 0 24 1 22 2 16 3 16
25 | 4	4	VQ 0 18 1 16 2 10 3 10
26 | 4	3	RA 0 45 1 43 1 57 2 37
27 | 3	3	GD 0 55 1 53 2 47
28 | 3	3	GI 0 38 1 36 2 30
29 | 3	3	GP 0 42 1 40 2 34
30 | 3	3	GR 0 44 1 42 2 36
31 | 3	3	KG 0 37 1 35 2 29
32 | 3	3	PG 0 43 1 41 2 35
33 | 3	3	RK 0 36 1 34 2 28
34 | 3	3	YA 0 48 1 46 2 40
35 | 2	2	AA 0 49 1 47
36 | 2	2	AE 0 50 1 48
37 | 2	2	AH 1 58 2 52
38 | 2	2	AN 0 7 1 5
39 | 2	2	AV 0 46 1 44
40 | 2	2	CN 1 60 2 54
41 | 2	2	DI 1 54 2 48
42 | 2	2	EE 0 51 1 49
43 | 2	2	HC 1 59 2 53
44 | 2	2	IS 1 62 2 56
45 | 2	2	MF 2 0 3 0
46 | 2	2	NF 0 8 1 6
47 | 2	2	NI 1 61 2 55
48 | 2	2	RI 0 40 1 38
49 | 2	2	RS 0 5 1 3
50 | 2	2	SA 0 6 1 4
51 | 2	2	VI 0 3 1 1
52 | 2	2	VY 0 47 1 45
53 | 2	1	VV 0 2 0 62
54 | 1	1	AF 2 38
55 | 1	1	AT 2 41
56 | 1	1	EV 0 1
57 | 1	1	FY 2 39
58 | 1	1	GA 2 58
59 | 1	1	GE 2 43
60 | 1	1	HI 2 32
61 | 1	1	IH 2 31
62 | 1	1	KL 0 65
63 | 1	1	KQ 0 60
64 | 1	1	KW 2 60
65 | 1	1	LK 0 59
66 | 1	1	LR 0 66
67 | 1	1	ME 0 0
68 | 1	1	MV 1 0
69 | 1	1	NY 0 33
70 | 1	1	QA 2 51
71 | 1	1	QV 0 61
72 | 1	1	RE 0 67
73 | 1	1	RQ 2 50
74 | 1	1	RR 1 56
75 | 1	1	SG 2 57
76 | 1	1	TG 2 42
77 | 1	1	TK 0 64
78 | 1	1	TL 0 58
79 | 1	1	VT 0 63
80 | 1	1	YT 0 34


--------------------------------------------------------------------------------
/tests/data/pep.fa.3mer.txt:
--------------------------------------------------------------------------------
 1 | 4	4	AKI 0:1 1:1 2:1 3:1
 2 | 4	4	ASV 0:1 1:1 2:1 3:1
 3 | 4	4	CTR 0:1 1:1 2:1 3:1
 4 | 4	4	DNA 0:1 1:1 2:1 3:1
 5 | 4	4	EIN 0:1 1:1 2:1 3:1
 6 | 4	4	FTD 0:1 1:1 2:1 3:1
 7 | 4	4	III 0:1 1:1 2:1 3:1
 8 | 4	4	IIV 0:1 1:1 2:1 3:1
 9 | 4	4	INC 0:1 1:1 2:1 3:1
10 | 4	4	IVQ 0:1 1:1 2:1 3:1
11 | 4	4	KII 0:1 1:1 2:1 3:1
12 | 4	4	LNA 0:1 1:1 2:1 3:1
13 | 4	4	NAK 0:1 1:1 2:1 3:1
14 | 4	4	NAS 0:1 1:1 2:1 3:1
15 | 4	4	NCT 0:1 1:1 2:1 3:1
16 | 4	4	PNN 0:1 1:1 2:1 3:1
17 | 4	4	QLN 0:1 1:1 2:1 3:1
18 | 4	4	RPN 0:1 1:1 2:1 3:1
19 | 4	4	SVE 0:1 1:1 2:1 3:1
20 | 4	4	TDN 0:1 1:1 2:1 3:1
21 | 4	4	TRP 0:1 1:1 2:1 3:1
22 | 4	4	VEI 0:1 1:1 2:1 3:1
23 | 4	4	VQL 0:1 1:1 2:1 3:1
24 | 3	3	EII 0:1 1:1 2:1
25 | 3	3	GPG 0:1 1:1 2:1
26 | 3	3	GRA 0:1 1:1 2:1
27 | 3	3	IGD 0:1 1:1 2:1
28 | 3	3	IGP 0:1 1:1 2:1
29 | 3	3	IIG 0:1 1:1 2:1
30 | 3	3	KGI 0:1 1:1 2:1
31 | 3	3	NNN 1:1 2:1 3:1
32 | 3	3	NNT 1:1 2:1 3:1
33 | 3	3	NTR 1:1 2:1 3:1
34 | 3	3	PGR 0:1 1:1 2:1
35 | 3	3	RKG 0:1 1:1 2:1
36 | 3	3	TRK 0:1 1:1 2:1
37 | 2	2	AAE 0:1 1:1
38 | 2	2	AEE 0:1 1:1
39 | 2	2	AHC 1:1 2:1
40 | 2	2	ANF 0:1 1:1
41 | 2	2	AVY 0:1 1:1
42 | 2	2	CNI 1:1 2:1
43 | 2	2	DIR 1:1 2:1
44 | 2	2	EEI 0:1 1:1
45 | 2	2	GDI 1:1 2:1
46 | 2	2	GIR 0:1 1:1
47 | 2	2	HCN 1:1 2:1
48 | 2	2	IRI 0:1 1:1
49 | 2	2	IRS 0:1 1:1
50 | 2	2	MFT 2:1 3:1
51 | 2	2	NFT 0:1 1:1
52 | 2	2	NIS 1:1 2:1
53 | 2	2	RAV 0:1 1:1
54 | 2	2	RIG 0:1 1:1
55 | 2	2	RSA 0:1 1:1
56 | 2	2	SAN 0:1 1:1
57 | 2	2	VIR 0:1 1:1
58 | 2	2	VYA 0:1 1:1
59 | 2	2	YAA 0:1 1:1
60 | 1	1	AFY 2:1
61 | 1	1	AKW 2:1
62 | 1	1	ATG 2:1
63 | 1	1	DNT 0:1
64 | 1	1	EVV 0:1
65 | 1	1	FYA 2:1
66 | 1	1	GAK 2:1
67 | 1	1	GDN 0:1
68 | 1	1	GEI 2:1
69 | 1	1	GIH 2:1
70 | 1	1	HIG 2:1
71 | 1	1	IHI 2:1
72 | 1	1	IRQ 2:1
73 | 1	1	IRR 1:1
74 | 1	1	ISG 2:1
75 | 1	1	KLR 0:1
76 | 1	1	KQV 0:1
77 | 1	1	LKQ 0:1
78 | 1	1	LRE 0:1
79 | 1	1	MEV 0:1
80 | 1	1	MVI 1:1
81 | 1	1	NNY 0:1
82 | 1	1	NTL 0:1
83 | 1	1	NYT 0:1
84 | 1	1	QAH 2:1
85 | 1	1	QVV 0:1
86 | 1	1	RAF 2:1
87 | 1	1	RAH 1:1
88 | 1	1	RQA 2:1
89 | 1	1	RRA 1:1
90 | 1	1	SGA 2:1
91 | 1	1	TGE 2:1
92 | 1	1	TKL 0:1
93 | 1	1	TLK 0:1
94 | 1	1	VTK 0:1
95 | 1	1	VVI 0:1
96 | 1	1	VVT 0:1
97 | 1	1	YAT 2:1
98 | 1	1	YTR 0:1


--------------------------------------------------------------------------------
/tests/data/pep.fa.3mer.wordpos.txt:
--------------------------------------------------------------------------------
 1 | 4	4	AKI 0 13 1 11 2 5 3 5
 2 | 4	4	ASV 0 22 1 20 2 14 3 14
 3 | 4	4	CTR 0 28 1 26 2 20 3 20
 4 | 4	4	DNA 0 11 1 9 2 3 3 3
 5 | 4	4	EIN 0 25 1 23 2 17 3 17
 6 | 4	4	FTD 0 9 1 7 2 1 3 1
 7 | 4	4	III 0 15 1 13 2 7 3 7
 8 | 4	4	IIV 0 16 1 14 2 8 3 8
 9 | 4	4	INC 0 26 1 24 2 18 3 18
10 | 4	4	IVQ 0 17 1 15 2 9 3 9
11 | 4	4	KII 0 14 1 12 2 6 3 6
12 | 4	4	LNA 0 20 1 18 2 12 3 12
13 | 4	4	NAK 0 12 1 10 2 4 3 4
14 | 4	4	NAS 0 21 1 19 2 13 3 13
15 | 4	4	NCT 0 27 1 25 2 19 3 19
16 | 4	4	PNN 0 31 1 29 2 23 3 23
17 | 4	4	QLN 0 19 1 17 2 11 3 11
18 | 4	4	RPN 0 30 1 28 2 22 3 22
19 | 4	4	SVE 0 23 1 21 2 15 3 15
20 | 4	4	TDN 0 10 1 8 2 2 3 2
21 | 4	4	TRP 0 29 1 27 2 21 3 21
22 | 4	4	VEI 0 24 1 22 2 16 3 16
23 | 4	4	VQL 0 18 1 16 2 10 3 10
24 | 3	3	EII 0 52 1 50 2 44
25 | 3	3	GPG 0 42 1 40 2 34
26 | 3	3	GRA 0 44 1 42 2 36
27 | 3	3	IGD 0 54 1 52 2 46
28 | 3	3	IGP 0 41 1 39 2 33
29 | 3	3	IIG 0 53 1 51 2 45
30 | 3	3	KGI 0 37 1 35 2 29
31 | 3	3	NNN 1 30 2 24 3 24
32 | 3	3	NNT 1 31 2 25 3 25
33 | 3	3	NTR 1 32 2 26 3 26
34 | 3	3	PGR 0 43 1 41 2 35
35 | 3	3	RKG 0 36 1 34 2 28
36 | 3	3	TRK 0 35 1 33 2 27
37 | 2	2	AAE 0 49 1 47
38 | 2	2	AEE 0 50 1 48
39 | 2	2	AHC 1 58 2 52
40 | 2	2	ANF 0 7 1 5
41 | 2	2	AVY 0 46 1 44
42 | 2	2	CNI 1 60 2 54
43 | 2	2	DIR 1 54 2 48
44 | 2	2	EEI 0 51 1 49
45 | 2	2	GDI 1 53 2 47
46 | 2	2	GIR 0 38 1 36
47 | 2	2	HCN 1 59 2 53
48 | 2	2	IRI 0 39 1 37
49 | 2	2	IRS 0 4 1 2
50 | 2	2	MFT 2 0 3 0
51 | 2	2	NFT 0 8 1 6
52 | 2	2	NIS 1 61 2 55
53 | 2	2	RAV 0 45 1 43
54 | 2	2	RIG 0 40 1 38
55 | 2	2	RSA 0 5 1 3
56 | 2	2	SAN 0 6 1 4
57 | 2	2	VIR 0 3 1 1
58 | 2	2	VYA 0 47 1 45
59 | 2	2	YAA 0 48 1 46
60 | 1	1	AFY 2 38
61 | 1	1	AKW 2 59
62 | 1	1	ATG 2 41
63 | 1	1	DNT 0 56
64 | 1	1	EVV 0 1
65 | 1	1	FYA 2 39
66 | 1	1	GAK 2 58
67 | 1	1	GDN 0 55
68 | 1	1	GEI 2 43
69 | 1	1	GIH 2 30
70 | 1	1	HIG 2 32
71 | 1	1	IHI 2 31
72 | 1	1	IRQ 2 49
73 | 1	1	IRR 1 55
74 | 1	1	ISG 2 56
75 | 1	1	KLR 0 65
76 | 1	1	KQV 0 60
77 | 1	1	LKQ 0 59
78 | 1	1	LRE 0 66
79 | 1	1	MEV 0 0
80 | 1	1	MVI 1 0
81 | 1	1	NNY 0 32
82 | 1	1	NTL 0 57
83 | 1	1	NYT 0 33
84 | 1	1	QAH 2 51
85 | 1	1	QVV 0 61
86 | 1	1	RAF 2 37
87 | 1	1	RAH 1 57
88 | 1	1	RQA 2 50
89 | 1	1	RRA 1 56
90 | 1	1	SGA 2 57
91 | 1	1	TGE 2 42
92 | 1	1	TKL 0 64
93 | 1	1	TLK 0 58
94 | 1	1	VTK 0 63
95 | 1	1	VVI 0 2
96 | 1	1	VVT 0 62
97 | 1	1	YAT 2 40
98 | 1	1	YTR 0 34


--------------------------------------------------------------------------------
/tests/test_calc_bbc.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsCommonTest.set_test_data()
11 |         self.script_name = 'calc_bbc.py'
12 | 
13 |     def test_arg_molecule_when_no_fasta(self):
14 |         args = ['--molecule', 'dna']
15 |         returncode, out = utils.runscript(self.script_name, args)
16 |         self.assertEqual(returncode, 2)
17 |         self.assertIn('--fasta/-f', out)
18 | 
19 |     def test_arg_molecule_invalid_choice(self):
20 |         args = ['--fasta', self.filename_dna,
21 |                 '--molecule', 'nonexistent_mol']
22 |         returncode, out = utils.runscript(self.script_name, args)
23 |         self.assertEqual(returncode, 2)
24 |         self.assertIn('--molecule/-m', out)
25 | 
26 |     def test_output_on_dna1(self):
27 |         args = ['--fasta', self.filename_dna, '--m', 'dna']
28 |         returncode, out, md5 = self._test_output(self.script_name, args)
29 |         self.assertEqual(returncode, 0)
30 |         self.assertEqual(md5, '6cfc27479ca5fb3d5d2d468544005d8b')
31 | 
32 |     def test_output_on_dna_k2(self):
33 |         args = ['--fasta', self.filename_dna, '--m', 'dna', '--k', '2']
34 |         returncode, out, md5 = self._test_output(self.script_name, args)
35 |         self.assertEqual(returncode, 0)
36 |         self.assertEqual(md5, '1ea7e82d6bb7b8648e0dcca9e089361c')
37 | 
38 |     def test_output_on_dna_k2_pairwise(self):
39 |         args = ['--fasta', self.filename_dna, '--m', 'dna',
40 |                 '--k', '2', '--outfmt', 'pairwise']
41 |         returncode, out, md5 = self._test_output(self.script_name, args)
42 |         self.assertEqual(returncode, 0)
43 |         self.assertEqual(md5, '74de6627e68cfb609701c13637ba4090')
44 | 
45 |     def test_output_on_protein(self):
46 |         args = ['--fasta', self.filename_pep, '--m', 'protein']
47 |         returncode, out, md5 = self._test_output(self.script_name, args)
48 |         self.assertEqual(returncode, 0)
49 |         self.assertEqual(md5, '154f2788be2ec349092f22ce359acf80')
50 | 
51 |     def test_output_on_protein_no_outfile(self):
52 |         args = ['--fasta', self.filename_pep, '--m', 'protein']
53 |         returncode, out, md5 = self._test_output(self.script_name, args, False)
54 |         self.assertEqual(returncode, 0)
55 |         self.assertEqual(md5, '154f2788be2ec349092f22ce359acf80')
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     unittest.main()
60 | 


--------------------------------------------------------------------------------
/tests/test_calc_fcgr.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsCommonTest.set_test_data()
11 |         self.script_name = 'calc_fcgr.py'
12 | 
13 |     def test_arg_word_size_2_when_no_fasta(self):
14 |         args = ['--word_size', '2']
15 |         returncode, out = utils.runscript(self.script_name, args)
16 |         self.assertEqual(returncode, 2)
17 |         self.assertIn('--fasta/-f', out)
18 | 
19 |     def test_arg_fasta_when_no_word_size(self):
20 |         args = ['--fasta', self.filename_dna]
21 |         returncode, out = utils.runscript(self.script_name, args)
22 |         self.assertEqual(returncode, 2)
23 |         self.assertIn('--word_size/-w', out)
24 | 
25 |     def test_arg_word_size_too_small(self):
26 |         args = ['--fasta', self.filename_dna, '--word_size', '0']
27 |         returncode, out = utils.runscript(self.script_name, args)
28 |         self.assertEqual(returncode, 2)
29 |         self.assertIn('--word_size must be >= 1', out)
30 | 
31 |     def test_output_word_size_1(self):
32 |         args = ['--fasta', self.filename_dna, '--word_size', '1']
33 |         returncode, out, md5 = self._test_output(self.script_name, args)
34 |         self.assertEqual(returncode, 0)
35 |         self.assertEqual(md5, 'bee51f3214f06f4e4265aa05bf9d6a7e')
36 | 
37 |     def test_output_word_size_2(self):
38 |         args = ['--fasta', self.filename_dna, '--word_size', '2']
39 |         returncode, out, md5 = self._test_output(self.script_name, args)
40 |         self.assertEqual(returncode, 0)
41 |         self.assertEqual(md5, '7175a91fb9fc31661ce07aea28743605')
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     unittest.main()
46 | 


--------------------------------------------------------------------------------
/tests/test_calc_graphdna.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsCommonTest.set_test_data()
11 |         self.script_name = 'calc_graphdna.py'
12 | 
13 |     def test_arg_vector_when_no_fasta(self):
14 |         args = ['--vector', '2DSV']
15 |         returncode, out = utils.runscript(self.script_name, args)
16 |         self.assertEqual(returncode, 2)
17 |         self.assertIn('--fasta/-f', out)
18 | 
19 |     def test_arg_vector_invalid_choice(self):
20 |         args = ['--fasta', self.filename_dna, '--vector', 'nonexistent']
21 |         returncode, out = utils.runscript(self.script_name, args)
22 |         self.assertEqual(returncode, 2)
23 |         self.assertIn('invalid choice', out)
24 | 
25 |     def test_output_default(self):
26 |         args = ['--fasta', self.filename_dna]
27 |         returncode, out, md5 = self._test_output(self.script_name, args)
28 |         self.assertEqual(returncode, 0)
29 |         self.assertEqual(md5, '496832ba4841a988a46c81770ee54668')
30 | 
31 |     def test_output_vector_2DSV(self):
32 |         args = ['--fasta', self.filename_dna, '--vector', '2DSV']
33 |         returncode, out, md5 = self._test_output(self.script_name, args)
34 |         self.assertEqual(returncode, 0)
35 |         self.assertEqual(md5, 'e35a44622d4f0411b26e12e8eedcdb64')
36 | 
37 |     def test_output_vector_2DMV(self):
38 |         args = ['--fasta', self.filename_dna, '--vector', '2DMV']
39 |         returncode, out, md5 = self._test_output(self.script_name, args)
40 |         self.assertEqual(returncode, 0)
41 |         self.assertEqual(md5, '7638015e1c25657cd572071f3b9ae7c4')
42 | 
43 |     def test_script_output_vector_2DNV_pairwise(self):
44 |         args = ['--fasta', self.filename_dna, '--vector', '2DNV',
45 |                 '--outfmt', 'pairwise']
46 |         returncode, out, md5 = self._test_output(self.script_name, args)
47 |         self.assertEqual(returncode, 0)
48 |         self.assertEqual(md5, '2921e374b468b6de81a1c9140681a3b4')
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     unittest.main()
53 | 


--------------------------------------------------------------------------------
/tests/test_calc_lempelziv.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsCommonTest.set_test_data()
11 |         self.script_name = 'calc_lempelziv.py'
12 | 
13 |     def test_agr_fasta_when_invalid_distance(self):
14 |         args = ['--fasta', self.filename_dna,
15 |                 '--distance', 'nonexistent']
16 |         returncode, out = utils.runscript(self.script_name, args)
17 |         self.assertEqual(returncode, 2)
18 |         self.assertIn('invalid choice', out)
19 | 
20 |     def test_agr_distance_when_no_fasta(self):
21 |         args = ['--distance', 'd1']
22 |         returncode, out = utils.runscript(self.script_name, args)
23 |         self.assertEqual(returncode, 2)
24 |         self.assertIn('--fasta/-f', out)
25 | 
26 |     def test_output_default(self):
27 |         args = ['--fasta', self.filename_pep]
28 |         returncode, out, md5 = self._test_output(self.script_name, args)
29 |         self.assertEqual(returncode, 0)
30 |         self.assertEqual(md5, '89d18a9ac1e573743fa0214c48dde40c')
31 | 
32 |     def test_output_distance_d(self):
33 |         args = ['--fasta', self.filename_pep, '--distance', 'd']
34 |         returncode, out, md5 = self._test_output(self.script_name, args)
35 |         self.assertEqual(returncode, 0)
36 |         self.assertEqual(md5, 'c71cb1521d0fc9084eee21c8599785ef')
37 | 
38 |     def test_output_distance_d_star_pairwise(self):
39 |         args = ['--fasta', self.filename_pep, '--distance', 'd_star',
40 |                 '--outfmt', 'pairwise']
41 |         returncode, out, md5 = self._test_output(self.script_name, args)
42 |         self.assertEqual(returncode, 0)
43 |         self.assertEqual(md5, '3ed3ca10d198fe4f44ea85134dbcb481')
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     unittest.main()
48 | 


--------------------------------------------------------------------------------
/tests/test_calc_ncd.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsCommonTest.set_test_data()
11 |         self.script_name = 'calc_ncd.py'
12 | 
13 |     def test_output_default(self):
14 |         args = ['--fasta', self.filename_pep]
15 |         returncode, out, md5 = self._test_output(self.script_name, args)
16 |         self.assertEqual(returncode, 0)
17 |         self.assertEqual(md5, 'e5491c3e4197bf1abb92e7f76bdefeaf')
18 | 
19 |     def test_output_pairwise(self):
20 |         args = ['--fasta', self.filename_pep, '--outfmt', 'pairwise']
21 |         returncode, out, md5 = self._test_output(self.script_name, args)
22 |         self.assertEqual(returncode, 0)
23 |         self.assertEqual(md5, 'cb69bbabd9a4286a9596f8af3b2b82d5')
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     unittest.main()
28 | 


--------------------------------------------------------------------------------
/tests/test_calc_wmetric.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsCommonTest.set_test_data()
11 |         self.script_name = 'calc_wmetric.py'
12 | 
13 |     def test_arg_matrix_when_no_fasta(self):
14 |         args = ['--matrix', 'blosum62']
15 |         returncode, out = utils.runscript(self.script_name, args)
16 |         self.assertEqual(returncode, 2)
17 |         self.assertIn('--fasta/-f', out)
18 | 
19 |     def test_arg_matrix_invalid_choice(self):
20 |         args = ['--matrix', 'nonexistent']
21 |         returncode, out = utils.runscript(self.script_name, args)
22 |         self.assertEqual(returncode, 2)
23 |         self.assertIn('--matrix/-m', out)
24 | 
25 |     def test_output_default(self):
26 |         args = ['--fasta', self.filename_pep]
27 |         returncode, out, md5 = self._test_output(self.script_name, args)
28 |         self.assertEqual(returncode, 0)
29 |         self.assertEqual(md5, '27ad675a7a2e5c2872a8ab495f2d4494')
30 | 
31 |     def test_output_phylip(self):
32 |         args = ['--fasta', self.filename_pep, '--outfmt', 'phylip']
33 |         returncode, out, md5 = self._test_output(self.script_name, args)
34 |         self.assertEqual(returncode, 0)
35 |         self.assertEqual(md5, '27ad675a7a2e5c2872a8ab495f2d4494')
36 | 
37 |     def test_output_pairwise(self):
38 |         args = ['--fasta', self.filename_pep, '--outfmt', 'pairwise']
39 |         returncode, out, md5 = self._test_output(self.script_name, args)
40 |         self.assertEqual(returncode, 0)
41 |         self.assertEqual(md5, '195fb45ed46a80473e1d004b9ce40e94')
42 | 
43 |     def test_output_pam250(self):
44 |         args = ['--fasta', self.filename_pep, '--outfmt', 'phylip',
45 |                 '--matrix', 'pam250']
46 |         returncode, out, md5 = self._test_output(self.script_name, args)
47 |         self.assertEqual(returncode, 0)
48 |         self.assertEqual(md5, '217ed91de43b091205add32a673cf8fe')
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     unittest.main()
53 | 


--------------------------------------------------------------------------------
/tests/test_calc_word_bool.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsWordCommonTest.set_test_data()
11 |         self.script_name = 'calc_word_bool.py'
12 | 
13 |     def test_arg_word_size_when_no_fasta(self):
14 |         args = ['--word_size', '2']
15 |         returncode, out = utils.runscript(self.script_name, args)
16 |         self.assertEqual(returncode, 2)
17 |         self.assertIn('--fasta/-f', out)
18 | 
19 |     def test_arg_word_pattern_when_no_fasta(self):
20 |         args = ['--word_pattern', self.filename_pep_2mer]
21 |         returncode, out = utils.runscript(self.script_name, args)
22 |         self.assertEqual(returncode, 2)
23 |         self.assertIn('--fasta/-f', out)
24 | 
25 |     def test_arg_fasta_when_no_wordsize_or_wordpattern(self):
26 |         args = ['--fasta', self.filename_pep]
27 |         returncode, out = utils.runscript(self.script_name, args)
28 |         self.assertEqual(returncode, 2)
29 |         self.assertIn('Specify either: --word_size or --word', out)
30 | 
31 |     def test_arg_fasta_when_no_wordsize_or_wordpattern(self):
32 |         args = ['--fasta', self.filename_pep]
33 |         returncode, out = utils.runscript(self.script_name, args)
34 |         self.assertEqual(returncode, 2)
35 |         self.assertIn('Specify either: --word_size or --word', out)
36 | 
37 |     def test_arg_word_size_too_small(self):
38 |         args = ['--fasta', self.filename_pep, '--word_size', '-1']
39 |         returncode, out = utils.runscript(self.script_name, args)
40 |         self.assertEqual(returncode, 2)
41 |         self.assertIn('Word size must be >= 1.', out)
42 | 
43 |     def test_output_word_size1(self):
44 |         args = ['--fasta', self.filename_pep, '--word_size', '1']
45 |         returncode, out, md5 = self._test_output(self.script_name, args)
46 |         self.assertEqual(returncode, 0)
47 |         self.assertEqual(md5, '4caed60c7590f45e9a6de19482839e9c')
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     unittest.main()
52 | 


--------------------------------------------------------------------------------
/tests/test_calc_word_cv.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsWordCommonTest.set_test_data()
11 |         self.script_name = 'calc_word_cv.py'
12 | 
13 |     def test_word_size_smaller_than_3(self):
14 |         args = ['--fasta', self.filename_pep, '--word_size', '2']
15 |         returncode, out = utils.runscript(self.script_name, args)
16 |         self.assertEqual(returncode, 2)
17 |         self.assertIn('error: Word size must be >= 3', out)
18 | 
19 |     def test_word_pattern_only_one_file(self):
20 |         args = ['--fasta', self.filename_pep, '--word_pattern',
21 |                 self.filename_pep_2mer]
22 |         returncode, out = utils.runscript(self.script_name, args)
23 |         self.assertEqual(returncode, 2)
24 |         self.assertIn('expected 3 argument', out)
25 | 
26 |     def test_word_pattern_not_follow_rule(self):
27 |         args = ['--fasta', self.filename_pep, '--word_pattern',
28 |                 self.filename_pep_2mer, self.filename_pep_2mer,
29 |                 self.filename_pep_2mer]
30 |         returncode, out = utils.runscript(self.script_name, args)
31 |         self.assertEqual(returncode, 2)
32 |         self.assertIn(' do not follow k, k-1, k-2', out)
33 | 
34 |     def test_fasta_when_no_word_size_or_pattern(self):
35 |         args = ['--fasta', self.filename_pep]
36 |         returncode, out = utils.runscript(self.script_name, args)
37 |         self.assertEqual(returncode, 2)
38 |         self.assertIn('Specify either: --word_size or --word_pattern', out)
39 | 
40 |     def test_output_word_size(self):
41 |         args = ['--fasta', self.filename_pep, '--word_size', '3']
42 |         returncode, out, md5 = self._test_output(self.script_name, args)
43 |         self.assertEqual(returncode, 0)
44 |         self.assertEqual(md5, '4fbba77e4f7a64601e7d0cb3b0b6878d')
45 | 
46 |     def test_output_word_pattern(self):
47 |         args = ['--fasta', self.filename_pep, '--word_patterns',
48 |                 self.filename_pep_3mer, self.filename_pep_2mer,
49 |                 self.filename_pep_1mer
50 |                 ]
51 |         returncode, out, md5 = self._test_output(self.script_name, args)
52 |         self.assertEqual(returncode, 0)
53 |         self.assertEqual(md5, '4fbba77e4f7a64601e7d0cb3b0b6878d')
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     unittest.main()
58 | 


--------------------------------------------------------------------------------
/tests/test_calc_word_d2.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsWordCommonTest.set_test_data()
11 |         self.script_name = 'calc_word_d2.py'
12 | 
13 |     def test_arg_when_u_smaller_than_l(self):
14 |         args = ['--fasta', self.filename_pep, '-l', '3', '-u', '2']
15 |         returncode, out = utils.runscript(self.script_name, args)
16 |         self.assertEqual(returncode, 2)
17 |         self.assertIn('error: max_word_size must be greater than ', out)
18 | 
19 |     def test_arg_char_weights_invalid_format(self):
20 |         args = ['--fasta', self.filename_pep,
21 |                 '-l', '1', '-u', '4',
22 |                 '--char_weights', self.filename_pep,
23 |                 '--vector', 'freqs']
24 |         returncode, out = utils.runscript(self.script_name, args)
25 |         self.assertEqual(returncode, 2)
26 |         self.assertIn('Invalid format for --char_weights', out)
27 | 
28 |     def test_arg_word_size_0(self):
29 |         args = ['--fasta', self.filename_pep, '-l', '0']
30 |         returncode, out = utils.runscript(self.script_name, args)
31 |         self.assertEqual(returncode, 2)
32 |         self.assertIn('min_word_size must be greater than 0', out)
33 | 
34 |     def test_output_default(self):
35 |         args = ['--fasta', self.filename_pep]
36 |         returncode, out, md5 = self._test_output(self.script_name, args)
37 |         self.assertEqual(returncode, 0)
38 |         self.assertEqual(md5, 'f651314b77dcd4fe9b3143de28000ca8')
39 | 
40 |     def test_output_l1_u4(self):
41 |         args = ['--fasta', self.filename_pep, '-l', '1', '-u', '4']
42 |         returncode, out, md5 = self._test_output(self.script_name, args)
43 |         self.assertEqual(returncode, 0)
44 |         self.assertEqual(md5, '164ef1a902f74517e6b7cff7798c595f')
45 | 
46 |     def test_output_l1_u4_freqs(self):
47 |         args = ['--fasta', self.filename_pep, '-l', '1', '-u', '4',
48 |                 '--vector', 'freqs']
49 |         returncode, out, md5 = self._test_output(self.script_name, args)
50 |         self.assertEqual(returncode, 0)
51 |         self.assertEqual(md5, '8340c1687a0e6ae50c5f6bcc24196247')
52 | 
53 |     def test_output_l1_u4_char_weights(self):
54 |         args = ['--fasta', self.filename_pep, '-l', '1', '-u', '4',
55 |                 '--char_weights', self.filename_char_weights]
56 |         returncode, out, md5 = self._test_output(self.script_name, args)
57 |         self.assertEqual(returncode, 0)
58 |         self.assertEqual(md5, '81873a0cb36f7e05698fa664311f38ee')
59 | 
60 |     def test_script_l1_u4_char_weights_freqs(self):
61 |         args = ['--fasta', self.filename_pep, '-l', '1', '-u', '4',
62 |                 '--vector', 'freqs',
63 |                 '--char_weights', self.filename_char_weights]
64 |         returncode, out, md5 = self._test_output(self.script_name, args)
65 |         self.assertEqual(returncode, 0)
66 |         self.assertEqual(md5, '96c944f9e8e4d2b8ca67bc2620f47d3a')
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     unittest.main()
71 | 


--------------------------------------------------------------------------------
/tests/test_calc_word_ffp.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsWordCommonTest.set_test_data()
11 |         self.script_name = 'calc_word_ffp.py'
12 | 
13 |     def test_arg_word_size_when_no_fasta(self):
14 |         args = ['--word_size', '2']
15 |         returncode, out = utils.runscript(self.script_name, args)
16 |         self.assertEqual(returncode, 2)
17 |         self.assertIn('--fasta/-f', out)
18 | 
19 |     def test_arg_no_molecule(self):
20 |         args = ['--fasta', self.filename_pep]
21 |         returncode, out = utils.runscript(self.script_name, args)
22 |         self.assertEqual(returncode, 2)
23 |         self.assertIn('--molecule/-m', out)
24 | 
25 |     def test_arg_no_word_size(self):
26 |         args = ['--fasta', self.filename_pep, '--molecule', 'protein']
27 |         returncode, out = utils.runscript(self.script_name, args)
28 |         self.assertEqual(returncode, 2)
29 |         self.assertIn('--word_size', out)
30 | 
31 |     def test_arg_incompatible_args_protein_merge_revcomp(self):
32 |         args = ['--fasta', self.filename_pep, '--word_size', '2',
33 |                 '--molecule', 'protein', '--merge_revcomp']
34 |         returncode, out = utils.runscript(self.script_name, args)
35 |         self.assertEqual(returncode, 2)
36 |         self.assertIn('Incompatible arguments', out)
37 | 
38 |     def test_arg_distance_invalid_choice(self):
39 |         args = ['--fasta', self.filename_pep, '--word_size', '2',
40 |                 '--molecule', 'protein', '--distance', 'nonexistent']
41 |         returncode, out = utils.runscript(self.script_name, args)
42 |         self.assertEqual(returncode, 2)
43 |         self.assertIn('invalid choice', out)
44 | 
45 |     def test_output_pep_word_size2(self):
46 |         args = ['--fasta', self.filename_pep, '--word_size', '2',
47 |                 '--molecule', 'protein']
48 |         returncode, out, md5 = self._test_output(self.script_name, args)
49 |         self.assertEqual(returncode, 0)
50 |         self.assertEqual(md5, '79caa37b67848c52b41a8cb074d810e1')
51 | 
52 |     def test_output_pep_word_size2_reduce_alphabet(self):
53 |         args = ['--fasta', self.filename_pep, '--word_size', '2',
54 |                 '--molecule', 'protein', '--reduce_alphabet']
55 |         returncode, out, md5 = self._test_output(self.script_name, args)
56 |         self.assertEqual(returncode, 0)
57 |         self.assertEqual(md5, '2e03fddfa6a10d810c3481fd53ada4a3')
58 | 
59 |     def test_output_pep_word_pattern2_reduce_alphabet(self):
60 |         args = ['--fasta', self.filename_pep, '--molecule', 'protein',
61 |                 '--word_pattern', self.filename_pep_2mer, '--reduce_alphabet']
62 |         returncode, out, md5 = self._test_output(self.script_name, args)
63 |         self.assertEqual(returncode, 0)
64 |         self.assertEqual(md5, '2e03fddfa6a10d810c3481fd53ada4a3')
65 | 
66 |     def test_output_dna_word_size2(self):
67 |         args = ['--fasta', self.filename_dna, '--molecule', 'dna',
68 |                 '--word_size', '2']
69 |         returncode, out, md5 = self._test_output(self.script_name, args)
70 |         self.assertEqual(returncode, 0)
71 |         self.assertEqual(md5, '69d68abfe5cb8e855f77f9f8fff20178')
72 | 
73 |     def test_output_dna_word_size2_mergerevcomp(self):
74 |         args = ['--fasta', self.filename_dna, '--molecule', 'dna',
75 |                 '--word_size', '2', '--merge_revcomp']
76 |         returncode, out, md5 = self._test_output(self.script_name, args)
77 |         self.assertEqual(returncode, 0)
78 |         self.assertEqual(md5, 'd3fd336b21aac9922ed7831b8d9f5f83')
79 | 
80 |     def test_output_dna_word_size2_mergerevcomp_reduce(self):
81 |         args = ['--fasta', self.filename_dna, '--molecule', 'dna',
82 |                 '--word_size', '2', '--merge_revcomp', '--reduce_alphabet']
83 |         returncode, out, md5 = self._test_output(self.script_name, args)
84 |         self.assertEqual(returncode, 0)
85 |         self.assertEqual(md5, '83fd63884c64c88ee3ff6e4eb2183e8b')
86 | 
87 | if __name__ == '__main__':
88 |     unittest.main()
89 | 


--------------------------------------------------------------------------------
/tests/test_calc_word_rtd.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsWordCommonTest.set_test_data()
11 |         self.script_name = 'calc_word_rtd.py'
12 | 
13 |     def test_arg_word_size_when_no_fasta(self):
14 |         args = ['--word_size', '2']
15 |         returncode, out = utils.runscript(self.script_name, args)
16 |         self.assertEqual(returncode, 2)
17 |         self.assertIn('--fasta/-f', out)
18 | 
19 |     def test_arg_fasta_when_no_word_size(self):
20 |         args = ['--fasta', self.filename_pep]
21 |         returncode, out = utils.runscript(self.script_name, args)
22 |         self.assertEqual(returncode, 2)
23 |         self.assertIn('Specify either: --word_size or --word_pattern.', out)
24 | 
25 |     def test_arg_word_pattern_invalid_format(self):
26 |         args = ['--fasta', self.filename_pep,
27 |                 '--word_pattern', self.filename_pep_2mer]
28 |         returncode, out = utils.runscript(self.script_name, args)
29 |         self.assertEqual(returncode, 2)
30 |         self.assertIn('does not contain info on word positions', out)
31 | 
32 |     def test_arg_distance_invalid_choice(self):
33 |         args = ['--fasta', self.filename_pep, '--word_size', '2',
34 |                 '--distance', 'nonexistent']
35 |         returncode, out = utils.runscript(self.script_name, args)
36 |         self.assertEqual(returncode, 2)
37 |         self.assertIn('invalid choice', out)
38 | 
39 |     def test_output_word_size_2(self):
40 |         args = ['--fasta', self.filename_pep, '--word_size', '2']
41 |         returncode, out, md5 = self._test_output(self.script_name, args)
42 |         self.assertEqual(returncode, 0)
43 |         self.assertEqual(md5, '1e1a089908495d60275c039272e8e45f')
44 | 
45 |     def test_output_wordpattern(self):
46 |         args = ['--fasta', self.filename_pep,
47 |                 '--word_pattern', self.filename_pep_2mer_wordpos]
48 |         returncode, out, md5 = self._test_output(self.script_name, args)
49 |         self.assertEqual(returncode, 0)
50 |         self.assertEqual(md5, '1e1a089908495d60275c039272e8e45f')
51 | 
52 |     def test_output_word_size_1(self):
53 |         args = ['--fasta', self.filename_pep, '--outfmt', 'pairwise',
54 |                 '--word_size', '1']
55 |         returncode, out, md5 = self._test_output(self.script_name, args)
56 |         self.assertEqual(returncode, 0)
57 |         self.assertEqual(md5, 'b4f581dabfa83b2f1ff4f5d367865711')
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     unittest.main()
62 | 


--------------------------------------------------------------------------------
/tests/test_calc_word_sets.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsWordCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsWordCommonTest.set_test_data()
11 |         self.script_name = 'calc_word_sets.py'
12 | 
13 |     def test_arg_word_size_when_no_fasta(self):
14 |         args = ['--word_size', '2']
15 |         returncode, out = utils.runscript(self.script_name, args)
16 |         self.assertEqual(returncode, 2)
17 |         self.assertIn('--fasta/-f', out)
18 | 
19 |     def test_arg_fasta_when_no_wordsize(self):
20 |         args = ['--fasta', self.filename_pep]
21 |         returncode, out = utils.runscript(self.script_name, args)
22 |         self.assertEqual(returncode, 2)
23 |         self.assertIn('--word_size', out)
24 | 
25 |     def test_arg_word_size_too_small(self):
26 |         args = ['--fasta', self.filename_pep, '--word_size', '-1']
27 |         returncode, out = utils.runscript(self.script_name, args)
28 |         self.assertEqual(returncode, 2)
29 |         self.assertIn('Word size must be >= 1.', out)
30 | 
31 |     def test_arg_distance_invalid_choice(self):
32 |         args = ['--fasta', self.filename_pep, '--word_size', '-1',
33 |                 '--distance', 'nonexistent']
34 |         returncode, out = utils.runscript(self.script_name, args)
35 |         self.assertEqual(returncode, 2)
36 |         self.assertIn('invalid choice', out)
37 | 
38 |     def test_output_word_size2(self):
39 |         args = ['--fasta', self.filename_pep, '--word_size', '2']
40 |         returncode, out, md5 = self._test_output(self.script_name, args)
41 |         print(out)
42 |         self.assertEqual(returncode, 0)
43 |         self.assertEqual(md5, 'f1b4cf9538d2d2a2a4f1e81ac1b1251d')
44 | 
45 |     def test_output_word_size2(self):
46 |         args = ['--fasta', self.filename_pep, '--word_size', '2',
47 |                 '--distance', 'jaccard']
48 |         returncode, out, md5 = self._test_output(self.script_name, args)
49 |         self.assertEqual(returncode, 0)
50 |         self.assertEqual(md5, '7a744c4665ac06483c5eb36ee03d4fa8')
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     unittest.main()
55 | 


--------------------------------------------------------------------------------
/tests/test_create_wordpattern.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import utils
 4 | 
 5 | 
 6 | class ScriptTest(unittest.TestCase, utils.ScriptsCommonTest):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(ScriptTest, self).__init__(*args, **kwargs)
10 |         utils.ScriptsCommonTest.set_test_data()
11 |         self.script_name = 'create_wordpattern.py'
12 | 
13 |     def test_arg_word_size_when_no_fasta(self):
14 |         args = ['--word_size', '2']
15 |         returncode, out = utils.runscript(self.script_name, args)
16 |         self.assertEqual(returncode, 2)
17 |         self.assertIn('--fasta/-f', out)
18 | 
19 |     def test_arg_word_size_0(self):
20 |         args = ['--fasta', self.filename_pep, '--word_size', '0']
21 |         returncode, out = utils.runscript(self.script_name, args)
22 |         self.assertEqual(returncode, 2)
23 |         self.assertIn('--word_size must be >= 1', out)
24 | 
25 |     def test_arg_teiresias_when_no_l(self):
26 |         args = ['--fasta', self.filename_pep, '--word_size', '2',
27 |                 '--teiresias']
28 |         returncode, out = utils.runscript(self.script_name, args)
29 |         self.assertEqual(returncode, 2)
30 |         self.assertIn('Teiresias requires --l', out)
31 | 
32 |     def test_arg_teiresias_when_no_k(self):
33 |         args = ['--fasta', self.filename_pep, '--word_size', '2',
34 |                 '--teiresias', '--l', '2']
35 |         returncode, out = utils.runscript(self.script_name, args)
36 |         self.assertEqual(returncode, 2)
37 |         self.assertIn('Teiresias requires --k', out)
38 | 
39 |     def test_arg_teiresias_when_k_and_not_l(self):
40 |         args = ['--fasta', self.filename_pep, '--word_size', '2',
41 |                 '--teiresias', '--k', '2']
42 |         returncode, out = utils.runscript(self.script_name, args)
43 |         self.assertEqual(returncode, 2)
44 |         self.assertIn('Teiresias requires --l', out)
45 | 
46 |     def test_teiresias_when_l_too_small(self):
47 |         args = ['--fasta', self.filename_pep, '--word_size', '2',
48 |                 '--teiresias', '--k', '2', '--l', '1']
49 |         returncode, out = utils.runscript(self.script_name, args)
50 |         self.assertEqual(returncode, 2)
51 |         self.assertIn('--l must be at least 2', out)
52 | 
53 |     def test_output_word_size_2(self):
54 |         args = ['--fasta', self.filename_pep, '--word_size', '2']
55 |         returncode, out, md5 = self._test_output(self.script_name, args)
56 |         self.assertEqual(returncode, 0)
57 |         self.assertEqual(md5, '2aea23ad3e883708dc2f95111f7f04ec')
58 | 
59 |     def test_output_word_size_2_wordpos(self):
60 |         args = ['--fasta', self.filename_pep, '--word_size', '2',
61 |                 '--word_position']
62 |         returncode, out, md5 = self._test_output(self.script_name, args)
63 |         self.assertEqual(returncode, 0)
64 |         self.assertEqual(md5, '040e121be77617191c7d7c847edafc8e')
65 | 
66 |     def test_output_word_size_1(self):
67 |         args = ['--fasta', self.filename_pep, '--word_size', '1']
68 |         returncode, out, md5 = self._test_output(self.script_name, args)
69 |         self.assertEqual(returncode, 0)
70 |         self.assertEqual(md5, '2d4dd98798cb6320975f6919fe43b777')
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     unittest.main()
75 | 


--------------------------------------------------------------------------------
/tests/test_distance.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from alfpy import word_pattern
 4 | from alfpy import word_vector
 5 | from alfpy.utils import distance
 6 | from alfpy.utils import distmatrix
 7 | 
 8 | from . import utils
 9 | 
10 | 
11 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest):
12 | 
13 |     def __init__(self, *args, **kwargs):
14 |         super(DistanceTest, self).__init__(*args, **kwargs)
15 |         utils.ModulesCommonTest.set_test_data()
16 |         self.pattern = word_pattern.create(self.dna_records.seq_list, 2)
17 |         self.counts = word_vector.Counts(self.dna_records.length_list,
18 |                                          self.pattern)
19 |         self.freqs = word_vector.Freqs(self.dna_records.length_list,
20 |                                        self.pattern)
21 | 
22 |     def test_euclid_squared_counts(self):
23 |         # The result of this method is identical to that from decaf+py.
24 |         dist = distance.Distance(self.counts, 'euclid_squared')
25 |         matrix = distmatrix.create(self.dna_records.id_list, dist)
26 |         data = ['   3',
27 |                 'seq1       0.0000000 57.0000000 30.0000000',
28 |                 'seq2       57.0000000 0.0000000 19.0000000',
29 |                 'seq3       30.0000000 19.0000000 0.0000000']
30 |         self.assertEqual(matrix.format(), "\n".join(data))
31 | 
32 |     def test_euclid_squared_freqs(self):
33 |         # The result of this method is identical to that from decaf+py.
34 |         dist = distance.Distance(self.freqs, 'euclid_squared')
35 |         matrix = distmatrix.create(self.dna_records.id_list, dist)
36 |         data = ['   3',
37 |                 'seq1       0.0000000 0.1416402 0.0641298',
38 |                 'seq2       0.1416402 0.0000000 0.0677565',
39 |                 'seq3       0.0641298 0.0677565 0.0000000']
40 |         self.assertEqual(matrix.format(), "\n".join(data))
41 | 
42 |     def test_euclid_norm_counts(self):
43 |         # The result of this method is identical to that from decaf+py.
44 |         dist = distance.Distance(self.counts, 'euclid_norm')
45 |         matrix = distmatrix.create(self.dna_records.id_list, dist)
46 |         data = ['   3',
47 |                 'seq1       0.0000000 7.5498344 5.4772256',
48 |                 'seq2       7.5498344 0.0000000 4.3588989',
49 |                 'seq3       5.4772256 4.3588989 0.0000000']
50 |         self.assertEqual(matrix.format(), "\n".join(data))
51 | 
52 |     def test_euclid_norm_freqs(self):
53 |         # The result of this method is identical to that from decaf+py.
54 |         dist = distance.Distance(self.freqs, 'euclid_norm')
55 |         matrix = distmatrix.create(self.dna_records.id_list, dist)
56 |         data = ['   3',
57 |                 'seq1       0.0000000 0.3763512 0.2532387',
58 |                 'seq2       0.3763512 0.0000000 0.2603008',
59 |                 'seq3       0.2532387 0.2603008 0.0000000']
60 |         self.assertEqual(matrix.format(), "\n".join(data))
61 | 
62 |     def test_google_freqs(self):
63 |         dist = distance.Distance(self.freqs, 'google')
64 |         matrix = distmatrix.create(self.dna_records.id_list, dist)
65 |         data = ['   3',
66 |                 'seq1       0.0000000 0.6078431 0.3809524',
67 |                 'seq2       0.6078431 0.0000000 0.3949580',
68 |                 'seq3       0.3809524 0.3949580 0.0000000']
69 |         self.assertEqual(matrix.format(), "\n".join(data))
70 | 
71 |     def test_get_disttypes(self):
72 |         distlist = distance.Distance.get_disttypes()
73 |         exp = ['euclid_norm', 'euclid_squared', 'google']
74 |         self.assertListEqual(distlist, exp)
75 | 
76 |     def test_set_disttypes_throws_exception(self):
77 |         dist = distance.Distance(self.freqs, 'google')
78 |         with self.assertRaises(Exception) as context:
79 |             dist.set_disttype('nonexistent')
80 |         self.assertIn('unknown disttype', str(context.exception))
81 | 
82 | if __name__ == '__main__':
83 |     unittest.main()
84 | 


--------------------------------------------------------------------------------
/tests/test_distmatrix.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import unittest
  4 | 
  5 | from alfpy import word_distance
  6 | from alfpy.utils import distmatrix
  7 | 
  8 | from . import utils
  9 | 
 10 | 
 11 | class TestDistMatrix(unittest.TestCase):
 12 | 
 13 |     def setUp(self):
 14 |         id_list = ['seq1', 'seq2', 'seq3']
 15 |         data = np.array([[0, 0.3531587, 0.35509333],
 16 |                          [0.3531587, 0, 0.295394],
 17 |                          [0.35509333, 0.295394, 0.]
 18 |                          ])
 19 |         self.matrix = distmatrix.Matrix(id_list, data)
 20 |         self.output_filename = utils.get_test_data('distmatrix.txt')
 21 | 
 22 |     def test_format(self):
 23 |         exp = [
 24 |             '   3',
 25 |             'seq1       0.0000000 0.3531587 0.3550933',
 26 |             'seq2       0.3531587 0.0000000 0.2953940',
 27 |             'seq3       0.3550933 0.2953940 0.0000000'
 28 |         ]
 29 |         self.assertEqual(self.matrix.format(), "\n".join(exp))
 30 | 
 31 |     def test_format_decimal3(self):
 32 |         exp = [
 33 |             '   3',
 34 |             'seq1       0.000 0.353 0.355',
 35 |             'seq2       0.353 0.000 0.295',
 36 |             'seq3       0.355 0.295 0.000'
 37 |         ]
 38 |         self.assertEqual(self.matrix.format(3), "\n".join(exp))
 39 | 
 40 |     def test_min(self):
 41 |         self.assertEqual(self.matrix.min(), 0)
 42 | 
 43 |     def test_max(self):
 44 |         self.assertEqual(self.matrix.max(), 0.35509332999999998)
 45 | 
 46 |     def test_is_zero(self):
 47 |         self.assertFalse(self.matrix.is_zero())
 48 | 
 49 |     def test_normalize(self):
 50 |         self.matrix.normalize()
 51 |         exp = [
 52 |             "   3",
 53 |             "seq1       0.0000000 0.9945518 1.0000000",
 54 |             "seq2       0.9945518 0.0000000 0.8318771",
 55 |             "seq3       1.0000000 0.8318771 0.0000000",
 56 |         ]
 57 |         self.assertEqual(self.matrix.format(), "\n".join(exp))
 58 | 
 59 |     def test_write_to_file_phylip(self):
 60 |         oh = open(self.output_filename, 'w')
 61 |         self.matrix.write_to_file(oh)
 62 |         oh.close()
 63 |         fh = open(self.output_filename)
 64 |         result = fh.read()
 65 |         fh.close()
 66 |         os.remove(self.output_filename)
 67 |         exp = [
 68 |             '   3',
 69 |             'seq1       0.0000000 0.3531587 0.3550933',
 70 |             'seq2       0.3531587 0.0000000 0.2953940',
 71 |             'seq3       0.3550933 0.2953940 0.0000000\n'
 72 |         ]
 73 |         self.assertEqual(result, "\n".join(exp))
 74 | 
 75 |     def test_write_to_file_pairwise(self):
 76 |         oh = open(self.output_filename, 'w')
 77 |         self.matrix.write_to_file(oh, 'pairwise')
 78 |         oh.close()
 79 |         fh = open(self.output_filename)
 80 |         result = fh.read()
 81 |         fh.close()
 82 |         os.remove(self.output_filename)
 83 |         exp = [
 84 |             "seq1\tseq2\t0.3531587",
 85 |             "seq1\tseq3\t0.3550933",
 86 |             "seq2\tseq3\t0.2953940\n"
 87 |         ]
 88 |         self.assertEqual(result, "\n".join(exp))
 89 | 
 90 |     def test_write_to_file_pairwise_decimal3(self):
 91 |         oh = open(self.output_filename, 'w')
 92 |         self.matrix.write_to_file(oh, 'pairwise', 3)
 93 |         oh.close()
 94 |         fh = open(self.output_filename)
 95 |         result = fh.read()
 96 |         fh.close()
 97 |         os.remove(self.output_filename)
 98 |         exp = [
 99 |             "seq1\tseq2\t0.353",
100 |             "seq1\tseq3\t0.355",
101 |             "seq2\tseq3\t0.295\n"
102 |         ]
103 |         self.assertEqual(result, "\n".join(exp))
104 | 
105 |     def test_iter(self):
106 |         exp = [(0, 1, 'seq1', 'seq2', 0.35315869999999999),
107 |                (0, 2, 'seq1', 'seq3', 0.35509332999999998),
108 |                (1, 2, 'seq2', 'seq3', 0.29539399999999999)]
109 |         self.assertEqual(list(self.matrix), exp)
110 | 
111 |     def test_create_matrix(self):
112 |         l = [[3, 6, 4, 1, 3, 4, 3, 0, 1, 1, 6, 4, 5, 0, 3, 4],
113 |              [0, 3, 0, 3, 0, 0, 0, 2, 9, 0, 3, 3, 0, 6, 3, 6],
114 |              [9, 0, 0, 3, 0, 0, 0, 2, 6, 0, 3, 3, 0, 3, 3, 3]]
115 |         vector = np.array(l)
116 |         dist = word_distance.Distance(vector, 'minkowski')
117 |         id_list = ['seq1', 'seq2', 'seq3']
118 |         matrix = distmatrix.create(id_list, dist)
119 |         exp = [
120 |             '   3',
121 |             'seq1       0.0000000 14.6969385 14.1774469',
122 |             'seq2       14.6969385 0.0000000 10.8166538',
123 |             'seq3       14.1774469 10.8166538 0.0000000'
124 |         ]
125 |         self.assertEqual(matrix.format(), "\n".join(exp))
126 | 
127 |     def test_highcharts(self):
128 |         self.assertEqual(len(self.matrix.highcharts()), 3)
129 | 
130 |     def test_read_highcharts_matrix(self):
131 |         id_list = ['seq1', 'seq2', 'seq3']
132 |         data = [[0, 1, 0.35, 0.19], [0, 2, 1.0, 0.55], [1, 2, 0.88, 0.48]]
133 |         matrix = distmatrix.read_highcharts_matrix(id_list, data)
134 |         md5 = utils.calc_md5(matrix.format())
135 |         self.assertEqual(md5, "476c8f5d284a84ee3c7c419bde2d7658")
136 |         
137 | 
138 | if __name__ == '__main__':
139 |     unittest.main()
140 | 


--------------------------------------------------------------------------------
/tests/test_fasta.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | 
  4 | from alfpy.utils import fasta
  5 | 
  6 | from . import utils
  7 | 
  8 | 
  9 | class FastaTest(unittest.TestCase):
 10 | 
 11 |     def __init__(self, *args, **kwargs):
 12 |         super(FastaTest, self).__init__(*args, **kwargs)
 13 |         self.ID_LIST = ['seq1', 'seq2', 'seq3', 'seq4']
 14 |         self.DESC_LIST = ['seq1 desc', 'seq2 desc', 'seq3 desc', '']
 15 |         self.SEQ_LIST = [
 16 |          'MEVVIRSANFTDNAKIIIVQLNASVEINCTRPNNYTRKGIRIGPGRAVYAAEEIIGDNTLKQVVTKLRE',
 17 |          'MVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIRIGPGRAVYAAEEIIGDIRRAHCNIS',
 18 |          'MFTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGAKW',
 19 |          'MFTDNAKIIIVQLNASVEINCTRPNNNTR'
 20 |         ]
 21 | 
 22 |     def _validate_FastaRecord_init(self, fasta_record, seqidx):
 23 |         self.assertIsInstance(fasta_record, fasta.FastaRecord)
 24 |         self.assertEqual(fasta_record.seq, self.SEQ_LIST[seqidx])
 25 |         self.assertEqual(fasta_record.id, self.ID_LIST[seqidx])
 26 |         self.assertEqual(fasta_record.description, self.DESC_LIST[seqidx])
 27 |         self.assertEqual(len(fasta_record), len(self.SEQ_LIST[seqidx]))
 28 | 
 29 |     def test_single_FastaRecord_init(self):
 30 |         r = fasta.FastaRecord(self.SEQ_LIST[0],
 31 |                               self.ID_LIST[0],
 32 |                               self.DESC_LIST[0])
 33 |         self._validate_FastaRecord_init(r, seqidx=0)
 34 | 
 35 |     def test_single_FastaRecord_iter(self):
 36 |         r = fasta.FastaRecord(self.SEQ_LIST[3],
 37 |                               self.ID_LIST[3],
 38 |                               self.DESC_LIST[3])
 39 |         i = iter(r)
 40 |         self.assertEqual(next(i), 'M')
 41 |         self.assertEqual(next(i), 'F')
 42 | 
 43 |     def test_single_FastaRecord_contains(self):
 44 |         r = fasta.FastaRecord(self.SEQ_LIST[3],
 45 |                               self.ID_LIST[3],
 46 |                               self.DESC_LIST[3])
 47 |         self.assertTrue('MFT' in r)
 48 | 
 49 |     def test_multiple_FastaRecord_init(self):
 50 |         for i in range(len(self.ID_LIST)):
 51 |             r = fasta.FastaRecord(self.SEQ_LIST[i],
 52 |                                   self.ID_LIST[i],
 53 |                                   self.DESC_LIST[i])
 54 |             self._validate_FastaRecord_init(r, seqidx=i)
 55 | 
 56 |     def test_read_fasta(self):
 57 |         fh = open(utils.get_test_data('pep.fa'))
 58 |         r = fasta.read(fh)
 59 |         fh.close()
 60 |         self._validate_FastaRecord_init(r, seqidx=0)
 61 | 
 62 |     def test_parse_fasta(self):
 63 |         fh = open(utils.get_test_data('pep.fa'))
 64 |         for i, rec in enumerate(fasta.parse(fh)):
 65 |             self._validate_FastaRecord_init(rec, seqidx=i)
 66 |         fh.close()
 67 | 
 68 |     def test_to_dict(self):
 69 |         fh = open(utils.get_test_data('pep.fa'))
 70 |         d = fasta.to_dict(fasta.parse(fh))
 71 |         fh.close()
 72 |         self.assertEqual(len(d), 4)
 73 | 
 74 |     def test_to_dict_value_error(self):
 75 |         h = ['>seq1\n', 'ATG\n', '>seq1\n', 'ATGC']
 76 |         with self.assertRaises(ValueError) as context:
 77 |             d = fasta.to_dict(fasta.parse(h))
 78 |         self.assertIn('Duplicate key', str(context.exception))
 79 |         
 80 | 
 81 |     def test_parse_fasta_missing_sequences(self):
 82 |         ids = ['seq1', 'seq2']
 83 |         seqs = ['ATGC', '']
 84 |         l = ['>{}\n'.format(ids[0]),
 85 |              '{}\n\n\n'.format(seqs[0]),
 86 |              '>{}\n'.format(ids[1]),
 87 |              '{}\n'.format(seqs[1])
 88 |              ]
 89 |         for i, fasta_record in enumerate(fasta.parse(l)):
 90 |             self.assertIsInstance(fasta_record, fasta.FastaRecord)
 91 |             self.assertEqual(fasta_record.seq, seqs[i])
 92 | 
 93 |     def test_fasta_format(self, wrap=70):
 94 |         l = ['>seq1 seq1 desc\n',
 95 |              'A' * wrap + '\n',
 96 |              'B' * wrap]
 97 |         r = fasta.read(l)
 98 |         self.assertEqual(''.join(l), r.format(wrap=wrap))
 99 | 
100 |     def test_input_output_file_fasta(self):
101 |         filename = 'temp.fa'
102 |         oh = open(utils.get_test_data(filename), 'w')
103 |         l1 = []
104 |         fh = open(utils.get_test_data('pep.fa'))
105 |         for seq_record in fasta.parse(fh):
106 |             l1.append(seq_record.format())
107 |             oh.write(seq_record.format())
108 |             oh.write('\n')
109 |         fh.close()
110 |         oh.close()
111 |         fh = open(utils.get_test_data(filename))
112 |         l2 = [seq_record.format() for seq_record in fasta.parse(fh)]
113 |         fh.close()
114 |         os.remove(utils.get_test_data(filename))
115 |         self.assertEqual(l1, l2)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     unittest.main()
120 | 


--------------------------------------------------------------------------------
/tests/test_fcgr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from alfpy import fcgr
 5 | from alfpy.utils import distmatrix
 6 | 
 7 | from . import utils
 8 | 
 9 | 
10 | class VectorTest(unittest.TestCase, utils.ModulesCommonTest):
11 | 
12 |     def __init__(self, *args, **kwargs):
13 |         super(VectorTest, self).__init__(*args, **kwargs)
14 |         utils.ModulesCommonTest.set_test_data()
15 | 
16 |     def test_fcgr_vector1(self):
17 |         vec = fcgr.fcgr_vector('CTAGGGAACATACCA', 1)
18 |         self.assertEqual(vec, [3.0, 6.0, 3.0])
19 | 
20 |     def test_fcgr_vector2(self):
21 |         vec = fcgr.fcgr_vector('CTAGGGAACATACCA', 2)
22 |         exp = [0.0, 0.0, 2.0, 2.0, 1.0, 1.0, 0.0, 2.0,
23 |                1.0, 2.0, 0.0, 1.0, 2.0, 1.0, 0.0]
24 |         self.assertEqual(vec, exp)
25 | 
26 |     def test_fcgr_vector3(self):
27 |         vec = fcgr.fcgr_vector('CTAGGGAACATACCXXA', 1)
28 |         self.assertEqual(vec, [3.0, 6.0, 3.0])
29 | 
30 |     def test_create_vectors(self):
31 |         vecs = fcgr.create_vectors(self.dna_records, 2)
32 |         exp = [[0, 3, 1, 4, 0, 1, 1, 1, 1, 1, 3, 2, 4, 1, 1],
33 |                [0, 0, 4, 1, 2, 2, 0, 0, 1, 4, 0, 0, 3, 1, 1],
34 |                [0, 0, 2, 2, 1, 1, 0, 2, 1, 2, 0, 1, 2, 1, 0]]
35 |         self.assertEqual(vecs.tolist(), exp)
36 | 
37 | 
38 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest):
39 | 
40 |     def __init__(self, *args, **kwargs):
41 |         super(DistanceTest, self).__init__(*args, **kwargs)
42 |         utils.ModulesCommonTest.set_test_data()
43 | 
44 |     def test_distance1(self):
45 |         vecs = fcgr.create_vectors(self.dna_records, 2)
46 |         dist = fcgr.Distance(vecs)
47 |         matrix = distmatrix.create(self.dna_records.id_list, dist)
48 |         exp = [
49 |             "   3",
50 |             "seq1       0.0000000 7.5498344 5.7445626",
51 |             "seq2       7.5498344 0.0000000 4.2426407",
52 |             "seq3       5.7445626 4.2426407 0.0000000"
53 |         ]
54 |         self.assertEqual(matrix.format(), "\n".join(exp))
55 | 
56 |     def test_distance2(self):
57 |         vecs = fcgr.create_vectors(self.dna_records, 2)
58 |         dist = fcgr.Distance(vecs, 'google')
59 |         matrix = distmatrix.create(self.dna_records.id_list, dist)
60 |         exp = [
61 |             "   3",
62 |             "seq1       0.0000000 0.5833333 0.5416667",
63 |             "seq2       0.5833333 0.0000000 0.4210526",
64 |             "seq3       0.5416667 0.4210526 0.0000000"
65 |         ]
66 |         self.assertEqual(matrix.format(), "\n".join(exp))
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     unittest.main()
71 | 


--------------------------------------------------------------------------------
/tests/test_graphdna.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import unittest
  3 | 
  4 | from alfpy import graphdna
  5 | from alfpy.utils import distmatrix
  6 | 
  7 | from . import utils
  8 | 
  9 | 
 10 | class VectorTest(unittest.TestCase, utils.ModulesCommonTest):
 11 | 
 12 |     def __init__(self, *args, **kwargs):
 13 |         super(VectorTest, self).__init__(*args, **kwargs)
 14 |         utils.ModulesCommonTest.set_test_data()
 15 | 
 16 |     def test_2DSGraphVector(self):
 17 |         seq = 'CTAGGGAACATACCA'
 18 |         vec = graphdna._2DSGraphVector(seq)
 19 | 
 20 |         exp = [2.99197183, -8.04298066, 9.16666667, -5.78272208,
 21 |                6.5, -1.75064326, 5, -2.92241364, 9.25, -3.81343559]
 22 |         self.assertTrue(np.allclose(vec, np.array(exp)))
 23 | 
 24 |     def test_2DSGraphVector_ambiguousDNA(self):
 25 |         seq = 'CTAGGGAANNNXXXCATACCA'
 26 |         vec = graphdna._2DSGraphVector(seq)
 27 | 
 28 |         exp = [2.99197183, -8.04298066, 9.16666667, -5.78272208,
 29 |                6.5, -1.75064326, 5, -2.92241364, 9.25, -3.81343559]
 30 |         self.assertTrue(np.allclose(vec, np.array(exp)))
 31 | 
 32 |     def test_2DMGraphVector_ndim10(self):
 33 |         seq = 'CTAGGGAACATACCA'
 34 |         vec = graphdna._2DMGraphVector(seq, 10)
 35 |         exp = [15, 12.14790682, 13.5804606, 15.88980624, 19.16010756,
 36 |                23.55763468, 29.38627489, 37.08035601, 47.23633868,
 37 |                60.66394053]
 38 |         self.assertEqual(vec.shape, (10,))
 39 |         self.assertTrue(np.allclose(vec, np.array(exp)))
 40 | 
 41 |     def test_2DMGraphVector_ndim10_ambiguousDNA(self):
 42 |         seq = 'CTAGGGAACATACCA'
 43 |         vec = graphdna._2DMGraphVector(seq, 10)
 44 |         exp = [15, 12.14790682, 13.5804606, 15.88980624, 19.16010756,
 45 |                23.55763468, 29.38627489, 37.08035601, 47.23633868,
 46 |                60.66394053]
 47 |         self.assertEqual(vec.shape, (10,))
 48 |         self.assertTrue(np.allclose(vec, np.array(exp)))
 49 | 
 50 |     def test_2DMGraphVector_ndim5(self):
 51 |         seq = 'CTAGGGAACATACCA'
 52 |         vec = graphdna._2DMGraphVector(seq, 5)
 53 |         exp = [15, 12.14790682, 13.5804606, 15.88980624, 19.16010756]
 54 |         self.assertEqual(vec.shape, (5,))
 55 |         self.assertTrue(np.allclose(vec, np.array(exp)))
 56 | 
 57 |     def test_2DNGraphVector(self):
 58 |         seq = 'CTAGGGAACATACCA'
 59 |         vec = graphdna._2DNGraphVector(seq)
 60 |         md5 = utils.calc_md5(vec)
 61 |         self.assertEqual(len(vec), 48)
 62 |         self.assertEqual(md5, '44829cc0277531646d656cdaacd3ae94')
 63 | 
 64 |     def test_2DNGraphVector_ambiguousDNA(self):
 65 |         seq = 'CTAGGGAACATACCA'
 66 |         vec = graphdna._2DNGraphVector(seq)
 67 |         md5 = utils.calc_md5(vec)
 68 |         self.assertEqual(len(vec), 48)
 69 |         self.assertEqual(md5, '44829cc0277531646d656cdaacd3ae94')
 70 | 
 71 |     def test_create_2DSGraphVectors(self):
 72 |         data = graphdna.create_2DSGraphVectors(self.dna_records)
 73 |         md5 = utils.calc_md5(data)
 74 |         self.assertEqual(md5, 'e2399897bb7eaa5ca3a81c84e2eeac84')
 75 | 
 76 |     def test_create_2DMGraphVectors(self):
 77 |         data = graphdna.create_2DMGraphVectors(self.dna_records, 10)
 78 |         md5 = utils.calc_md5(data)
 79 |         self.assertEqual(md5, '8c7d4dca912aeaf7c88d325799dadf00')
 80 | 
 81 |     def test_create_2DNGraphVectors(self):
 82 |         data = graphdna.create_2DNGraphVectors(self.dna_records)
 83 |         md5 = utils.calc_md5(data)
 84 |         self.assertEqual(md5, '3211fc3837b876521a6ab8b6a22b411c')
 85 | 
 86 | 
 87 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest):
 88 | 
 89 |     def __init__(self, *args, **kwargs):
 90 |         super(DistanceTest, self).__init__(*args, **kwargs)
 91 |         utils.ModulesCommonTest.set_test_data()
 92 | 
 93 |     def test_distance_2DSG(self):
 94 |         data = graphdna.create_2DSGraphVectors(self.dna_records)
 95 |         dist = graphdna.Distance(data)
 96 |         matrix = distmatrix.create(self.dna_records.id_list, dist)
 97 |         exp = [
 98 |             '   3',
 99 |             'seq1       0.0000000 9.4762599 14.6585286',
100 |             'seq2       9.4762599 0.0000000 6.7199568',
101 |             'seq3       14.6585286 6.7199568 0.0000000',
102 |         ]
103 |         self.assertEqual(matrix.format(), "\n".join(exp))
104 | 
105 |     def test_distance_2DMG(self):
106 |         data = graphdna.create_2DMGraphVectors(self.dna_records, 10)
107 |         dist = graphdna.Distance(data)
108 |         matrix = distmatrix.create(self.dna_records.id_list, dist)
109 |         exp = [
110 |             '   3',
111 |             'seq1       0.0000000 22.2449494 55.9753388',
112 |             'seq2       22.2449494 0.0000000 34.2064423',
113 |             'seq3       55.9753388 34.2064423 0.0000000'
114 |         ]
115 |         self.assertEqual(matrix.format(), "\n".join(exp))
116 | 
117 |     def test_distance_2DNG(self):
118 |         data = graphdna.create_2DNGraphVectors(self.dna_records)
119 |         dist = graphdna.Distance(data)
120 |         matrix = distmatrix.create(self.dna_records.id_list, dist)
121 |         exp = [
122 |             '   3',
123 |             'seq1       0.0000000 10.3711467 15.1355787',
124 |             'seq2       10.3711467 0.0000000 7.8973545',
125 |             'seq3       15.1355787 7.8973545 0.0000000'
126 |         ]
127 |         self.assertEqual(matrix.format(), "\n".join(exp))
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     unittest.main()
132 | 


--------------------------------------------------------------------------------
/tests/test_lempelziv.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from alfpy import lempelziv
  4 | from alfpy.utils import distmatrix
  5 | 
  6 | from . import utils
  7 | 
  8 | 
  9 | class VectorTest(unittest.TestCase, utils.ModulesCommonTest):
 10 | 
 11 |     def __init__(self, *args, **kwargs):
 12 |         super(VectorTest, self).__init__(*args, **kwargs)
 13 |         utils.ModulesCommonTest.set_test_data()
 14 | 
 15 |     def test_complexity(self):
 16 |         seq = 'MFTDNAKIIIVQLNASVEINCTRPNNNTR'
 17 |         c = lempelziv.complexity(seq)
 18 |         self.assertEqual(c, 19)
 19 | 
 20 |     def test_complexity1(self):
 21 |         seq = 'MFTDNAKIIIVQLNASVEINCTRPNNNTR'
 22 |         c = lempelziv.complexity1(seq)
 23 |         self.assertEqual(c, 20)
 24 | 
 25 |     def test_complexities(self):
 26 |         dist = lempelziv.Distance(self.pep_records)
 27 |         exp = [((0,), 40), ((0, 1), 47), ((0, 2), 53),
 28 |                ((0, 3), 43), ((1,), 38), ((1, 0), 47),
 29 |                ((1, 2), 47), ((1, 3), 41), ((2,), 35),
 30 |                ((2, 0), 50), ((2, 1), 45), ((2, 3), 37),
 31 |                ((3,), 19), ((3, 0), 39), ((3, 1), 37),
 32 |                ((3, 2), 36)]
 33 |         self.assertEqual(sorted(dist._complexity.items()), exp)
 34 | 
 35 | 
 36 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest):
 37 | 
 38 |     def __init__(self, *args, **kwargs):
 39 |         super(DistanceTest, self).__init__(*args, **kwargs)
 40 |         utils.ModulesCommonTest.set_test_data()
 41 |         self.dist = lempelziv.Distance(self.pep_records, 'd')
 42 | 
 43 |     def test_distance_d(self):
 44 |         matrix = distmatrix.create(self.pep_records.id_list, self.dist)
 45 |         exp = [
 46 |             "   4",
 47 |             "seq1       0 9 15 20",
 48 |             "seq2       9 0 10 18",
 49 |             "seq3       15 10 0 17",
 50 |             "seq4       20 18 17 0"
 51 |         ]
 52 |         self.assertEqual(matrix.format(decimal_places=0), "\n".join(exp))
 53 | 
 54 |     def test_distance_d_star(self):
 55 |         self.dist.set_disttype('d_star')
 56 |         matrix = distmatrix.create(self.pep_records.id_list, self.dist)
 57 |         exp = [
 58 |             "   4",
 59 |             "seq1       0.0000000 0.2250000 0.3750000 0.5000000",
 60 |             "seq2       0.2250000 0.0000000 0.2631579 0.4736842",
 61 |             "seq3       0.3750000 0.2631579 0.0000000 0.4857143",
 62 |             "seq4       0.5000000 0.4736842 0.4857143 0.0000000"
 63 |         ]
 64 |         self.assertEqual(matrix.format(), "\n".join(exp))
 65 | 
 66 |     def test_distance_d1(self):
 67 |         self.dist.set_disttype('d1')
 68 |         matrix = distmatrix.create(self.pep_records.id_list, self.dist)
 69 |         exp = [
 70 |             "   4",
 71 |             "seq1       0 16 28 23",
 72 |             "seq2       16 0 19 21",
 73 |             "seq3       28 19 0 19",
 74 |             "seq4       23 21 19 0"
 75 |         ]
 76 |         self.assertEqual(matrix.format(0), "\n".join(exp))
 77 | 
 78 |     def test_distance_d1_star(self):
 79 |         self.dist.set_disttype('d1_star')
 80 |         matrix = distmatrix.create(self.pep_records.id_list, self.dist)
 81 |         exp = [
 82 |             "   4",
 83 |             "seq1       0.0000000 0.3404255 0.5283019 0.5348837",
 84 |             "seq2       0.3404255 0.0000000 0.4042553 0.5121951",
 85 |             "seq3       0.5283019 0.4042553 0.0000000 0.5135135",
 86 |             "seq4       0.5348837 0.5121951 0.5135135 0.0000000"
 87 |         ]
 88 |         self.assertEqual(matrix.format(), "\n".join(exp))
 89 | 
 90 |     def test_distance_d1_star2(self):
 91 |         self.dist.set_disttype('d1_star2')
 92 |         matrix = distmatrix.create(self.pep_records.id_list, self.dist)
 93 |         exp = [
 94 |             "   4",
 95 |             "seq1       0.0000000 0.3404255 0.5436893 0.5609756",
 96 |             "seq2       0.3404255 0.0000000 0.4130435 0.5384615",
 97 |             "seq3       0.5436893 0.4130435 0.0000000 0.5205479",
 98 |             "seq4       0.5609756 0.5384615 0.5205479 0.0000000"
 99 |         ]
100 |         self.assertEqual(matrix.format(), "\n".join(exp))
101 | 
102 |     def test_set_disttype_throws_exception(self):
103 |         with self.assertRaises(Exception) as context:
104 |             self.dist.set_disttype('nonexitent')
105 |         self.assertIn('unknown disttype', str(context.exception))
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     unittest.main()
110 | 


--------------------------------------------------------------------------------
/tests/test_ncd.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from alfpy import ncd
 4 | from alfpy.utils import distmatrix
 5 | 
 6 | from . import utils
 7 | 
 8 | 
 9 | class Test(unittest.TestCase, utils.ModulesCommonTest):
10 | 
11 |     def __init__(self, *args, **kwargs):
12 |         super(Test, self).__init__(*args, **kwargs)
13 |         utils.ModulesCommonTest.set_test_data()
14 | 
15 |     def test_complexity1(self):
16 |         seq = 'AACGTACCATTGAACGTACCGTAGG'
17 |         c = ncd.complexity(seq)
18 |         self.assertEqual(c, 26)
19 | 
20 |     def test_complexity2(self):
21 |         seq = 'MFTDNAKIIIVQLNASVEINCTRPNNNTR'
22 |         c = ncd.complexity(seq)
23 |         self.assertEqual(c, 37)
24 | 
25 |     def test_complexities(self):
26 |         dist = ncd.Distance(self.pep_records)
27 |         exp = [
28 |             ((0,), 63.0), ((0, 1), 77.0), ((0, 2), 85.0),
29 |             ((0, 3), 70.0), ((1,), 60.0), ((1, 2), 78.0),
30 |             ((1, 3), 65.0), ((2,), 61.0), ((2, 3), 66.0),
31 |             ((3,), 37.0)
32 |         ]
33 |         self.assertEqual(exp, sorted(dist._complexity.items()))
34 | 
35 |     def test_distance(self):
36 |         dist = ncd.Distance(self.pep_records)
37 |         matrix = distmatrix.create(self.pep_records.id_list, dist)
38 |         exp = [
39 |             "   4",
40 |             "seq1       0.0000000 0.2698413 0.3809524 0.5238095",
41 |             "seq2       0.2698413 0.0000000 0.2950820 0.4666667",
42 |             "seq3       0.3809524 0.2950820 0.0000000 0.4754098",
43 |             "seq4       0.5238095 0.4666667 0.4754098 0.0000000"
44 |         ]
45 |         self.assertEqual(matrix.format(), "\n".join(exp))
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     unittest.main()
50 | 


--------------------------------------------------------------------------------
/tests/test_seqrecords.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from alfpy.utils import seqrecords
 4 | 
 5 | from . import utils
 6 | 
 7 | 
 8 | class SeqRecordsTest(unittest.TestCase):
 9 | 
10 |     def __init__(self, *args, **kwargs):
11 |         super(SeqRecordsTest, self).__init__(*args, **kwargs)
12 |         self.ID_LIST = ['seq1', 'seq2', 'seq3', 'seq4']
13 |         self.DESC_LIST = ['seq1 desc', 'seq2 desc', 'seq3 desc', '']
14 |         self.SEQ_LIST = [
15 |             'MEVVIRSANFTDNAKIIIVQLNASVEINCTRPNNYTRKGIRIGPGRAVYAAEEIIGDNTLKQVVTKLRE',
16 |             'MVIRSANFTDNAKIIIVQLNASVEINCTRPNNNTRKGIRIGPGRAVYAAEEIIGDIRRAHCNIS',
17 |             'MFTDNAKIIIVQLNASVEINCTRPNNNTRKGIHIGPGRAFYATGEIIGDIRQAHCNISGAKW',
18 |             'MFTDNAKIIIVQLNASVEINCTRPNNNTR'
19 |         ]
20 | 
21 |     def _validate_seqrecords(self, rec):
22 |         self.assertEqual(rec.id_list, self.ID_LIST)
23 |         self.assertEqual(rec.seq_list, self.SEQ_LIST)
24 |         self.assertEqual(rec.length_list, [len(s) for s in self.SEQ_LIST])
25 |         self.assertEqual(rec.count, len(self.SEQ_LIST))
26 | 
27 |     def test_SeqRecords_init(self):
28 |         rec = seqrecords.SeqRecords(
29 |             id_list=self.ID_LIST, seq_list=self.SEQ_LIST)
30 |         self._validate_seqrecords(rec)
31 | 
32 |     def test_SeqRecords_add(self):
33 |         rec = seqrecords.SeqRecords()
34 |         for i in range(len(self.ID_LIST)):
35 |             rec.add(self.ID_LIST[i], self.SEQ_LIST[i])
36 |         self._validate_seqrecords(rec)
37 | 
38 |     def test_SeqRecords_len(self):
39 |         rec = seqrecords.SeqRecords(
40 |             id_list=self.ID_LIST, seq_list=self.SEQ_LIST)
41 |         self.assertEqual(len(rec), 4)
42 | 
43 |     def test_read_fasta(self):
44 |         fh = open(utils.get_test_data('pep.fa'))
45 |         rec = seqrecords.read_fasta(fh)
46 |         fh.close()
47 |         self._validate_seqrecords(rec)
48 | 
49 |     def test_fasta(self):
50 |         rec = seqrecords.SeqRecords(
51 |             id_list=self.ID_LIST, seq_list=self.SEQ_LIST)
52 |         md5 = utils.calc_md5(rec.fasta(wrap=30))
53 |         exp = [
54 |             ">seq1",
55 |             "MEVVIRSANFTDNAKIIIVQLNASVEINCT",
56 |             "RPNNYTRKGIRIGPGRAVYAAEEIIGDNTL",
57 |             "KQVVTKLRE",
58 |             ">seq2",
59 |             "MVIRSANFTDNAKIIIVQLNASVEINCTRP",
60 |             "NNNTRKGIRIGPGRAVYAAEEIIGDIRRAH",
61 |             "CNIS",
62 |             ">seq3",
63 |             "MFTDNAKIIIVQLNASVEINCTRPNNNTRK",
64 |             "GIHIGPGRAFYATGEIIGDIRQAHCNISGA",
65 |             "KW",
66 |             ">seq4",
67 |             "MFTDNAKIIIVQLNASVEINCTRPNNNTR"
68 |         ]
69 |         self.assertEqual(rec.fasta(wrap=30), "\n".join(exp))
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/tests/test_wmetric.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from alfpy import wmetric
 4 | from alfpy.utils import distmatrix
 5 | from alfpy.utils.data import subsmat
 6 | 
 7 | from . import utils
 8 | 
 9 | 
10 | class VectorTest(unittest.TestCase):
11 | 
12 |     def test_count_seq_chars(self):
13 |         seq = 'MKSTGWHFSG'
14 |         l = wmetric.count_seq_chars(seq, utils.ALPHABET_PEP)
15 |         expl = [0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 1, 0, 0]
16 |         self.assertEqual(l, expl)
17 | 
18 |     def test_count_seq_chars_pep_ambiguous(self):
19 |         seq = 'MKSTGWXXXXXXXOOOOOOOHFSG'
20 |         l = wmetric.count_seq_chars(seq, utils.ALPHABET_PEP)
21 |         expl = [0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 1, 0, 0]
22 |         self.assertEqual(l, expl)
23 | 
24 |     def test_freq_seq_chars(self):
25 |         seq = 'MKSTGWXXXXXXXOOOOOOOHFSG'
26 |         l = wmetric.count_seq_chars(seq, utils.ALPHABET_PEP)
27 |         freq = wmetric.freq_seq_chars(l)
28 |         expfreq = [0.0, 0.0, 0.0, 0.0, 0.1, 0.2, 0.1, 0.0, 0.1, 0.0,
29 |                    0.1, 0.0, 0.0, 0.0, 0.2, 0.1, 0.0, 0.1, 0.0, 0.0]
30 |         self.assertEqual(freq, expfreq)
31 | 
32 | 
33 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest):
34 | 
35 |     def __init__(self, *args, **kwargs):
36 |         super(DistanceTest, self).__init__(*args, **kwargs)
37 |         utils.ModulesCommonTest.set_test_data()
38 | 
39 |     def test_wmetric_blosum62(self):
40 |         # The result of this method is identical to that from decaf+py.
41 |         matrix = subsmat.get('blosum62')
42 |         dist = wmetric.Distance(self.pep_records, matrix)
43 |         matrix = distmatrix.create(self.pep_records.id_list, dist)
44 |         data = ['   4',
45 |                 'seq1       0.0000000 0.0392559 0.0783026 0.1261381',
46 |                 'seq2       0.0392559 0.0000000 0.0377364 0.1166475',
47 |                 'seq3       0.0783026 0.0377364 0.0000000 0.1677386',
48 |                 'seq4       0.1261381 0.1166475 0.1677386 0.0000000']
49 |         self.assertEqual(matrix.format(), "\n".join(data))
50 | 
51 |     def test_wmetric_pam250(self):
52 |         matrix = subsmat.get('pam250')
53 |         dist = wmetric.Distance(self.pep_records, matrix)
54 |         matrix = distmatrix.create(self.pep_records.id_list, dist)
55 |         data = ['   4',
56 |                 'seq1       0.0000000 0.0289700 0.0467580 0.0353781',
57 |                 'seq2       0.0289700 0.0000000 0.0227122 0.0372699',
58 |                 'seq3       0.0467580 0.0227122 0.0000000 0.0578383',
59 |                 'seq4       0.0353781 0.0372699 0.0578383 0.0000000']
60 |         self.assertEqual(matrix.format(), "\n".join(data))
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     unittest.main()
65 | 


--------------------------------------------------------------------------------
/tests/test_word_d2.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from alfpy import word_d2
 4 | from alfpy import word_pattern
 5 | from alfpy import word_vector
 6 | from alfpy.utils import distmatrix
 7 | 
 8 | from . import utils
 9 | 
10 | 
11 | class DistanceTest(unittest.TestCase, utils.ModulesCommonTest):
12 | 
13 |     def __init__(self, *args, **kwargs):
14 |         super(DistanceTest, self).__init__(*args, **kwargs)
15 |         utils.ModulesCommonTest.set_test_data()
16 |         self.patterns = []
17 |         self.counts = []
18 |         self.freqs = []
19 |         for i in range(1, 5):
20 |             p = word_pattern.create(self.pep_records.seq_list, i)
21 |             self.patterns.append(p)
22 |             c = word_vector.Counts(self.pep_records.length_list, p)
23 |             self.counts.append(c)
24 |             f = word_vector.Freqs(self.pep_records.length_list, p)
25 |             self.freqs.append(f)
26 | 
27 |     def test_counts_from1_to4(self):
28 |         dist = word_d2.Distance(self.counts)
29 |         matrix = distmatrix.create(self.pep_records.id_list, dist)
30 |         exp = [
31 |             '   4',
32 |             'seq1       0 130 236 286',
33 |             'seq2       130 0 142 258',
34 |             'seq3       236 142 0 212',
35 |             'seq4       286 258 212 0'
36 |         ]
37 |         self.assertEqual(matrix.format(decimal_places=0), "\n".join(exp))
38 | 
39 |     def test_freqs_from1_to4(self):
40 |         dist = word_d2.Distance(self.freqs)
41 |         matrix = distmatrix.create(self.pep_records.id_list, dist)
42 |         exp = [
43 |             '   4',
44 |             'seq1       0.0000000 0.0313590 0.0573154 0.1020235',
45 |             'seq2       0.0313590 0.0000000 0.0373677 0.0907196',
46 |             'seq3       0.0573154 0.0373677 0.0000000 0.0870581',
47 |             'seq4       0.1020235 0.0907196 0.0870581 0.0000000'
48 | 
49 |         ]
50 |         self.assertEqual(matrix.format(), "\n".join(exp))
51 | 
52 |     def test_counts_from1_to1(self):
53 |         dist = word_d2.Distance([self.counts[0]])
54 |         matrix = distmatrix.create(self.pep_records.id_list, dist)
55 |         exp = [
56 |             '   4',
57 |             'seq1       0 37 57 140',
58 |             'seq2       37 0 28 137',
59 |             'seq3       57 28 0 111',
60 |             'seq4       140 137 111 0'
61 |         ]
62 |         self.assertEqual(matrix.format(decimal_places=0), "\n".join(exp))
63 | 
64 |     def test_freqs_from1_to4_d2_squareroot(self):
65 |         dist = word_d2.Distance(self.freqs)
66 |         dist.set_disttype('d2_squareroot')
67 |         matrix = distmatrix.create(self.pep_records.id_list, dist)
68 |         exp = [
69 |             "   4",
70 |             "seq1       0.0000000 0.1770847 0.2394063 0.3194113",
71 |             "seq2       0.1770847 0.0000000 0.1933073 0.3011969",
72 |             "seq3       0.2394063 0.1933073 0.0000000 0.2950560",
73 |             "seq4       0.3194113 0.3011969 0.2950560 0.0000000"
74 | 
75 |         ]
76 |         self.assertEqual(matrix.format(), "\n".join(exp))
77 | 
78 |     def test_set_disttype_throws_exception(self):
79 |         dist = word_d2.Distance(self.freqs)
80 |         with self.assertRaises(Exception) as context:
81 |             dist.set_disttype('nonexistent')
82 |         self.assertIn('unknown disttype', str(context.exception))
83 |         
84 | 
85 | if __name__ == '__main__':
86 |     unittest.main()
87 | 


--------------------------------------------------------------------------------
/tests/test_word_rtd.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from alfpy import word_pattern
 4 | from alfpy import word_rtd
 5 | from alfpy.utils import distmatrix
 6 | 
 7 | from . import utils
 8 | 
 9 | 
10 | class Test(unittest.TestCase, utils.ModulesCommonTest):
11 | 
12 |     def __init__(self, *args, **kwargs):
13 |         super(Test, self).__init__(*args, **kwargs)
14 |         utils.ModulesCommonTest.set_test_data()
15 |         self.pep_2mer_pos = word_pattern.create(
16 |             self.pep_records.seq_list, 2, True)
17 | 
18 |     def test_calc_rtd(self):
19 |         seq = 'CTACACAACTTTGCGGGTAGCCGGAAACATTGTGAATGCGGTGAACA'
20 |         apos = [i for i, nt in enumerate(seq) if nt == 'A']
21 |         val = word_rtd.calc_rtd(apos)
22 |         exp = (3.3846153846153846, 3.1510306381944679)
23 |         self.assertEqual(val, exp)
24 | 
25 |     def test_create_vector(self):
26 |         vec = word_rtd.create_vector(self.pep_records.count, self.pep_2mer_pos)
27 |         exp = (self.pep_records.count, len(self.pep_2mer_pos.pat_list)*2)
28 |         self.assertEqual(vec.shape, exp)
29 | 
30 |     def test_distance(self):
31 |         vec = word_rtd.create_vector(self.pep_records.count, self.pep_2mer_pos)
32 |         dist = word_rtd.Distance(vec, 'google')
33 |         matrix = distmatrix.create(self.pep_records.id_list, dist)
34 |         exp = [
35 |             "   4",
36 |             "seq1       0.0000000 0.4892241 0.6034483 0.9310345",
37 |             "seq2       0.4892241 0.0000000 0.3673469 0.8802817",
38 |             "seq3       0.6034483 0.3673469 0.0000000 0.8843537",
39 |             "seq4       0.9310345 0.8802817 0.8843537 0.0000000"
40 |         ]
41 |         self.assertEqual(matrix.format(), "\n".join(exp))
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     unittest.main()
46 | 


--------------------------------------------------------------------------------
/tests/test_word_sets_distance.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import unittest
 3 | 
 4 | from alfpy import word_pattern
 5 | from alfpy import word_sets_distance
 6 | from alfpy.utils import distmatrix
 7 | 
 8 | from . import utils
 9 | 
10 | 
11 | class Test(unittest.TestCase, utils.ModulesCommonTest):
12 | 
13 |     def __init__(self, *args, **kwargs):
14 |         super(Test, self).__init__(*args, **kwargs)
15 |         utils.ModulesCommonTest.set_test_data()
16 |         self.p = word_pattern.create(self.pep_records.seq_list, 2)
17 | 
18 |     def test_getwords(self):
19 |         words = word_sets_distance._getwords('ATGCGTA', 2)
20 |         self.assertSetEqual(words, set(['GT', 'CG', 'GC', 'AT', 'TG', 'TA']))
21 | 
22 |     def test_distance_dice(self):
23 |         # The result of this function is identical
24 |         # to the Dice distance implemented in word_bool_distance.
25 |         dist = word_sets_distance.Distance(self.pep_records, 2, 'dice')
26 |         matrix = distmatrix.create(self.pep_records.id_list, dist)
27 |         exp = [
28 |             "   4",
29 |             "seq1       0.0000000 0.1964286 0.3928571 0.4457831",
30 |             "seq2       0.1964286 0.0000000 0.2452830 0.4025974",
31 |             "seq3       0.3928571 0.2452830 0.0000000 0.3766234",
32 |             "seq4       0.4457831 0.4025974 0.3766234 0.0000000"
33 |         ]
34 |         self.assertEqual(matrix.format(), "\n".join(exp))
35 | 
36 |     def test_distance_hamming(self):
37 |         dist = word_sets_distance.Distance(self.pep_records, 2, 'hamming')
38 |         matrix = distmatrix.create(self.pep_records.id_list, dist)
39 |         exp = [
40 |             "   4",
41 |             "seq1       0 22 44 37",
42 |             "seq2       22 0 26 31",
43 |             "seq3       44 26 0 29",
44 |             "seq4       37 31 29 0"
45 |         ]
46 |         self.assertEqual(matrix.format(0), "\n".join(exp))
47 | 
48 |     def test_distance_jaccard(self):
49 |         # The result of this function is identical
50 |         # to the Jaccard distance implemented in word_bool_distance.
51 |         dist = word_sets_distance.Distance(self.pep_records, 2, 'jaccard')
52 |         matrix = distmatrix.create(self.pep_records.id_list, dist)
53 |         exp = [
54 |             "   4",
55 |             "seq1       0.0000000 0.3283582 0.5641026 0.6166667",
56 |             "seq2       0.3283582 0.0000000 0.3939394 0.5740741",
57 |             "seq3       0.5641026 0.3939394 0.0000000 0.5471698",
58 |             "seq4       0.6166667 0.5740741 0.5471698 0.0000000"
59 |         ]
60 |         self.assertEqual(matrix.format(), "\n".join(exp))
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     unittest.main()
65 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import os
  3 | import subprocess
  4 | 
  5 | from alfpy.utils import seqrecords
  6 | from alfpy import __version__
  7 | 
  8 | 
  9 | ALPHABET_DNA = 'ATGC'
 10 | ALPHABET_PEP = 'ACDEFGHIKLMNPRSTQWVY'
 11 | 
 12 | 
 13 | def get_test_data(filename):
 14 |     filepath = os.path.join(os.path.dirname(__file__), 'data', filename)
 15 |     return filepath
 16 | 
 17 | 
 18 | def calc_md5(obj):
 19 |     return hashlib.md5(str(obj).encode("utf-8")).hexdigest()
 20 | 
 21 | 
 22 | def runscript(scriptname, args):
 23 |     cmd = [scriptname]
 24 |     for arg in args:
 25 |         cmd.append(arg)
 26 |     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
 27 |                          stderr=subprocess.PIPE,
 28 |                          universal_newlines=True)
 29 |     out = "".join(p.communicate())
 30 |     return p.returncode, out
 31 | 
 32 | 
 33 | class ScriptsCommonTest:
 34 |     """Methods testing arguments that are common to all scripts."""
 35 | 
 36 |     # the name of the file to read from
 37 | 
 38 |     @classmethod
 39 |     def set_test_data(cls):
 40 |         cls.filename_dna = get_test_data('dna.fa')
 41 |         cls.filename_pep = get_test_data('pep.fa')
 42 | 
 43 |     def test_arg_version(self):
 44 |         cmd = ['--version']
 45 |         return_code, out = runscript(self.script_name, cmd)
 46 |         self.assertEqual(return_code, 0)
 47 |         self.assertIn(__version__, out)
 48 | 
 49 |     def test_arg_help(self):
 50 |         cmd = ['--help']
 51 |         return_code, out = runscript(self.script_name, cmd)
 52 |         self.assertEqual(return_code, 0)
 53 | 
 54 |     def test_arg_out_when_no_fasta(self):
 55 |         cmd = ['--out', 'out.txt']
 56 |         return_code, out = runscript(self.script_name, cmd)
 57 |         self.assertEqual(return_code, 2)
 58 |         self.assertIn('--fasta/-f', out)
 59 | 
 60 |     def test_arg_outfmt_when_no_fasta(self):
 61 |         cmd = ['--outfmt', 'pairwise']
 62 |         return_code, out = runscript(self.script_name, cmd)
 63 |         self.assertEqual(return_code, 2)
 64 |         self.assertIn('--fasta/-f', out)
 65 | 
 66 |     def _test_output(self, script_name, args, outfile=True):
 67 |         input_filename = args[args.index('--fasta') + 1]
 68 |         if outfile:
 69 |             args.append('--out')
 70 |             output_filename = '{}.out'.format(input_filename)
 71 |             args.append(output_filename)
 72 |         returncode, result = runscript(script_name, args)
 73 |         if outfile:
 74 |             fh = open(output_filename)
 75 |             result = fh.read()
 76 |             fh.close()
 77 |             os.remove(output_filename)
 78 |         md5 = calc_md5(result)
 79 |         return returncode, result, md5
 80 | 
 81 | 
 82 | class ScriptsWordCommonTest(ScriptsCommonTest):
 83 | 
 84 |     @classmethod
 85 |     def set_test_data(cls):
 86 |         ScriptsCommonTest.set_test_data()
 87 |         cls.filename_char_weights = get_test_data('char_weights.txt')
 88 |         cls.filename_char_freqs = get_test_data('char_freqs.txt')
 89 |         cls.filename_pep_1mer_wordpos = get_test_data(
 90 |             'pep.fa.1mer.wordpos.txt')
 91 |         cls.filename_pep_1mer = get_test_data('pep.fa.1mer.txt')        
 92 |         cls.filename_pep_2mer_wordpos = get_test_data(
 93 |             'pep.fa.2mer.wordpos.txt')
 94 |         cls.filename_pep_2mer = get_test_data('pep.fa.2mer.txt')
 95 |         cls.filename_pep_3mer_wordpos = get_test_data(
 96 |             'pep.fa.3mer.wordpos.txt')
 97 |         cls.filename_pep_3mer = get_test_data('pep.fa.3mer.txt')
 98 | 
 99 | class ModulesCommonTest:
100 | 
101 |     @classmethod
102 |     def set_test_data(cls):
103 |         fh = open(get_test_data('dna.fa'))
104 |         cls.dna_records = seqrecords.read_fasta(fh)
105 |         fh.close()
106 |         fh = open(get_test_data('pep.fa'))
107 |         cls.pep_records = seqrecords.read_fasta(fh)
108 |         fh.close()
109 |         cls.dna_filename = get_test_data('dna.fa')
110 |         cls.pep_filename = get_test_data('pep.fa')
111 | 


--------------------------------------------------------------------------------