├── requirements.txt
├── docs
    ├── images
    │   ├── output_10_1.png
    │   ├── output_10_2.png
    │   ├── output_4_1.png
    │   ├── output_6_1.png
    │   ├── output_8_2.png
    │   └── output_8_3.png
    ├── modules.rst
    ├── installation.rst
    ├── requirements.txt
    ├── pyckmeans.knee.rst
    ├── pyckmeans.ordering.rst
    ├── documentation.rst
    ├── index.rst
    ├── pyckmeans.rst
    ├── pyckmeans.distance.rst
    ├── pyckmeans.ordination.rst
    ├── pyckmeans.utils.rst
    ├── makefile
    ├── make.bat
    ├── pyckmeans.core.rst
    ├── pyckmeans.io.rst
    └── conf.py
├── .coveragerc
├── .readthedocs.yaml
├── pyckmeans
    ├── core
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── tests
    │   │   └── test_core.py
    │   └── multickmeans.py
    ├── utils
    │   ├── __init__.py
    │   ├── progressbar.py
    │   └── plotting.py
    ├── __init__.py
    ├── ordination
    │   ├── utils.py
    │   └── tests
    │   │   └── test_pcoa.py
    ├── io
    │   ├── __init__.py
    │   ├── c_interop.py
    │   ├── tests
    │   │   ├── test_fasta.py
    │   │   ├── test_csv.py
    │   │   ├── test_nucleotidealignment.py
    │   │   └── test_phylip.py
    │   ├── fasta.py
    │   ├── src
    │   │   └── nucencode.cpp
    │   ├── csv.py
    │   ├── phylip.py
    │   └── nucleotide_alignment.py
    ├── knee
    │   ├── tests
    │   │   └── test_knee.py
    │   └── __init__.py
    ├── ordering
    │   ├── tests
    │   │   └── test_reordering.py
    │   └── __init__.py
    ├── distance
    │   ├── tests
    │   │   ├── test_distance.py
    │   │   └── test_c_interop.py
    │   ├── c_interop.py
    │   ├── __init__.py
    │   └── src
    │   │   └── distance.cpp
    └── tests
    │   ├── manual_test_2.py
    │   ├── manual_test.py
    │   ├── manual_tests.ipynb
    │   └── test_workflow.py
├── .github
    └── workflows
    │   ├── start_readthedocs_build.yaml
    │   ├── publish_coverage.yaml
    │   ├── publish_wheels_pypi.yaml
    │   └── publish_anaconda.yaml
├── LICENSE
├── conda.recipe
    └── meta.yaml
├── setup.py
├── .gitignore
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | scipy
4 | scikit-learn
5 | matplotlib
6 | tqdm
7 | 


--------------------------------------------------------------------------------
/docs/images/output_10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TankredO/pyckmeans/HEAD/docs/images/output_10_1.png


--------------------------------------------------------------------------------
/docs/images/output_10_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TankredO/pyckmeans/HEAD/docs/images/output_10_2.png


--------------------------------------------------------------------------------
/docs/images/output_4_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TankredO/pyckmeans/HEAD/docs/images/output_4_1.png


--------------------------------------------------------------------------------
/docs/images/output_6_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TankredO/pyckmeans/HEAD/docs/images/output_6_1.png


--------------------------------------------------------------------------------
/docs/images/output_8_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TankredO/pyckmeans/HEAD/docs/images/output_8_2.png


--------------------------------------------------------------------------------
/docs/images/output_8_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TankredO/pyckmeans/HEAD/docs/images/output_8_3.png


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | pyckmeans
2 | =========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    pyckmeans
8 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
1 | .. _installation:
2 | 
3 | Installation
4 | ============
5 | 
6 | Requirements
7 | ------------
8 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | scipy
4 | scikit-learn
5 | matplotlib
6 | tqdm
7 | numpydoc
8 | pyckmeans
9 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = *tests*, setup.py
3 | 
4 | [report]
5 | exclude_lines =
6 |     if TYPE_CHECKING:
7 |         import matplotlib
8 |         import matplotlib.figure


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sphinx:
 4 |   configuration: docs/conf.py
 5 | 
 6 | python:
 7 |   version: 3.8
 8 |   install:
 9 |     - requirements: docs/requirements.txt
10 | 


--------------------------------------------------------------------------------
/pyckmeans/core/__init__.py:
--------------------------------------------------------------------------------
1 | ''' pyckmeans core module
2 | '''
3 | 
4 | from .ckmeans import CKmeans, CKmeansResult
5 | from .multickmeans import MultiCKMeans, MultiCKmeansResult
6 | from .wecr import WECR, WECRResult
7 | 


--------------------------------------------------------------------------------
/docs/pyckmeans.knee.rst:
--------------------------------------------------------------------------------
 1 | pyckmeans.knee package
 2 | ======================
 3 | 
 4 | Module contents
 5 | ---------------
 6 | 
 7 | .. automodule:: pyckmeans.knee
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 | 


--------------------------------------------------------------------------------
/docs/pyckmeans.ordering.rst:
--------------------------------------------------------------------------------
 1 | pyckmeans.ordering package
 2 | ==========================
 3 | 
 4 | Module contents
 5 | ---------------
 6 | 
 7 | .. automodule:: pyckmeans.ordering
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 | 


--------------------------------------------------------------------------------
/docs/documentation.rst:
--------------------------------------------------------------------------------
 1 | .. _Documentation:
 2 | 
 3 | Documentation
 4 | =============
 5 | 
 6 | .. toctree::
 7 |     :maxdepth: 3
 8 | 
 9 |     pyckmeans
10 | 
11 | Indices and tables
12 | ------------------
13 | 
14 | * :ref:`genindex`
15 | * :ref:`modindex`
16 | 


--------------------------------------------------------------------------------
/pyckmeans/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | ''' Utilties module
 2 | '''
 3 | 
 4 | from .progressbar import MultiCKMeansProgressBars
 5 | from .plotting import (
 6 |     plot_ckmeans_result, plot_multickmeans_metrics,
 7 |     plot_wecr_result, plot_wecr_result_metrics,
 8 |     plot_cmatrix,
 9 | )
10 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. _index:
 2 | 
 3 | Welcome to pyckmeans' documentation!
 4 | ====================================
 5 | 
 6 | .. toctree::
 7 |     :maxdepth: 1
 8 |     :caption: Contents:
 9 | 
10 |     installation.rst
11 |     documentation.rst
12 | 
13 | Indices and tables
14 | ==================
15 | 
16 | * :ref:`genindex`
17 | * :ref:`modindex`
18 | 


--------------------------------------------------------------------------------
/docs/pyckmeans.rst:
--------------------------------------------------------------------------------
 1 | pyckmeans package
 2 | =================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    pyckmeans.core
11 |    pyckmeans.distance
12 |    pyckmeans.io
13 |    pyckmeans.knee
14 |    pyckmeans.ordering
15 |    pyckmeans.ordination
16 |    pyckmeans.utils
17 | 
18 | Module contents
19 | ---------------
20 | 
21 | .. automodule:: pyckmeans
22 |    :members:
23 |    :undoc-members:
24 |    :show-inheritance:
25 | 


--------------------------------------------------------------------------------
/pyckmeans/__init__.py:
--------------------------------------------------------------------------------
 1 | ''' pyckmeans
 2 | 
 3 |     pyckmeans, a Python package for Consensus K-Means clustering.
 4 | '''
 5 | 
 6 | __version__ = '0.9.4'
 7 | 
 8 | __all__ = [
 9 |     'CKmeans',
10 |     'MultiCKMeans',
11 |     'WECR',
12 |     'NucleotideAlignment',
13 |     'DistanceMatrix',
14 |     'pcoa',
15 | ]
16 | 
17 | from .core import CKmeans, MultiCKMeans, WECR
18 | from .io import NucleotideAlignment
19 | from .distance import DistanceMatrix
20 | from .ordination import pcoa
21 | 


--------------------------------------------------------------------------------
/docs/pyckmeans.distance.rst:
--------------------------------------------------------------------------------
 1 | pyckmeans.distance package
 2 | ==========================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | pyckmeans.distance.c\_interop module
 8 | ------------------------------------
 9 | 
10 | .. automodule:: pyckmeans.distance.c_interop
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: pyckmeans.distance
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/pyckmeans.ordination.rst:
--------------------------------------------------------------------------------
 1 | pyckmeans.ordination package
 2 | ============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | pyckmeans.ordination.utils module
 8 | ---------------------------------
 9 | 
10 | .. automodule:: pyckmeans.ordination.utils
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: pyckmeans.ordination
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/.github/workflows/start_readthedocs_build.yaml:
--------------------------------------------------------------------------------
 1 | name: Start readthedocs build
 2 | on:
 3 |   workflow_dispatch:
 4 |     inputs:
 5 |       logLevel:
 6 |         description: 'Log level'     
 7 |         required: true
 8 |         default: 'warning'
 9 |       tags:
10 |         description: 'Start readthedocs build tags'  
11 | jobs:
12 |   start-readthedocs-build:
13 |     runs-on: "ubuntu-latest"
14 |     steps:
15 |       - run: curl -d "token=${{secrets.READTHEDOCS_TOKEN}}" -X POST https://readthedocs.org/api/v2/webhook/pyckmeans/163759/
16 | 


--------------------------------------------------------------------------------
/pyckmeans/core/utils.py:
--------------------------------------------------------------------------------
 1 | '''core utilities'''
 2 | 
 3 | import json
 4 | import numpy
 5 | 
 6 | # source: https://stackoverflow.com/a/49677241
 7 | class NumpyEncoder(json.JSONEncoder):
 8 |     ''' Special json encoder for numpy types '''
 9 |     def default(self, obj):
10 |         if isinstance(obj, numpy.integer):
11 |             return int(obj)
12 |         elif isinstance(obj, numpy.floating):
13 |             return float(obj)
14 |         elif isinstance(obj, numpy.ndarray):
15 |             return obj.tolist()
16 |         return json.JSONEncoder.default(self, obj)
17 | 


--------------------------------------------------------------------------------
/pyckmeans/ordination/utils.py:
--------------------------------------------------------------------------------
 1 | '''ordination utilities'''
 2 | 
 3 | import json
 4 | import numpy
 5 | 
 6 | # source: https://stackoverflow.com/a/49677241
 7 | class NumpyEncoder(json.JSONEncoder):
 8 |     ''' Special json encoder for numpy types '''
 9 |     def default(self, obj):
10 |         if isinstance(obj, numpy.integer):
11 |             return int(obj)
12 |         elif isinstance(obj, numpy.floating):
13 |             return float(obj)
14 |         elif isinstance(obj, numpy.ndarray):
15 |             return obj.tolist()
16 |         return json.JSONEncoder.default(self, obj)
17 | 


--------------------------------------------------------------------------------
/pyckmeans/io/__init__.py:
--------------------------------------------------------------------------------
 1 | ''' io
 2 | 
 3 |     Module containing input and output functionality.
 4 | '''
 5 | 
 6 | from .nucleotide_alignment import \
 7 |     NucleotideAlignment, \
 8 |     read_alignment, \
 9 |     InvalidAlignmentFileExtensionError, \
10 |     InvalidAlignmentFileFormatError
11 | from .phylip import \
12 |     read_phylip_alignment, \
13 |     InvalidPhylipAlignmentError, \
14 |     read_phylip_distmat, \
15 |     InvalidPhylipMatrixError
16 | from .fasta import \
17 |     read_fasta_alignment, \
18 |     InvalidFastaAlignmentError
19 | from .csv import \
20 |     read_csv_distmat
21 | 


--------------------------------------------------------------------------------
/docs/pyckmeans.utils.rst:
--------------------------------------------------------------------------------
 1 | pyckmeans.utils package
 2 | =======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | pyckmeans.utils.plotting module
 8 | -------------------------------
 9 | 
10 | .. automodule:: pyckmeans.utils.plotting
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | pyckmeans.utils.progressbar module
16 | ----------------------------------
17 | 
18 | .. automodule:: pyckmeans.utils.progressbar
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: pyckmeans.utils
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/docs/makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd


--------------------------------------------------------------------------------
/docs/pyckmeans.core.rst:
--------------------------------------------------------------------------------
 1 | pyckmeans.core package
 2 | ======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | pyckmeans.core.ckmeans module
 8 | -----------------------------
 9 | 
10 | .. automodule:: pyckmeans.core.ckmeans
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | pyckmeans.core.multickmeans module
16 | ----------------------------------
17 | 
18 | .. automodule:: pyckmeans.core.multickmeans
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | pyckmeans.core.utils module
24 | ---------------------------
25 | 
26 | .. automodule:: pyckmeans.core.utils
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | pyckmeans.core.wecr module
32 | --------------------------
33 | 
34 | .. automodule:: pyckmeans.core.wecr
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | Module contents
40 | ---------------
41 | 
42 | .. automodule:: pyckmeans.core
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021 Tankred Ott
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/docs/pyckmeans.io.rst:
--------------------------------------------------------------------------------
 1 | pyckmeans.io package
 2 | ====================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | pyckmeans.io.c\_interop module
 8 | ------------------------------
 9 | 
10 | .. automodule:: pyckmeans.io.c_interop
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | pyckmeans.io.csv module
16 | -----------------------
17 | 
18 | .. automodule:: pyckmeans.io.csv
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | pyckmeans.io.fasta module
24 | -------------------------
25 | 
26 | .. automodule:: pyckmeans.io.fasta
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | pyckmeans.io.nucleotide\_alignment module
32 | -----------------------------------------
33 | 
34 | .. automodule:: pyckmeans.io.nucleotide_alignment
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | pyckmeans.io.phylip module
40 | --------------------------
41 | 
42 | .. automodule:: pyckmeans.io.phylip
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 
47 | Module contents
48 | ---------------
49 | 
50 | .. automodule:: pyckmeans.io
51 |    :members:
52 |    :undoc-members:
53 |    :show-inheritance:
54 | 


--------------------------------------------------------------------------------
/pyckmeans/knee/tests/test_knee.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy
 3 | 
 4 | from pyckmeans.knee import KneeLocator
 5 | 
 6 | @pytest.mark.parametrize('direction', ['increasing', 'decreasing'])
 7 | @pytest.mark.parametrize('curve', ['convex', 'concave'])
 8 | def test_simple(direction, curve):
 9 |     x = numpy.array([1.0, 2.0, 3.0 ,4.0, 5.0, 6.0,  7.0,  8.0,  9.0 ])
10 |     y = numpy.array([1.0, 2.2, 3.4, 4.5, 7.0, 10.0, 15.0, 22.0, 30.0])
11 | 
12 |     kl_0 = KneeLocator(x, y, curve=curve, direction=direction, interp_method='interp1d')
13 |     print('kl_0.knee:', kl_0.knee)
14 |     print('kl_0.elbow:', kl_0.elbow)
15 |     print('kl_0.norm_elbow:', kl_0.norm_elbow)
16 |     print('kl_0.elbow_y:', kl_0.elbow_y)
17 |     print('kl_0.norm_elbow_y:', kl_0.norm_elbow_y)
18 |     print('kl_0.all_elbows:', kl_0.all_elbows)
19 |     print('kl_0.all_norm_elbows:', kl_0.all_norm_elbows)
20 |     print('kl_0.all_elbows_y:', kl_0.all_elbows_y)
21 |     print('kl_0.all_norm_elbows_y:', kl_0.all_norm_elbows_y)
22 | 
23 |     kl_1 = KneeLocator(x, y, curve=curve, direction=direction, interp_method='polynomial')
24 | 
25 |     with pytest.raises(ValueError):
26 |         KneeLocator(x, y, curve=curve, direction=direction, interp_method='XYZ')
27 | 


--------------------------------------------------------------------------------
/conda.recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set data = load_setup_py_data(setup_file='../setup.py', from_recipe_dir=True) %}
 2 | 
 3 | package:
 4 |   name: {{ data.get('name') }}
 5 |   version: {{ data.get('version') }}
 6 | 
 7 | source:
 8 |   path: '../'
 9 | 
10 | build:
11 |   # noarch: python
12 |   number: 0
13 |   script: "{{ PYTHON }} -m pip install --no-deps --ignore-installed -vv ."
14 | 
15 | requirements:
16 |   build:
17 |     - {{ compiler('cxx') }}
18 |   host:
19 |     - python
20 |     - pip
21 |   run:
22 |     - python
23 |     - numpy
24 |     - pandas
25 |     - scipy
26 |     - scikit-learn
27 |     - matplotlib
28 |     - tqdm
29 | 
30 | about:
31 |   home: https://github.com/TankredO/pyckmeans
32 |   license: MIT
33 |   license_family: MIT
34 |   license_file: LICENSE
35 |   summary: {{ data.get('description') }}
36 | 
37 |   # The remaining entries in this section are optional, but recommended.
38 |   description: |
39 |     pyckmeans is a Python package for Consensus K-Means and Weighted Ensemble Consensus of Random (WECR) K-Means clustering, especially in the context of DNA sequence data.
40 |   doc_url: https://pyckmeans.readthedocs.io
41 |   dev_url: https://github.com/TankredO/pyckmeans
42 | 
43 | extra:
44 |   recipe-maintainers:
45 |     - TankredO
46 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_coverage.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish coverage report to coveralls
 2 | on:
 3 |   workflow_dispatch:
 4 |     inputs:
 5 |       logLevel:
 6 |         description: 'Log level'     
 7 |         required: true
 8 |         default: 'warning'
 9 |       tags:
10 |         description: 'Publish coverage report to coveralls tags'
11 |   release:
12 |     types: [published]
13 | jobs:
14 |   Publish-coverage:
15 |     name: Publish coverage
16 |     runs-on: "windows-latest"
17 |     env:
18 |       COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_TOKEN }}
19 |     steps:
20 |       - run: echo "This job was triggered by ${{ github.event_name }}."
21 |       - run: echo "This job is now running on ${{ runner.os }}."
22 |       - run: echo "The branch name is ${{ github.ref }}"
23 |       - name: Check out repository code
24 |         uses: actions/checkout@v2
25 |       - run: echo "The ${{ github.repository }} repository has been cloned to the runner."
26 |       - name: List files in the repository
27 |         run: |
28 |           ls ${{ github.workspace }}
29 |       - name: Set up Python
30 |         uses: actions/setup-python@v2
31 |         with:
32 |           python-version: "3.9"
33 |       - name: Which version
34 |         run: which python
35 |       - name: Install coveralls and pytest
36 |         run: python -m pip install coveralls pytest
37 |       - name: Install pyckmeans
38 |         run: |
39 |           python -m pip install -r requirements.txt
40 |           python -m pip install biopython
41 |           python -m pip install --no-deps --ignore-installed -vv -e .
42 |       - name: Generate coverage report
43 |         run: python -m coverage run --source=pyckmeans -m pytest .\pyckmeans\
44 |       - name: Publish coverage report to coveralls
45 |         run: python -m coveralls
46 |       - run: echo "This job's status is ${{ job.status }}."
47 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_wheels_pypi.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish wheels to PyPi
 2 | on:
 3 |   workflow_dispatch:
 4 |     inputs:
 5 |       logLevel:
 6 |         description: 'Log level'     
 7 |         required: true
 8 |         default: 'warning'
 9 |       tags:
10 |         description: 'Publish wheels to PyPi tags'
11 |   release:
12 |     types: [published]
13 | jobs:
14 |   Publish-to-PyPi:
15 |     name: Ex1 (${{ matrix.python-version }}, ${{ matrix.os }})
16 |     runs-on: ${{ matrix.os }}
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 | #         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
21 |         os: ["windows-latest",  "macos-latest"]
22 |         python-version: ["3.6", "3.7", "3.8", "3.9"]
23 |     steps:
24 |       - run: echo "This job was triggered by ${{ github.event_name }}."
25 |       - run: echo "This job is now running on ${{ runner.os }}."
26 |       - run: echo "The branch name is ${{ github.ref }}"
27 |       - name: Check out repository code
28 |         uses: actions/checkout@v2
29 |       - run: echo "The ${{ github.repository }} repository has been cloned to the runner."
30 |       - name: List files in the repository
31 |         run: |
32 |           ls ${{ github.workspace }}
33 |       - name: Set up Python
34 |         uses: actions/setup-python@v2
35 |         with:
36 |           python-version: ${{ matrix.python-version }}
37 |       - name: Which version
38 |         run: which python
39 |       - name: Install wheel and twine
40 |         run: python -m pip install wheel twine
41 |       - name: Install pyckmeans
42 |         run: |
43 |           python -m pip install --no-deps --ignore-installed -vv .
44 |           python setup.py bdist_wheel
45 |       - name: Publish to PyPI
46 |         run: twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
47 |       - run: echo "This job's status is ${{ job.status }}."
48 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_anaconda.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish to Anaconda Cloud
 2 | on:
 3 |   workflow_dispatch:
 4 |     inputs:
 5 |       logLevel:
 6 |         description: 'Log level'     
 7 |         required: true
 8 |         default: 'warning'
 9 |       tags:
10 |         description: 'Publish to Anaconda Cloud tags'
11 |   release:
12 |     types: [published]
13 | jobs:
14 |   Publish-to-Anaconda-Cloud:
15 |     name: Ex1 (${{ matrix.python-version }}, ${{ matrix.os }})
16 |     runs-on: ${{ matrix.os }}
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         #os: ["ubuntu-latest", "macos-latest", "windows-latest"]
21 |         os: ["ubuntu-latest", "windows-latest"]
22 |         python-version: ["3.6", "3.7", "3.8", "3.9"]
23 |     steps:
24 |       - run: echo "This job was triggered by ${{ github.event_name }}."
25 |       - run: echo "This job is now running on ${{ runner.os }}."
26 |       - run: echo "The branch name is ${{ github.ref }}"
27 |       - name: Check out repository code
28 |         uses: actions/checkout@v2
29 |       - run: echo "The ${{ github.repository }} repository has been cloned to the runner."
30 |       - name: List files in the repository
31 |         run: |
32 |           ls ${{ github.workspace }}
33 |       - uses: s-weigand/setup-conda@v1
34 |         with:
35 |           auto-update-conda: true
36 |           python-version: ${{ matrix.python-version }}
37 |       - name: Conda info
38 |         run: conda info
39 |       - name: Conda list
40 |         run: conda list
41 |       - name: Which version
42 |         run: which python
43 |       - name: Install conda packages
44 |         run: conda install conda-build anaconda-client
45 |       - name: Build and publish
46 |         run: conda build ${{ github.workspace }}/conda.recipe --user ${{secrets.ANACONDA_USER}} --token ${{secrets.ANACONDA_TOKEN}}
47 |       - run: echo "This job's status is ${{ job.status }}."
48 | 


--------------------------------------------------------------------------------
/pyckmeans/ordering/tests/test_reordering.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from sklearn.datasets import make_blobs
 4 | from scipy.spatial.distance import squareform, pdist
 5 | 
 6 | from pyckmeans.ordering import InvalidLinkageType, InvalidReorderMethod, distance_order, reorder_distance
 7 | from pyckmeans.distance import DistanceMatrix
 8 | 
 9 | @pytest.fixture(scope='session')
10 | def prepare_distances():
11 |     x0, _ = make_blobs(n_samples=10, n_features=2, centers=2)
12 |     d0_np = squareform(pdist(x0))
13 |     d0_dm = DistanceMatrix(d0_np)
14 | 
15 |     x1, _ = make_blobs(n_samples=50, n_features=3, centers=3)
16 |     d1_np = squareform(pdist(x1))
17 |     d1_dm = DistanceMatrix(d1_np)
18 | 
19 |     return (
20 |         d0_np,
21 |         d0_dm,
22 |         d1_np,
23 |         d1_dm,
24 |     )
25 | 
26 | def test_reorder(prepare_distances):
27 |     d0, d1, d2, d3 = prepare_distances
28 | 
29 |     d0_o = reorder_distance(d0)
30 |     print('d0_o:', d0_o)
31 |     d1_o = reorder_distance(d1)
32 |     print('d1_o:', d1_o)
33 |     d2_o = reorder_distance(d2)
34 |     print('d2_o:', d2_o)
35 |     d3_o = reorder_distance(d3)
36 |     print('d3_o:', d3_o)
37 | 
38 |     print('d0_o olo:', reorder_distance(d0, method='OLO'))
39 | 
40 |     with pytest.raises(InvalidReorderMethod):
41 |         reorder_distance(d0, method='XYZ')
42 |     with pytest.raises(InvalidLinkageType):
43 |         reorder_distance(d0, linkage_type='XYZ')
44 | 
45 | def test_order(prepare_distances):
46 |     d0, d1, d2, d3 = prepare_distances
47 | 
48 |     o0 = distance_order(d0)
49 |     print('o0:', o0)
50 |     o1 = distance_order(d1)
51 |     print('o1:', o1)
52 |     o2 = distance_order(d2)
53 |     print('o2:', o2)
54 |     o3 = distance_order(d3)
55 |     print('o3:', o3)
56 | 
57 |     print('o0 olo:', distance_order(d0, method='OLO'))
58 | 
59 |     with pytest.raises(InvalidReorderMethod):
60 |         distance_order(d0, method='XYZ')
61 |     with pytest.raises(InvalidLinkageType):
62 |         distance_order(d0, linkage_type='XYZ')
63 |     
64 | 


--------------------------------------------------------------------------------
/pyckmeans/distance/tests/test_distance.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import numpy as np
 4 | 
 5 | from pyckmeans.io import NucleotideAlignment
 6 | from pyckmeans.distance import alignment_distance, p_distance, InvalidDistanceTypeError
 7 | 
 8 | @pytest.fixture()
 9 | def prepare_alignments():
10 |     aln_0 = NucleotideAlignment(
11 |         ['s0', 's1', 's2', 's3'],
12 |         np.array([
13 |             ['A', 'C', 'T', 'G', 'C', 'C', 'T', 'A', 'G', 'A'],
14 |             ['T', 'C', 'T', 'G', 'C', 'C', 'T', 'T', 'G', 'A'],
15 |             ['A', 'G', 'T', 'G', 'C', 'C', 'T', 'A', 'G', 'A'],
16 |             ['A', 'C', 'T', 'A', 'A', 'A', 'T', 'A', 'G', 'A'],
17 |         ])
18 |     )
19 |     d_0_p = np.array([
20 |         [0.0, 0.2, 0.1, 0.3],
21 |         [0.2, 0.0, 0.3, 0.5],
22 |         [0.1, 0.3, 0.0, 0.4],
23 |         [0.3, 0.5, 0.4, 0.0],
24 |     ])
25 | 
26 |     return (
27 |         (aln_0, d_0_p),
28 |     )
29 | 
30 | def test_p_distance(prepare_alignments):
31 |     eps = 0.0001
32 | 
33 |     d_0 = alignment_distance(prepare_alignments[0][0], 'p')
34 |     d_0_expected = prepare_alignments[0][1]
35 |     assert np.all(np.abs(d_0.dist_mat - d_0_expected) < eps)
36 | 
37 |     print('d_0', d_0)
38 | 
39 |     d_0_p = p_distance(prepare_alignments[0][0].sequences)
40 |     assert np.all(np.abs(d_0_p - d_0_expected) < eps)
41 | 
42 | def test_distances_simple(prepare_alignments):
43 |     alignment_distance(prepare_alignments[0][0], 'p', True)
44 |     alignment_distance(prepare_alignments[0][0], 'p', False)
45 |     alignment_distance(prepare_alignments[0][0], 'jc', True)
46 |     alignment_distance(prepare_alignments[0][0], 'jc', False)
47 |     alignment_distance(prepare_alignments[0][0], 'k2p', True)
48 |     alignment_distance(prepare_alignments[0][0], 'k2p', False)
49 | 
50 |     with pytest.raises(InvalidDistanceTypeError):
51 |         alignment_distance(prepare_alignments[0][0], 'xyz', True)
52 |     with pytest.raises(InvalidDistanceTypeError):
53 |         alignment_distance(prepare_alignments[0][0], 'xyz', False)
54 | 


--------------------------------------------------------------------------------
/pyckmeans/tests/manual_test_2.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import time
 3 | 
 4 | import numpy as np
 5 | import numpy.random as random
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | import pyckmeans
 9 | 
10 | if __name__ == '__main__':
11 | 
12 |     path = pathlib.Path(__file__).parent.absolute()
13 | 
14 |     p = 10
15 | 
16 |     n0 = 50
17 |     x0 = np.random.normal(0, 2, (n0, p))
18 |     n1 = 50
19 |     x1 = np.random.normal(-3, 1.5, (n1, p))
20 |     n2 = 50
21 |     x2 = np.random.normal(3, 2, (n2, p))
22 | 
23 |     x_0 = np.r_[x0, x1, x2]
24 | 
25 |     k = np.arange(2, 8)
26 |     n_rep = 200
27 |     p_feat = 0.8
28 |     p_samp = 0.8
29 |     gamma = 0.5
30 |     must_link = np.array([
31 |         [0, 10],
32 |         [12, 21],
33 |         [52, 56],
34 |         [75, 61],
35 |         [101, 142],
36 |         # [1, 51],
37 |         # [2, 51],
38 |         # [3, 51],
39 |         # [4, 51],
40 |         # [5, 51],
41 |         # [6, 51],
42 |         # [7, 51],
43 |         # [8, 51],
44 |         # [9, 51],
45 |         # [10, 51]
46 |     ])
47 |     must_not_link = np.array([
48 |         [0, 64],
49 |         [88, 15],
50 |         [112, 56],
51 |         [140, 1],
52 |         # [1, 2],
53 |         # [1, 3],
54 |         # [1, 4],
55 |         # [1, 5],
56 |         # [1, 6],
57 |         # [1, 7],
58 |     ])
59 | 
60 |     wecr_0 = pyckmeans.WECR(k=k, n_rep=n_rep, p_samp=p_samp, p_feat=p_feat, gamma=gamma)
61 | 
62 |     t0 = time.time()
63 |     wecr_0.fit(x_0, must_link=must_link, must_not_link=must_not_link)
64 |     t1 = time.time()
65 | 
66 | 
67 |     t2 = time.time()
68 |     cmatrix = wecr_0.predict(x_0)
69 |     t3 = time.time()
70 | 
71 |     # print(cmatrix)
72 | 
73 |     print(t1 - t0)
74 |     print(t3 - t2)
75 | 
76 |     fig, ax = plt.subplots(1, 1)
77 |     ax.imshow(cmatrix)
78 |     fig.savefig(path / 'manual_test_2_img0.png')
79 | 
80 |     print(cmatrix)
81 | 
82 |     fig, ax = plt.subplots(1, 1)
83 |     ax.scatter(x_0[:, 0], x_0[:, 1])
84 |     fig.savefig(path / 'manual_test_2_img1.png')
85 | 
86 |     # print(wecr_0.qualities)


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 |   
 2 | # Configuration file for the Sphinx documentation builder.
 3 | #
 4 | # This file only contains a selection of the most common options. For a full
 5 | # list see the documentation:
 6 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 7 | 
 8 | # -- Path setup --------------------------------------------------------------
 9 | 
10 | # If extensions (or modules to document with autodoc) are in another directory,
11 | # add these directories to sys.path here. If the directory is relative to the
12 | # documentation root, use os.path.abspath to make it absolute, like shown here.
13 | #
14 | import os
15 | import sys
16 | sys.path.insert(0, os.path.abspath('../pyckmeans'))
17 | 
18 | 
19 | # -- Project information -----------------------------------------------------
20 | 
21 | project = 'pyckmeans'
22 | copyright = '2021, Tankred Ott'
23 | author = 'Tankred Ott'
24 | 
25 | 
26 | # -- General configuration ---------------------------------------------------
27 | 
28 | # Add any Sphinx extension module names here, as strings. They can be
29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
30 | # ones.
31 | extensions = [
32 |     'sphinx_rtd_theme',
33 |     'sphinx.ext.autodoc',
34 |     'numpydoc',
35 | ]
36 | 
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ['_templates']
39 | 
40 | html_theme_options = {
41 |     "collapse_navigation": False,
42 | }
43 | 
44 | # List of patterns, relative to source directory, that match files and
45 | # directories to ignore when looking for source files.
46 | # This pattern also affects html_static_path and html_extra_path.
47 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
48 | 
49 | 
50 | # -- Options for HTML output -------------------------------------------------
51 | 
52 | # The theme to use for HTML and HTML Help pages.  See the documentation for
53 | # a list of builtin themes.
54 | #
55 | html_theme = 'sphinx_rtd_theme'
56 | 
57 | # Add any paths that contain custom static files (such as style sheets) here,
58 | # relative to this directory. They are copied after the builtin static files,
59 | # so a file named "default.css" will overwrite the builtin "default.css".
60 | html_static_path = ['_static']
61 | 
62 | # numpydoc_show_class_members = False
63 | 


--------------------------------------------------------------------------------
/pyckmeans/utils/progressbar.py:
--------------------------------------------------------------------------------
 1 | ''' Progress bar utilities
 2 | '''
 3 | 
 4 | from typing import Dict, Any
 5 | 
 6 | import tqdm
 7 | from pyckmeans.core import MultiCKMeans
 8 | 
 9 | class MultiCKMeansProgressBars:
10 |     '''MultiCKMeansProgressBars
11 | 
12 |     Context Manager for a MultiCKMeans progress bars.
13 | 
14 |     Parameters
15 |     ----------
16 |     mckm : MultiCKMeans
17 |         MultiCKMeans object to display progress bars for.
18 |     kwargs : Dict[str, Any]
19 |         Additional keyword arguments passed to tqdm.tqdm.
20 |     '''
21 |     def __init__(
22 |         self,
23 |         mckm: MultiCKMeans,
24 |         **kwargs: Dict[str, Any],
25 |     ):
26 |         self.mckm = mckm
27 | 
28 |         self.ks = mckm.k
29 |         self.n_rep = mckm.n_rep
30 | 
31 |         self._ckm_idx = 0
32 |         self._iter = 0
33 |         self._done = False
34 | 
35 |         # tqdm options
36 |         self._tqdm_kwargs = {
37 |         }
38 |         self._tqdm_kwargs.update(kwargs)
39 | 
40 |         # init first progress bar
41 |         self._tqdm = tqdm.tqdm(
42 |             total=self.n_rep,
43 |             mininterval=0.5,
44 |             desc=f'k={self.ks[self._ckm_idx]}',
45 |             **self._tqdm_kwargs,
46 |         )
47 | 
48 |     def update(
49 |         self,
50 |         n: int = 1,
51 |     ):
52 |         '''update
53 | 
54 |         Update progress by n iterations.
55 | 
56 |         Parameters
57 |         ----------
58 |         n : int, optional
59 |             Progress increment in iterations, by default 1
60 |         '''
61 |         if self._done:
62 |             return
63 | 
64 |         self._iter += n
65 |         self._tqdm.update(n)
66 | 
67 |         if self._iter >= self.n_rep:
68 |             self._tqdm.close()
69 |             self._iter = 0
70 |             self._ckm_idx += 1
71 |             if self._ckm_idx >= len(self.ks):
72 |                 self._done = True
73 |             else:
74 |                 self._tqdm = tqdm.tqdm(
75 |                     total=self.n_rep,
76 |                     desc=f'k={self.ks[self._ckm_idx]}',
77 |                     **self._tqdm_kwargs,
78 |                 )
79 | 
80 |     def __enter__(self):
81 |         return self
82 | 
83 |     def __exit__(self, exc_type, exc_value, exc_traceback):
84 |         self._tqdm.close()
85 |         return
86 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | ''' setup
 2 | '''
 3 | 
 4 | import re
 5 | import io
 6 | from distutils.command.build_ext import build_ext as build_ext_orig
 7 | from setuptools import setup, find_packages, Extension
 8 | 
 9 | # source: https://stackoverflow.com/a/39671214
10 | __version__ = re.search(
11 |     r'__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
12 |     io.open('pyckmeans/__init__.py', encoding='utf_8_sig').read()
13 | ).group(1)
14 | 
15 | # ==== ctypes extensions
16 | class CTypesExtension(Extension):
17 |     '''CTypesExtension'''
18 | 
19 | class build_ext(build_ext_orig):
20 |     '''build_ext'''
21 |     def build_extension(self, ext):
22 |         self._ctypes = isinstance(ext, CTypesExtension)
23 |         return super().build_extension(ext)
24 | 
25 |     def get_export_symbols(self, ext):
26 |         if self._ctypes:
27 |             return ext.export_symbols
28 |         return super().get_export_symbols(ext)
29 | 
30 |     def get_ext_filename(self, ext_name):
31 |         if self._ctypes:
32 |             return ext_name + '.so'
33 |         return super().get_ext_filename(ext_name)
34 | 
35 | distance_module = CTypesExtension(
36 |     'pyckmeans.distance.lib.distance',
37 |     sources=['pyckmeans/distance/src/distance.cpp'],
38 |     language='c++',
39 | )
40 | 
41 | nucencode_module = CTypesExtension(
42 |     'pyckmeans.io.lib.nucencode',
43 |     sources=['pyckmeans/io/src/nucencode.cpp'],
44 |     language='c++',
45 | )
46 | 
47 | ext_modules = [
48 |     distance_module,
49 |     nucencode_module,
50 | ]
51 | 
52 | install_requires = [
53 |     'numpy',
54 |     'pandas',
55 |     'scipy',
56 |     'scikit-learn',
57 |     'matplotlib',
58 |     'tqdm',
59 | ]
60 | 
61 | # ====
62 | description = 'A consensus K-Means implementation.'
63 | 
64 | long_description = io.open('README.md').read()
65 | long_description_content_type = 'text/markdown'
66 | # ====
67 | setup(
68 |     name='pyckmeans',
69 |     version=__version__,
70 |     packages=find_packages(),
71 |     description=description,
72 |     long_description=long_description,
73 |     long_description_content_type=long_description_content_type,
74 |     author='Tankred Ott',
75 |     platforms=['any'],
76 |     python_requires='>=3.6',
77 |     install_requires=install_requires,
78 |     cmdclass={'build_ext': build_ext},
79 |     ext_modules=ext_modules,
80 |     url='https://github.com/TankredO/pyckmeans',
81 | )
82 | 


--------------------------------------------------------------------------------
/pyckmeans/io/c_interop.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | from os import error
 3 | import pathlib
 4 | 
 5 | import numpy
 6 | 
 7 | # load the shared library
 8 | libfile = pathlib.Path(__file__).parent / 'lib' / 'nucencode.so'
 9 | lib = ctypes.CDLL(str(libfile))
10 | 
11 | # == nucleotide encoding
12 | lib.encodeNucleotides.restype = None
13 | lib.encodeNucleotides.argtypes = [
14 |     numpy.ctypeslib.ndpointer(  # alignment: n * m matrix
15 |         dtype=numpy.uint8,
16 |         ndim=2,
17 |         flags='C_CONTIGUOUS',
18 |     ),
19 |     ctypes.c_int,               # n: number of entries
20 |     ctypes.c_int,               # m: number of sites
21 | ]
22 | 
23 | lib.encodeNucleotides_uint32.restype = None
24 | lib.encodeNucleotides_uint32.argtypes = [
25 |     numpy.ctypeslib.ndpointer(  # alignment: n * m matrix
26 |         dtype=numpy.uint32,
27 |         ndim=2,
28 |         flags='C_CONTIGUOUS',
29 |     ),
30 |     ctypes.c_int,               # n: number of entries
31 |     ctypes.c_int,               # m: number of sites
32 |     numpy.ctypeslib.ndpointer(  # encodedAlignment: n * m matrix
33 |         dtype=numpy.uint8,
34 |         ndim=2,
35 |         flags='C_CONTIGUOUS',
36 |     ),
37 | ]
38 | 
39 | def encode_nucleotides(
40 |     alignment: numpy.ndarray,
41 | ) -> numpy.ndarray:
42 |     '''encode_nucleotides
43 | 
44 |     Encode nucleotide alignment INPLACE.
45 | 
46 |     Parameters
47 |     ----------
48 |     alignment : numpy.ndarray
49 |         n*m numpy alignment, where n is the number of entries and m is
50 |         the number of sites. Dtype must be 'U1' or 'S'.
51 | 
52 |     Returns
53 |     -------
54 |     numpy.ndarray
55 |         The encoded alignment.
56 | 
57 |     Raises
58 |     ------
59 |     Exception
60 |         Raised if alignment has invalid dtype.
61 |     '''
62 |     if not alignment.flags['C_CONTIGUOUS']:
63 |         alignment = numpy.ascontiguousarray(alignment)
64 | 
65 |     n, m = alignment.shape
66 | 
67 |     # ASCII encoding? 1 byte per character
68 |     if alignment.dtype.type == numpy.dtype('S'):
69 |         lib.encodeNucleotides(alignment.view(numpy.uint8), n, m)
70 |         return alignment.view(numpy.uint8)
71 |     # Unicode encoding. Expecting 4 bytes per character
72 |     elif alignment.dtype.type == numpy.dtype('U'):
73 |         alignment_encoded = numpy.zeros_like(alignment, dtype=numpy.uint8)
74 |         lib.encodeNucleotides_uint32(alignment.view(numpy.uint32), n, m, alignment_encoded)
75 |         return alignment_encoded
76 |     else:
77 |         msg = f'Can not encode sequences with dtype {alignment.dtype}.'
78 |         raise Exception(msg)
79 | 


--------------------------------------------------------------------------------
/pyckmeans/io/tests/test_fasta.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import tempfile
  3 | import os
  4 | 
  5 | from pyckmeans.io import fasta
  6 | from pyckmeans.io.fasta import InvalidFastaAlignmentError
  7 | 
  8 | FASTA_STR_0 = \
  9 | '''
 10 | >Sample 0
 11 | ACTGTCATG
 12 | >Sample 1
 13 | ACT--CATC
 14 | '''
 15 | 
 16 | FASTA_STR_1 = \
 17 | '''
 18 | >Sample 0
 19 | ACT GTC ATG
 20 | >Sample 1
 21 | ACT --C ATC
 22 | '''
 23 | 
 24 | FASTA_STR_2 = \
 25 | '''
 26 | >Sample 0
 27 | ACT
 28 | GTC
 29 | ATG
 30 | >Sample 1
 31 | ACT
 32 | --C
 33 | ATC
 34 | '''
 35 | 
 36 | FASTA_STR_3 = \
 37 | '''
 38 | >Sample 0
 39 | ACTGTCAT
 40 | >Sample 1
 41 | ACT--CATC
 42 | '''
 43 | 
 44 | FASTA_STR_4 = \
 45 | '''
 46 | >Sample 0
 47 | ACTGTCATA
 48 | >Sample 1
 49 | ACT--CAT
 50 | '''
 51 | 
 52 | @pytest.fixture(scope='session')
 53 | def prep_fasta_files():
 54 |     with tempfile.TemporaryDirectory() as tempdir:
 55 |         print(f'Created temporary directory {tempdir}.')
 56 | 
 57 |         fasta_file_0 = os.path.join(tempdir, 'fasta_0.fasta')
 58 |         with open(fasta_file_0, 'w') as f:
 59 |             f.write(FASTA_STR_0)
 60 | 
 61 |         fasta_file_1 = os.path.join(tempdir, 'fasta_1.fasta')
 62 |         with open(fasta_file_1, 'w') as f:
 63 |             f.write(FASTA_STR_1)
 64 | 
 65 |         fasta_file_2 = os.path.join(tempdir, 'fasta_2.fasta')
 66 |         with open(fasta_file_2, 'w') as f:
 67 |             f.write(FASTA_STR_2)
 68 |         
 69 |         fasta_file_3 = os.path.join(tempdir, 'fasta_3.fasta')
 70 |         with open(fasta_file_3, 'w') as f:
 71 |             f.write(FASTA_STR_3)
 72 |         
 73 |         fasta_file_4 = os.path.join(tempdir, 'fasta_4.fasta')
 74 |         with open(fasta_file_4, 'w') as f:
 75 |             f.write(FASTA_STR_4)
 76 | 
 77 |         yield (
 78 |             # should work
 79 |             fasta_file_0,
 80 |             fasta_file_1,
 81 |             fasta_file_2,
 82 | 
 83 |             # shouldn't work
 84 |             fasta_file_3,
 85 |             fasta_file_4,
 86 |         )
 87 | 
 88 |         print(f'Deleted temporary directory {tempdir}.')
 89 | 
 90 | def test_read_fasta_alignment(prep_fasta_files):
 91 |     r_0 = fasta.read_fasta_alignment(prep_fasta_files[0])
 92 |     r_1 = fasta.read_fasta_alignment(prep_fasta_files[1])
 93 |     r_2 = fasta.read_fasta_alignment(prep_fasta_files[2])
 94 | 
 95 |     print('r_0', r_0)
 96 |     print('r_1', r_1)
 97 |     print('r_2', r_2)
 98 | 
 99 |     with pytest.raises(InvalidFastaAlignmentError):
100 |         r_3 = fasta.read_fasta_alignment(prep_fasta_files[3])
101 |     with pytest.raises(InvalidFastaAlignmentError):
102 |         r_4 = fasta.read_fasta_alignment(prep_fasta_files[4])
103 | 


--------------------------------------------------------------------------------
/pyckmeans/io/fasta.py:
--------------------------------------------------------------------------------
 1 | ''' fasta
 2 | 
 3 |     Module for reading and writing FASTA files.
 4 | '''
 5 | 
 6 | 
 7 | import itertools
 8 | import re
 9 | from typing import Tuple, Union
10 | 
11 | import numpy
12 | 
13 | class InvalidFastaAlignmentError(Exception):
14 |     '''InvalidFastaAlignmentError
15 |     '''
16 | 
17 | WHITESPACE_RE = re.compile(r'\s+')
18 | 
19 | def read_fasta_alignment(
20 |         fasta_file: str,
21 |         dtype: Union[str, numpy.dtype] = 'U',
22 |     ) -> Tuple[numpy.ndarray, numpy.ndarray]:
23 |     '''read_fasta_alignment
24 | 
25 |     Read fasta alignment file. This function expects the fasta to be a valid alignment,
26 |     meaning that it should contain at least 2 sequences of the same length, including
27 |     gaps.
28 | 
29 |     Parameters
30 |     ----------
31 |     fasta_file : str
32 |         Path to a fasta file.
33 |     dtype: Union[str, numpy.dtype]
34 |         Data type to use for the sequence array.
35 | 
36 |     Returns
37 |     -------
38 |     Tuple[numpy.ndarray, numpy.ndarray]
39 |         Tuple of sequences and names, each as numpy array.
40 | 
41 |     Raises
42 |     ------
43 |     InvalidFastaAlignmentError
44 |         Raised if less than 2 sequences are present in fasta_file.
45 |     InvalidFastaAlignmentError
46 |         Raised if the sequences have different lengths.
47 |     '''
48 | 
49 |     names = []
50 |     seqs = []
51 |     first = True
52 |     with open(fasta_file) as fasta_f:
53 |         seq_buffer = []
54 |         for line in fasta_f:
55 |             _line = line.strip()
56 | 
57 |             # empty line
58 |             if not _line:
59 |                 continue
60 | 
61 |             # name line
62 |             if _line[0] == '>':
63 |                 names.append(_line[1:])
64 |                 if not first:
65 |                     seqs.append(list(itertools.chain(*seq_buffer)))
66 |                     seq_buffer = []
67 |                 else:
68 |                     first = False
69 |             # sequence line
70 |             else:
71 |                 seq_buffer.append(re.sub(WHITESPACE_RE, '', _line).upper())
72 | 
73 |         seqs.append(list(itertools.chain(*seq_buffer)))
74 | 
75 |     # check alignment validity
76 |     n_seq = len(seqs)
77 |     if len(seqs) < 2:
78 |         msg = f'Expected at least 2 entries but found only {n_seq}.'
79 |         raise InvalidFastaAlignmentError(msg)
80 | 
81 |     seq_len = len(seqs[0])
82 |     for i, seq in enumerate(seqs[1:]):
83 |         cur_seq_len = len(seq)
84 |         if cur_seq_len != seq_len:
85 |             msg = f'Expected all sequences to have length {seq_len}' +\
86 |                 f'(length of sequence #0) but sequence #{i+1} has length {cur_seq_len}.'
87 |             raise InvalidFastaAlignmentError(msg)
88 | 
89 |     seqs = numpy.array(seqs, dtype=dtype)
90 |     names = numpy.array(names)
91 | 
92 |     return seqs, names
93 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | # vscode
141 | .vscode
142 | 
143 | # test data
144 | *test*.png
145 | pyckmeans/tests/*.pickle
146 | pyckmeans/tests/*.csv
147 | pyckmeans/tests/*.tsv
148 | pyckmeans/tests/*.json
149 | 


--------------------------------------------------------------------------------
/pyckmeans/io/src/nucencode.cpp:
--------------------------------------------------------------------------------
 1 | #ifdef _WIN32
 2 | #define LIBRARY_API extern "C" __declspec(dllexport)
 3 | #else
 4 | #define LIBRARY_API extern "C"
 5 | #endif
 6 | 
 7 | #include<cstdint>
 8 | #include<cstddef>
 9 | 
10 | /*
11 |  *    Base encoding as used by R package ape.
12 |  *    See http://ape-package.ird.fr/misc/BitLevelCodingScheme.html
13 |  *    
14 |  *    Summary:
15 |  *    Most significant four bits are base information (A, G, C, T)
16 |  *      76543210       
17 |  *    0b00001000 -> base is known
18 |  *    0b00000100 -> gap
19 |  *    0b00000010 -> unknown base
20 |  * 
21 |  *    bases
22 |  *    A     0b10001000
23 |  *    G     0b01001000
24 |  *    C     0b00101000
25 |  *    T     0b00011000
26 |  * 
27 |  *    wobbles
28 |  *    R     0b11000000      A|G 
29 |  *    M     0b10100000      A|C 
30 |  *    W     0b10010000      A|T 
31 |  *    S     0b01100000      G|C 
32 |  *    K     0b01010000      G|T 
33 |  *    Y     0b00110000      C|T 
34 |  *    V     0b11100000      A|G|C 
35 |  *    H     0b10110000      A|C|T 
36 |  *    D     0b11010000      A|G|T 
37 |  *    B     0b01110000      G|C|T 
38 |  *    N     0b11110000      A|G|C|T 
39 |  *    
40 |  *    gap
41 |  *    -     0b00000100
42 |  * 
43 |  *    unknown/missing state
44 |  *    ?     0b00000010
45 |  *
46 | */
47 | 
48 | // ASCII code to nucleotide encoding map
49 | const std::uint8_t asciiToEncoding[128] = {
50 | //  0    1    2    3    4    5    6    7    8    9
51 |     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 000
52 |     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 010
53 |     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 020
54 |     0,   0,   4,   0,   0,   0,   0,   0,   0,   0, // 030
55 |     0,   0,   0,   0,   0,   4,   0,   0,   0,   0, // 040
56 |     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 050
57 |     0,   0,   0,   2,   0, 136, 112,  40, 208,   0, // 060
58 |     0,  72, 176,   0,   0,  80,   0, 160, 240,   0, // 070
59 |     0,   0, 192,  96,  24,   0, 224, 144,   0,  48, // 080
60 |     0,   0,   0,   0,   0,   0,   0, 136, 112,  40, // 090
61 |     208, 0,   0,  72, 176,   0,   0,  80,   0, 160, // 100
62 |     240, 0,   0,   0, 192,  96,  24,   0, 224, 144, // 110
63 |     0,  48,   0,   0,   0,   0,   4,   0            // 120
64 | };
65 | 
66 | 
67 | // encode nucleotides in place
68 | LIBRARY_API void encodeNucleotides(
69 |     std::uint8_t* alignment, // nucleotide alignment
70 |     int n,                   // number of entries
71 |     int m                    // number of sites
72 | ) {
73 |     for (size_t i = 0; i < n; i++) {
74 |         for (size_t j = 0; j < m; j++) {
75 |             alignment[i * m + j] =  asciiToEncoding[alignment[i * m + j]];
76 |         }
77 |     }
78 | }
79 | 
80 | LIBRARY_API void encodeNucleotides_uint32(
81 |     std::uint32_t* alignment, // nucleotide alignment
82 |     int n,                    // number of entries
83 |     int m,                    // number of sites
84 |     std::uint8_t* alignmentEncoded
85 | ) {
86 |     for (size_t i = 0; i < n; i++) {
87 |         for (size_t j = 0; j < m; j++) {
88 |             alignmentEncoded[i * m + j] =  asciiToEncoding[alignment[i * m + j]];
89 |         }
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/pyckmeans/distance/tests/test_c_interop.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import numpy as np
  4 | 
  5 | from pyckmeans.io import NucleotideAlignment
  6 | from pyckmeans.distance.c_interop import \
  7 |     p_distance,\
  8 |     jc_distance,\
  9 |     k2p_distance
 10 | 
 11 | @pytest.fixture(scope='session')
 12 | def prepare_alignments():
 13 |     aln_0 = NucleotideAlignment(
 14 |         ['s0', 's1', 's2', 's3'],
 15 |         np.array([
 16 |             ['A', 'C', 'T', 'G', 'C', 'C', 'T', 'A', 'G', 'A'],
 17 |             ['T', 'C', '-', 'G', 'C', 'C', 'T', 'T', 'G', 'A'],
 18 |             ['A', 'G', 'T', 'G', 'C', 'C', 'T', 'A', 'G', 'A'],
 19 |             ['A', 'C', 'T', 'A', 'A', 'A', 'T', 'A', 'G', 'A'],
 20 |         ])
 21 |     )
 22 |     p_d_0_pd = np.array([
 23 |         [0.0000000, 0.2222222, 0.1000000, 0.3000000],
 24 |         [0.2222222, 0.0000000, 0.3333333, 0.5555556],
 25 |         [0.1000000, 0.3333333, 0.0000000, 0.4000000],
 26 |         [0.3000000, 0.5555556, 0.4000000, 0.0000000]
 27 |     ])
 28 |     p_d_0_cd = np.array([
 29 |         [0.0000000, 0.2222222, 0.1111111, 0.3333333],
 30 |         [0.2222222, 0.0000000, 0.3333333, 0.5555556],
 31 |         [0.1111111, 0.3333333, 0.0000000, 0.4444444],
 32 |         [0.3333333, 0.5555556, 0.4444444, 0.0000000]
 33 |     ])
 34 |     jc_d_0_pd = np.array([
 35 |         [0.0000000, 0.2635484, 0.1073256, 0.3831192],
 36 |         [0.2635484, 0.0000000, 0.4408400, 1.0124450],
 37 |         [0.1073256, 0.4408400, 0.0000000, 0.5716050],
 38 |         [0.3831192, 1.0124450, 0.5716050, 0.0000000]
 39 |     ])
 40 |     jc_d_0_cd = np.array([
 41 |         [0.0000000, 0.2635484, 0.1202570, 0.4408400],
 42 |         [0.2635484, 0.0000000, 0.4408400, 1.0124450],
 43 |         [0.1202570, 0.4408400, 0.0000000, 0.6734562],
 44 |         [0.4408400, 1.0124450, 0.6734562, 0.0000000]
 45 |     ])
 46 |     k2p_d_0_pd = np.array([
 47 |         [0.0000000, 0.2726039, 0.1084661, 0.3831192],
 48 |         [0.2726039, 0.0000000, 0.4773856, 1.0986123],
 49 |         [0.1084661, 0.4773856, 0.0000000, 0.5756463],
 50 |         [0.3831192, 1.0986123, 0.5756463, 0.0000000]
 51 |     ])
 52 |     k2p_d_0_cd = np.array([
 53 |         [0.0000000,0.2726039,0.1217201,0.4408400],
 54 |         [0.2726039,0.0000000,0.4773856,1.0986123],
 55 |         [0.1217201,0.4773856,0.0000000,0.6801182],
 56 |         [0.4408400,1.0986123,0.6801182,0.0000000],
 57 |     ])
 58 | 
 59 | 
 60 |     return (
 61 |         (
 62 |             aln_0,
 63 |             {
 64 |                 'p_pd': p_d_0_pd, 'p_cd': p_d_0_cd,
 65 |                 'jc_pd': jc_d_0_pd, 'jc_cd': jc_d_0_cd,
 66 |                 'k2p_pd': k2p_d_0_pd, 'k2p_cd': k2p_d_0_cd,
 67 |             }
 68 |         ),
 69 |     )
 70 | 
 71 | def test_p_distance(prepare_alignments):
 72 |     eps = 0.001
 73 | 
 74 |     aln_0, d_expected_0 = prepare_alignments[0]
 75 |     p_d_0_pd_expected = d_expected_0['p_pd']
 76 |     p_d_0_cd_expected = d_expected_0['p_cd']
 77 | 
 78 |     print(aln_0.sequences)
 79 | 
 80 |     p_d_0_pd = p_distance(aln_0.sequences, True)
 81 |     print('p_d_0_pd:', p_d_0_pd)
 82 |     assert np.max(np.abs(p_d_0_pd - p_d_0_pd_expected)) < eps
 83 |     p_d_0_cd = p_distance(aln_0.sequences, False)
 84 |     print('p_d_0_cd:', p_d_0_cd)
 85 |     assert np.max(np.abs(p_d_0_cd - p_d_0_cd_expected)) < eps
 86 | 
 87 | def test_jc_distance(prepare_alignments):
 88 |     eps = 0.001
 89 | 
 90 |     aln_0, d_expected_0 = prepare_alignments[0]
 91 |     jc_d_0_pd_expected = d_expected_0['jc_pd']
 92 |     jc_d_0_cd_expected = d_expected_0['jc_cd']
 93 | 
 94 |     print(aln_0.sequences)
 95 | 
 96 |     jc_d_0_pd = jc_distance(aln_0.sequences, True)
 97 |     print('jc_d_0_pd:', jc_d_0_pd)
 98 |     assert np.max(np.abs(jc_d_0_pd - jc_d_0_pd_expected)) < eps
 99 |     jc_d_0_cd = jc_distance(aln_0.sequences, False)
100 |     print('jc_d_0_cd:', jc_d_0_cd)
101 |     assert np.max(np.abs(jc_d_0_cd - jc_d_0_cd_expected)) < eps
102 | 
103 | def test_k2p_distance(prepare_alignments):
104 |     eps = 0.001
105 | 
106 |     aln_0, d_expected_0 = prepare_alignments[0]
107 |     k2p_d_0_pd_expected = d_expected_0['k2p_pd']
108 |     k2p_d_0_cd_expected = d_expected_0['k2p_cd']
109 | 
110 |     print(aln_0.sequences)
111 | 
112 |     k2p_d_0_pd = k2p_distance(aln_0.sequences, True)
113 |     print('k2p_d_0_pd:', k2p_d_0_pd)
114 |     assert np.max(np.abs(k2p_d_0_pd - k2p_d_0_pd_expected)) < eps
115 |     k2p_d_0_cd = k2p_distance(aln_0.sequences, False)
116 |     print('k2p_d_0_cd:', k2p_d_0_cd)
117 |     assert np.max(np.abs(k2p_d_0_cd - k2p_d_0_cd_expected)) < eps
118 | 


--------------------------------------------------------------------------------
/pyckmeans/tests/manual_test.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | import time
  3 | import pickle
  4 | 
  5 | import numpy as np
  6 | import numpy.random as random
  7 | import matplotlib.pyplot as plt
  8 | try:
  9 |     import tqdm
 10 | except:
 11 |     tqdm = None
 12 | 
 13 | import pyckmeans
 14 | import pyckmeans.utils
 15 | 
 16 | if __name__ == '__main__':
 17 |     path = pathlib.Path(__file__).parent.absolute()
 18 | 
 19 |     p = 10
 20 | 
 21 |     n0 = 50
 22 |     x0 = np.random.normal(0, 2, (n0, p))
 23 |     n1 = 50
 24 |     x1 = np.random.normal(-5, 1.5, (n1, p))
 25 |     n2 = 50
 26 |     x2 = np.random.normal(5, 2, (n2, p))
 27 | 
 28 |     x_0 = np.r_[x0, x1, x2]
 29 | 
 30 |     k = 3
 31 |     n_rep = 100
 32 |     p_feat = 0.5
 33 |     p_samp = 0.5
 34 | 
 35 |     
 36 |     ckm_0 = pyckmeans.CKmeans(
 37 |         k=k,
 38 |         n_rep=n_rep,
 39 |         p_samp=p_samp,
 40 |         p_feat=p_feat,
 41 |         metrics=[
 42 |             'sil',
 43 |             'bic',
 44 |             'db',
 45 |             'ch',
 46 |         ],
 47 |         n_init=5,
 48 |     )
 49 | 
 50 |     print('fitting ...')
 51 |     if tqdm:
 52 |         with tqdm.tqdm(total=n_rep) as bar:
 53 |             t0 = time.time()
 54 |             ckm_0.fit(x_0, progress_callback=bar.update)
 55 |             t1 = time.time()
 56 |     else:
 57 |         t0 = time.time()
 58 |         ckm_0.fit(x_0)
 59 |         t1 = time.time()
 60 | 
 61 |     print('predicting ...')
 62 |     if tqdm:
 63 |         with tqdm.tqdm(total=n_rep) as bar:
 64 |             t2 = time.time()
 65 |             ckm_0_res = ckm_0.predict(x_0, progress_callback=bar.update, return_cls=True)
 66 |             t3 = time.time()
 67 |     else:
 68 |         t2 = time.time()
 69 |         ckm_0_res = ckm_0.predict(x_0, return_cls=True)
 70 |         t3 = time.time()
 71 | 
 72 |     print(ckm_0_res.cmatrix)
 73 |     ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_00.tsv', one_hot=False, row_names=False, col_names=False)
 74 |     ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_10.tsv', one_hot=False, row_names=True, col_names=False)
 75 |     ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_01.tsv', one_hot=False, row_names=False, col_names=True)
 76 |     ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_11.tsv', one_hot=False, row_names=True, col_names=True)
 77 |     ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_oh_00.tsv', one_hot=True, row_names=False, col_names=False)
 78 |     ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_oh_10.tsv', one_hot=True, row_names=True, col_names=False)
 79 |     ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_oh_01.tsv', one_hot=True, row_names=False, col_names=True)
 80 |     ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_oh_11.tsv', one_hot=True, row_names=True, col_names=True)
 81 | 
 82 |     print(t1 - t0)
 83 |     print(t3 - t2)
 84 | 
 85 |     # fig, ax = plt.subplots(1,1)
 86 |     # ax.imshow(ckm_0_res.sort().cmatrix)
 87 |     # fig.savefig(path / 'manual_test_img0.png')
 88 | 
 89 |     fig = ckm_0_res.plot(figsize=(10, 10))
 90 |     fig.savefig(path / 'manual_test_img0.png')
 91 | 
 92 |     fig = ckm_0_res.plot(
 93 |         names=np.arange(x_0.shape[0]).astype('str'),
 94 |         figsize=(10, 10),
 95 |     )
 96 |     fig.savefig(path / 'manual_test_img1.png')
 97 | 
 98 |     fig = ckm_0_res.plot(
 99 |         names=np.arange(x_0.shape[0]).astype('str'),
100 |         figsize=(10, 10),
101 |         order=None,
102 |     )
103 |     fig.savefig(path / 'manual_test_img2.png')
104 | 
105 |     print('sils:', ckm_0.sils)
106 |     print('bics:', ckm_0.bics)
107 |     print('dbs:', ckm_0.dbs)
108 |     print('chs:', ckm_0.chs)
109 | 
110 |     ks = [2,3,4,5,6,7,8,9,10]
111 |     n_rep = 100
112 |     mckm_0 = pyckmeans.MultiCKMeans(k=ks, n_rep=n_rep)
113 | 
114 |     print('fitting multi ...')
115 |     with pyckmeans.utils.MultiCKMeansProgressBars(mckm_0) as pb:
116 |         mckm_0.fit(x_0, pb.update)
117 | 
118 |     with open(path / 'mckm_0.pickle', 'wb') as f:
119 |         pickle.dump(mckm_0, f)
120 | 
121 |     print('predicting multi ...')
122 |     with pyckmeans.utils.MultiCKMeansProgressBars(mckm_0) as pb:
123 |         mckm_0_res = mckm_0.predict(x_0, progress_callback=pb.update)
124 | 
125 |     print('sils:', mckm_0_res.sils)
126 |     print('bics:', mckm_0_res.bics)
127 |     print('dbs:', mckm_0_res.dbs)
128 |     print('chs:', mckm_0_res.chs)
129 | 
130 |     fig = mckm_0_res.plot_metrics(figsize=(10, 10))
131 |     fig.savefig(path / f'manual_test_img_metrics0.png')
132 | 
133 |     for k, ckm_res in zip(ks, mckm_0_res.ckmeans_results):
134 |         fig = ckm_res.plot(figsize=(10, 10))
135 |         fig.savefig(path / f'manual_test_img_k-{k}.png')
136 | 


--------------------------------------------------------------------------------
/pyckmeans/io/tests/test_csv.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import tempfile
  3 | import os
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | from pyckmeans.io.csv import InvalidMatrixShapeError, read_csv_distmat, write_csv_distmat
  9 | 
 10 | d_0 = np.array([
 11 |     [0.0, 1.0, 2.0],
 12 |     [1.0, 0.0, 3.0],
 13 |     [2.0, 3.0, 0.0],
 14 | ])
 15 | nm_0 = ['a', 'b', 'c']
 16 | df_0 = pd.DataFrame(d_0, columns=nm_0, index=nm_0)
 17 | 
 18 | d_1 = np.array([
 19 |     [0.0, 1.0, 2.0, 0.5],
 20 |     [1.0, 0.0, 3.0, 2.4],
 21 |     [2.0, 3.0, 0.0, 1.5],
 22 |     [0.5, 2.4, 1.5, 0.0]
 23 | ])
 24 | nm_1 = ['a', 'b', 'c', 'd']
 25 | df_1 = pd.DataFrame(d_1, columns=nm_1, index=nm_1)
 26 | 
 27 | # invalid input
 28 | d_2 = np.array([
 29 |     [0.0, 1.0, 2.0],
 30 |     [1.0, 0.0, 3.0],
 31 |     [2.0, 3.0, 0.0],
 32 |     [0.5, 2.4, 1.5]
 33 | ])
 34 | nm_2 = ['a', 'b', 'c']
 35 | df_2 = pd.DataFrame(d_2, columns=nm_2, index=['c', 'b', 'a', '0'])
 36 | 
 37 | @pytest.fixture(scope='session')
 38 | def prep_csv_files():
 39 |     with tempfile.TemporaryDirectory() as tempdir:
 40 |         print(f'Created temporary directory {tempdir}.')
 41 | 
 42 |         csv_file_0 = os.path.join(tempdir, 'dist_0.csv')
 43 |         df_0.to_csv(csv_file_0)
 44 | 
 45 |         csv_file_1 = os.path.join(tempdir, 'dist_1.csv')
 46 |         df_1.to_csv(csv_file_1, index=None)
 47 | 
 48 |         csv_file_2 = os.path.join(tempdir, 'dist_2.csv')
 49 |         df_0.to_csv(csv_file_2, header=None)
 50 | 
 51 |         csv_file_3 = os.path.join(tempdir, 'dist_3.csv')
 52 |         df_0.to_csv(csv_file_3, index=None, header=None)
 53 | 
 54 |         csv_file_4 = os.path.join(tempdir, 'dist_4.csv')
 55 |         df_2.to_csv(csv_file_4, index=None, header=None)
 56 | 
 57 |         yield (
 58 |             # should work
 59 |             (csv_file_0, d_0, nm_0),
 60 |             (csv_file_1, d_1, nm_1),
 61 |             (csv_file_2, d_0, nm_0),
 62 |             (csv_file_3, d_0, None),
 63 |             (csv_file_4, d_0, None),
 64 |         )
 65 | 
 66 |         print(f'Deleted temporary directory {tempdir}.')
 67 | 
 68 | @pytest.fixture(scope='session')
 69 | def prep_outdir():
 70 |     with tempfile.TemporaryDirectory() as tempdir:
 71 |         print(f'Created temporary directory {tempdir}.')
 72 | 
 73 |         yield tempdir
 74 | 
 75 |         print(f'Deleted temporary directory {tempdir}.')
 76 | 
 77 | def test_csv(prep_csv_files, prep_outdir):
 78 |     eps = 0.00001
 79 |     
 80 |     csv_f_0, d_0_expected, nm_0_expected = prep_csv_files[0]
 81 |     dm_0 = read_csv_distmat(csv_f_0, 0, 0, ',')
 82 |     assert np.max(np.abs(dm_0.dist_mat - d_0_expected)) < eps
 83 |     assert all([a == b for a, b in zip(dm_0.names, nm_0_expected)])
 84 |     csv_of_0 = os.path.join(prep_outdir, 'dist_0.csv')
 85 |     write_csv_distmat(dm_0, csv_of_0)
 86 |     dm_0_r = read_csv_distmat(csv_of_0, 0, 0, ',')
 87 |     assert np.max(np.abs(dm_0.dist_mat - dm_0_r.dist_mat)) < eps
 88 |     assert all([a == b for a, b in zip(dm_0.names, dm_0_r.names)])
 89 | 
 90 |     csv_f_1, d_1_expected, nm_1_expected = prep_csv_files[1]
 91 |     dm_1 = read_csv_distmat(csv_f_1, 0, None, ',')
 92 |     assert np.max(np.abs(dm_1.dist_mat - d_1_expected)) < eps
 93 |     assert all([a == b for a, b in zip(dm_1.names, nm_1_expected)])
 94 |     csv_of_1 = os.path.join(prep_outdir, 'dist_1.csv')
 95 |     write_csv_distmat(dm_1, csv_of_1)
 96 |     dm_1_r = read_csv_distmat(csv_of_1, 0, 0, ',')
 97 |     assert np.max(np.abs(dm_1.dist_mat - dm_1_r.dist_mat)) < eps
 98 |     assert all([a == b for a, b in zip(dm_1.names, dm_1_r.names)])
 99 | 
100 |     csv_f_2, d_2_expected, nm_2_expected = prep_csv_files[2]
101 |     dm_2 = read_csv_distmat(csv_f_2, None, 0, ',')
102 |     assert np.max(np.abs(dm_2.dist_mat - d_2_expected)) < eps
103 |     assert all([a == b for a, b in zip(dm_2.names, nm_2_expected)])
104 | 
105 |     csv_f_3, d_3_expected, nm_3_expected = prep_csv_files[3]
106 |     dm_3 = read_csv_distmat(csv_f_3, None, None, ',')
107 |     assert np.max(np.abs(dm_3.dist_mat - d_3_expected)) < eps
108 |     assert dm_3.names == nm_3_expected
109 |     csv_of_3 = os.path.join(prep_outdir, 'dist_3.csv')
110 |     write_csv_distmat(dm_3, csv_of_3)
111 |     dm_3_r = read_csv_distmat(csv_of_3)
112 |     assert np.max(np.abs(dm_3.dist_mat - dm_3_r.dist_mat)) < eps
113 | 
114 |     with pytest.raises(FileExistsError):
115 |         write_csv_distmat(dm_3, os.path.join(prep_outdir, 'dist_3.csv'))
116 |     with pytest.raises(FileExistsError):
117 |         d_path = os.path.join(prep_outdir, 'SOMEDIR')
118 |         os.mkdir(d_path)
119 |         write_csv_distmat(dm_3, d_path)
120 | 
121 |     csv_f_4, d_4_expected, nm_4_expected = prep_csv_files[4]
122 |     with pytest.raises(InvalidMatrixShapeError):
123 |         read_csv_distmat(csv_f_4, None, None, ',')


--------------------------------------------------------------------------------
/pyckmeans/io/csv.py:
--------------------------------------------------------------------------------
  1 | ''' csv
  2 | 
  3 |     Comma Separated Value (CSV) input and output.
  4 | '''
  5 | import os
  6 | from typing import Optional
  7 | 
  8 | import pandas
  9 | 
 10 | import pyckmeans.distance
 11 | 
 12 | class InvalidMatrixShapeError(Exception):
 13 |     '''InvalidMatrixShapeError'''
 14 | 
 15 | class IncompatibleNamesError(Exception):
 16 |     '''IncompatibleNamesError'''
 17 | 
 18 | def read_csv_distmat( # pylint: disable=missing-param-doc
 19 |     file_path: str,
 20 |     header: Optional[int] = 0,
 21 |     index_col: Optional[int] = 0,
 22 |     sep: str = ',',
 23 |     **kwargs,
 24 | ) -> 'pyckmeans.distance.DistanceMatrix':
 25 |     '''read_csv_distmat
 26 | 
 27 |     Read distance matrix from CSV file.
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     file_path : str
 32 |         Path to CSV file.
 33 |     header : Optional[int]
 34 |         Determines the row in the CSV file containing
 35 |         sample names. Is passed to pandas.read_csv(). By default 0, meaning
 36 |         the first row.
 37 |     index_col : Optional[int]
 38 |         Determines the index column. By default, the first column is expected
 39 |         to contain sample names. Passed to pandas.read_csv().
 40 |     sep : str
 41 |         Column separator, be default ','. Passed to Passed to pandas.read_csv().
 42 |     **kwargs
 43 |         Additional keyword arguments passed to pandas.read_csv().
 44 |     Returns
 45 |     -------
 46 |     pyckmeans.distance.DistanceMatrix
 47 |         DistanceMatrix object.
 48 | 
 49 |     Raises
 50 |     ------
 51 |     InvalidMatrixShapeError
 52 |         Raised if matrix is not square.
 53 |     IncompatibleNamesError
 54 |         Raised if column and row names do not match.
 55 |     '''
 56 |     dist_df = pandas.read_csv(
 57 |         file_path,
 58 |         header=header,
 59 |         index_col=index_col,
 60 |         sep=sep,
 61 |         **kwargs
 62 |     )
 63 | 
 64 |     dist_mat = dist_df.values
 65 | 
 66 |     # distance matrix must be a square matrix
 67 |     if dist_mat.shape[0] != dist_mat.shape[1]:
 68 |         msg = 'Expected a square matrix but matrix has dimensions '+ \
 69 |             f'{dist_mat.shape[0]}x{dist_mat.shape[1]}.'
 70 |         raise InvalidMatrixShapeError(msg)
 71 | 
 72 |     names = None
 73 |     # names are present in file
 74 |     if (not header is None) or (not index_col is None):
 75 |         # row and column names are present
 76 |         if (not header is None) and (not index_col is None):
 77 |             names_a = [nm.strip() for nm in dist_df.index.astype(str)]
 78 |             names_b = [nm.strip() for nm in dist_df.columns.astype(str)]
 79 | 
 80 |             # if row names and column names do not match, something
 81 |             # is probably wrong
 82 |             if not all([a == b for a, b in zip(names_a, names_b)]):
 83 |                 raise IncompatibleNamesError('Column and row names do not match.')
 84 | 
 85 |             names = names_a
 86 |         # column names are present
 87 |         elif not header is None:
 88 |             names = [nm.strip() for nm in dist_df.columns.astype(str)]
 89 |         # row names are present
 90 |         elif not index_col is None:
 91 |             names = [nm.strip() for nm in dist_df.index.astype(str)]
 92 | 
 93 | 
 94 |     return pyckmeans.distance.DistanceMatrix(
 95 |         dist_mat,
 96 |         names,
 97 |     )
 98 | 
 99 | def write_csv_distmat(
100 |     dist: 'pyckmeans.distance.DistanceMatrix',
101 |     file_path: str,
102 |     force: bool = False,
103 | ) -> None:
104 |     '''write_csv_distmat
105 | 
106 |     Write DistanceMatrix object to CSV.
107 | 
108 |     Parameters
109 |     ----------
110 |     dist : pyckmeans.distance.DistanceMatrix
111 |         DistanceMatrix object.
112 |     file_path : str
113 |         CSV file path.
114 |     force : bool, optional
115 |         Force overwrite if file_path already exists, by default False
116 | 
117 |     Raises
118 |     ------
119 |     FileExistsError
120 |         Raised if file at file_path already exists and force is False.
121 |     FileExistsError
122 |         Raised if file_path points to an existing directory.
123 |     '''
124 |     if os.path.exists(file_path):
125 |         if os.path.isfile(file_path) and not force:
126 |             msg = f'File {file_path} already exists. If you want to overwrite ' +\
127 |                 'it run the function with force=True.'
128 |             raise FileExistsError(msg)
129 |         elif os.path.isdir(file_path):
130 |             msg = f'A directory exists at path {file_path}.'
131 |             raise FileExistsError(msg)
132 | 
133 |     dist_df = pandas.DataFrame(
134 |         dist.dist_mat,
135 |         columns=dist.names,
136 |         index=dist.names,
137 |     )
138 | 
139 |     dist_df.to_csv(file_path, index_label='sample')
140 | 


--------------------------------------------------------------------------------
/pyckmeans/io/tests/test_nucleotidealignment.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | 
  3 | from pyckmeans.io.nucleotide_alignment import InvalidAlignmentCharacterError
  4 | import numpy
  5 | import pytest
  6 | 
  7 | from pyckmeans.io import \
  8 |     NucleotideAlignment, \
  9 |     read_alignment, \
 10 |     InvalidAlignmentFileExtensionError, \
 11 |     InvalidAlignmentFileFormatError
 12 | 
 13 | from test_fasta import prep_fasta_files
 14 | from test_phylip import prep_phylip_files
 15 | 
 16 | Bio = None
 17 | try:
 18 |     import Bio
 19 |     from Bio import SeqIO, AlignIO
 20 | except:
 21 |     warnings.warn('Could not test Biopython since it is not installed.')
 22 | 
 23 | 
 24 | def test_simple(prep_fasta_files, prep_phylip_files):
 25 |     na_fa_0 = NucleotideAlignment.from_file(prep_fasta_files[0])
 26 |     na_fa_1 = NucleotideAlignment.from_file(prep_fasta_files[1])
 27 |     na_fa_2 = NucleotideAlignment.from_file(prep_fasta_files[2], 'fasta')
 28 |     na_fa_3 = NucleotideAlignment.from_file(prep_fasta_files[2], 'fasta', fast_encoding=True)
 29 | 
 30 |     assert (na_fa_2.sequences == na_fa_3.sequences).all()
 31 | 
 32 |     print('na_fa_0:', na_fa_0)
 33 |     print('na_fa_1:', na_fa_1)
 34 |     print('na_fa_2:', na_fa_2)
 35 | 
 36 |     na_phy_0 = NucleotideAlignment.from_file(prep_phylip_files[0])
 37 |     na_phy_1 = NucleotideAlignment.from_file(prep_phylip_files[1])
 38 |     na_phy_2 = NucleotideAlignment.from_file(prep_phylip_files[2], 'phylip')
 39 |     na_phy_3 = NucleotideAlignment.from_file(prep_phylip_files[2], 'phylip', fast_encoding=True)
 40 | 
 41 |     assert (na_phy_2.sequences == na_phy_3.sequences).all()
 42 | 
 43 |     print('na_phy_0:', na_phy_0)
 44 |     print('na_phy_1:', na_phy_1)
 45 |     print('na_phy_2:', na_phy_2)
 46 | 
 47 |     with pytest.raises(InvalidAlignmentFileFormatError):
 48 |         NucleotideAlignment.from_file(prep_fasta_files[0], 'xyz')
 49 |     with pytest.raises(InvalidAlignmentFileExtensionError):
 50 |         NucleotideAlignment.from_file('test.png', 'auto')
 51 | 
 52 |     na_phy_0_di = na_phy_0.drop_invariant_sites(in_place=False)
 53 |     assert not na_phy_0_di is na_phy_0
 54 |     na_phy_0_di = na_phy_0.drop_invariant_sites(in_place=True)
 55 |     assert na_phy_0_di is na_phy_0
 56 | 
 57 |     na_phy_0_cp = na_phy_0.copy()
 58 |     assert (na_phy_0_cp.names == na_phy_0.names).all()
 59 |     assert (na_phy_0_cp.sequences == na_phy_0.sequences).all()
 60 | 
 61 |     with pytest.raises(Exception):
 62 |         NucleotideAlignment(
 63 |             ['a', 'b'],
 64 |             numpy.array([
 65 |                 ['A', 'C'],
 66 |                 ['A', 'T'],
 67 |                 ['A', 'T']
 68 |             ]),
 69 |             fast_encoding=False
 70 |         )
 71 |     
 72 |     with pytest.raises(InvalidAlignmentCharacterError):
 73 |         NucleotideAlignment(
 74 |             ['a', 'b', 'C'],
 75 |             numpy.array([
 76 |                 ['A', '3'],
 77 |                 ['A', 'T'],
 78 |                 ['A', 'T']
 79 |             ]),
 80 |             fast_encoding=False,
 81 |         )
 82 | 
 83 |     if not Bio is None:
 84 |         bio_aln = AlignIO.read(prep_fasta_files[0], format='fasta')
 85 |         aln_b = NucleotideAlignment.from_bp_seqio_records(bio_aln)
 86 |         print('aln_b:', aln_b)
 87 | 
 88 | 
 89 | def test_read_alignment(prep_fasta_files, prep_phylip_files):
 90 |     na_fa_0 = read_alignment(prep_fasta_files[0])
 91 |     na_fa_1 = read_alignment(prep_fasta_files[1])
 92 |     na_fa_2 = read_alignment(prep_fasta_files[2], 'fasta')
 93 | 
 94 |     print('na_fa_0:', na_fa_0)
 95 |     print('na_fa_1:', na_fa_1)
 96 |     print('na_fa_2:', na_fa_2)
 97 | 
 98 |     na_phy_0 = read_alignment(prep_phylip_files[0])
 99 |     na_phy_1 = read_alignment(prep_phylip_files[1])
100 |     na_phy_2 = read_alignment(prep_phylip_files[2], 'phylip')
101 | 
102 |     print('na_phy_0:', na_phy_0)
103 |     print('na_phy_1:', na_phy_1)
104 |     print('na_phy_2:', na_phy_2)
105 | 
106 |     with pytest.raises(InvalidAlignmentFileFormatError):
107 |         read_alignment(prep_fasta_files[0], 'xyz')
108 |     with pytest.raises(InvalidAlignmentFileExtensionError):
109 |         read_alignment('test.png', 'auto')
110 | 
111 | def test_utils():
112 |     na_0 = NucleotideAlignment(
113 |         ['a', 'b', 'c', 'd', 'e'],
114 |         numpy.array([
115 |             ['a', 't', 'a', 't', 't', 'g', 'c'],
116 |             ['a', 'a', '-', 't', 't', 'g', 'c'],
117 |             ['a', 'a', '-', 't', 't', 'g', 'c'],
118 |             ['a', 't', 'a', 't', 'g', 'g', 'c'],
119 |             ['a', 't', 'a', 't', 'g', 'g', 'c'],
120 |         ]),
121 |     )
122 |     na_0_0 = na_0[:2]
123 |     assert na_0_0.shape == (2, na_0.shape[1])
124 |     assert (na_0_0.names == na_0.names[:2]).all()
125 |     assert (na_0_0.sequences == na_0.sequences[:2]).all()
126 | 
127 |     na_0_1 = na_0[::2]
128 |     assert na_0_1.shape == (3, na_0.shape[1])
129 |     assert (na_0_1.names == na_0.names[::2]).all()
130 |     assert (na_0_1.sequences == na_0.sequences[::2]).all()
131 | 
132 |     na_0_2 = na_0[:4, :3]
133 |     assert na_0_2.shape == (4, 3)
134 |     assert (na_0_2.names == na_0.names[:4]).all()
135 |     assert (na_0_2.sequences == na_0.sequences[:4, :3]).all()
136 | 
137 |     assert na_0.drop_invariant_sites().shape == (5, 3)
138 |     


--------------------------------------------------------------------------------
/pyckmeans/distance/c_interop.py:
--------------------------------------------------------------------------------
  1 | import ctypes
  2 | import pathlib
  3 | 
  4 | import numpy
  5 | 
  6 | # load the shared library
  7 | libfile = pathlib.Path(__file__).parent / 'lib' / 'distance.so'
  8 | lib = ctypes.CDLL(str(libfile))
  9 | 
 10 | # == p distance
 11 | lib.pDistance.restype = None
 12 | lib.pDistance.argtypes = [
 13 |     numpy.ctypeslib.ndpointer(  # alignment: n * m matrix
 14 |         dtype=numpy.uint8,
 15 |         ndim=2,
 16 |         flags='C_CONTIGUOUS',
 17 |     ),
 18 |     ctypes.c_int,               # n: number of entries
 19 |     ctypes.c_int,               # m: number of sites
 20 |     ctypes.c_int,               # pairwiseDeletion
 21 |     numpy.ctypeslib.ndpointer(  # (output) distMat: n * n distance matrixmatrix
 22 |         dtype=numpy.double,
 23 |         ndim=2,
 24 |         flags='C_CONTIGUOUS',
 25 |     ),
 26 | ]
 27 | 
 28 | def p_distance(
 29 |     alignment: numpy.ndarray,
 30 |     pairwise_deletion: bool = True,
 31 | ) -> numpy.ndarray:
 32 |     '''p_distance
 33 | 
 34 |     Calculate p-distance for a nucleotide alignment.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     alignment : numpy.ndarray
 39 |         n*m numpy alignment, where n is the number of entries and m is
 40 |         the number of sites. Bases must be encoded in the format of
 41 |         pyckmeans.io.NucleotideAlignment.
 42 |     pairwise_deletion : bool, optional
 43 |         Calculate distances with pairwise-deletion in case of missing
 44 |         data, by default True
 45 | 
 46 |     Returns
 47 |     -------
 48 |     numpy.ndarray
 49 |         n*n distance matrix.
 50 |     '''
 51 |     if not alignment.flags['C_CONTIGUOUS']:
 52 |         alignment = numpy.ascontiguousarray(alignment)
 53 | 
 54 |     n, m = alignment.shape
 55 | 
 56 |     dist_mat = numpy.zeros((n, n), dtype=numpy.double)
 57 | 
 58 |     lib.pDistance(alignment, n, m, pairwise_deletion, dist_mat)
 59 | 
 60 |     return dist_mat
 61 | 
 62 | # == Jukes-Cantor distance
 63 | lib.jcDistance.restype = None
 64 | lib.jcDistance.argtypes = [
 65 |     numpy.ctypeslib.ndpointer(  # alignment: n * m matrix
 66 |         dtype=numpy.uint8,
 67 |         ndim=2,
 68 |         flags='C_CONTIGUOUS',
 69 |     ),
 70 |     ctypes.c_int,               # n: number of entries
 71 |     ctypes.c_int,               # m: number of sites
 72 |     ctypes.c_int,               # pairwiseDeletion
 73 |     numpy.ctypeslib.ndpointer(  # (output) distMat: n * n distance matrixmatrix
 74 |         dtype=numpy.double,
 75 |         ndim=2,
 76 |         flags='C_CONTIGUOUS',
 77 |     ),
 78 | ]
 79 | 
 80 | def jc_distance(
 81 |     alignment: numpy.ndarray,
 82 |     pairwise_deletion: bool = True,
 83 | ) -> numpy.ndarray:
 84 |     '''jc_distance
 85 | 
 86 |     Calculate Jukes-Cantor distance for a nucleotide alignment.
 87 | 
 88 |     Parameters
 89 |     ----------
 90 |     alignment : numpy.ndarray
 91 |         n*m numpy alignment, where n is the number of entries and m is
 92 |         the number of sites. Bases must be encoded in the format of
 93 |         pyckmeans.io.NucleotideAlignment.
 94 |     pairwise_deletion : bool, optional
 95 |         Calculate distances with pairwise-deletion in case of missing
 96 |         data, by default True
 97 | 
 98 |     Returns
 99 |     -------
100 |     numpy.ndarray
101 |         n*n distance matrix.
102 |     '''
103 |     if not alignment.flags['C_CONTIGUOUS']:
104 |         alignment = numpy.ascontiguousarray(alignment)
105 | 
106 |     n, m = alignment.shape
107 | 
108 |     dist_mat = numpy.zeros((n, n), dtype=numpy.double)
109 | 
110 |     lib.jcDistance(alignment, n, m, pairwise_deletion, dist_mat)
111 | 
112 |     return dist_mat
113 | 
114 | # == Kimura 2-parameter distance
115 | lib.k2pDistance.restype = None
116 | lib.k2pDistance.argtypes = [
117 |     numpy.ctypeslib.ndpointer(  # alignment: n * m matrix
118 |         dtype=numpy.uint8,
119 |         ndim=2,
120 |         flags='C_CONTIGUOUS',
121 |     ),
122 |     ctypes.c_int,               # n: number of entries
123 |     ctypes.c_int,               # m: number of sites
124 |     ctypes.c_int,               # pairwiseDeletion
125 |     numpy.ctypeslib.ndpointer(  # (output) distMat: n * n distance matrixmatrix
126 |         dtype=numpy.double,
127 |         ndim=2,
128 |         flags='C_CONTIGUOUS',
129 |     ),
130 | ]
131 | 
132 | def k2p_distance(
133 |     alignment: numpy.ndarray,
134 |     pairwise_deletion: bool = True,
135 | ) -> numpy.ndarray:
136 |     '''jc_distance
137 | 
138 |     Calculate Kimura 2-parameter distance for a nucleotide alignment.
139 | 
140 |     Parameters
141 |     ----------
142 |     alignment : numpy.ndarray
143 |         n*m numpy alignment, where n is the number of entries and m is
144 |         the number of sites. Bases must be encoded in the format of
145 |         pyckmeans.io.NucleotideAlignment.
146 |     pairwise_deletion : bool, optional
147 |         Calculate distances with pairwise-deletion in case of missing
148 |         data, by default True
149 | 
150 |     Returns
151 |     -------
152 |     numpy.ndarray
153 |         n*n distance matrix.
154 |     '''
155 |     if not alignment.flags['C_CONTIGUOUS']:
156 |         alignment = numpy.ascontiguousarray(alignment)
157 | 
158 |     n, m = alignment.shape
159 | 
160 |     dist_mat = numpy.zeros((n, n), dtype=numpy.double)
161 | 
162 |     lib.k2pDistance(alignment, n, m, pairwise_deletion, dist_mat)
163 | 
164 |     return dist_mat
165 | 


--------------------------------------------------------------------------------
/pyckmeans/tests/manual_tests.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 7,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pyckmeans import NucleotideAlignment, CKmeans\n",
 10 |     "from pyckmeans.io.nucleotide_alignment import BASE_ENCODING_INVERSE, BASE_ENCODING\n",
 11 |     "import numpy as np\n",
 12 |     "import time"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 3,
 18 |    "metadata": {
 19 |     "tags": []
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# import pprofile\n",
 24 |     "# profiler = pprofile.Profile()\n",
 25 |     "# with profiler:\n",
 26 |     "# #     aln = NucleotideAlignment.from_file('../../docs/datasets/rhodanthemum_ct85_msl68.snps.phy')\n",
 27 |     "#     aln = NucleotideAlignment.from_file('C:/Users/Tankr/Downloads/leu_reference_msl12.phy')\n",
 28 |     "# # Process profile content: generate a cachegrind file and send it to user.\n",
 29 |     "\n",
 30 |     "# # You can also write the result to the console:\n",
 31 |     "# profiler.print_stats()"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 4,
 37 |    "metadata": {
 38 |     "tags": []
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# import pprofile\n",
 43 |     "# profiler = pprofile.Profile()\n",
 44 |     "# with profiler:\n",
 45 |     "# #     aln = NucleotideAlignment.from_file('../../docs/datasets/rhodanthemum_ct85_msl68.snps.phy', fast_encoding=True)\n",
 46 |     "#     aln2 = NucleotideAlignment.from_file('C:/Users/Tankr/Downloads/leu_reference_msl12.phy', fast_encoding=True)\n",
 47 |     "# # Process profile content: generate a cachegrind file and send it to user.\n",
 48 |     "\n",
 49 |     "# # You can also write the result to the console:\n",
 50 |     "# profiler.print_stats()"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 2,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "<U1\n",
 63 |       "<U1\n",
 64 |       "<U1\n",
 65 |       "<U1\n",
 66 |       "<U1\n",
 67 |       "393 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 5 loops each)\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "%%timeit -n 5 -r 1\n",
 73 |     "\n",
 74 |     "aln = NucleotideAlignment.from_file('../../docs/datasets/rhodanthemum_ct85_msl68.snps.phy')"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 3,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "name": "stdout",
 84 |      "output_type": "stream",
 85 |      "text": [
 86 |       "|S1\n",
 87 |       "|S1\n",
 88 |       "|S1\n",
 89 |       "|S1\n",
 90 |       "|S1\n",
 91 |       "78.8 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 5 loops each)\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "%%timeit -n 5 -r 1\n",
 97 |     "\n",
 98 |     "aln2 = NucleotideAlignment.from_file('../../docs/datasets/rhodanthemum_ct85_msl68.snps.phy', fast_encoding=True)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 9,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "<U1\n"
111 |      ]
112 |     },
113 |     {
114 |      "data": {
115 |       "text/plain": [
116 |        "array([240, 240, 240, 240, 240, 240, 240, 240, 240, 240], dtype=uint8)"
117 |       ]
118 |      },
119 |      "execution_count": 9,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "t0 = time.time()\n",
126 |     "# aln = NucleotideAlignment.from_file('../../docs/datasets/rhodanthemum_ct85_msl68.snps.phy')\n",
127 |     "aln = NucleotideAlignment.from_file('C:/Users/Tankr/Downloads/leu_reference_msl12.phy', fast_encoding=False)\n",
128 |     "t1 = time.time()\n",
129 |     "aln.sequences[0, :10]"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 10,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "name": "stdout",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "|S1\n"
142 |      ]
143 |     },
144 |     {
145 |      "data": {
146 |       "text/plain": [
147 |        "array([240, 240, 240, 240, 240, 240, 240, 240, 240, 240], dtype=uint8)"
148 |       ]
149 |      },
150 |      "execution_count": 10,
151 |      "metadata": {},
152 |      "output_type": "execute_result"
153 |     }
154 |    ],
155 |    "source": [
156 |     "t2 = time.time()\n",
157 |     "# aln2 = NucleotideAlignment.from_file('../../docs/datasets/rhodanthemum_ct85_msl68.snps.phy', fast_encoding=True)\n",
158 |     "aln2 = NucleotideAlignment.from_file('C:/Users/Tankr/Downloads/leu_reference_msl12.phy', fast_encoding=True)\n",
159 |     "t3 = time.time()\n",
160 |     "aln2.sequences[0, : 10]"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 13,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "51.790849924087524\n",
173 |       "9.706952810287476\n"
174 |      ]
175 |     },
176 |     {
177 |      "data": {
178 |       "text/plain": [
179 |        "True"
180 |       ]
181 |      },
182 |      "execution_count": 13,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "print(t1 - t0)\n",
189 |     "print(t3 - t2)\n",
190 |     "\n",
191 |     "(aln.sequences == aln2.sequences).all()"
192 |    ]
193 |   }
194 |  ],
195 |  "metadata": {
196 |   "interpreter": {
197 |    "hash": "b51d5ffcdcb55cd147093c36a7aba5fa109db6e1886e0c97238be526b29fa5a9"
198 |   },
199 |   "kernelspec": {
200 |    "display_name": "Python [conda env:root] *",
201 |    "language": "python",
202 |    "name": "conda-root-py"
203 |   },
204 |   "language_info": {
205 |    "codemirror_mode": {
206 |     "name": "ipython",
207 |     "version": 3
208 |    },
209 |    "file_extension": ".py",
210 |    "mimetype": "text/x-python",
211 |    "name": "python",
212 |    "nbconvert_exporter": "python",
213 |    "pygments_lexer": "ipython3",
214 |    "version": "3.9.4"
215 |   }
216 |  },
217 |  "nbformat": 4,
218 |  "nbformat_minor": 4
219 | }
220 | 


--------------------------------------------------------------------------------
/pyckmeans/ordination/tests/test_pcoa.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import tempfile
  3 | import os
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn.datasets import make_blobs
  8 | from scipy.spatial.distance import pdist, squareform
  9 | 
 10 | from pyckmeans.distance import DistanceMatrix
 11 | from pyckmeans.ordination import InvalidCorrectionTypeError, InvalidFilterError, InvalidOutFormatError, pcoa, PCOAResult
 12 | 
 13 | @pytest.fixture()
 14 | def prepare_distmats():
 15 |     nm_0 = ['a', 'b', 'c', 'd']
 16 |     d_0 = np.array([
 17 |         [0.00, 0.90, 0.80, 0.30],
 18 |         [0.90, 0.00, 0.40, 0.70],
 19 |         [0.80, 0.40, 0.00, 0.50],
 20 |         [0.30, 0.70, 0.50, 0.00],
 21 |     ])
 22 | 
 23 |     return (
 24 |         (d_0, nm_0),
 25 |         (DistanceMatrix(d_0, nm_0), nm_0),
 26 |     )
 27 | 
 28 | @pytest.fixture(scope='session')
 29 | def test_dir():
 30 |     with tempfile.TemporaryDirectory() as tempdir:
 31 | 
 32 |         yield tempdir
 33 | 
 34 |         print(f'Deleted temporary directory {tempdir}.')
 35 | 
 36 | def test_pcoa_simple(prepare_distmats):
 37 |     pcoares_0 = pcoa(prepare_distmats[0][0])
 38 |     assert pcoares_0.names is None
 39 |     assert pcoares_0.vectors.shape[0] == prepare_distmats[0][0].shape[0]
 40 |     print('pcoares_0:', pcoares_0)
 41 |     print('pcoares_0.vectors:', pcoares_0.vectors)
 42 |     print('pcoares_0.values:', pcoares_0.values)
 43 |     print('pcoares_0.names:', pcoares_0.names)
 44 | 
 45 |     vectors_0_np = pcoares_0.get_vectors()
 46 |     assert isinstance(vectors_0_np, np.ndarray)
 47 |     assert vectors_0_np.shape == pcoares_0.vectors.shape
 48 |     vectors_0_pd = pcoares_0.get_vectors(out_format='pd')
 49 |     assert isinstance(vectors_0_pd, pd.DataFrame)
 50 |     assert vectors_0_pd.shape == pcoares_0.vectors.shape
 51 | 
 52 |     vectors_1_np = pcoares_0.get_vectors(filter_by='eigvals_rel_cum', filter_th=0.6, out_format='np')
 53 |     vectors_1_np_expected = pcoares_0.vectors[
 54 |         :,
 55 |         :(np.argmax(pcoares_0.values['eigvals_rel_cum'].values >= 0.6)+1)
 56 |     ]
 57 |     assert vectors_1_np.shape == vectors_1_np_expected.shape
 58 |     assert abs(vectors_1_np - vectors_1_np_expected).sum() < 0.0001
 59 | 
 60 |     vectors_1_pd = pcoares_0.get_vectors(filter_by='eigvals_rel_cum', filter_th=0.6, out_format='pd')
 61 |     assert vectors_1_pd.shape == vectors_1_np_expected.shape
 62 |     assert abs(vectors_1_pd.values - vectors_1_np_expected).sum() < 0.0001
 63 | 
 64 | 
 65 |     pcoares_1 = pcoa(prepare_distmats[1][0])
 66 |     assert all([nm_a == nm_b for nm_a, nm_b in zip(pcoares_1.names, prepare_distmats[1][1])])
 67 |     assert pcoares_1.vectors.shape[0] == prepare_distmats[1][0].shape[0]
 68 |     print('pcoares_0:', pcoares_1)
 69 |     print('pcoares_0.vectors:', pcoares_1.vectors)
 70 |     print('pcoares_0.values:', pcoares_1.values)
 71 |     print('pcoares_0.names:', pcoares_1.names)
 72 | 
 73 |     vectors_1_pd = pcoares_1.get_vectors(filter_by='eigvals_rel_cum', filter_th=0.6, out_format='pd')
 74 |     vectors_1_np_expected = pcoares_1.vectors[
 75 |         :,
 76 |         :(np.argmax(pcoares_1.values['eigvals_rel_cum'].values >= 0.6)+1)
 77 |     ]
 78 |     assert vectors_1_pd.shape == vectors_1_np_expected.shape
 79 |     assert abs(vectors_1_pd.values - vectors_1_np_expected).sum() < 0.0001
 80 |     print(vectors_1_pd.index.values)
 81 |     assert np.all(vectors_1_pd.index.values == pcoares_1.names)
 82 | 
 83 |     x_2, _ = make_blobs(200, 3, centers= 3)
 84 |     d_2 = squareform(pdist(x_2))
 85 |     pcoares_2 = pcoa(d_2)
 86 | 
 87 |     with pytest.raises(InvalidCorrectionTypeError):
 88 |         pcoa(d_2, correction='NONEXISTING_CORRECTION')
 89 | 
 90 | def assert_pcoa_res_are_equal(
 91 |     a: PCOAResult,
 92 |     b: PCOAResult,
 93 |     eps: float=1e-8,
 94 | ):
 95 |     assert (np.abs(a.vectors - b.vectors) < eps).all()
 96 |     assert (np.abs(a.values - b.values) < eps).values.all()
 97 |     assert (a.trace - b.trace) < eps or a.trace is b.trace
 98 |     assert a.trace_corr is b.trace_corr or (a.trace_corr - b.trace_corr) < eps
 99 |     assert a.correction == b.correction or a.correction is b.correction
100 |     assert a.negative_eigvals == b.negative_eigvals
101 | 
102 | @pytest.mark.parametrize('correction', [None, 'lingoes', 'cailliez'])
103 | def test_save_load(prepare_distmats, test_dir, correction):
104 |     pcoa_res_0 = pcoa(prepare_distmats[0][0],  correction=correction)
105 |     print('correction:', correction)
106 | 
107 |     pcoa_res_0_json_file = os.path.join(test_dir, f'{correction}_pcoa_res_0.json')
108 |     pcoa_res_0.to_json(pcoa_res_0_json_file)
109 |     pcoa_res_0_l = PCOAResult.from_json(pcoa_res_0_json_file)
110 |     assert_pcoa_res_are_equal(pcoa_res_0, pcoa_res_0_l)
111 | 
112 |     pcoa_res_0_dir = os.path.join(test_dir, f'{correction}_pcoa_res_0')
113 |     pcoa_res_0.to_dir(pcoa_res_0_dir)
114 |     pcoa_res_0_l = PCOAResult.from_dir(pcoa_res_0_dir)
115 |     assert_pcoa_res_are_equal(pcoa_res_0, pcoa_res_0_l)
116 | 
117 |     pcoa_res_1 = pcoa(prepare_distmats[1][0],  correction=correction)
118 |     print('correction:', correction)
119 | 
120 |     pcoa_res_1_json_file = os.path.join(test_dir, f'{correction}_pcoa_res_1.json')
121 |     pcoa_res_1.to_json(pcoa_res_1_json_file)
122 |     pcoa_res_1_l = PCOAResult.from_json(pcoa_res_1_json_file)
123 |     assert_pcoa_res_are_equal(pcoa_res_1, pcoa_res_1_l)
124 | 
125 |     pcoa_res_1_dir = os.path.join(test_dir, f'{correction}_pcoa_res_1')
126 |     pcoa_res_1.to_dir(pcoa_res_1_dir)
127 |     pcoa_res_1_l = PCOAResult.from_dir(pcoa_res_1_dir)
128 |     assert_pcoa_res_are_equal(pcoa_res_1, pcoa_res_1_l)
129 | 
130 |     assert_pcoa_res_are_equal(
131 |         pcoa_res_1, PCOAResult.from_json_str(pcoa_res_1.to_json())
132 |     )
133 | 
134 |     with pytest.raises(Exception):
135 |         pcoa_res_1.to_dir(pcoa_res_1_dir, force=False)
136 |     with pytest.raises(Exception):
137 |         PCOAResult.from_dir('NONEXISTING_DIR')
138 | 
139 | 
140 | def test_pcoa_result(prepare_distmats):
141 |     pcoa_res_0 = pcoa(prepare_distmats[0][0])
142 | 
143 |     with pytest.raises(InvalidFilterError):
144 |         pcoa_res_0.get_vectors(filter_by='NONEXISTING_FILTER', filter_th=0.8)
145 |     with pytest.raises(InvalidFilterError):
146 |         pcoa_res_0.get_vectors(filter_by='eigvals_rel_cum')
147 |     with pytest.raises(InvalidFilterError):
148 |         pcoa_res_0.get_vectors(filter_th=0.8)
149 |     with pytest.raises(InvalidOutFormatError):
150 |         pcoa_res_0.get_vectors(filter_by='eigvals_rel_cum', filter_th=0.8, out_format='NONEXISTING_FORMAT')
151 | 


--------------------------------------------------------------------------------
/pyckmeans/tests/test_workflow.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import pandas as pd
  3 | import pytest
  4 | import tempfile
  5 | import os
  6 | 
  7 | from pyckmeans.io import read_alignment
  8 | from pyckmeans.distance import alignment_distance
  9 | from pyckmeans.ordination import PCOAResult, pcoa
 10 | from pyckmeans.core import CKmeans, MultiCKMeans, WECR
 11 | from pyckmeans.utils import plot_ckmeans_result, plot_multickmeans_metrics, MultiCKMeansProgressBars, plot_cmatrix
 12 | 
 13 | import tqdm
 14 | 
 15 | PHYLIP_STR_0 = \
 16 | '''10 14
 17 | Sample0 ACTGTCATGAAGGA
 18 | Sample1 ACT--CATCAAGGA
 19 | Sample2 ACTCTCATGAAGGA
 20 | Sample3 AGTCTCTTGAAGGA
 21 | Sample4 AGT--CATGAACTG
 22 | Sample5 ACTGTCATGAACTG
 23 | Sample6 ACTC-CATCAACTG
 24 | Sample7 AGGCTCCTGAACTG
 25 | Sample8 ACTCTCTTTAACTG
 26 | Sample9 TTTCTCACGAACTG
 27 | '''
 28 | 
 29 | @pytest.fixture(scope='session')
 30 | def prep_phylip_files():
 31 |     with tempfile.TemporaryDirectory() as tempdir:
 32 |         print(f'Created temporary directory {tempdir}.')
 33 | 
 34 |         phylip_file_0 = os.path.join(tempdir, 'phylip_0.phy')
 35 |         with open(phylip_file_0, 'w') as f:
 36 |             f.write(PHYLIP_STR_0)
 37 | 
 38 |         yield (
 39 |             phylip_file_0,
 40 |         )
 41 | 
 42 |         print(f'Destroyed temporary directory {tempdir}.')
 43 | 
 44 | @pytest.fixture(scope='session')
 45 | def prep_pcoa_results(prep_phylip_files):
 46 |     na_0 = read_alignment(prep_phylip_files[0])
 47 |     d_0_p = alignment_distance(na_0, 'p')
 48 |     pcoares_0 = pcoa(d_0_p, 'lingoes')
 49 | 
 50 |     return (
 51 |         pcoares_0,
 52 |     )
 53 | 
 54 | def test_simple_workflow(prep_phylip_files):
 55 |     na_0 = read_alignment(prep_phylip_files[0])
 56 |     d_0_p = alignment_distance(na_0, 'p')
 57 |     pcoares_0 = pcoa(d_0_p, 'lingoes')
 58 |     ckm_0 = CKmeans(k=2, n_rep=50, n_init=2)
 59 |     ckm_0.fit(pcoares_0.vectors)
 60 |     ckm_0_res = ckm_0.predict(pcoares_0.vectors)
 61 |     ckm_0_res.sort(in_place=True)
 62 | 
 63 |     print('pcoares_0.vectors', pcoares_0.vectors)
 64 |     print('ckm_0_res.cl:', ckm_0_res.cl)
 65 | 
 66 |     ckm_1 = CKmeans(k=2, n_rep=50, n_init=2)
 67 |     with tqdm.tqdm(total=ckm_1.n_rep) as pb:
 68 |         ckm_1.fit(pcoares_0, progress_callback=pb.update)
 69 |     with tqdm.tqdm(total=ckm_1.n_rep) as pb:
 70 |         ckm_1_res = ckm_1.predict(pcoares_0, progress_callback=pb.update)
 71 |     ckm_1_res.sort(in_place=True)
 72 |     ckm_1_res.recalculate_cluster_memberships(pcoares_0, linkage_type='complete')
 73 | 
 74 |     print('ckm_1_res.cl:', ckm_1_res.cl)
 75 |     print('ckm_1_res.names:', ckm_1_res.names)
 76 | 
 77 |     ckm_2 = CKmeans(k=2, n_rep=50, n_init=2)
 78 |     df = pd.DataFrame(pcoares_0.vectors, pcoares_0.names)
 79 |     ckm_2.fit(df)
 80 |     ckm_2_res = ckm_2.predict(df)
 81 |     ckm_2_res.sort(in_place=True)
 82 |     print('ckm_2_res.cl:', ckm_2_res.cl)
 83 |     print('ckm_2_res.names:', ckm_2_res.names)
 84 | 
 85 |     ckm_2_res_cls = ckm_2.predict(df, return_cls=True)
 86 |     assert ckm_2_res_cls.km_cls.shape == (ckm_2.n_rep, df.shape[0])
 87 |     ckm_2_res_cls.sort()
 88 | 
 89 |     # test copy
 90 |     ckm_2_res_cls_cp = ckm_2_res_cls.copy()
 91 |     ckm_2_res_cls_cp.cl[0] = -1000
 92 |     ckm_2_res_cls_cp.km_cls[0,0] = -1000
 93 |     assert ckm_2_res_cls_cp.cl[0] != ckm_2_res_cls.cl[0]
 94 |     assert ckm_2_res_cls_cp.km_cls[0,0] != ckm_2_res_cls.km_cls[0,0]
 95 |     assert not ckm_2_res_cls_cp is ckm_2_res_cls
 96 | 
 97 |     # test recalculate cluster memberships
 98 |     ckm_2_res_cls_cp_rcm_1 = ckm_2_res_cls_cp.recalculate_cluster_memberships(df, 'average', in_place=False)
 99 |     ckm_2_res_cls_cp_rcm_2 = ckm_2_res_cls_cp.recalculate_cluster_memberships(df, 'average', in_place=True)
100 |     assert ckm_2_res_cls_cp_rcm_2 is ckm_2_res_cls_cp
101 |     assert not ckm_2_res_cls_cp_rcm_1 is ckm_2_res_cls_cp
102 | 
103 |     # test distance
104 |     d_0_p_0 = alignment_distance(na_0, 'p')
105 |     d_0_p_1 = na_0.distance('p')
106 |     assert numpy.abs(d_0_p_0.dist_mat - d_0_p_1.dist_mat).sum() < 0.001
107 | 
108 | def test_multi_workflow(prep_pcoa_results):
109 |     pcoares_0: PCOAResult = prep_pcoa_results[0]
110 |     mckm_0 = MultiCKMeans([2,3,3])
111 |     with MultiCKMeansProgressBars(mckm_0) as pb:
112 |         mckm_0.fit(pcoares_0, progress_callback=pb.update)
113 |     with MultiCKMeansProgressBars(mckm_0) as pb:
114 |         mckm_0_res = mckm_0.predict(pcoares_0, progress_callback=pb.update)
115 | 
116 |     plot_multickmeans_metrics(mckm_0_res)
117 |     mckm_0_res.plot_metrics()
118 | 
119 |     mckm_1 = MultiCKMeans([2,3,3])
120 |     mckm_1.fit(pcoares_0.vectors)
121 |     mckm_1_res = mckm_1.predict(pcoares_0.vectors)
122 |     plot_multickmeans_metrics(mckm_1_res)
123 |     mckm_1_res.plot_metrics()
124 | 
125 |     mckm_2 = MultiCKMeans([2,3,3], n_rep=100)
126 |     df = pd.DataFrame(pcoares_0.vectors, pcoares_0.names)
127 |     mckm_2.fit(df)
128 |     mckm_2_res = mckm_2.predict(df)
129 |     plot_multickmeans_metrics(mckm_2_res)
130 |     mckm_2_res.plot_metrics()
131 | 
132 |     mckm_2_res_cls = mckm_2.predict(df, return_cls=True)
133 |     assert mckm_2_res_cls.ckmeans_results[0].km_cls.shape == (mckm_2.n_rep, df.shape[0])
134 |     mckm_2_res_cls.sort(0)
135 | 
136 | def test_wecr_workflow(prep_pcoa_results):
137 |     pcoares_0: PCOAResult = prep_pcoa_results[0]
138 |     wecr_0 = WECR([2, 3])
139 |     
140 |     with tqdm.tqdm(total=wecr_0.n_rep) as pb:
141 |         wecr_0.fit(pcoares_0, progress_callback=pb.update)
142 |     with tqdm.tqdm(total=wecr_0.n_rep) as pb:
143 |         wecr_res_0 = wecr_0.predict(pcoares_0, progress_callback=pb.update)
144 |     
145 |     wecr_res_0.recalculate_cluster_memberships(pcoares_0, 'single')
146 | 
147 | def test_plotting(prep_pcoa_results):
148 |     pcoares_0 = prep_pcoa_results[0]
149 |     ckm_0 = CKmeans(k=2, n_rep=10)
150 |     ckm_0.fit(pcoares_0)
151 |     ckm_0_res = ckm_0.predict(pcoares_0)
152 | 
153 |     ckm_0_res.sort()
154 |     ord = ckm_0_res.order()
155 |     ckm_0_res.reorder(ord)
156 | 
157 |     plot_ckmeans_result(ckm_0_res)
158 |     plot_ckmeans_result(ckm_0_res, order=None)
159 |     plot_ckmeans_result(ckm_0_res, order=ord)
160 | 
161 |     ckm_0_res.plot()
162 |     ckm_0_res.plot(order=None)
163 |     ckm_0_res.plot(order=ord)
164 | 
165 |     plot_cmatrix(ckm_0_res.cmatrix, ckm_0_res.cl, names=ckm_0_res.names, order = None)
166 |     plot_cmatrix(ckm_0_res.cmatrix, ckm_0_res.cl, names=ckm_0_res.names, order = 'GW')
167 |     plot_cmatrix(ckm_0_res.cmatrix, ckm_0_res.cl, names=ckm_0_res.names, order = ord)
168 |     plot_cmatrix(ckm_0_res.cmatrix, ckm_0_res.cl, names=None, order = None)
169 | 


--------------------------------------------------------------------------------
/pyckmeans/distance/__init__.py:
--------------------------------------------------------------------------------
  1 | ''' distance
  2 | 
  3 |     Module for distance calculations.
  4 | '''
  5 | 
  6 | from typing import Iterable, Optional, Tuple
  7 | 
  8 | import numpy
  9 | 
 10 | import pyckmeans.io
 11 | 
 12 | from .c_interop import p_distance, jc_distance, k2p_distance
 13 | 
 14 | class IncompatibleNamesError(Exception):
 15 |     '''IncompatibleNamesError'''
 16 | 
 17 | class DistanceMatrix:
 18 |     '''__init__
 19 | 
 20 |     Distance Matrix, optionally named.
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     dist_mat : numpy.ndarray
 25 |         n*n distance matrix.
 26 |     names : Optional[Iterable[str]]
 27 |         Names, by default None.
 28 | 
 29 |     Raises
 30 |     ------
 31 |     IncompatibleNamesError
 32 |         Raised if dimension of names and dist_mat are incompatible.
 33 |     '''
 34 |     def __init__(self, dist_mat: numpy.ndarray, names: Optional[Iterable[str]] = None):
 35 |         self.dist_mat = dist_mat
 36 |         self.names = None
 37 | 
 38 |         if not names is None:
 39 |             n = dist_mat.shape[0]
 40 |             if len(names) != n:
 41 |                 msg = f'Expected {n} names for {n}x{n} distance matrix ' +\
 42 |                     f'but {len(names)} were passed.'
 43 |                 raise IncompatibleNamesError(msg)
 44 | 
 45 |             self.names = numpy.array(names)
 46 | 
 47 |     def __repr__(self) -> str:
 48 |         '''__repr__
 49 | 
 50 |         Returns
 51 |         -------
 52 |         str
 53 |             String representation.
 54 |         '''
 55 |         return f'{repr(self.names)}\n{repr(self.dist_mat)}'
 56 | 
 57 |     @property
 58 |     def shape(self) -> Tuple[int]:
 59 |         '''shape
 60 | 
 61 |         Get matrix shape.
 62 | 
 63 |         Returns
 64 |         -------
 65 |         Tuple[int]
 66 |             Matrix shape.
 67 |         '''
 68 |         return self.dist_mat.shape
 69 | 
 70 |     @staticmethod
 71 |     def from_phylip(file_path: str) -> 'DistanceMatrix':
 72 |         '''from_phylip
 73 | 
 74 |         Read PHYLIP distance matrix.
 75 | 
 76 |         Returns
 77 |         -------
 78 |         DistanceMatrix
 79 |             DistanceMatrix object.
 80 |         '''
 81 |         return pyckmeans.io.phylip.read_phylip_distmat(file_path)
 82 | 
 83 |     @staticmethod
 84 |     def from_csv( # pylint: disable=missing-param-doc
 85 |         file_path: str,
 86 |         header: Optional[int] = 0,
 87 |         index_col: Optional[int] = 0,
 88 |         sep: str = ',',
 89 |         **kwargs,
 90 |     ) -> 'DistanceMatrix':
 91 |         '''read_csv_distmat
 92 | 
 93 |         Read distance matrix from CSV file.
 94 | 
 95 |         Parameters
 96 |         ----------
 97 |         file_path : str
 98 |             Path to CSV file.
 99 |         header : Optional[int]
100 |             Determines the row in the CSV file containing
101 |             sample names. Is passed to pandas.read_csv(). By default 0, meaning
102 |             the first row.
103 |         index_col : Optional[int]
104 |             Determines the index column. By default, the first column is expected
105 |             to contain sample names. Passed to pandas.read_csv().
106 |         sep : str
107 |             Column separator, be default ','. Passed to Passed to pandas.read_csv().
108 |         **kwargs
109 |             Additional keyword arguments passed to pandas.read_csv().
110 |         Returns
111 |         -------
112 |         pyckmeans.distance.DistanceMatrix
113 |             DistanceMatrix object.
114 |         '''
115 |         return pyckmeans.io.csv.read_csv_distmat(
116 |             file_path=file_path,
117 |             header=header,
118 |             index_col=index_col,
119 |             sep=sep,
120 |             **kwargs,
121 |         )
122 | 
123 |     def to_phylip(
124 |         self,
125 |         file_path: str,
126 |         force: bool = False,
127 |     ):
128 |         '''to_phylip
129 | 
130 |         Write distance matrix to file in PHYLIP matrix format.
131 | 
132 |         Parameters
133 |         ----------
134 |         file_path : str
135 |             Output file path.
136 |         force : bool, optional
137 |             Force overwrite if file exists, by default False
138 |         '''
139 |         pyckmeans.io.phylip.write_phylip_distmat(
140 |             dist=self,
141 |             file_path=file_path,
142 |             force=force,
143 |         )
144 | 
145 |     def to_csv(
146 |         self,
147 |         file_path: str,
148 |         force: bool = False,
149 |     ):
150 |         '''to_csv
151 | 
152 |         Write DistanceMatrix object to CSV.
153 | 
154 |         Parameters
155 |         ----------
156 |         file_path : str
157 |             CSV file path.
158 |         force : bool, optional
159 |             Force overwrite if file_path already exists, by default False
160 |         '''
161 |         pyckmeans.io.csv.write_csv_distmat(
162 |             dist=self,
163 |             file_path=file_path,
164 |             force=force,
165 |         )
166 | 
167 | class InvalidDistanceTypeError(Exception):
168 |     '''UnknownDistanceTypeError'''
169 | 
170 | def alignment_distance(
171 |     alignment: "pyckmeans.io.NucleotideAlignment",
172 |     distance_type: str = 'p',
173 |     pairwise_deletion: bool = True,
174 | ) -> DistanceMatrix:
175 |     '''genetic_distance
176 | 
177 |     Calculate genetic distance based on a nucleotide alignment.
178 | 
179 |     Parameters
180 |     ----------
181 |     alignment : pyckmeans.io.NucleotideAlignment
182 |         Nucleotide alignment.
183 |     distance_type : str, optional
184 |         Type of genetic distance to calculate, by default 'p'.
185 |         Available distance types are p-distances ('p'),
186 |         Jukes-Cantor distances ('jc'), and Kimura 2-paramater distances
187 |         ('k2p').
188 |     pairwise_deletion : bool
189 |         Use pairwise deletion as action to deal with missing data.
190 |         If False, complete deletion is applied.
191 |         Gaps ("-", "~", " "), "?", and ambiguous bases are treated as
192 |         missing data.
193 |     Returns
194 |     -------
195 |     DistanceMatrix
196 |         n*n distance matrix.
197 | 
198 |     Raises
199 |     ------
200 |     InvalidDistanceTypeError
201 |         Raised if invalid distance_type is passed.
202 |     '''
203 |     distance_type = distance_type.lower()
204 |     if distance_type in ['p', 'raw']:
205 |         return DistanceMatrix(
206 |             p_distance(alignment.sequences, pairwise_deletion),
207 |             alignment.names,
208 |         )
209 |     elif distance_type in ['jc', 'jc69']:
210 |         return DistanceMatrix(
211 |             jc_distance(alignment.sequences, pairwise_deletion),
212 |             alignment.names,
213 |         )
214 |     elif distance_type in ['k2p', 'k80']:
215 |         return DistanceMatrix(
216 |             k2p_distance(alignment.sequences, pairwise_deletion),
217 |             alignment.names,
218 |         )
219 |     else:
220 |         msg = f'Unknown distance type "{distance_type}".'
221 |         raise InvalidDistanceTypeError(msg)
222 | 


--------------------------------------------------------------------------------
/pyckmeans/core/tests/test_core.py:
--------------------------------------------------------------------------------
  1 | import pandas
  2 | import pytest
  3 | import tempfile
  4 | import os
  5 | 
  6 | import numpy as np
  7 | from sklearn.datasets import make_blobs
  8 | 
  9 | from pyckmeans.core.multickmeans import MultiCKMeans
 10 | from pyckmeans.core.ckmeans import CKmeans, CKmeansResult, InvalidClusteringMetric
 11 | from pyckmeans.core.wecr import WECR, WECRResult, InvalidConstraintsError, InvalidKError
 12 | 
 13 | @pytest.fixture(scope='session')
 14 | def test_dir():
 15 |     with tempfile.TemporaryDirectory() as tempdir:
 16 | 
 17 |         yield tempdir
 18 | 
 19 |         print(f'Deleted temporary directory {tempdir}.')
 20 | 
 21 | def assert_ckm_res_equal(a: CKmeansResult, b: CKmeansResult, eps=1e-8):
 22 |     assert (np.abs(a.cmatrix - b.cmatrix) < eps).all()
 23 |     assert (a.cl == b.cl).all()
 24 |     assert (a.bic is b.bic) or (a.bic - b.bic) < eps
 25 |     assert (a.db is b.db) or (a.db - b.db) < eps
 26 |     assert (a.sil is b.sil) or (a.sil - b.sil) < eps
 27 |     assert (a.ch is b.ch) or (a.ch - b.ch) < eps
 28 |     assert (a.names == b.names).all()
 29 |     assert a.km_cls is b.km_cls or \
 30 |         (np.abs(a.km_cls - b.km_cls) < eps).all()
 31 | 
 32 | def assert_wecr_res_equal(a: WECRResult, b: WECRResult, eps=1e-8):
 33 |     assert (np.abs(a.cmatrix - b.cmatrix) < eps).all()
 34 |     assert (a.cl == b.cl).all()
 35 |     assert (a.bic is b.bic) or ((a.bic - b.bic) < eps).all()
 36 |     assert (a.db is b.db) or ((a.db - b.db) < eps).all()
 37 |     assert (a.sil is b.sil) or ((a.sil - b.sil) < eps).all()
 38 |     assert (a.ch is b.ch) or ((a.ch - b.ch) < eps).all()
 39 |     assert (a.names == b.names).all()
 40 |     assert a.km_cls is b.km_cls or \
 41 |         (np.abs(a.km_cls - b.km_cls) < eps).all()
 42 | 
 43 | def test_simple():
 44 |     ckm_0 = CKmeans(2)
 45 |     ckm_1 = CKmeans(np.array(3, dtype=int))
 46 |     ckm_2 = CKmeans(np.array(3, dtype=np.int64))
 47 | 
 48 | def test_ckmeans():
 49 |     x_0, _ = make_blobs(100, 5, centers=3, center_box=[-15, 15], shuffle=False)
 50 |     ckm_0 = CKmeans(3, metrics=['sil', 'bic', 'db', 'ch'])
 51 |     ckm_0.fit(x_0)
 52 |     ckm_res_0 = ckm_0.predict(x_0)
 53 | 
 54 |     with pytest.raises(InvalidClusteringMetric):
 55 |         CKmeans(3, metrics=['NONEXISTENT_METRIC'])
 56 | 
 57 | def test_wecr():
 58 |     x_0, _ = make_blobs(100, 5, centers=3, center_box=[-15, 15], shuffle=False)
 59 |     wecr_0 = WECR([2,3,4,5], 100)
 60 |     wecr_0.fit(x_0)
 61 |     
 62 |     wecr_res_0 = wecr_0.predict(x_0, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51], [5, 99]])
 63 |     with pytest.raises(InvalidConstraintsError):
 64 |         wecr_0.predict(x_0, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51], [5, 100]])
 65 |     with pytest.raises(InvalidConstraintsError):
 66 |         wecr_0.predict(x_0, must_link=[[0, 1], [101, 2]], must_not_link=[[0, 51], [5, 99]])
 67 |     with pytest.raises(InvalidConstraintsError):
 68 |         wecr_0.predict(x_0, must_link=[['a', 'b'], ['c', 'd']], must_not_link=[[0, 51], [5, 99]])
 69 |     with pytest.raises(InvalidConstraintsError):
 70 |         wecr_0.predict(x_0, must_link=[[0, 1], [0, 2]], must_not_link=[['a', 'b'], ['c', 'd']])
 71 |     with pytest.raises(InvalidConstraintsError):
 72 |         wecr_0.predict(x_0, must_link=[[0, 1, 0, 2]], must_not_link=[[0, 51], [5, 99]])
 73 |     with pytest.raises(InvalidConstraintsError):
 74 |         wecr_0.predict(x_0, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51, 5, 99]])
 75 | 
 76 |     x_1 = pandas.DataFrame(x_0)
 77 |     wecr_0.fit(x_1)
 78 |     wecr_res_1 = wecr_0.predict(x_1, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51], [5, 99]])
 79 |     wecr_res_1 = wecr_0.predict(x_1, must_link=[['0', '1'], ['0', '2']], must_not_link=[[0, 51], [5, 99]])
 80 |     wecr_res_1 = wecr_0.predict(x_1, must_link=[[0, 1], [0, 2]], must_not_link=[['0', '51'], ['5', '99']])
 81 |     with pytest.raises(InvalidConstraintsError):
 82 |         wecr_0.predict(x_1, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51], [5, 100]])
 83 |     with pytest.raises(InvalidConstraintsError):
 84 |         wecr_0.predict(x_1, must_link=[['a', 'b'], ['c', 'd']], must_not_link=[[0, 51], [5, 99]])
 85 |     with pytest.raises(InvalidConstraintsError):
 86 |         wecr_0.predict(x_1, must_link=[[0, 'b'], ['c', 'd']], must_not_link=[[0, 51], [5, 99]])
 87 |     with pytest.raises(InvalidConstraintsError):
 88 |         wecr_0.predict(x_1, must_link=[[0, 1], ['c', 'd']], must_not_link=[[0, 51], [5, 99]])
 89 |     with pytest.raises(InvalidConstraintsError):
 90 |         wecr_0.predict(x_1, must_link=[[0, 1], [0, 'd']], must_not_link=[[0, 51], [5, 99]])
 91 |     with pytest.raises(InvalidConstraintsError):
 92 |         wecr_0.predict(x_1, must_link=[[0, 1], [0, 2]], must_not_link=[['a', 'b'], ['c', 'd']])
 93 |     with pytest.raises(InvalidConstraintsError):
 94 |         wecr_0.predict(x_1, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51], [5, 'd']])
 95 |     with pytest.raises(InvalidConstraintsError):
 96 |         wecr_0.predict(x_1, must_link=[[0, 1, 0, 2]], must_not_link=[[0, 51], [5, 99]])
 97 |     with pytest.raises(InvalidConstraintsError):
 98 |         wecr_0.predict(x_1, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51, 5, 99]])
 99 | 
100 |     wecr_res_1_ro = wecr_res_1.reorder(wecr_res_1.order(linkage_type='single'))
101 |     wecr_res_1_sort = wecr_res_1.sort(linkage_type='single')
102 |     assert_wecr_res_equal(wecr_res_1_ro, wecr_res_1_sort)
103 | 
104 |     wecr_res_1.plot(2)
105 |     wecr_res_1.plot_metrics()
106 |     wecr_res_1.plot_affinity_propagation()
107 | 
108 |     wecr_res_1_rcm = wecr_res_1.recalculate_cluster_memberships(x_1, 'average', in_place=True)
109 |     assert wecr_res_1_rcm is wecr_res_1
110 | 
111 |     wecr_res_1_rcm = wecr_res_1.recalculate_cluster_memberships(x_1, 'average', in_place=False)
112 |     assert not wecr_res_1_rcm is wecr_res_1
113 | 
114 |     cl = wecr_res_1.get_cl(2, with_names=False)
115 |     cl = wecr_res_1.get_cl(2, with_names=True)
116 |     cl = wecr_res_1.get_cl_affinity_propagation(with_names=False)
117 |     cl = wecr_res_1.get_cl_affinity_propagation(with_names=True)
118 |     with pytest.raises(InvalidKError):
119 |         wecr_res_1.get_cl(12500)
120 |     with pytest.raises(InvalidKError):
121 |         wecr_res_1.get_cl(-1)
122 | 
123 | def test_multickmeans():
124 |     x_0, _ = make_blobs(100, 5, centers=3, center_box=[-15, 15], shuffle=False)
125 |     mckm_0 = MultiCKMeans([2,3,4,5], n_rep=25)
126 |     mckm_0.fit(x_0)
127 |     mckm_res_0 = mckm_0.predict(x_0)
128 |     mckm_res_0_ro = mckm_res_0.reorder(mckm_res_0.order(2), in_place=True)
129 |     assert mckm_res_0 is mckm_res_0_ro
130 | 
131 |     mckm_res_0_ro = mckm_res_0.reorder(mckm_res_0.order(2), in_place=False)
132 |     assert not mckm_res_0 is mckm_res_0_ro
133 | 
134 |     with pytest.raises(InvalidClusteringMetric):
135 |         MultiCKMeans([2,3,4,5], n_rep=25, metrics=['NONEXISTENT_METRIC'])
136 | 
137 | @pytest.mark.parametrize('return_cls', [True, False])
138 | def test_save_load_ckm_res(test_dir, return_cls):
139 |     x, _ = make_blobs(100, 5, centers=3)
140 |     ckm = CKmeans(3, 20)
141 |     ckm.fit(x)
142 |     ckm_res = ckm.predict(x, return_cls=return_cls)
143 | 
144 |     if return_cls:
145 |         ckm_res_km_cls_file = os.path.join(test_dir, f'{return_cls}_ckm_res_km_cls.txt')
146 |         ckm_res.save_km_cls(ckm_res_km_cls_file, one_hot=False)
147 |         ckm_res.save_km_cls(ckm_res_km_cls_file, one_hot=True)
148 | 
149 |     assert_ckm_res_equal(
150 |         ckm_res, CKmeansResult.from_json_str(ckm_res.to_json())
151 |     )
152 | 
153 |     ckm_res_json_file = os.path.join(test_dir, f'{return_cls}_ckm_res.json')
154 |     ckm_res.to_json(ckm_res_json_file)
155 |     ckm_res_l = CKmeansResult.from_json(ckm_res_json_file)
156 | 
157 |     assert_ckm_res_equal(ckm_res, ckm_res_l)
158 | 
159 |     ckm_res_dir = os.path.join(test_dir, f'{return_cls}_ckm_res')
160 |     ckm_res.to_dir(ckm_res_dir)
161 |     ckm_res.to_dir(ckm_res_dir, force=True)
162 |     with pytest.raises(Exception):
163 |         ckm_res.to_dir(ckm_res_dir, force=False)
164 |     ckm_res_l = CKmeansResult.from_dir(ckm_res_dir)
165 |     assert_ckm_res_equal(ckm_res, ckm_res_l)
166 | 
167 |     with pytest.raises(Exception):
168 |         CKmeansResult.from_dir('SOME_NONEXISTENT_DIR')
169 | 
170 | @pytest.mark.parametrize('return_cls', [True, False])
171 | def test_save_load_wecr_res(test_dir, return_cls):
172 |     x, _ = make_blobs(100, 5, centers=3, center_box=[-15, 15])
173 |     wecr = WECR([2,3,4,5], 100)
174 |     wecr.fit(x)
175 |     wecr_res = wecr.predict(x, return_cls=return_cls)
176 | 
177 |     if return_cls:
178 |         wecr_res_km_cls_file = os.path.join(test_dir, f'{return_cls}_wecr_res_km_cls.txt')
179 |         wecr_res.save_km_cls(wecr_res_km_cls_file, one_hot=False)
180 |         wecr_res.save_km_cls(wecr_res_km_cls_file, one_hot=True)
181 | 
182 |     assert_wecr_res_equal(
183 |         wecr_res, WECRResult.from_json_str(wecr_res.to_json())
184 |     )
185 | 
186 |     wecr_res_json_file = os.path.join(test_dir, f'{return_cls}_wecr_res.json')
187 |     wecr_res.to_json(wecr_res_json_file)
188 |     wecr_res_l = WECRResult.from_json(wecr_res_json_file)
189 |     assert_wecr_res_equal(wecr_res, wecr_res_l)
190 | 
191 |     wecr_res_dir = os.path.join(test_dir, f'{return_cls}_wecr_res')
192 |     wecr_res.to_dir(wecr_res_dir)
193 |     wecr_res.to_dir(wecr_res_dir, force=True)
194 |     with pytest.raises(Exception):
195 |         wecr_res.to_dir(wecr_res_dir, force=False)
196 |     wecr_res_l = WECRResult.from_dir(wecr_res_dir)
197 |     assert_wecr_res_equal(wecr_res, wecr_res_l)
198 | 
199 |     with pytest.raises(Exception):
200 |         WECRResult.from_dir('SOME_NONEXISTENT_DIR')
201 | 


--------------------------------------------------------------------------------
/pyckmeans/io/tests/test_phylip.py:
--------------------------------------------------------------------------------
  1 | from pyckmeans.distance import DistanceMatrix
  2 | import pytest
  3 | import tempfile
  4 | import os
  5 | 
  6 | import numpy as np
  7 | 
  8 | from pyckmeans.io import phylip
  9 | from pyckmeans.io.phylip import InvalidPhylipAlignmentError, InvalidPhylipMatrixError, read_phylip_distmat, write_phylip_distmat, IncompatibleNamesError
 10 | 
 11 | 
 12 | # ==== alignment
 13 | 
 14 | PHYLIP_STR_0 = \
 15 | '''2 9
 16 | Sample0 ACTGTCATG
 17 | Sample1 ACT--CATC
 18 | '''
 19 | 
 20 | PHYLIP_STR_1 = \
 21 | '''2 9
 22 | Sample0 ACTGT CATG
 23 | Sample1 ACT-- CATC
 24 | '''
 25 | 
 26 | PHYLIP_STR_2 = \
 27 | '''2 9
 28 | Sample0 ACTGT CATG
 29 | 
 30 | Sample1 ACT-- CATC
 31 | 
 32 | '''
 33 | 
 34 | PHYLIP_STR_3 = \
 35 | '''2 9 3
 36 | Sample0 ACTGTCATG
 37 | Sample1 ACT--CATC
 38 | '''
 39 | 
 40 | PHYLIP_STR_4 = \
 41 | '''2 8
 42 | Sample0 ACTGTCATG
 43 | Sample1 ACT--CATC
 44 | Sample2 ACTTGCATC
 45 | '''
 46 | 
 47 | PHYLIP_STR_5 = \
 48 | '''1 9
 49 | Sample0 ACTGTCATG
 50 | '''
 51 | 
 52 | @pytest.fixture(scope='session')
 53 | def prep_phylip_files():
 54 |     with tempfile.TemporaryDirectory() as tempdir:
 55 |         print(f'Created temporary directory {tempdir}.')
 56 | 
 57 |         phylip_file_0 = os.path.join(tempdir, 'phylip_0.phy')
 58 |         with open(phylip_file_0, 'w') as f:
 59 |             f.write(PHYLIP_STR_0)
 60 | 
 61 |         phylip_file_1 = os.path.join(tempdir, 'phylip_1.phy')
 62 |         with open(phylip_file_1, 'w') as f:
 63 |             f.write(PHYLIP_STR_1)
 64 | 
 65 |         phylip_file_2 = os.path.join(tempdir, 'phylip_2.phy')
 66 |         with open(phylip_file_2, 'w') as f:
 67 |             f.write(PHYLIP_STR_2)
 68 |         
 69 |         phylip_file_3 = os.path.join(tempdir, 'phylip_3.phy')
 70 |         with open(phylip_file_3, 'w') as f:
 71 |             f.write(PHYLIP_STR_3)
 72 |         
 73 |         phylip_file_4 = os.path.join(tempdir, 'phylip_4.phy')
 74 |         with open(phylip_file_4, 'w') as f:
 75 |             f.write(PHYLIP_STR_4)
 76 |         
 77 |         phylip_file_5 = os.path.join(tempdir, 'phylip_5.phy')
 78 |         with open(phylip_file_5, 'w') as f:
 79 |             f.write(PHYLIP_STR_5)
 80 | 
 81 |         yield (
 82 |             # should work
 83 |             phylip_file_0,
 84 |             phylip_file_1,
 85 |             phylip_file_2,
 86 | 
 87 |             # shouldn't work
 88 |             phylip_file_3,
 89 |             phylip_file_4,
 90 |             phylip_file_5,
 91 |         )
 92 | 
 93 |         print(f'Deleted temporary directory {tempdir}.')
 94 | 
 95 | def test_read_phylip_alignment(prep_phylip_files):
 96 |     r_0 = phylip.read_phylip_alignment(prep_phylip_files[0])
 97 |     r_1 = phylip.read_phylip_alignment(prep_phylip_files[1])
 98 |     r_2 = phylip.read_phylip_alignment(prep_phylip_files[2])
 99 | 
100 |     print('r_0', r_0)
101 |     print('r_1', r_1)
102 |     print('r_2', r_2)
103 | 
104 |     with pytest.raises(InvalidPhylipAlignmentError):
105 |         r_3 = phylip.read_phylip_alignment(prep_phylip_files[3])
106 |     with pytest.raises(InvalidPhylipAlignmentError):
107 |         r_4 = phylip.read_phylip_alignment(prep_phylip_files[4])
108 |     with pytest.raises(InvalidPhylipAlignmentError):
109 |         r_5 = phylip.read_phylip_alignment(prep_phylip_files[5])
110 | 
111 | 
112 | 
113 | # ==== distance
114 | 
115 | PHYLIP_DIST_STR_0 = \
116 | '''4
117 | Sample0 0.00 0.90 0.80 0.30
118 | Sample1 0.90 0.00 0.40 0.70
119 | Sample2 0.80 0.40 0.00 0.50
120 | Sample3 0.30 0.70 0.50 0.00
121 | '''
122 | 
123 | PHYLIP_DIST_STR_1 = \
124 | '''4
125 | Sample0
126 | Sample1 0.90
127 | Sample2 0.80 0.40
128 | Sample3 0.30 0.70 0.50
129 | '''
130 | 
131 | PHYLIP_DIST_STR_2 = \
132 | '''5
133 | Sample0
134 | Sample1 0.90
135 | Sample2 0.80 0.40
136 | Sample3 0.30 0.70 0.50
137 | '''
138 | 
139 | PHYLIP_DIST_STR_3 = \
140 | '''4
141 | Sample0
142 | Sample1 0.90
143 | Sample2 0.80 
144 | Sample3 0.30 0.70 0.50
145 | '''
146 | 
147 | PHYLIP_DIST_STR_4 = \
148 | '''5
149 | Sample0 0.00 0.90 0.80 0.30
150 | Sample1 0.90 0.00 0.40 0.70
151 | Sample2 0.80 0.40 0.00 0.50
152 | Sample3 0.30 0.70 0.50 0.00
153 | '''
154 | 
155 | PHYLIP_DIST_STR_5 = \
156 | '''4
157 | Sample0 0.00 0.90 0.80 0.30
158 | Sample1 0.90  0.40 0.70
159 | Sample2 0.80 0.40 0.00 0.50
160 | Sample3 0.30 0.70 0.50 0.00
161 | '''
162 | 
163 | PHYLIP_DIST_STR_6 = \
164 | '''4
165 | 
166 | Sample0 0.00 0.90 0.80 0.30
167 | Sample1 0.90 0.00 0.40 0.70
168 | Sample2 0.80 0.40 0.00 0.50
169 | Sample3 0.30 0.70 0.50 0.00
170 | '''
171 | 
172 | PHYLIP_DIST_STR_7 = \
173 | '''X
174 | Sample0 0.00 0.90 0.80 0.30
175 | 
176 | Sample1 0.90 0.00 0.40 0.70
177 | Sample2 0.80 0.40 0.00 0.50
178 | Sample3 0.30 0.70 0.50 0.00
179 | '''
180 | 
181 | PHYLIP_DIST_STR_8 = \
182 | '''4
183 | Sample0 0.00 0.90 0.80 0.30
184 | 
185 | Sample1 0.90 0.00 0.40 0.70
186 | Sample2 0.80 0.40 0.00 0.50
187 | Sample3 0.30 0.70 0.50 0.00
188 | '''
189 | 
190 | PHYLIP_DIST_STR_9 = \
191 | '''4
192 | Sample0
193 | 
194 | Sample1 0.90
195 | Sample2 0.80 
196 | Sample3 0.30 0.70 0.50
197 | '''
198 | 
199 | @pytest.fixture(scope='session')
200 | def prep_phylip_dist_files():
201 |     with tempfile.TemporaryDirectory() as tempdir:
202 |         print(f'Created temporary directory {tempdir}.')
203 | 
204 |         phylip_dist_file_0 = os.path.join(tempdir, 'phylip_dist_0.dist')
205 |         with open(phylip_dist_file_0, 'w') as f:
206 |             f.write(PHYLIP_DIST_STR_0)
207 | 
208 |         phylip_dist_file_1 = os.path.join(tempdir, 'phylip_dist_1.dist')
209 |         with open(phylip_dist_file_1, 'w') as f:
210 |             f.write(PHYLIP_DIST_STR_1)
211 | 
212 |         phylip_dist_file_2 = os.path.join(tempdir, 'phylip_dist_2.dist')
213 |         with open(phylip_dist_file_2, 'w') as f:
214 |             f.write(PHYLIP_DIST_STR_2)
215 | 
216 |         phylip_dist_file_3 = os.path.join(tempdir, 'phylip_dist_3.dist')
217 |         with open(phylip_dist_file_3, 'w') as f:
218 |             f.write(PHYLIP_DIST_STR_3)
219 | 
220 |         phylip_dist_file_4 = os.path.join(tempdir, 'phylip_dist_4.dist')
221 |         with open(phylip_dist_file_4, 'w') as f:
222 |             f.write(PHYLIP_DIST_STR_4)
223 | 
224 |         phylip_dist_file_5 = os.path.join(tempdir, 'phylip_dist_5.dist')
225 |         with open(phylip_dist_file_5, 'w') as f:
226 |             f.write(PHYLIP_DIST_STR_5)
227 | 
228 |         phylip_dist_file_6 = os.path.join(tempdir, 'phylip_dist_6.dist')
229 |         with open(phylip_dist_file_6, 'w') as f:
230 |             f.write(PHYLIP_DIST_STR_6)
231 |         
232 |         phylip_dist_file_7 = os.path.join(tempdir, 'phylip_dist_7.dist')
233 |         with open(phylip_dist_file_7, 'w') as f:
234 |             f.write(PHYLIP_DIST_STR_7)
235 |         
236 |         phylip_dist_file_8 = os.path.join(tempdir, 'phylip_dist_8.dist')
237 |         with open(phylip_dist_file_8, 'w') as f:
238 |             f.write(PHYLIP_DIST_STR_8)
239 |         
240 |         phylip_dist_file_9 = os.path.join(tempdir, 'phylip_dist_9.dist')
241 |         with open(phylip_dist_file_9, 'w') as f:
242 |             f.write(PHYLIP_DIST_STR_9)
243 | 
244 |         yield (
245 |             # should work
246 |             phylip_dist_file_0,
247 |             phylip_dist_file_1,
248 | 
249 |             # shouldn't work
250 |             phylip_dist_file_2,
251 |             phylip_dist_file_3,
252 |             phylip_dist_file_4,
253 |             phylip_dist_file_5,
254 |             phylip_dist_file_6,
255 |             phylip_dist_file_7,
256 |             phylip_dist_file_8,
257 |             phylip_dist_file_9,
258 |         )
259 | 
260 |         print(f'Deleted temporary directory {tempdir}.')
261 | 
262 | def test_phylip_distance(prep_phylip_dist_files):
263 |     eps = 0.0001
264 |     
265 |     # == reading
266 |     d_0 = read_phylip_distmat(prep_phylip_dist_files[0])
267 |     nm_0 = d_0.names
268 |     print('d_0:', d_0)
269 | 
270 |     d_1 = read_phylip_distmat(prep_phylip_dist_files[1])
271 |     nm_1 = d_1.names
272 |     print('d_1:', d_1)
273 | 
274 |     assert np.sum(np.abs(d_0.dist_mat - d_1.dist_mat)) < eps
275 | 
276 | 
277 |     with pytest.raises(InvalidPhylipMatrixError):
278 |         phylip.read_phylip_distmat(prep_phylip_dist_files[2])
279 |     with pytest.raises(InvalidPhylipMatrixError):
280 |         phylip.read_phylip_distmat(prep_phylip_dist_files[3])
281 |     with pytest.raises(InvalidPhylipMatrixError):
282 |         phylip.read_phylip_distmat(prep_phylip_dist_files[4])
283 |     with pytest.raises(InvalidPhylipMatrixError):
284 |         phylip.read_phylip_distmat(prep_phylip_dist_files[5])
285 |     with pytest.raises(InvalidPhylipMatrixError):
286 |         phylip.read_phylip_distmat(prep_phylip_dist_files[6])
287 |     with pytest.raises(InvalidPhylipMatrixError):
288 |         phylip.read_phylip_distmat(prep_phylip_dist_files[7])
289 |     with pytest.raises(InvalidPhylipMatrixError):
290 |         phylip.read_phylip_distmat(prep_phylip_dist_files[8])
291 |     with pytest.raises(InvalidPhylipMatrixError):
292 |         phylip.read_phylip_distmat(prep_phylip_dist_files[9])
293 | 
294 |     # == writing
295 |     with tempfile.TemporaryDirectory() as tempdir:
296 |         d_file_0 = os.path.join(tempdir, 'd_file_0.dist')
297 |         write_phylip_distmat(d_0, d_file_0)
298 |         with pytest.raises(FileExistsError):
299 |             write_phylip_distmat(d_0, d_file_0, force=False)
300 |         with pytest.raises(FileExistsError):
301 |             d_path = os.path.join(tempdir, 'SOMEDIR')
302 |             os.mkdir(d_path)
303 |             write_phylip_distmat(d_0, d_path, force=True)
304 |         with pytest.raises(IncompatibleNamesError):
305 |             d_x = DistanceMatrix(d_0.dist_mat.copy(), d_0.names.copy())
306 |             d_x.names = d_x.names[1:]
307 |             d_path = os.path.join(tempdir, 'somefile.dist')
308 |             write_phylip_distmat(d_x, d_path)
309 | 
310 |         d_0_r = read_phylip_distmat(d_file_0)
311 |         nm_0_r = d_0_r.names
312 |         assert all([a == b for a, b in zip(nm_0, nm_0_r)])
313 |         assert np.sum(np.abs(d_0.dist_mat - d_0_r.dist_mat)) < eps
314 | 
315 |         d_file_1 = os.path.join(tempdir, 'd_file_1.dist')
316 |         write_phylip_distmat(d_1, d_file_1)
317 |         d_1_r = read_phylip_distmat(d_file_1)
318 |         nm_1_r = d_1_r.names
319 |         assert all([a == b for a, b in zip(nm_1, nm_1_r)])
320 |         assert np.sum(np.abs(d_1.dist_mat - d_1_r.dist_mat)) < eps
321 | 
322 | 
323 | 


--------------------------------------------------------------------------------
/pyckmeans/io/phylip.py:
--------------------------------------------------------------------------------
  1 | ''' fasta
  2 | 
  3 |     Module for reading and writing PHYLIP files.
  4 | '''
  5 | 
  6 | import os
  7 | import re
  8 | from typing import Tuple, Union
  9 | 
 10 | import numpy
 11 | 
 12 | import pyckmeans.distance
 13 | 
 14 | WHITESPACE_RE = re.compile(r'\s+')
 15 | 
 16 | class InvalidPhylipAlignmentError(Exception):
 17 |     '''InvalidPhylipAlignmentError
 18 |     '''
 19 | 
 20 | def read_phylip_alignment(
 21 |     phylip_file: str,
 22 |     dtype: Union[str, numpy.dtype] = 'U',
 23 | ) -> Tuple[numpy.ndarray, numpy.ndarray]:
 24 |     '''read_phylip_alignment
 25 | 
 26 |     Read phylip alignment file. This function expects the phylip to be a valid alignment,
 27 |     meaning that it should contain at least 2 sequences of the same length, including
 28 |     gaps.
 29 | 
 30 |     WARNING: whitespace characters in entry names are NOT supported.
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     phylip_file : str
 35 |         Path to a phylip file.
 36 |     dtype: Union[str, numpy.dtype]
 37 |         Data type to use for the sequence array.
 38 | 
 39 |     Returns
 40 |     -------
 41 |     Tuple[numpy.ndarray, numpy.ndarray]
 42 |         Tuple of sequences and names, each as numpy array.
 43 | 
 44 |     Raises
 45 |     ------
 46 |     InvalidPhylipAlignmentError
 47 |         Raised if header is malformed.
 48 |     InvalidPhylipAlignmentError
 49 |         Raised if less than 2 entries are present in phylip_file.
 50 |     InvalidPhylipAlignmentError
 51 |         Raised if number of entries does not match header.
 52 |     '''
 53 | 
 54 |     names = []
 55 |     seqs = []
 56 |     with open(phylip_file) as phylip_f:
 57 |         # header
 58 |         header_str = next(phylip_f)
 59 |         try:
 60 |             n_entries, n_sites = [int(s) for s in header_str.split()]
 61 |         except:
 62 |             raise InvalidPhylipAlignmentError('Malformed header.')
 63 | 
 64 |         for line in phylip_f:
 65 |             _line = re.sub(WHITESPACE_RE, '', line)
 66 |             if not _line:
 67 |                 continue
 68 |             l_len = len(_line)
 69 |             start = l_len-n_sites
 70 |             name = _line[:start]
 71 |             seq = _line[start:].upper()
 72 | 
 73 |             names.append(name)
 74 |             seqs.append(list(seq))
 75 | 
 76 |     # check alignment validity
 77 |     n_seq = len(seqs)
 78 |     if len(seqs) < 2:
 79 |         msg = f'Expected at least 2 entries but found only {n_seq}.'
 80 |         raise InvalidPhylipAlignmentError(msg)
 81 | 
 82 |     if n_seq != n_entries:
 83 |         msg = f'Expected {n_entries} entries but found {n_seq} instead.'
 84 |         raise InvalidPhylipAlignmentError(msg)
 85 | 
 86 |     # construct output
 87 |     seqs = numpy.array(seqs, dtype=dtype)
 88 |     names = numpy.array(names)
 89 | 
 90 |     return seqs, names
 91 | 
 92 | 
 93 | class InvalidPhylipMatrixError(Exception):
 94 |     '''InvalidPhylipMatrixTypeError
 95 |     '''
 96 | 
 97 | def read_phylip_distmat(phylip_file: str) -> 'pyckmeans.distance.DistanceMatrix':
 98 |     '''read_phylip_distmat
 99 | 
100 |     Read distance matrix in PHYLIP format.
101 |     Supports full and lower-triangle matrices.
102 | 
103 |     Parameters
104 |     ----------
105 |     phylip_file : str
106 |         Path to distance file in phylip format.
107 | 
108 |     Returns
109 |     -------
110 |     pyckmeans.distance.DistanceMatrix
111 |         Distance matrix as pyckmeans.distance DistanceMatrix object.
112 | 
113 |     Raises
114 |     ------
115 |     InvalidPhylipMatrixError
116 |         Raised if the header is malformed.
117 |     InvalidPhylipMatrixError
118 |         Raised if an empty line is encountered as second line.
119 |     InvalidPhylipMatrixError
120 |         Raised if file format can neither be inferred as full nor
121 |         as lower-triangle matrix.
122 |     InvalidPhylipMatrixError
123 |         Raised if an empty line is encountered.
124 |     InvalidPhylipMatrixError
125 |         Raised if expecting a full matrix but number of values
126 |         does not match the header.
127 |     InvalidPhylipMatrixError
128 |         Raised if an empty line is encountered.
129 |     InvalidPhylipMatrixError
130 |         Raised if expecting lower-triangle matrix but number of values
131 |         does not match the expected number of values for that entry.
132 |     InvalidPhylipMatrixError
133 |         Raised if number of names does not match number of entries
134 |         stated in the header.
135 |     '''
136 |     with open(phylip_file) as phylip_f:
137 |         # == header
138 |         header_str = next(phylip_f)
139 |         try:
140 |             n_entries = int(header_str.strip())
141 |         except:
142 |             raise InvalidPhylipMatrixError('Malformed header.')
143 | 
144 |         dist_mat = numpy.zeros((n_entries, n_entries))
145 |         names = []
146 | 
147 |         # == detect matrix type (full, lower-triangle)
148 |         line = next(phylip_f)
149 |         _line = line.strip()
150 |         if not _line:
151 |             msg = 'Line 2: Empty lines are not allowed.'
152 |             raise InvalidPhylipMatrixError(msg)
153 |         name, *mat_entries = _line.split()
154 |         names.append(name)
155 | 
156 |         # lower-triangle matrix
157 |         if len(mat_entries) == 0:
158 |             mat_type = 'lower-triangle'
159 |         # full matrix
160 |         elif len(mat_entries) == n_entries:
161 |             mat_type = 'full'
162 |             dist_mat[0,] = numpy.array(mat_entries, dtype=float)
163 |         # error
164 |         else:
165 |             msg = 'Line 2: Expected either 0 values for a lower-triangle ' +\
166 |                 f'matrix or {n_entries} values for a full matrix; found ' +\
167 |                 f'{len(mat_entries)} values instead.'
168 |             raise InvalidPhylipMatrixError(msg)
169 | 
170 |         # == full matrix
171 |         if mat_type == 'full':
172 |             for i, line in enumerate(phylip_f):
173 |                 l_num = i + 3 # 1-based line number: header + first line already read
174 | 
175 |                 _line = line.strip()
176 |                 if not _line:
177 |                     # last line can be empty
178 |                     if i + 2 == n_entries:
179 |                         continue
180 |                     msg = f'Line {l_num}: Empty lines are not allowed.'
181 |                     raise InvalidPhylipMatrixError(msg)
182 |                 name, *mat_entries = _line.split()
183 |                 names.append(name)
184 | 
185 |                 # error
186 |                 if len(mat_entries) != n_entries:
187 |                     msg = f'Line {l_num}: Expected {n_entries} values for a full matrix but ' +\
188 |                         f'found {len(mat_entries)} values instead.'
189 |                     raise InvalidPhylipMatrixError(msg)
190 | 
191 |                 dist_mat[i+1,] = numpy.array(mat_entries, dtype=float)
192 | 
193 |         # == lower-triangle matrix
194 |         elif mat_type == 'lower-triangle':
195 |             for i, line in enumerate(phylip_f):
196 |                 l_num = i + 3 # 1-based line number: header + first line already read
197 | 
198 |                 _line = line.strip()
199 |                 if not _line:
200 |                     # last line can be empty
201 |                     if i + 2 == n_entries:
202 |                         continue
203 |                     msg = f'Line {l_num}: Empty lines are not allowed.'
204 |                     raise InvalidPhylipMatrixError(msg)
205 |                 name, *mat_entries = _line.split()
206 |                 names.append(name)
207 | 
208 |                 # error
209 |                 if len(mat_entries) != i+1:
210 |                     msg = f'Line {l_num}: Expected {i+1} values for a lower-triangle ' +\
211 |                         f'matrix but found {len(mat_entries)} values instead.'
212 |                     raise InvalidPhylipMatrixError(msg)
213 | 
214 |                 dist_mat[i+1, :i+1] = numpy.array(mat_entries, dtype=float)
215 | 
216 |             # fill upper triangle
217 |             dist_mat = dist_mat + dist_mat.T
218 | 
219 |     # check validity
220 |     if len(names) != n_entries:
221 |         msg = f'Expected {n_entries} entries but found {len(names)}.'
222 |         raise InvalidPhylipMatrixError(msg)
223 | 
224 |     return pyckmeans.distance.DistanceMatrix(dist_mat, names)
225 | 
226 | class IncompatibleNamesError(Exception):
227 |     '''IncompatibleNamesError'''
228 | 
229 | NAME_PADDING = 64
230 | 
231 | def write_phylip_distmat(
232 |     dist: 'pyckmeans.distance.DistanceMatrix',
233 |     file_path: str,
234 |     force: bool = False,
235 | ) -> None:
236 |     '''write_phylip_distmat
237 | 
238 |     Write distance matrix to file in PHYLIP matrix format.
239 | 
240 |     Parameters
241 |     ----------
242 |     dist : pyckmeans.distance.DistanceMatrix
243 |         Distance matrix as pyckmeans.distance DistanceMatrix object.
244 |     file_path : str
245 |         Output file path.
246 |     force : bool, optional
247 |         Force overwrite if file exists, by default False
248 | 
249 |     Raises
250 |     ------
251 |     FileExistsError
252 |         Raised if file at file_path already exists and force is False.
253 |     FileExistsError
254 |         Raised if file_path points to an existing directory.
255 |     IncompatibleNamesError
256 |         Raised if names are incompatible with dist_mat.
257 |     '''
258 |     if os.path.exists(file_path):
259 |         if os.path.isfile(file_path) and not force:
260 |             msg = f'File {file_path} already exists. If you want to overwrite ' +\
261 |                 'it run the function with force=True.'
262 |             raise FileExistsError(msg)
263 |         else:
264 |             msg = f'A directory exists at path {file_path}.'
265 |             raise FileExistsError(msg)
266 | 
267 |     dist_mat = dist.dist_mat
268 |     names = dist.names
269 | 
270 |     n_entries = dist_mat.shape[0]
271 |     if len(names) != n_entries:
272 |         msg = f'Expected {n_entries} names but got {len(names)} instead.'
273 |         raise IncompatibleNamesError(msg)
274 | 
275 |     with open(file_path, 'w') as phylip_f:
276 |         # header
277 |         phylip_f.write(f'{n_entries}\n')
278 | 
279 |         # body
280 |         for name, dists in zip(names, dist_mat):
281 |             nm_str = f'{name: <{NAME_PADDING}}'
282 |             dst_str = '\t'.join(dists.astype(str))
283 |             phylip_f.write(f'{nm_str} {dst_str}\n')
284 | 


--------------------------------------------------------------------------------
/pyckmeans/ordering/__init__.py:
--------------------------------------------------------------------------------
  1 | ''' Module for distance matrix ordering.
  2 | '''
  3 | 
  4 | from typing import Union
  5 | import numpy
  6 | from scipy.cluster import hierarchy
  7 | 
  8 | import pyckmeans.distance
  9 | 
 10 | class InvalidReorderMethod(Exception):
 11 |     '''InvalidReorderMethod'''
 12 | class InvalidLinkageType(Exception):
 13 |     '''InvalidLinkageType'''
 14 | 
 15 | REORDER_METHODS = (
 16 |     'GW',
 17 |     'OLO',
 18 | )
 19 | LINKAGE_TYPES = (
 20 |     'average',
 21 |     'complete',
 22 |     'single',
 23 |     'weighted',
 24 |     'centroid',
 25 | )
 26 | 
 27 | 
 28 | def distance_order(
 29 |     dist: Union[numpy.ndarray, 'pyckmeans.distance.DistanceMatrix'],
 30 |     method: str = 'GW',
 31 |     linkage_type: str = 'average',
 32 | ) -> numpy.ndarray:
 33 |     '''distance_order
 34 | 
 35 |     Get optimal distance matrix order.
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     dist : Union[numpy.ndarray, 'pyckmeans.distance.DistanceMatrix']
 40 |         A n * n distance matrix as either numpy.ndarray or as
 41 |         pyckmeans.distance.DistanceMatrix object.
 42 |     method : str
 43 |         Reordering method. Either 'GW' (Gruvaeus & Wainer, 1972) or 'OLO' for
 44 |         scipy.hierarchy.optimal_leaf_ordering.
 45 | 
 46 |         Gruvaeus, G., H., Wainer. 1972. Two Additions to Hierarchical Cluster Analysis.
 47 |         The British Psychological Society 25.
 48 |     linkage_type : str
 49 |         Linkage type for the hierarchical clustering. One of
 50 | 
 51 |         * 'average'
 52 |         * 'complete'
 53 |         * 'single'
 54 |         * 'weighted'
 55 |         * 'centroid'
 56 | 
 57 |         See scipy.cluster.hierarchy.linkage for details.
 58 | 
 59 |     Returns
 60 |     -------
 61 |     numpy.ndarray
 62 |         Optimal order as vector.
 63 | 
 64 |     Raises
 65 |     ------
 66 |     InvalidReorderMethod
 67 |         Raised if an unknown reordering method is passed.
 68 |     InvalidLinkageType
 69 |         Raised if an unknown linakage type is passed.
 70 |     '''
 71 | 
 72 |     method = method.upper()
 73 |     if method not in REORDER_METHODS:
 74 |         msg = f'"{method}" is not a valid reordering method. Available ' +\
 75 |             f'methods are {REORDER_METHODS}.'
 76 |         raise InvalidReorderMethod(msg)
 77 | 
 78 |     linkage_type = linkage_type.lower()
 79 |     if linkage_type not in LINKAGE_TYPES:
 80 |         msg = f'"{linkage_type}" is not a valid linkage type. Available ' +\
 81 |             f'types are {LINKAGE_TYPES}.'
 82 |         raise InvalidLinkageType(msg)
 83 | 
 84 |     is_ndarray = isinstance(dist, numpy.ndarray)
 85 |     if is_ndarray:
 86 |         dist_mat = dist
 87 |     else:
 88 |         dist_mat = dist.dist_mat
 89 | 
 90 |     dist_mat_cond = condensed_form(dist_mat)
 91 |     linkage_mat = hierarchy.linkage(dist_mat_cond, method=linkage_type)
 92 |     # cluster distance can become negative due to floating point
 93 |     # errors.
 94 |     linkage_mat[numpy.abs(linkage_mat) < 1e-8] = 0
 95 | 
 96 |     if method == 'OLO':
 97 |         linkage_mat = hierarchy.optimal_leaf_ordering(linkage_mat, dist_mat_cond)
 98 |     elif method == 'GW':
 99 |         linkage_mat = reorder_linkage_gw(linkage_mat, dist_mat)
100 | 
101 |     order = hierarchy.leaves_list(linkage_mat)
102 |     dist_mat = dist_mat[order, :][:, order]
103 | 
104 |     return order
105 | 
106 | # This function duplicates code from distance_order, but I
107 | # want to keep this duplication for now for more flexibility
108 | def reorder_distance(
109 |     dist: Union[numpy.ndarray, 'pyckmeans.distance.DistanceMatrix'],
110 |     method: str = 'GW',
111 |     linkage_type: str = 'average',
112 | ) -> Union[numpy.ndarray, 'pyckmeans.distance.DistanceMatrix']:
113 |     '''reorder_distance
114 | 
115 |     Reorder distance matrix using hierarchical clustering.
116 | 
117 |     Parameters
118 |     ----------
119 |     dist : Union[numpy.ndarray, 'pyckmeans.distance.DistanceMatrix']
120 |         A n * n distance matrix as either numpy.ndarray or as
121 |         pyckmeans.distance.DistanceMatrix object.
122 |     method : str
123 |         Reordering method. Either 'GW' (Gruvaeus & Wainer, 1972) or 'OLO' for
124 |         scipy.hierarchy.optimal_leaf_ordering.
125 | 
126 |         Gruvaeus, G., H., Wainer. 1972. Two Additions to Hierarchical Cluster Analysis.
127 |         The British Psychological Society 25.
128 |     linkage_type : str
129 |         Linkage type for the hierarchical clustering. One of
130 | 
131 |         * 'average'
132 |         * 'complete'
133 |         * 'single'
134 |         * 'weighted'
135 |         * 'centroid'
136 | 
137 |         See scipy.cluster.hierarchy.linkage for details.
138 | 
139 |     Returns
140 |     -------
141 |     Union[numpy.ndarray, 'pyckmeans.distance.DistanceMatrix']
142 |         The sorted distance matrix as either numpy.ndarray or
143 |         pyckmeans.distance.DistanceMatrix, depending on the input.
144 | 
145 |     Raises
146 |     ------
147 |     InvalidReorderMethod
148 |         Raised if an unknown reordering method is passed.
149 |     InvalidLinkageType
150 |         Raised if an unknown linakage type is passed.
151 |     '''
152 |     method = method.upper()
153 |     if method not in REORDER_METHODS:
154 |         msg = f'"{method}" is not a valid reordering method. Available ' +\
155 |             f'methods are {REORDER_METHODS}.'
156 |         raise InvalidReorderMethod(msg)
157 | 
158 |     linkage_type = linkage_type.lower()
159 |     if linkage_type not in LINKAGE_TYPES:
160 |         msg = f'"{linkage_type}" is not a valid linkage type. Available ' +\
161 |             f'types are {LINKAGE_TYPES}.'
162 |         raise InvalidLinkageType(msg)
163 | 
164 |     is_ndarray = isinstance(dist, numpy.ndarray)
165 |     if is_ndarray:
166 |         dist_mat = dist
167 |     else:
168 |         dist_mat = dist.dist_mat
169 | 
170 |     dist_mat_cond = condensed_form(dist_mat)
171 |     linkage_mat = hierarchy.linkage(dist_mat_cond, method=linkage_type)
172 |     if method == 'OLO':
173 |         linkage_mat = hierarchy.optimal_leaf_ordering(linkage_mat, dist_mat_cond)
174 |     elif method == 'GW':
175 |         linkage_mat = reorder_linkage_gw(linkage_mat, dist_mat)
176 | 
177 |     order = hierarchy.leaves_list(linkage_mat)
178 |     dist_mat = dist_mat[order, :][:, order]
179 | 
180 |     if is_ndarray:
181 |         return dist_mat
182 |     else:
183 |         return pyckmeans.distance.DistanceMatrix(
184 |             dist_mat,
185 |             dist.names[order] if not dist.names is None else None,
186 |         )
187 | 
188 | def condensed_form(dist: numpy.ndarray) -> numpy.ndarray:
189 |     '''condensed_form
190 | 
191 |     Convert n*n distance matrix to condensed vector form.
192 | 
193 |     Parameters
194 |     ----------
195 |     dist : numpy.ndarray
196 |         n * n distance matrix.
197 | 
198 |     Returns
199 |     -------
200 |     numpy.ndarray
201 |         Distance matrix in condensed vector form as expected by
202 |         scipy.cluster.hierarchy.linkage.
203 |     '''
204 | 
205 |     return dist[numpy.triu_indices_from(dist, k=1)]
206 | 
207 | def reorder_linkage_gw(
208 |     linkage: numpy.ndarray,
209 |     dist: numpy.ndarray,
210 | ) -> numpy.ndarray:
211 |     '''reorder_linkage_gw
212 | 
213 |     Reorder linkage matrix using the algorithm described by Gruvaeus & Wainer (1972) [1]_.
214 | 
215 |     Parameters
216 |     ----------
217 |     linkage : numpy.ndarray
218 |         Linkage matrix as returned from scipy.cluster.hierarchy.linkage.
219 |     dist : numpy.ndarray
220 |         n * n distance matrix.
221 | 
222 |     Returns
223 |     -------
224 |     numpy.ndarray
225 |         Reordered linkage matrix.
226 | 
227 |     References
228 |     ----------
229 |     .. [1]  Gruvaeus, G., H., Wainer. 1972. Two Additions to Hierarchical Cluster Analysis.
230 |             The British Psychological Society 25.
231 |     '''
232 |     linkage = linkage.copy()
233 | 
234 |     n = linkage.shape[0]
235 | 
236 |     # left and right leaves of a cluster
237 |     l_r = numpy.zeros((n, 2))
238 |     # matrix determining, whether a cluster (subtree) should be flipped
239 |     flip = numpy.full((n, 2), False)
240 | 
241 |     # find left and right leaves of clusters
242 |     # and determine, whether cluster should
243 |     # be flipped
244 |     for i in range(n):
245 |         l, r = linkage[i, [0, 1]].astype(int)
246 | 
247 |         # l and r are singletons
248 |         if l <= n and r <= n:
249 |             l_r[i] = (l, r)
250 |         # only l is a singleton
251 |         elif l <= n:
252 |             l_r[i, 0] = l
253 | 
254 |             # left and right leaves of cluster r
255 |             rl, rr = l_r[r - (n + 1)].astype(int)
256 | 
257 |             if dist[l, rl] < dist[l, rr]:
258 |                 l_r[i, 1] = rr
259 |             else:
260 |                 l_r[i, 1] = rl
261 |                 flip[i, 1] = True
262 |         # only r is singleton
263 |         elif r <= n:
264 |             l_r[i, 1] = r
265 | 
266 |             # left and right leaves of cluster l
267 |             ll, lr = l_r[l - (n + 1)].astype(int)
268 | 
269 |             if dist[r, ll] < dist[r, lr]:
270 |                 l_r[i, 0] = lr
271 |                 flip[i, 0] = True
272 |             else:
273 |                 l_r[i, 0] = ll
274 |         # none of l and r are singletons
275 |         else:
276 |             # left and right leaves
277 |             ll, lr = l_r[l - (n + 1)].astype(int)
278 |             rl, rr = l_r[r - (n + 1)].astype(int)
279 | 
280 |             d_ll_rl = dist[ll, rl] # 0
281 |             d_ll_rr = dist[ll, rr] # 1
282 |             d_lr_rl = dist[lr, rl] # 2
283 |             d_lr_rr = dist[lr, rr] # 3
284 | 
285 |             mn_idx = numpy.argmin([d_ll_rl, d_ll_rr, d_lr_rl, d_lr_rr])
286 |             if mn_idx == 0: # d_ll_rl
287 |                 l_r[i] = (lr, rr)
288 |                 flip[i, 0] = True
289 |             elif mn_idx == 1: # d_ll_rr
290 |                 l_r[i] = (lr, rl)
291 |                 flip[i] = (True, True)
292 |             elif mn_idx == 2: # d_lr_rl
293 |                 l_r[i] = (ll, rr)
294 |             else: # d_lr_rr
295 |                 l_r[i] = (ll, rl)
296 |                 flip[i, 1] = True
297 | 
298 |     # apply flip
299 |     for i in range((n-1), 0, -1):
300 |         if flip[i, 0]:
301 |             c = linkage[i, 0].astype(int)
302 |             # non-singleton cluster
303 |             if c > n:
304 |                 c = c - (n + 1)
305 |                 linkage[c, [0, 1]] = linkage[c, [1, 0]]
306 |                 if flip[c, 0] == flip[c, 1]:
307 |                     flip[c] = ~flip[c]
308 |         if flip[i, 1]:
309 |             c = linkage[i, 1].astype(int)
310 |             if c > n:
311 |                 c = c - (n + 1)
312 |                 linkage[c, [0, 1]] = linkage[c, [1, 0]]
313 |                 if flip[c, 0] == flip[c, 1]:
314 |                     flip[c] = ~flip[c]
315 | 
316 |     return linkage
317 | 


--------------------------------------------------------------------------------
/pyckmeans/distance/src/distance.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef _WIN32
  2 | #define LIBRARY_API extern "C" __declspec(dllexport)
  3 | #else
  4 | #define LIBRARY_API extern "C"
  5 | #endif
  6 | 
  7 | #include<iostream>
  8 | #include<cstdint>
  9 | #include<vector>
 10 | #include<math.h>
 11 | 
 12 | /*
 13 |  *    Base encoding as used by R package ape.
 14 |  *    See http://ape-package.ird.fr/misc/BitLevelCodingScheme.html
 15 |  *    
 16 |  *    Summary:
 17 |  *    Most significant four bits are base information (A, G, C, T)
 18 |  *      76543210       
 19 |  *    0b00001000 -> base is known
 20 |  *    0b00000100 -> gap
 21 |  *    0b00000010 -> unknown base
 22 |  * 
 23 |  *    bases
 24 |  *    A     0b10001000
 25 |  *    G     0b01001000
 26 |  *    C     0b00101000
 27 |  *    T     0b00011000
 28 |  * 
 29 |  *    wobbles
 30 |  *    R     0b11000000      A|G 
 31 |  *    M     0b10100000      A|C 
 32 |  *    W     0b10010000      A|T 
 33 |  *    S     0b01100000      G|C 
 34 |  *    K     0b01010000      G|T 
 35 |  *    Y     0b00110000      C|T 
 36 |  *    V     0b11100000      A|G|C 
 37 |  *    H     0b10110000      A|C|T 
 38 |  *    D     0b11010000      A|G|T 
 39 |  *    B     0b01110000      G|C|T 
 40 |  *    N     0b11110000      A|G|C|T 
 41 |  *    
 42 |  *    gap
 43 |  *    -     0b00000100
 44 |  * 
 45 |  *    unknown/missing state
 46 |  *    ?     0b00000010
 47 |  *
 48 | */
 49 | 
 50 | // bases
 51 | const std::uint8_t A = 0b10001000; // A
 52 | const std::uint8_t G = 0b01001000; // G
 53 | const std::uint8_t C = 0b00101000; // C
 54 | const std::uint8_t T = 0b00011000; // T
 55 | // wobbles
 56 | const std::uint8_t R = 0b11000000; // A|G 
 57 | const std::uint8_t M = 0b10100000; // A|C 
 58 | const std::uint8_t W = 0b10010000; // A|T 
 59 | const std::uint8_t S = 0b01100000; // G|C 
 60 | const std::uint8_t K = 0b01010000; // G|T 
 61 | const std::uint8_t Y = 0b00110000; // C|T 
 62 | const std::uint8_t V = 0b11100000; // A|G|C 
 63 | const std::uint8_t H = 0b10110000; // A|C|T 
 64 | const std::uint8_t D = 0b11010000; // A|G|T 
 65 | const std::uint8_t B = 0b01110000; // G|C|T 
 66 | const std::uint8_t N = 0b11110000; // A|G|C|T
 67 | // extra
 68 | const std::uint8_t KNOWN   = 0b00001000; // base is known, i.e. A, G, C, T
 69 | const std::uint8_t GAP     = 0b00000100; // gap
 70 | const std::uint8_t UNKNOWN = 0b00000010; // base is unknown, e.g. missing data
 71 | 
 72 | const std::uint8_t NOT_PURINE     = 0b00110111; // not a unabiguous purine
 73 | const std::uint8_t NOT_PYRIMIDINE = 0b11000111; // not a unabiguous pyrimidine
 74 | 
 75 | // helper functions
 76 | inline bool isA(std::uint8_t base) {return base == A;}
 77 | inline bool isG(std::uint8_t base) {return base == G;}
 78 | inline bool isC(std::uint8_t base) {return base == C;}
 79 | inline bool isT(std::uint8_t base) {return base == T;}
 80 | 
 81 | inline bool isKnown(std::uint8_t base) {return (base & KNOWN) == KNOWN;}
 82 | inline bool isUnknown(std::uint8_t base) {return base == UNKNOWN;}
 83 | inline bool isGap(std::uint8_t base) {return base == GAP;}
 84 | 
 85 | inline bool isSameBase(std::uint8_t a, std::uint8_t b) {return (a == b) && isKnown(a);}
 86 | inline bool isDifferentBase(std::uint8_t a, std::uint8_t b) {return (a & b) < 0b00010000;}
 87 | inline bool isMatch(std::uint8_t a, std::uint8_t b) {return (a == b);}
 88 | inline bool isAmbiguousMatch(std::uint8_t a, std::uint8_t b) {return (a & b) > 0b00001111;}
 89 | 
 90 | inline bool isPurine(std::uint8_t base) {return (base & NOT_PURINE) == 0;}
 91 | inline bool isPyrimidine(std::uint8_t base) {return (base & NOT_PYRIMIDINE) == 0;}
 92 | inline bool isTransition(std::uint8_t a, std::uint8_t b) {
 93 |     return (isPurine(a) && isPurine(b))
 94 |         || (isPyrimidine(a) && isPyrimidine(b));
 95 | }
 96 | inline bool isTransversion(std::uint8_t a, std::uint8_t b) {return !isTransition(a, b);}
 97 | 
 98 | // == distances
 99 | 
100 | // helpers
101 | std::vector<bool> completeDeletionSites(
102 |     std::uint8_t* alignment,
103 |     int n,
104 |     int m
105 | ) {
106 |     std::vector<bool> skip(m);
107 |     for (size_t i = 0; i < m; ++i) {
108 |         skip[i] = false;
109 |         for (size_t j = 0; j < n; ++j) {
110 |             std::uint8_t base = alignment[j * m + i];
111 | 
112 |             // TODO: think about whether it is a good idea to ignore wobbles
113 |             if (isGap(base) || !isKnown(base)) {
114 |                 skip[i] = true;
115 |                 break;
116 |             }
117 |         }
118 |     }
119 | 
120 |     return skip;
121 | }
122 | 
123 | // p-distance
124 | LIBRARY_API void pDistance(
125 |     std::uint8_t* alignment, // nucleotide alignment
126 |     int n,                   // number of entries
127 |     int m,                   // number of sites
128 |     bool pairwiseDeletion,   // gap handling
129 |     double *distMat          // (output) distance matrix
130 | ) {
131 |     // pairwise deletion
132 |     if (pairwiseDeletion) {
133 |         for (size_t i_a = 0; i_a < (n - 1); ++i_a) {
134 |             for (size_t i_b = (i_a + 1); i_b < n; ++i_b) {
135 |                 // double to avoid casting later
136 |                 double nComp = 0;
137 |                 double nMatch = 0;
138 |                 for (size_t j = 0; j < m; ++j) {
139 |                     std::uint8_t a = alignment[i_a * m + j];
140 |                     std::uint8_t b = alignment[i_b * m + j];
141 | 
142 |                     // TODO: think about this... This seems to be the same way as in ape
143 |                     // but I'm not sure that it is a good idea to ignore wobbles.
144 |                     if (!(isGap(a) || isGap(b)) && isKnown(a) && isKnown(b)) {
145 |                         nComp += 1;
146 |                         nMatch += isMatch(a, b);
147 |                     }
148 |                 }
149 | 
150 |                 double d = 1.0;
151 |                 if (nComp > 0) d = 1 - nMatch / nComp;
152 | 
153 |                 distMat[i_a * n + i_b] = d;
154 |                 distMat[i_b * n + i_a] = d;
155 |             }
156 |         }
157 |     // complete deletion
158 |     } else {
159 |         // find sites with missing values
160 |         std::vector<bool> skip = completeDeletionSites(alignment, n, m);
161 | 
162 |         // p distance calculation
163 |         for (size_t i_a = 0; i_a < (n - 1); ++i_a) {
164 |             for (size_t i_b = (i_a + 1); i_b < n; ++i_b) {
165 |                 // double to avoid casting later
166 |                 double nComp = 0;
167 |                 double nMatch = 0;
168 |                 for (size_t j = 0; j < m; ++j) {
169 |                     if (skip[j]) continue; // skip if site contains missing value
170 |                     std::uint8_t a = alignment[i_a * m + j];
171 |                     std::uint8_t b = alignment[i_b * m + j];
172 |                     
173 |                     nComp += 1;
174 |                     nMatch += isMatch(a, b);
175 |                 }
176 | 
177 |                 double d = 1.0;
178 |                 if (nComp > 0) d = 1 - nMatch / nComp;
179 | 
180 |                 distMat[i_a * n + i_b] = d;
181 |                 distMat[i_b * n + i_a] = d;
182 |             }
183 |         }
184 |     }
185 | };
186 | 
187 | // Jukes-Cantor distance
188 | LIBRARY_API void jcDistance(
189 |     std::uint8_t* alignment, // nucleotide alignment
190 |     int n,                   // number of entries
191 |     int m,                   // number of sites
192 |     bool pairwiseDeletion,   // gap handling
193 |     double *distMat          // (output) distance matrix
194 | ) {
195 |     // calculate p
196 |     pDistance(
197 |         alignment, n, m,
198 |         pairwiseDeletion,
199 |         distMat
200 |     );
201 | 
202 |     for (size_t i = 0; i < n; ++i) {
203 |         for (size_t j = 0; j < n; ++j) {
204 |             double d = abs(- (3.0 / 4.0) * log(1 - (4.0 / 3.0) * distMat[i * n + j]));
205 |             if (isnan(d)) d = INFINITY;
206 |             distMat[i * n + j] = d;
207 |         } 
208 |     }
209 | }
210 | 
211 | // Kimura 2-parameter distance
212 | LIBRARY_API void k2pDistance(
213 |     std::uint8_t* alignment, // nucleotide alignment
214 |     int n,                   // number of entries
215 |     int m,                   // number of sites
216 |     bool pairwiseDeletion,   // gap handling
217 |     double *distMat          // (output) distance matrix
218 | ) {
219 |     // pairwise deletion
220 |     if (pairwiseDeletion) {
221 |         for (size_t i_a = 0; i_a < (n - 1); ++i_a) {
222 |             for (size_t i_b = (i_a + 1); i_b < n; ++i_b) {
223 |                 // double to avoid casting later
224 |                 double nComp = 0;
225 |                 double nTransitions = 0;
226 |                 double nTransversions = 0;
227 |                 for (size_t j = 0; j < m; ++j) {
228 |                     std::uint8_t a = alignment[i_a * m + j];
229 |                     std::uint8_t b = alignment[i_b * m + j];
230 | 
231 |                     // TODO: think about this... This seems to be the same way as in ape
232 |                     // but I'm not sure that it is a good idea to ignore wobbles.
233 |                     if (!(isGap(a) || isGap(b)) && isKnown(a) && isKnown(b)) {
234 |                         nComp += 1;
235 |                         // if bases are the same there is neither transition
236 |                         // not transversion
237 |                         if (isMatch(a, b)) continue;
238 | 
239 |                         bool isTs = isTransition(a, b);
240 |                         nTransitions += isTs;
241 |                         nTransversions += 1 - isTs;
242 |                     }
243 |                 }
244 | 
245 |                 double d = INFINITY;
246 |                 if (nComp > 0) {
247 |                     double p = nTransitions / nComp;
248 |                     double q = nTransversions / nComp;
249 | 
250 |                     d = abs(-(1.0 / 2.0) * log((1 - 2 * p - q) * sqrt(1 - 2 * q)));
251 |                     if (isnan(d)) d = INFINITY;
252 |                 } 
253 | 
254 |                 distMat[i_a * n + i_b] = d;
255 |                 distMat[i_b * n + i_a] = d;
256 |             }
257 |         }
258 |     // complete deletion
259 |     } else {
260 |         // find sites with missing values
261 |         std::vector<bool> skip = completeDeletionSites(alignment, n, m);
262 | 
263 |         for (size_t i_a = 0; i_a < (n - 1); ++i_a) {
264 |             for (size_t i_b = (i_a + 1); i_b < n; ++i_b) {
265 |                 // double to avoid casting later
266 |                 double nComp = 0;
267 |                 double nTransitions = 0;
268 |                 double nTransversions = 0;
269 |                 for (size_t j = 0; j < m; ++j) {
270 |                     if (skip[j]) continue; // skip if site contains missing value
271 |                     std::uint8_t a = alignment[i_a * m + j];
272 |                     std::uint8_t b = alignment[i_b * m + j];
273 | 
274 |                     nComp += 1;
275 |                     // if bases are the same there is neither transition
276 |                     // not transversion
277 |                     if (isMatch(a, b)) continue;
278 | 
279 |                     bool isTs = isTransition(a, b);
280 |                     nTransitions += isTs;
281 |                     nTransversions += 1 - isTs;
282 |                 }
283 | 
284 |                 double d = INFINITY;
285 |                 if (nComp > 0) {
286 |                     double p = nTransitions / nComp;
287 |                     double q = nTransversions / nComp;
288 | 
289 |                     d = abs(-(1.0 / 2.0) * log((1 - 2 * p - q) * sqrt(1 - 2 * q)));
290 |                     if (isnan(d)) d = INFINITY;
291 |                 } 
292 | 
293 |                 distMat[i_a * n + i_b] = d;
294 |                 distMat[i_b * n + i_a] = d;
295 |             }
296 |         }
297 |     }
298 | }
299 | 


--------------------------------------------------------------------------------
/pyckmeans/knee/__init__.py:
--------------------------------------------------------------------------------
  1 | ''' Knee and elbow search.
  2 | '''
  3 | 
  4 | from typing import Callable, Iterable
  5 | import warnings
  6 | 
  7 | import numpy
  8 | 
  9 | def rel_extrema_idcs(
 10 |     x: numpy.ndarray,
 11 |     cmp_fun: Callable[[numpy.ndarray, numpy.ndarray], numpy.ndarray] = numpy.greater,
 12 |     mode: str = 'clip',
 13 | ) -> numpy.ndarray:
 14 |     '''rel_extrema_idcs
 15 | 
 16 |     Find indices of relative extrema. A relative extremum is found if
 17 |     at an element, if cmp_fun returns true for both of its neighbors.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     x : numpy.ndarray
 22 |         Data vector.
 23 |     cmp_fun : Callable[[numpy.ndarray, numpy.ndarray], numpy.ndarray], optional
 24 |         Compare function function, by default numpy.greater
 25 |     mode : str, optional
 26 |         Specifies how out-of-bounds indices will behave.
 27 | 
 28 |         * 'raise' - raise an error (default)
 29 |         * 'wrap' - wrap around
 30 |         * 'clip' - clip to the range
 31 | 
 32 |         'clip' mode means that all indices that are too large are replaced
 33 |         by the index that addresses the last element along that axis. Note
 34 |         that this disables indexing with negative numbers.
 35 | 
 36 |         (mode documentation copied from numpy.take)
 37 | 
 38 |     Returns
 39 |     -------
 40 |     numpy.ndarray
 41 |         Indices of the extrema.
 42 |     '''
 43 |     idcs = numpy.arange(0, x.shape[0])
 44 |     left = x.take(idcs + 1, mode=mode)
 45 |     right = x.take(idcs - 1, mode=mode)
 46 | 
 47 |     return numpy.nonzero(cmp_fun(x, left) & cmp_fun(x, right))[0]
 48 | 
 49 | 
 50 | # the following code is mostly copied from
 51 | # https://github.com/arvkevi/kneed and
 52 | # was adapted for compatibility
 53 | 
 54 | VALID_CURVE = ('convex', 'concave')
 55 | VALID_DIRECTION = ('increasing', 'decreasing')
 56 | 
 57 | class KneeLocator:
 58 |     '''KneeLocator
 59 | 
 60 |     An implementation of the Kneedle algorithm [1]_.
 61 | 
 62 |     Once instantiated, this class attempts to find the point of maximum
 63 |     curvature on a line. The knee is accessible via the `.knee` attribute.
 64 | 
 65 |     Parameters
 66 |     ----------
 67 |     x : numpy.ndarray
 68 |         x values.
 69 |     y : numpy.ndarray
 70 |         y values.
 71 |     S : float, optional
 72 |         Sensitivity, original paper suggests default of 1.0, by default 1.0.
 73 |     curve : str, optional
 74 |         If 'concave', algorithm will detect knees. If 'convex', it
 75 |         will detect elbows., by default 'concave'.
 76 |     direction : str, optional
 77 |         Curve direction. One of {'increasing', 'decreasing'}, by default 'increasing'.
 78 |     interp_method : str, optional
 79 |         Interpolation method. One of
 80 | 
 81 |         * 'interp1d' - no interpolation
 82 |         * 'polynomial' - polynomial interpolation
 83 | 
 84 |         By default 'interp1d'.
 85 |     online : bool, optional
 86 |         Correct old knee points if True, will return first knee if False,
 87 |         by default False.
 88 |     polynomial_degree : int, optional
 89 |         The degree of the fitting polynomial. Only used when interp_method='polynomial'.
 90 |         This argument is passed to numpy polyfit `deg` parameter., by default 7.
 91 | 
 92 |     Raises
 93 |     ------
 94 |     ValueError
 95 |         Raised if invalid curve or direction argument passed.
 96 |     ValueError
 97 |         Raised if invalid interp_method argument passed.
 98 | 
 99 |     References
100 |     ----------
101 |     .. [1]  Satopaa, V., J., Albrecht, D., Irwin, B., Raghavan. 2011.
102 |             "Finding a "Kneedle" in a Haystack: Detecting Knee Points in System Behavior".
103 |             31st International Conference on Distributed Computing Systems Workshops.
104 |             doi: 10.1109/ICDCSW.2011.20.
105 |     '''
106 |     def __init__(
107 |         self,
108 |         x: numpy.ndarray,
109 |         y: numpy.ndarray,
110 |         S: float = 1.0,
111 |         curve: str = 'concave',
112 |         direction: str = 'increasing',
113 |         interp_method: str = 'interp1d',
114 |         online: bool = False,
115 |         polynomial_degree: int = 7,
116 |     ):
117 |         # Step 0: Raw Input
118 |         self.x = numpy.array(x)
119 |         self.y = numpy.array(y)
120 |         self.curve = curve
121 |         self.direction = direction
122 |         self.N = len(self.x)
123 |         self.S = S
124 |         self.all_knees = set()
125 |         self.all_norm_knees = set()
126 |         self.all_knees_y = []
127 |         self.all_norm_knees_y = []
128 |         self.online = online
129 |         self.polynomial_degree = polynomial_degree
130 | 
131 |         valid_curve = self.curve in VALID_CURVE
132 |         valid_direction = self.direction in VALID_DIRECTION
133 |         if not all((valid_curve, valid_direction)):
134 |             raise ValueError(
135 |                 'Please check that the curve and direction arguments are valid.'
136 |             )
137 | 
138 |         # Step 1: fit a smooth line
139 |         if interp_method == 'interp1d':
140 |             # uspline = interpolate.interp1d(self.x, self.y)
141 |             # self.ds_y = uspline(self.x)
142 |             self.ds_y = y
143 |         elif interp_method == 'polynomial':
144 |             p = numpy.poly1d(numpy.polyfit(x, y, self.polynomial_degree))
145 |             self.ds_y = p(x)
146 |         else:
147 |             msg = f'{interp_method} is an invalid interp_method parameter, ' +\
148 |                 'use either "interp1d" or "polynomial".'
149 |             raise ValueError(msg)
150 | 
151 |         # Step 2: normalize values
152 |         self.x_normalized = self._normalize(self.x)
153 |         self.y_normalized = self._normalize(self.ds_y)
154 | 
155 |         # Step 3: Calculate the Difference curve
156 |         self.y_normalized = self.transform_y(
157 |             self.y_normalized, self.direction, self.curve
158 |         )
159 |         # normalized difference curve
160 |         self.y_difference = self.y_normalized - self.x_normalized
161 |         self.x_difference = self.x_normalized.copy()
162 | 
163 |         # Step 4: Identify local maxima/minima
164 |         # local maxima
165 |         self.maxima_indices = rel_extrema_idcs(self.y_difference, numpy.greater_equal)
166 |         self.x_difference_maxima = self.x_difference[self.maxima_indices]
167 |         self.y_difference_maxima = self.y_difference[self.maxima_indices]
168 | 
169 |         # local minima
170 |         self.minima_indices = rel_extrema_idcs(self.y_difference, numpy.less_equal)
171 |         self.x_difference_minima = self.x_difference[self.minima_indices]
172 |         self.y_difference_minima = self.y_difference[self.minima_indices]
173 | 
174 |         # Step 5: Calculate thresholds
175 |         self.Tmx = self.y_difference_maxima - (
176 |             self.S * numpy.abs(numpy.diff(self.x_normalized).mean())
177 |         )
178 | 
179 |         # Step 6: find knee
180 |         self.knee, self.norm_knee = self.find_knee()
181 | 
182 |         # Step 7: If we have a knee, extract data about it
183 |         self.knee_y = self.norm_knee_y = None
184 |         if self.knee:
185 |             self.knee_y = self.y[self.x == self.knee][0]
186 |             self.norm_knee_y = self.y_normalized[self.x_normalized == self.norm_knee][0]
187 | 
188 |     @staticmethod
189 |     def _normalize(x: numpy.ndarray) -> numpy.ndarray:
190 |         '''_normalize
191 | 
192 |         Scale vector values between 0 and 1.
193 | 
194 |         Parameters
195 |         ----------
196 |         x : numpy.ndarray
197 |             Vector to scale.
198 | 
199 |         Returns
200 |         -------
201 |         numpy.ndarray
202 |             Scaled vector
203 |         '''
204 |         return (x - x.min()) / (x.max() - x.min())
205 | 
206 |     @staticmethod
207 |     def transform_y(y: Iterable[float], direction: str, curve: str) -> float:
208 |         '''transform y to concave, increasing based on given direction and curve'''
209 |         # convert elbows to knees
210 |         if direction == 'decreasing':
211 |             if curve == 'concave':
212 |                 y = numpy.flip(y)
213 |             elif curve == 'convex':
214 |                 y = y.max() - y
215 |         elif direction == 'increasing' and curve == 'convex':
216 |             y = numpy.flip(y.max() - y)
217 | 
218 |         return y
219 | 
220 |     def find_knee(self,):
221 |         '''
222 |         This function is called when KneeLocator is instantiated.
223 |         It identifies the knee value and sets the instance attributes.
224 |         '''
225 |         if not self.maxima_indices.size:
226 |             warnings.warn(
227 |                 'No local maxima found in the difference curve\n'
228 |                 'The line is probably not polynomial.',
229 |                 RuntimeWarning,
230 |             )
231 |             return None, None
232 |         # placeholder for which threshold region i is located in.
233 |         maxima_threshold_index = 0
234 |         minima_threshold_index = 0
235 |         # traverse the difference curve
236 |         for i, x in enumerate(self.x_difference):
237 |             # skip points on the curve before the the first local maxima
238 |             if i < self.maxima_indices[0]:
239 |                 continue
240 | 
241 |             j = i + 1
242 | 
243 |             # reached the end of the curve
244 |             if x == 1.0:
245 |                 break
246 | 
247 |             # if we're at a local max, increment the maxima threshold index and continue
248 |             if (self.maxima_indices == i).any():
249 |                 threshold = self.Tmx[maxima_threshold_index]
250 |                 threshold_index = i
251 |                 maxima_threshold_index += 1
252 |             # values in difference curve are at or after a local minimum
253 |             if (self.minima_indices == i).any():
254 |                 threshold = 0.0
255 |                 minima_threshold_index += 1
256 | 
257 |             if self.y_difference[j] < threshold:
258 |                 if self.curve == 'convex':
259 |                     if self.direction == 'decreasing':
260 |                         knee = self.x[threshold_index]
261 |                         norm_knee = self.x_normalized[threshold_index]
262 |                     else:
263 |                         knee = self.x[-(threshold_index + 1)]
264 |                         norm_knee = self.x_normalized[threshold_index]
265 | 
266 |                 elif self.curve == 'concave':
267 |                     if self.direction == 'decreasing':
268 |                         knee = self.x[-(threshold_index + 1)]
269 |                         norm_knee = self.x_normalized[threshold_index]
270 |                     else:
271 |                         knee = self.x[threshold_index]
272 |                         norm_knee = self.x_normalized[threshold_index]
273 | 
274 |                 # add the y value at the knee
275 |                 y_at_knee = self.y[self.x == knee][0]
276 |                 y_norm_at_knee = self.y_normalized[self.x_normalized == norm_knee][0]
277 |                 if knee not in self.all_knees:
278 |                     self.all_knees_y.append(y_at_knee)
279 |                     self.all_norm_knees_y.append(y_norm_at_knee)
280 | 
281 |                 # now add the knee
282 |                 self.all_knees.add(knee)
283 |                 self.all_norm_knees.add(norm_knee)
284 | 
285 |                 # if detecting in offline mode, return the first knee found
286 |                 if self.online is False:
287 |                     return knee, norm_knee
288 | 
289 |         if self.all_knees == set():
290 |             warnings.warn('No knee/elbow found')
291 |             return None, None
292 | 
293 |         return knee, norm_knee
294 | 
295 |     # Niceties for users working with elbows rather than knees
296 |     @property
297 |     def elbow(self):
298 |         return self.knee
299 | 
300 |     @property
301 |     def norm_elbow(self):
302 |         return self.norm_knee
303 | 
304 |     @property
305 |     def elbow_y(self):
306 |         return self.knee_y
307 | 
308 |     @property
309 |     def norm_elbow_y(self):
310 |         return self.norm_knee_y
311 | 
312 |     @property
313 |     def all_elbows(self):
314 |         return self.all_knees
315 | 
316 |     @property
317 |     def all_norm_elbows(self):
318 |         return self.all_norm_knees
319 | 
320 |     @property
321 |     def all_elbows_y(self):
322 |         return self.all_knees_y
323 | 
324 |     @property
325 |     def all_norm_elbows_y(self):
326 |         return self.all_norm_knees_y
327 | 


--------------------------------------------------------------------------------
/pyckmeans/io/nucleotide_alignment.py:
--------------------------------------------------------------------------------
  1 | ''' nucleotide_alignment
  2 | 
  3 |     Module for the representation of nucleotide alignments.
  4 | '''
  5 | 
  6 | import os
  7 | from typing import Iterable, Tuple
  8 | 
  9 | import numpy
 10 | import pyckmeans.distance
 11 | from .c_interop import encode_nucleotides
 12 | 
 13 | # Base encoding as used by R package ape.
 14 | # See http://ape-package.ird.fr/misc/BitLevelCodingScheme.html
 15 | #
 16 | # Summary:
 17 | # Most significant four bits are base information (A, G, C, T)
 18 | # 0b00001000 -> base is known
 19 | # 0b00000100 -> gap
 20 | # 0b00000010 -> unknown base
 21 | BASE_ENCODING = {
 22 |     # bases
 23 |     'A': 0b10001000, 'a': 0b10001000,
 24 |     'G': 0b01001000, 'g': 0b01001000,
 25 |     'C': 0b00101000, 'c': 0b00101000,
 26 |     'T': 0b00011000, 't': 0b00011000,
 27 |     # wobbles
 28 |     'R': 0b11000000, 'r': 0b11000000, # A|G
 29 |     'M': 0b10100000, 'm': 0b10100000, # A|C
 30 |     'W': 0b10010000, 'w': 0b10010000, # A|T
 31 |     'S': 0b01100000, 's': 0b01100000, # G|C
 32 |     'K': 0b01010000, 'k': 0b01010000, # G|T
 33 |     'Y': 0b00110000, 'y': 0b00110000, # C|T
 34 |     'V': 0b11100000, 'v': 0b11100000, # A|G|C
 35 |     'H': 0b10110000, 'h': 0b10110000, # A|C|T
 36 |     'D': 0b11010000, 'd': 0b11010000, # A|G|T
 37 |     'B': 0b01110000, 'b': 0b01110000, # G|C|T
 38 |     'N': 0b11110000, 'n': 0b11110000, # A|G|C|T
 39 |     # gaps
 40 |     '-': 0b00000100,
 41 |     '~': 0b00000100,
 42 |     ' ': 0b00000100,
 43 |     # unknown/missing state
 44 |     '?': 0b00000010
 45 | }
 46 | BASE_ENCODING_INVERSE = {
 47 |     v:k for k, v in BASE_ENCODING.items() if k.isupper() or k in ('-', '?')
 48 | }
 49 | 
 50 | class InvalidAlignmentFileExtensionError(Exception):
 51 |     '''InvalidAlignmentFileExtensionError'''
 52 | 
 53 | class InvalidAlignmentFileFormatError(Exception):
 54 |     '''InvalidAlignmentFileFormatError'''
 55 | 
 56 | class InvalidAlignmentCharacterError(Exception):
 57 |     '''InvalidAlignmentCharacterError'''
 58 | 
 59 | class InvalidSeqIORecordsError(Exception):
 60 |     '''InvalidSeqIORecordsError'''
 61 | 
 62 | class NucleotideAlignment:
 63 |     '''NucleotideAlignment
 64 | 
 65 |     Class for nucleotide alignments.
 66 | 
 67 |     Parameters
 68 |     ----------
 69 |     names : List[str]
 70 |         Sequence identifiers/names.
 71 |     sequences : numpy.ndarray
 72 |         n*m alignment matrix, where n is the number of entries and m
 73 |         is the number of sites.
 74 |     copy : bool
 75 |         If True, sequences will be copied. If false, the NucleotideAlignment
 76 |         will use the original sequences, potentially modifying them.
 77 |     fast_encoding : bool
 78 |         If true, a fast nucleotide encoding method without error checking
 79 |         will be used. ATTENTION: This will modify sequences in place.
 80 |     '''
 81 |     def __init__(
 82 |         self,
 83 |         names: Iterable[str],
 84 |         sequences: numpy.ndarray,
 85 |         copy: bool = False,
 86 |         fast_encoding: bool = False,
 87 |     ):
 88 |         # check validity
 89 |         n_names = len(names)
 90 |         n_seqs = sequences.shape[0]
 91 |         if n_names != n_seqs:
 92 |             msg = f'Number of names ({n_names}) does not match number of sequences ({n_seqs}).'
 93 |             raise Exception(msg)
 94 |         self.names = numpy.array(names)
 95 | 
 96 |         # encode strings as uint8, see BASE_ENCODING
 97 |         if sequences.dtype != numpy.uint8:
 98 |             if fast_encoding:
 99 |                 self.sequences = encode_nucleotides(sequences.copy() if copy else sequences)
100 |             else:
101 |                 try:
102 |                     self.sequences = numpy.array(
103 |                         [[BASE_ENCODING[n] for n in row] for row in sequences],
104 |                         dtype=numpy.uint8,
105 |                     )
106 |                 except KeyError as k_err:
107 |                     msg = f'Encountered unknown character in alignment: {str(k_err)}'
108 |                     raise InvalidAlignmentCharacterError(msg) from k_err
109 |         else:
110 |             self.sequences = sequences.copy() if copy else sequences
111 | 
112 |     def drop_invariant_sites(self, in_place: bool = False) -> 'NucleotideAlignment':
113 |         '''drop_invariant_sites
114 | 
115 |         Remove invariant sites from alignment. Invariant sites
116 |         are sites, where each entry has the same symbol.
117 | 
118 |         Parameters
119 |         ----------
120 |         in_place : bool, optional
121 |             Modify self in place, by default False
122 | 
123 |         Returns
124 |         -------
125 |         NucleotideAlignment
126 |             NucleotideAlignment without invariant sites.
127 |             If in_place is set to True, self is returned.
128 |         '''
129 |         if in_place:
130 |             self.sequences = self.sequences[
131 |                 :,
132 |                 ~numpy.all((self.sequences == self.sequences[0,]), axis=0)
133 |             ]
134 |             return self
135 |         else:
136 |             return NucleotideAlignment(
137 |                 self.names.copy(),
138 |                 self.sequences[
139 |                     :, ~numpy.all((self.sequences == self.sequences[0,]), axis=0)
140 |                 ].copy(),
141 |             )
142 | 
143 |     def copy(self) -> 'NucleotideAlignment':
144 |         '''copy
145 | 
146 |         Return a copy of the NucleotideAligment object.
147 | 
148 |         Returns
149 |         -------
150 |         NucleotideAlignment
151 |             Copy of self.
152 |         '''
153 |         return NucleotideAlignment(self.names.copy(), self.sequences.copy())
154 | 
155 |     def distance(
156 |         self,
157 |         distance_type: str = 'p',
158 |         pairwise_deletion: bool = True,
159 |     ) -> 'pyckmeans.distance.DistanceMatrix':
160 |         '''distance
161 | 
162 |         Calculate genetic distance.
163 | 
164 |         Parameters
165 |         ----------
166 |         distance_type : str, optional
167 |             Type of genetic distance to calculate, by default 'p'.
168 |             Available distance types are p-distances ('p'),
169 |             Jukes-Cantor distances ('jc'), and Kimura 2-paramater distances
170 |             ('k2p').
171 |         pairwise_deletion : bool
172 |             Use pairwise deletion as action to deal with missing data.
173 |             If False, complete deletion is applied.
174 |             Gaps ("-", "~", " "), "?", and ambiguous bases are treated as
175 |             missing data.
176 |         Returns
177 |         -------
178 |         pyckmeans.distance.DistanceMatrix
179 |             n*n distance matrix.
180 |         '''
181 | 
182 |         return pyckmeans.distance.alignment_distance(
183 |             alignment=self,
184 |             distance_type=distance_type,
185 |             pairwise_deletion=pairwise_deletion,
186 |         )
187 | 
188 |     @property
189 |     def shape(self) -> Tuple[int, int]:
190 |         '''shape
191 | 
192 |         Get alignment dimensions/shapes.
193 | 
194 |         Returns
195 |         -------
196 |         Tuple[int, int]
197 |             Number of samples n, number of sites m
198 |         '''
199 |         return self.sequences.shape
200 | 
201 |     def __getitem__(self, idx):
202 |         if isinstance(idx, tuple):
203 |             return NucleotideAlignment(self.names[idx[0]], self.sequences[idx])
204 |         else:
205 |             return NucleotideAlignment(self.names[idx], self.sequences[idx])
206 | 
207 |     def __repr__(self) -> str:
208 |         '''__repr__
209 | 
210 |         Returns
211 |         -------
212 |         str
213 |             String representation
214 |         '''
215 |         shape = self.shape
216 |         return f'<NucleotideAlignment; #samples: {shape[0]}, #sites: {shape[1]}>'
217 | 
218 |     @classmethod
219 |     def from_bp_seqio_records(
220 |         cls,
221 |         records: Iterable['Bio.SeqRecord.SeqRecord'],
222 |         fast_encoding: bool = False,
223 |     ) -> 'NucleotideAlignment':
224 |         '''from_bp_seqio_records
225 | 
226 |         Build NucleotideAlignment from iterable of Bio.SeqRecord.SeqRecord.
227 |         Such an iterable is, for example, returned by Bio.SeqIO.parse() or
228 |         can be constructed using Bio.Align.MultipleSequenceAlignment().
229 | 
230 |         Parameters
231 |         ----------
232 |         records: Iterable['Bio.SeqRecord.SeqRecord']
233 |             Iterable of Bio.SeqRecord.SeqRecord.
234 |             Such an iterable is, for example, returned by Bio.SeqIO.parse() or
235 |             can be constructed using Bio.Align.MultipleSequenceAlignment().
236 |         fast_encoding : bool
237 |             If true, a fast nucleotide encoding method without error checking
238 |             will be used.
239 | 
240 |         Returns
241 |         -------
242 |         NucleotideAlignment
243 |             NucleotideAlignment object.
244 | 
245 |         Raises
246 |         ------
247 |         InvalidSeqIORecordsError
248 |             Raised of sequences have different lengths.
249 |         '''
250 |         names = []
251 |         seqs = []
252 | 
253 |         for record in records:
254 |             names.append(record.id)
255 |             seqs.append(list(record.seq))
256 | 
257 |         # check if all sequences have same length
258 |         seq_len = len(seqs[0])
259 |         for i, seq in enumerate(seqs[1:]):
260 |             cur_seq_len = len(seq)
261 |             if cur_seq_len != seq_len:
262 |                 msg = f'Expected all sequences to have length {seq_len}' +\
263 |                     f'(length of sequence #0) but sequence #{i+1} has length {cur_seq_len}.'
264 |                 raise InvalidSeqIORecordsError(msg)
265 | 
266 |         seqs = numpy.array(seqs)
267 |         names = numpy.array(names)
268 | 
269 |         return cls(names, seqs, copy=False, fast_encoding=fast_encoding)
270 | 
271 |     @classmethod
272 |     def from_file(
273 |         cls,
274 |         file_path: str,
275 |         file_format='auto',
276 |         fast_encoding=False,
277 |     ) -> 'NucleotideAlignment':
278 |         '''from_file
279 | 
280 |         Read nucleotide alignment from file.
281 | 
282 |         Parameters
283 |         ----------
284 |         file_path: str
285 |             Path to alignment file.
286 |         file_format: str
287 |             Alignment file format. Either "auto", "fasta" or "phylip".
288 |             When "auto" the file format will be inferred based on the file extension.
289 |         fast_encoding : bool
290 |             If true, a fast nucleotide encoding method without error checking
291 |             will be used.
292 | 
293 |         Returns
294 |         -------
295 |         Tuple[numpy.ndarray, numpy.ndarray]
296 |             Tuple of sequences and names, each as numpy array.
297 | 
298 |         Raises
299 |         ------
300 |         InvalidAlignmentFileExtensionError
301 |             Raised if file_format is "auto" and the file extension is not understood.
302 |         InvalidAlignmentFileFormatError
303 |             Raised if an invalid file_format is passed.
304 |         '''
305 |         if file_format == 'auto':
306 |             ext = os.path.splitext(file_path)[1].lower()
307 | 
308 |             if ext in ['.fasta', '.fas', '.fa']:
309 |                 file_format = 'fasta'
310 |             elif ext in ['.phylip', '.phy']:
311 |                 file_format = 'phylip'
312 |             else:
313 |                 msg = f'Unknown alignment file extension "{ext}". Please set file_format manually.'
314 |                 raise InvalidAlignmentFileExtensionError(msg)
315 | 
316 |         if file_format in ['fasta', 'FASTA']:
317 |             from .fasta import read_fasta_alignment
318 | 
319 |             seqs, names = read_fasta_alignment(
320 |                 file_path,
321 |                 dtype='S' if fast_encoding else 'U',
322 |             )
323 | 
324 |             return cls(
325 |                 names=names,
326 |                 sequences=seqs,
327 |                 copy=False,
328 |                 fast_encoding=fast_encoding,
329 |             )
330 | 
331 |         elif file_format in ['phylip', 'PHYLIP']:
332 |             from .phylip import read_phylip_alignment
333 | 
334 |             seqs, names = read_phylip_alignment(
335 |                 file_path,
336 |                 dtype='S' if fast_encoding else 'U',
337 |             )
338 | 
339 |             return cls(
340 |                 names=names,
341 |                 sequences=seqs,
342 |                 copy=False,
343 |                 fast_encoding=fast_encoding,
344 |             )
345 | 
346 |         else:
347 |             msg = f'Unknown aligment file format "{file_format}". ' +\
348 |                 'Supported formats are "fasta" and "phylip".'
349 |             raise InvalidAlignmentFileFormatError(msg)
350 | 
351 | def read_alignment(file_path: str, file_format: str = 'auto') -> NucleotideAlignment:
352 |     '''read_alignment
353 | 
354 |     Read nucleotide alignment from file.
355 |     Alias for NucleotideAlignment.from_file.
356 | 
357 |     Parameters
358 |     ----------
359 |     file_path: str
360 |         Path to alignment file.
361 |     file_format: str
362 |         Alignment file format. Either "auto", "fasta" or "phylip".
363 |         When "auto" the file format will be inferred based on the file extension.
364 | 
365 |     Returns
366 |     -------
367 |     NucleotideAlignment
368 |         NucleotideAlignment instance.
369 | 
370 |     Raises
371 |     ------
372 |     InvalidAlignmentFileExtensionError
373 |         Raised if file_format is "auto" and the file extension is not understood.
374 |     InvalidAlignmentFileFormatError
375 |         Raised if an invalid file_format is passed.
376 |     '''
377 | 
378 |     return NucleotideAlignment.from_file(file_path, file_format)
379 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](/LICENSE)
  2 | [![PyPI version](https://img.shields.io/pypi/v/pyckmeans?color=blue)](https://pypi.org/project/pyckmeans/)
  3 | [![Anaconda-Server Badge](https://img.shields.io/conda/v/TankredO/pyckmeans?label=conda)](https://anaconda.org/tankredo/pyckmeans)
  4 | [![Coverage Status](https://img.shields.io/coveralls/github/TankredO/pyckmeans)](https://coveralls.io/github/TankredO/pyckmeans?branch=main)
  5 | 
  6 | [![DOI](https://zenodo.org/badge/361376094.svg)](https://zenodo.org/badge/latestdoi/361376094)
  7 | 
  8 | # pyckmeans
  9 | 
 10 | pyckmeans is a Python package for [Consensus K-Means](https://doi.org/10.1023/A:1023949509487) and [Weighted Ensemble Consensus of Random (WECR) K-Means](https://doi.org/10.1109/TKDE.2019.2952596) clustering, especially in the context of DNA sequence data. To evaluate the quality of clusterings, pyckmeans implements several internal validation metrics.
 11 | 
 12 | In addition to the clustering functionality, it provides tools for working with DNA sequence data such as reading and writing of DNA alignment files, calculating genetic distances, and Principle Coordinate Analysis (PCoA) for dimensionality reduction.
 13 | 
 14 | ## Consensus K-Means
 15 | 
 16 | [Consensus K-Means](https://doi.org/10.1023/A:1023949509487) is an unsupervised ensemble clustering algorithm, combining multiple K-Means clusterings, where each K-Means is trained on a subset of the data (random subset) and a subset of the the features (random subspace). The predicted cluster memberships of the single clusterings are combined to a consensus (or co-association) matrix, determining the number of times each pair of samples was clustered together over all clusterings. This matrix can be interpreted as similarity matrix and can be used to resolve the final consensus clustering by subjecting it to a last clustering step, e.g. hierarchical, or spectral clustering.
 17 | 
 18 | ## WECR K-Means
 19 | 
 20 | [Weighted Ensemble Consensus of Random (WECR) K-Means](https://doi.org/10.1109/TKDE.2019.2952596) is a semi-supervised ensemble clustering algorithm. Similar to consensus K-Means, it is based on a collection of K-Means clusterings, which are each trained on a random subset of data and a random subspace of features. In addition, for each single clustering the number of clusters _k_ is also randomized. This library of clusterings is subjected to weighting function that integrates user-supplied must-link and must-not-link constraints, as well as an internal cluster validation criterion. The constraints represent the semi-supervised component of WECR K-Means: the user can provide prior knowledge considering the composition of the clusters. Must-link and must-not-link constraints imply that a pair of samples (observations, data points) is expected to be found in the same or different clusters, respectively. Based on the clusterings and the calculated weights, a weighted consensus (co-association) matrix is constructed, which is subjected to Cluster-based Similariry Partitioning (CSPA; e.g. hierarchical clustering) or spectral clustering to resolve the consensus clustering.
 21 | 
 22 | ## Documentation
 23 | 
 24 | See pyckmeans' [RTD Documentation](https://pyckmeans.readthedocs.io/) for details.
 25 | 
 26 | ## Installation
 27 | 
 28 | pyckmeans can be installed using pip, Conda, or from source.
 29 | 
 30 | ### pip
 31 | 
 32 | ```bash
 33 | pip install pyckmeans
 34 | ```
 35 | 
 36 | ### Conda
 37 | 
 38 | ```bash
 39 | conda install pyckmeans -c TankredO
 40 | ```
 41 | 
 42 | ### From Source
 43 | 
 44 | The installation from source requires `git` and a c++ compiler.
 45 | 
 46 | ```bash
 47 | git clone https://github.com/TankredO/pyckmeans
 48 | cd pyckmeans
 49 | pip install .
 50 | ```
 51 | 
 52 | ## Usage
 53 | 
 54 | Examples using the Python API:
 55 | 
 56 | - [Consensus K-Means: Clustering a Data Matrix (Single K)](#ckmeans-data-single)
 57 | - [Consensus K-Means: Clustering a Data Matrix (Multi K)](#ckmeans-data-multi)
 58 | - [Consensus K-Means: Clustering Sequence Data](#ckmeans-sequence-multi)
 59 | - [WECR K-Means: Clustering Sequence Data](#wecr-sequence)
 60 | 
 61 | <h3 id="ckmeans-data-single">Consensus K-Means: Clustering a Data Matrix (Single K)</h3>
 62 | 
 63 | ```python
 64 | from pyckmeans import CKmeans
 65 | 
 66 | # simulate dataset
 67 | # 50 samples, 2 features, 3 true clusters
 68 | import sklearn.datasets
 69 | x, _ = sklearn.datasets.make_blobs(n_samples=50, n_features=2, centers=3, random_state=75)
 70 | 
 71 | # apply Consensus K-Means
 72 | # 3 clusters, 100 K-Means runs,
 73 | # draw 80% of samples and 50% of features for each single K-Means
 74 | ckm = CKmeans(k=3, n_rep=100, p_samp=0.8, p_feat=0.5)
 75 | ckm.fit(x)
 76 | ckm_res = ckm.predict(x)
 77 | 
 78 | # plot consensus matrix and consensus clustering
 79 | fig = ckm_res.plot(figsize=(7,7))
 80 | 
 81 | # consensus matrix
 82 | ckm_res.cmatrix
 83 | 
 84 | # clustering metrics
 85 | print('Bayesian Information Criterion:', ckm_res.bic)
 86 | print('Davies-Bouldin Index:', ckm_res.db)
 87 | print('Silhouette Score:', ckm_res.sil)
 88 | print('Calinski-Harabasz Index:', ckm_res.ch)
 89 | 
 90 | # consensus clusters
 91 | print('Cluster Membership:', ckm_res.cl)
 92 | ```
 93 | 
 94 |     Bayesian Information Criterion: 50.21824821939818
 95 |     Davies-Bouldin Index: 0.2893792767901513
 96 |     Silhouette Score: 0.7827738719266039
 97 |     Calinski-Harabasz Index: 630.8235586596012
 98 |     Cluster Membership: [0 2 1 0 2 2 1 0 2 1 0 0 2 0 2 2 1 1 1 1 0 1 2 2 2 2 1 0 2 2 1 0 1 1 0 0 0
 99 |      1 0 1 2 1 2 2 1 0 0 0 0 1]
100 | 
101 | ![png](https://github.com/TankredO/pyckmeans/blob/main/docs/images/output_4_1.png?raw=true)
102 | 
103 | <h3 id="ckmeans-data-multi">Consensus K-Means: Clustering a Data Matrix (Multi K)</h3>
104 | 
105 | The `MultiCKmeans` class allows to train multiple `CKmeans` objects a once.
106 | This is, for example, useful for exploring clustering for different values of _k_.
107 | 
108 | ```python
109 | from pyckmeans import MultiCKMeans
110 | import sklearn.datasets
111 | 
112 | # simulate dataset
113 | # 50 samples, 10 features, 3 true clusters
114 | x, _ = sklearn.datasets.make_blobs(n_samples=50, n_features=10, centers=3, random_state=44)
115 | 
116 | # apply multiple Consensus K-Means for
117 | # k = 2, ..., 5
118 | # 100 K-Means runs per Consensus K-Means
119 | # draw 80% of the sample for each single K-Means
120 | # draw 50% of the features for each single K-Means
121 | mckm = MultiCKMeans(k=[2, 3, 4, 5], n_rep=100, p_samp=0.8, p_feat=0.5)
122 | mckm.fit(x)
123 | mckm_res = mckm.predict(x)
124 | 
125 | # clustering metrics
126 | print('Metrics:')
127 | print(mckm_res.metrics)
128 | 
129 | # plot clustering metrics against k
130 | # BIC, DB: lower is better
131 | # SIL, CH: higher is better
132 | mckm_res.plot_metrics(figsize=(10,5))
133 | 
134 | 
135 | # get a single CKmeansResult                  0 |1| 2  3
136 | ckm_res_k3 = mckm_res.ckmeans_results[1] # k=[2, 3, 4, 5]
137 | # ...
138 | # see "Clustering a Data Matrix (Single K)"
139 | ```
140 | 
141 |     Metrics:
142 |        k       sil         bic        db          ch
143 |     0  2  0.574369  225.092100  0.646401   59.733498
144 |     1  3  0.788207  126.358519  0.302979  387.409107
145 |     2  4  0.563343  126.979355  1.214520  271.019424
146 |     3  5  0.339466  128.061382  1.698652  211.080143
147 | 
148 | ![png](https://github.com/TankredO/pyckmeans/blob/main/docs/images/output_6_1.png?raw=true)
149 | 
150 | <h3 id="ckmeans-sequence-multi">Consensus K-Means: Clustering Sequence Data</h3>
151 | 
152 | ```python
153 | from pyckmeans import MultiCKMeans, NucleotideAlignment, pcoa
154 | from IPython.display import display
155 | # Set random seed for demonstration
156 | import numpy
157 | numpy.random.seed(0)
158 | 
159 | # Load nucleotide alignment
160 | # Note: the file is available from
161 | # "https://github.com/TankredO/pyckmeans/tree/main/docs/datasets/rhodanthemum_ct85_msl68.snps.phy"
162 | aln = NucleotideAlignment.from_file('datasets/rhodanthemum_ct85_msl68.snps.phy')
163 | print('Nucleotide alignment:', aln)
164 | 
165 | # Calculate Kimura 2-parameter distances
166 | dst = aln.distance(distance_type='k2p')
167 | 
168 | # Apply PCoA, including negative Eigentvalue correction
169 | pcoa_res = pcoa(dst, correction='lingoes')
170 | # display Eigenvalues
171 | print('Eigenvalues:')
172 | display(pcoa_res.values)
173 | 
174 | # Get Eigenvectors until the cumulative corrected Eigenvalues are >= 0.8
175 | vectors = pcoa_res.get_vectors(
176 |     filter_by='eigvals_rel_corrected_cum',
177 |     filter_th=0.8,
178 |     out_format='pandas'
179 | )
180 | 
181 | # Apply Multi-K Consensus K-Means
182 | mckm = MultiCKMeans(
183 |     k=range(2, 20),
184 |     n_rep=50,
185 |     p_samp=0.8,
186 |     p_feat=0.8
187 | )
188 | mckm.fit(vectors)
189 | mckm_res = mckm.predict(vectors)
190 | mckm_res.plot_metrics(figsize=(12, 7))
191 | 
192 | # Select a 'good' K
193 | # At k values around 7, BIC, DB, and SIL have a (local) optimum
194 | ckm_res_k7 = mckm_res.ckmeans_results[5]
195 | fig = ckm_res_k7.plot(figsize=(14,14))
196 | ```
197 | 
198 |     Nucleotide alignment: <NucleotideAlignment; #samples: 108, #sites: 6752>
199 |     Eigenvalues:
200 | 
201 | <div>
202 | <table border="1" class="dataframe">
203 |   <thead>
204 |     <tr style="text-align: right;">
205 |       <th></th>
206 |       <th>eigvals</th>
207 |       <th>eigvals_rel</th>
208 |       <th>eigvals_rel_cum</th>
209 |       <th>eigvals_rel_corrected</th>
210 |       <th>eigvals_rel_corrected_cum</th>
211 |     </tr>
212 |   </thead>
213 |   <tbody>
214 |     <tr>
215 |       <th>0</th>
216 |       <td>0.115972</td>
217 |       <td>0.471458</td>
218 |       <td>0.233986</td>
219 |       <td>0.233986</td>
220 |       <td>0.233986</td>
221 |     </tr>
222 |     <tr>
223 |       <th>1</th>
224 |       <td>0.039585</td>
225 |       <td>0.160924</td>
226 |       <td>0.317016</td>
227 |       <td>0.083030</td>
228 |       <td>0.317016</td>
229 |     </tr>
230 |     <tr>
231 |       <th>2</th>
232 |       <td>0.035079</td>
233 |       <td>0.142604</td>
234 |       <td>0.391140</td>
235 |       <td>0.074125</td>
236 |       <td>0.391140</td>
237 |     </tr>
238 |     <tr>
239 |       <th>3</th>
240 |       <td>0.017383</td>
241 |       <td>0.070665</td>
242 |       <td>0.430295</td>
243 |       <td>0.039154</td>
244 |       <td>0.430295</td>
245 |     </tr>
246 |     <tr>
247 |       <th>4</th>
248 |       <td>0.009831</td>
249 |       <td>0.039965</td>
250 |       <td>0.454525</td>
251 |       <td>0.024230</td>
252 |       <td>0.454525</td>
253 |     </tr>
254 |     <tr>
255 |       <th>...</th>
256 |       <td>...</td>
257 |       <td>...</td>
258 |       <td>...</td>
259 |       <td>...</td>
260 |       <td>...</td>
261 |     </tr>
262 |     <tr>
263 |       <th>103</th>
264 |       <td>-0.001325</td>
265 |       <td>-0.005388</td>
266 |       <td>0.998575</td>
267 |       <td>0.001457</td>
268 |       <td>0.998575</td>
269 |     </tr>
270 |     <tr>
271 |       <th>104</th>
272 |       <td>-0.001693</td>
273 |       <td>-0.006881</td>
274 |       <td>0.999654</td>
275 |       <td>0.001079</td>
276 |       <td>0.999654</td>
277 |     </tr>
278 |     <tr>
279 |       <th>105</th>
280 |       <td>-0.001884</td>
281 |       <td>-0.007660</td>
282 |       <td>1.000000</td>
283 |       <td>0.000346</td>
284 |       <td>1.000000</td>
285 |     </tr>
286 |     <tr>
287 |       <th>106</th>
288 |       <td>-0.002255</td>
289 |       <td>-0.009168</td>
290 |       <td>1.000000</td>
291 |       <td>0.000000</td>
292 |       <td>1.000000</td>
293 |     </tr>
294 |     <tr>
295 |       <th>107</th>
296 |       <td>-0.002430</td>
297 |       <td>-0.009880</td>
298 |       <td>1.000000</td>
299 |       <td>0.000000</td>
300 |       <td>1.000000</td>
301 |     </tr>
302 |   </tbody>
303 | </table>
304 | <p>108 rows × 5 columns</p>
305 | </div>
306 | 
307 | ![png](https://github.com/TankredO/pyckmeans/blob/main/docs/images/output_8_2.png?raw=true)
308 | 
309 | ![png](https://github.com/TankredO/pyckmeans/blob/main/docs/images/output_8_3.png?raw=true)
310 | 
311 | <h3 id="wecr-sequence">WECR K-Means: Clustering Sequence Data</h3>
312 | 
313 | ```python
314 | from pyckmeans import WECR, NucleotideAlignment, pcoa
315 | 
316 | # Load nucleotide alignment
317 | aln = NucleotideAlignment.from_file('datasets/rhodanthemum_ct85_msl68.snps.phy')
318 | 
319 | # Calculate Kimura 2-parameter distances
320 | dst = aln.distance(distance_type='k2p')
321 | 
322 | # Apply PCoA, including negative Eigentvalue correction
323 | pcoa_res = pcoa(dst, correction='lingoes')
324 | 
325 | # Get Eigenvectors until the cumulative corrected Eigenvalues are >= 0.8
326 | vectors = pcoa_res.get_vectors(
327 |     filter_by='eigvals_rel_corrected_cum',
328 |     filter_th=0.8,
329 |     out_format='pandas'
330 | )
331 | 
332 | # Apply WECR K-Means
333 | wecr = WECR(
334 |     k=range(2, 20),
335 |     n_rep=1000,
336 |     p_samp=0.6,
337 |     p_feat=0.6,
338 | )
339 | wecr.fit(vectors)
340 | wecr_res = wecr.predict(vectors)
341 | 
342 | # Plot clustering metrics for each k
343 | wecr_res.plot_metrics(figsize=(12, 7))
344 | 
345 | # Select a 'good' K (e.g., 6, 7, 8) for the consensus clustering
346 | wecr_res.plot(k=6, figsize=(14,14))
347 | 
348 | cluster_membership = wecr_res.get_cl(k=6, with_names=True)
349 | print('cluster_membership:')
350 | print(cluster_membership)
351 | ```
352 | 
353 |     cluster_membership:
354 |     PP-R002-01         0
355 |     PP-R002-01-dupl    0
356 |     PP-R017-04         4
357 |     PP-R017-04-dupl    4
358 |     PP-R019-01         5
359 |                       ..
360 |     R044-02            3
361 |     R044-12            3
362 |     R045-02            0
363 |     R045-06            0
364 |     R045-25            0
365 |     Length: 108, dtype: int32
366 | 
367 | ![png](https://github.com/TankredO/pyckmeans/blob/main/docs/images/output_10_1.png?raw=true)
368 | 
369 | ![png](https://github.com/TankredO/pyckmeans/blob/main/docs/images/output_10_2.png?raw=true)
370 | 


--------------------------------------------------------------------------------
/pyckmeans/core/multickmeans.py:
--------------------------------------------------------------------------------
  1 | '''multickmeans module'''
  2 | 
  3 | from typing import List, Optional, Iterable, Dict, Any, Tuple, Union, Callable, TYPE_CHECKING
  4 | import numpy
  5 | import pandas
  6 | 
  7 | from pyckmeans.ordination import PCOAResult
  8 | from .ckmeans import CKmeansResult, CKmeans, InvalidClusteringMetric
  9 | 
 10 | if TYPE_CHECKING:
 11 |     import matplotlib
 12 |     import matplotlib.figure
 13 | 
 14 | class MultiCKmeansResult:
 15 |     '''MultiCKmeansResult
 16 | 
 17 |     Result of MultiCKmeansResult.predict.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     ckmeans_results: List[CKmeansResult]
 22 |         List of CKmeansResults.
 23 |     names: Optional[Iterable(str)]
 24 |         Sample names.
 25 |     '''
 26 |     def __init__(
 27 |         self,
 28 |         ckmeans_results: List[CKmeansResult],
 29 |         names: Optional[Iterable[str]] = None,
 30 |     ):
 31 |         self.ckmeans_results = ckmeans_results
 32 |         self.names = numpy.arange(ckmeans_results[0].cmatrix.shape[0]).astype(str) \
 33 |             if names is None else numpy.array(names).astype(str)
 34 | 
 35 |         self.ks = numpy.array([ckm_res.k for ckm_res in ckmeans_results])
 36 | 
 37 |         self.sils = numpy.array([ckm_res.sil for ckm_res in ckmeans_results])
 38 |         self.bics = numpy.array([ckm_res.bic for ckm_res in ckmeans_results])
 39 |         self.dbs = numpy.array([ckm_res.db for ckm_res in ckmeans_results])
 40 |         self.chs = numpy.array([ckm_res.ch for ckm_res in ckmeans_results])
 41 | 
 42 |         self.metrics = pandas.DataFrame({
 43 |             'k': self.ks,
 44 |             'sil': self.sils,
 45 |             'bic': self.bics,
 46 |             'db': self.dbs,
 47 |             'ch': self.chs,
 48 |         })
 49 | 
 50 |     def order(
 51 |         self,
 52 |         by: int,
 53 |         method: str = 'GW',
 54 |         linkage_type: str = 'average',
 55 |     ) -> numpy.ndarray:
 56 |         '''order
 57 | 
 58 |         Get optimal sample order according to hierarchical clustering of the
 59 |         CKmeansResult at index "by".
 60 | 
 61 |         Parameters
 62 |         ----------
 63 |         by : int
 64 |             Index of the CKMeansResult to order by.
 65 |         method : str
 66 |             Reordering method. Either 'GW' (Gruvaeus & Wainer, 1972) or 'OLO' for
 67 |             scipy.hierarchy.optimal_leaf_ordering.
 68 | 
 69 |             Gruvaeus, G., H., Wainer. 1972. Two Additions to Hierarchical Cluster Analysis.
 70 |             The British Psychological Society 25.
 71 |         linkage_type : str
 72 |             Linkage type for the hierarchical clustering. One of
 73 | 
 74 |             * 'average'
 75 |             * 'complete'
 76 |             * 'single'
 77 |             * 'weighted'
 78 |             * 'centroid'
 79 | 
 80 |             See scipy.cluster.hierarchy.linkage for details.
 81 | 
 82 |         Returns
 83 |         -------
 84 |         numpy.ndarray
 85 |             Optimal sample order.
 86 |         '''
 87 |         ckm_res = self.ckmeans_results[by]
 88 | 
 89 |         return ckm_res.order(method=method, linkage_type=linkage_type)
 90 | 
 91 |     def sort(
 92 |         self,
 93 |         by: int,
 94 |         method: str = 'GW',
 95 |         linkage_type: str = 'average',
 96 |         in_place: bool = False,
 97 |     ) -> 'MultiCKmeansResult':
 98 |         '''sort
 99 | 
100 |         Sort samples according to hierarchical clustering of the
101 |         CKmeansResult at index "by".
102 | 
103 |         Parameters
104 |         ----------
105 |         by : int
106 |             Index of the CKMeansResult to sort by.
107 |         method : str
108 |             Reordering method. Either 'GW' (Gruvaeus & Wainer, 1972) or 'OLO' for
109 |             scipy.hierarchy.optimal_leaf_ordering.
110 | 
111 |             Gruvaeus, G., H., Wainer. 1972. Two Additions to Hierarchical Cluster Analysis.
112 |             The British Psychological Society 25.
113 |         linkage_type : str
114 |             Linkage type for the hierarchical clustering. One of
115 | 
116 |             * 'average'
117 |             * 'complete'
118 |             * 'single'
119 |             * 'weighted'
120 |             * 'centroid'
121 | 
122 |             See scipy.cluster.hierarchy.linkage for details.
123 |         in_place : bool
124 |             If False, a new, sorted MultiCKmeansResult object will be returned.
125 |             If True, the object will be sorted in place and self will be returned.
126 | 
127 |         Returns
128 |         -------
129 |         MultiCKmeansResult
130 |             Sorted MultiCKmeansResult
131 |         '''
132 | 
133 |         order = self.order(by=by, method=method, linkage_type=linkage_type)
134 | 
135 |         return self.reorder(order, in_place=in_place)
136 | 
137 |     def reorder(
138 |         self,
139 |         order: numpy.ndarray,
140 |         in_place: bool = False,
141 |     ) -> 'MultiCKmeansResult':
142 |         '''reorder
143 | 
144 |         Reorder samples in all CKmeansResults according to provided order.
145 | 
146 |         Parameters
147 |         ----------
148 |         order : numpy.ndarray
149 |             New sample order.
150 |         in_place : bool
151 |             If False, a new, sorted MultiCKmeansResult object will be returned.
152 |             If True, the object will be sorted in place and self will be returned.
153 | 
154 |         Returns
155 |         -------
156 |         MultiCKmeansResult
157 |             Reordered MultiCKmeansResult
158 |         '''
159 | 
160 |         if in_place:
161 |             mckmres = self
162 |             for ckmres in self.ckmeans_results:
163 |                 ckmres.reorder(order, in_place=True)
164 |         else:
165 |             ckm_results = [
166 |                 ckmres.reorder(order, in_place=False) for ckmres in self.ckmeans_results
167 |             ]
168 |             names = None if self.names is None else self.names.copy()
169 |             mckmres = MultiCKmeansResult(ckm_results, names=names)
170 | 
171 |         return mckmres
172 | 
173 |     def plot_metrics(
174 |         self,
175 |         figsize: Tuple[float, float] = (7, 7),
176 |     ) -> 'matplotlib.figure.Figure':
177 |         '''plot_metrics
178 | 
179 |         Plot MultiCKMeansResult metrics.
180 | 
181 |         Parameters
182 |         ----------
183 |         figsize : Tuple[float, float], optional
184 |             Figure size for the matplotlib figure, by default (7, 7).
185 | 
186 |         Returns
187 |         -------
188 |         matplotlib.figure.Figure
189 |             Matplotlib Figure of the metrics plot.
190 |         '''
191 | 
192 |         from pyckmeans.utils import plot_multickmeans_metrics
193 | 
194 |         fig = plot_multickmeans_metrics(
195 |             mckm_res=self,
196 |             figsize=figsize,
197 |         )
198 |         fig.tight_layout()
199 |         return fig
200 | 
201 | class MultiCKMeans:
202 |     '''MultiCKMeans
203 | 
204 |     Convenience class wrapping Consensus K-Means runs for multiple different numbers of clusters.
205 | 
206 |     Parameters
207 |     ----------
208 |     k : Iterable[int]
209 |         List of cluster counts for CKmeans.
210 |     n_rep : int, optional
211 |         Number of K-Means to fit for each single CKmeans, by default 100
212 |     p_samp : float, optional
213 |         Proportion of samples (observations) to randomly draw per K-Means run, by default 0.8.
214 |         The resulting number of samples will be rounded up. I.e. if number of samples is 10 and
215 |         p_samp is 0.75, each K-Means will use 8 randomly drawn samples (0.72 * 10 = 7.2, 7.2 -> 8).
216 |     p_feat : float, optional
217 |         Proportion of features (predictors) to randomly draw per K-Means run, by default 0.8.
218 |         The resulting number of features will be rounded up. I.e. if number of features is 10 and
219 |         p_feat is 0.72, each K-Means will use 8 randomly drawn features (0.72 * 10 = 7.5, 7.2 -> 8).
220 |     metrics : Iterable[str]
221 |         Clustering quality metrics to calculate while training. Available metrics are
222 |         * "sil" (Silhouette Index)
223 |         * "bic" (Bayesian Information Criterion)
224 |         * "db" (Davies-Bouldin Index)
225 |         * "ch" (Calinski-Harabasz).
226 |     kwargs : Dict[str, Any]
227 |         Additional keyword arguments passed to sklearn.cluster.KMeans.
228 |     '''
229 |     def __init__(
230 |         self,
231 |         k: Iterable[int],
232 |         n_rep: int = 100,
233 |         p_samp: float = 0.8,
234 |         p_feat: float = 0.8,
235 |         metrics: Iterable[str] = ('sil', 'bic'),
236 |         **kwargs: Dict[str, Any],
237 |     ):
238 |         self.k = k
239 |         self.n_rep = n_rep
240 |         self.p_samp = p_samp
241 |         self.p_feat = p_feat
242 | 
243 |         for metric in metrics:
244 |             if not metric in CKmeans.AVAILABLE_METRICS:
245 |                 am_str = ", ".join(CKmeans.AVAILABLE_METRICS)
246 |                 msg = f'Unknown metric "{metric}". Available metrics are {am_str}.'
247 |                 raise InvalidClusteringMetric(msg)
248 | 
249 |         self._metrics = metrics
250 |         self._kmeans_kwargs = kwargs
251 | 
252 |         self.ckmeans: Optional[List[CKmeans]] = None
253 | 
254 |     def fit(
255 |         self,
256 |         x: Union[numpy.ndarray, PCOAResult, pandas.DataFrame],
257 |         progress_callback: Optional[Callable] = None,
258 |     ):
259 |         '''fit
260 | 
261 |         Fit MultiCKmeans.
262 | 
263 |         Parameters
264 |         ----------
265 |         x : Union[numpy.ndarray, PCOAResult]
266 |             a n * m matrix (numpy.ndarray) or dataframe (pandas.DataFrame), where n is the number
267 |             of samples (observations) and m is the number of features (predictors).
268 |             Alternatively a pyckmeans.ordination.PCOAResult as returned from pyckmeans.pcoa.
269 |         progress_callback : Optional[Callable]
270 |             Optional callback function for progress reporting.
271 |         '''
272 | 
273 |         if isinstance(x, PCOAResult):
274 |             x = x.vectors
275 |         elif isinstance(x, pandas.DataFrame):
276 |             x = x.values
277 | 
278 |         # _fit is called here to be able to extend later on.
279 |         # The plan is to add a parallel fitting function later on
280 |         # e.g. _fit_parallel(x, progress_callback, n_cores)
281 |         self._fit(x, progress_callback=progress_callback)
282 | 
283 |     def predict(
284 |         self,
285 |         x: Union[numpy.ndarray, PCOAResult, pandas.DataFrame],
286 |         linkage_type: str = 'average',
287 |         return_cls: bool = False,
288 |         progress_callback: Optional[Callable] = None,
289 |     ) -> MultiCKmeansResult:
290 |         '''predict
291 | 
292 |         Predict cluster membership of new data from all fitted CKmeans.
293 | 
294 |         Parameters
295 |         ----------
296 |         x : Union[numpy.ndarray, PCOAResult]
297 |             a n * m matrix (numpy.ndarray) or dataframe (pandas.DataFrame), where n is the number
298 |             of samples (observations) and m is the number of features (predictors). If x is a
299 |             dataframe, the index will be used a sample names.
300 |             Alternatively a pyckmeans.ordination.PCOAResult as returned from pyckmeans.pcoa.
301 |         linkage_type : str
302 |             Linkage type of the hierarchical clustering that is used for consensus cluster
303 |             calculation. One of
304 | 
305 |             * 'average'
306 |             * 'complete'
307 |             * 'single'
308 |             * 'weighted'
309 |             * 'centroid'
310 | 
311 |             See scipy.cluster.hierarchy.linkage for details.
312 |         return_cls : bool
313 |             If True, the cluster memberships of the single K-Means runs will be present
314 |             in the output.
315 |         progress_callback : Optional[Callable]
316 |             Optional callback function for progress reporting.
317 | 
318 |         Returns
319 |         -------
320 |         CKmeansResult
321 |             Object comprising a  n * n consensus matrix, and a n-length vector of
322 |             precited cluster memberships.
323 |         '''
324 |         names = None
325 |         if isinstance(x, PCOAResult):
326 |             names = numpy.array(x.names).astype(str)
327 |         elif isinstance(x, pandas.DataFrame):
328 |             names = numpy.array(x.index).astype(str)
329 | 
330 |         ckmeans_results: List[CKmeansResult] = []
331 |         for ckm in self.ckmeans:
332 |             ckm_res = ckm.predict(
333 |                 x=x,
334 |                 linkage_type=linkage_type,
335 |                 return_cls=return_cls,
336 |                 progress_callback=progress_callback,
337 |             )
338 |             ckmeans_results.append(ckm_res)
339 | 
340 |         return MultiCKmeansResult(
341 |             ckmeans_results=ckmeans_results,
342 |             names=names,
343 |         )
344 | 
345 |     def _fit(
346 |         self,
347 |         x: numpy.ndarray,
348 |         progress_callback: Optional[Callable] = None,
349 |     ):
350 |         '''_fit
351 | 
352 |         Internal sequential fitting function.
353 | 
354 |         Parameters
355 |         ----------
356 |         x : numpy.ndarray
357 |             n * m matrix, where n is the number of samples (observations) and m is
358 |             the number of features (predictors).
359 |         progress_callback : Optional[Callable]
360 |             Optional callback function for progress reporting.
361 |         '''
362 |         self._reset()
363 | 
364 |         self.ckmeans = []
365 | 
366 |         for k in self.k:
367 |             ckm = CKmeans(
368 |                 k=k,
369 |                 n_rep=self.n_rep,
370 |                 p_samp=self.p_samp,
371 |                 p_feat=self.p_feat,
372 |                 metrics=self._metrics,
373 |                 **self._kmeans_kwargs,
374 |             )
375 |             ckm.fit(x=x, progress_callback=progress_callback)
376 |             self.ckmeans.append(ckm)
377 | 
378 |     def _reset(self):
379 |         '''_reset
380 | 
381 |         Reset MultiCKmeans object.
382 |         '''
383 |         self.ckmeans = None
384 | 


--------------------------------------------------------------------------------
/pyckmeans/utils/plotting.py:
--------------------------------------------------------------------------------
  1 | ''' Plotting utitlies
  2 | '''
  3 | 
  4 | from typing import Iterable, Optional, Tuple, Union
  5 | import numpy
  6 | import matplotlib.pyplot as plt
  7 | import matplotlib.figure
  8 | import matplotlib.colors
  9 | import matplotlib.axes
 10 | 
 11 | import pyckmeans.core
 12 | from pyckmeans.core import wecr
 13 | from pyckmeans.ordering import distance_order
 14 | 
 15 | def plot_ckmeans_result(
 16 |     ckm_res: pyckmeans.core.CKmeansResult,
 17 |     names: Optional[Iterable[str]] = None,
 18 |     order: Optional[Union[str, numpy.ndarray]] = 'GW',
 19 |     cmap_cm: Union[str, matplotlib.colors.Colormap] = 'Blues',
 20 |     cmap_clbar: Union[str, matplotlib.colors.Colormap] = 'tab20',
 21 |     figsize: Tuple[float, float] = (7, 7),
 22 | ) -> matplotlib.figure.Figure:
 23 |     '''plot_ckmeans_result
 24 | 
 25 |     Plot pyckmeans result consensus matrix with consensus clusters.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     ckm_res : CKmeansResult
 30 |         CKmeansResult as returned from CKmeans.predict.
 31 |     names : Optional[Iterable[str]]
 32 |         Sample names to be plotted.
 33 |     order : Optional[Union[str, numpy.ndarray]]
 34 |         Sample Plotting order. Either a string, determining the oder method to use
 35 |         (see CKmeansResult.order), or a numpy.ndarray giving the sample order,
 36 |         or None to apply no reordering.
 37 |     cmap_cm : Union[str, matplotlib.colors.Colormap], optional
 38 |         Colormap for the consensus matrix, by default 'Blues'
 39 |     cmap_clbar : Union[str, matplotlib.colors.Colormap], optional
 40 |         Colormap for the cluster bar, by default 'tab20'
 41 |     figsize : Tuple[float, float], optional
 42 |         Figure size for the matplotlib figure, by default (7, 7).
 43 | 
 44 |     Returns
 45 |     -------
 46 |     matplotlib.figure.Figure
 47 |         Matplotlib figure.
 48 |     '''
 49 |     # if order is None do not reorder
 50 |     if order is None:
 51 |         order = numpy.arange(ckm_res.cmatrix.shape[0])
 52 |     # if order is str use CKMeansResult order
 53 |     elif isinstance(order, str):
 54 |         order = ckm_res.order(method=order)
 55 |     # else order must be numpy.ndarray giving the sample order
 56 | 
 57 |     ckm_res = ckm_res.reorder(order=order, in_place=False)
 58 |     cl = ckm_res.cl
 59 | 
 60 |     # if names is passed use names, else try to get names
 61 |     # from ckm_res, else just use samples indices
 62 |     if names is None:
 63 |         if ckm_res.names is not None:
 64 |             nms = ckm_res.names
 65 |         else:
 66 |             nms = order.astype('str')
 67 |     else:
 68 |         nms = numpy.array(names)[order]
 69 | 
 70 |     # build figure layout
 71 |     fig = plt.figure(figsize=figsize)
 72 |     ax_cmat = fig.add_axes([0.1, 0.1, 0.8, 0.8])
 73 |     ax_clbar = fig.add_axes([0.05, 0.1, 0.05, 0.8])
 74 |     ax_cbar = fig.add_axes([0.925, 0.1, 0.025, 0.8])
 75 | 
 76 |     # = consensus matrix
 77 |     ax_cmat.imshow(ckm_res.cmatrix, cmap=cmap_cm)
 78 |     ax_cmat.set_xticks(numpy.arange(len(nms)))
 79 |     ax_cmat.set_xticklabels(nms)
 80 |     for tick in ax_cmat.get_xticklabels():
 81 |         tick.set_rotation(90)
 82 |     ax_cmat.set_yticks([])
 83 |     ax_cmat.tick_params(left=False)
 84 | 
 85 |     # cluster lines
 86 |     cl_01 = []
 87 |     cl_start = 0
 88 |     for i in range(1, len(cl)):
 89 |         if cl[i] != cl[cl_start]:
 90 |             cl_01.append((cl_start, i))
 91 |             cl_start = i
 92 |     cl_01.append((cl_start, len(cl)))
 93 |     cl_01 = numpy.array(cl_01)
 94 | 
 95 |     ax_cmat.hlines(cl_01[:, 0] + 0.5 - 1, -0.5, len(nms) - 0.5, color='white', linewidth=2)
 96 |     ax_cmat.vlines(cl_01[:, 0] + 0.5 - 1, -0.5, len(nms) - 0.5, color='white', linewidth=2)
 97 | 
 98 |     # = cluster membership bar
 99 |     ax_clbar.imshow(ckm_res.cl.reshape(-1, 1), cmap=cmap_clbar)
100 |     ax_clbar.set_xticks([])
101 |     ax_clbar.set_yticks(numpy.arange(len(nms)))
102 |     ax_clbar.set_yticklabels(nms)
103 | 
104 |     # = color bar
105 |     ax_cbar.set_xticks([])
106 |     ax_cbar.yaxis.tick_right()
107 |     plt.colorbar(plt.cm.ScalarMappable(cmap=cmap_cm), cax=ax_cbar)
108 | 
109 |     return fig
110 | 
111 | def plot_multickmeans_metrics(
112 |     mckm_res: pyckmeans.core.MultiCKmeansResult,
113 |     figsize: Tuple[float, float] = (7, 7),
114 | ) -> matplotlib.figure.Figure:
115 |     '''plot_multickmeans_metrics
116 | 
117 |     Plot MultiCKMeansResult metrics.
118 | 
119 |     Parameters
120 |     ----------
121 |     mckm_res : MultiCKmeansResult
122 |         MultiCKmeansResult object
123 |     figsize : Tuple[float, float], optional
124 |         Figure size for the matplotlib figure, by default (7, 7).
125 | 
126 |     Returns
127 |     -------
128 |     matplotlib.figure.Figure
129 |         Matplotlib Figure of the metrics plot.
130 |     '''
131 | 
132 |     fig, axs = plt.subplots(nrows=2, ncols=2, figsize=figsize)
133 | 
134 |     axs = axs.flatten()
135 |     for ax in axs:
136 |         ax.grid(axis='x')
137 |         ax.set_xticks(mckm_res.ks)
138 |         ax.set_xlabel('k')
139 | 
140 |     axs[0].plot(mckm_res.ks, mckm_res.bics)
141 |     axs[0].set_title('BIC')
142 |     axs[0].set_ylabel('BIC')
143 | 
144 |     axs[1].plot(mckm_res.ks, mckm_res.dbs)
145 |     axs[1].set_title('DB')
146 |     axs[1].set_ylabel('DB')
147 | 
148 |     axs[2].plot(mckm_res.ks, mckm_res.sils)
149 |     axs[2].set_title('SIL')
150 |     axs[2].set_ylabel('SIL')
151 | 
152 |     axs[3].plot(mckm_res.ks, mckm_res.chs)
153 |     axs[3].set_title('CH')
154 |     axs[3].set_ylabel('CH')
155 | 
156 |     return fig
157 | 
158 | def plot_wecr_result(
159 |     wecr_res: pyckmeans.core.WECRResult,
160 |     k: int,
161 |     names: Optional[Iterable[str]] = None,
162 |     order: Optional[Union[str, numpy.ndarray]] = 'GW',
163 |     cmap_cm: Union[str, matplotlib.colors.Colormap] = 'Blues',
164 |     cmap_clbar: Union[str, matplotlib.colors.Colormap] = 'tab20',
165 |     figsize: Tuple[float, float] = (7, 7),
166 | ) -> matplotlib.figure.Figure:
167 |     '''plot_wecr_result
168 | 
169 |     Plot wecr result consensus matrix with consensus clusters.
170 | 
171 |     Parameters
172 |     ----------
173 |     wecr_res : pyckmeans.core.WECRResult
174 |         WECRResult as returned from pyckmeans.core.WECR.predict.
175 |     k: int
176 |         The number of clusters k to use for plotting.
177 |     names : Optional[Iterable[str]]
178 |         Sample names to be plotted.
179 |     order : Optional[Union[str, numpy.ndarray]]
180 |         Sample Plotting order. Either a string, determining the oder method to use
181 |         (see WECRResult.order), or a numpy.ndarray giving the sample order,
182 |         or None to apply no reordering.
183 |     cmap_cm : Union[str, matplotlib.colors.Colormap], optional
184 |         Colormap for the consensus matrix, by default 'Blues'
185 |     cmap_clbar : Union[str, matplotlib.colors.Colormap], optional
186 |         Colormap for the cluster bar, by default 'tab20'
187 |     figsize : Tuple[float, float], optional
188 |         Figure size for the matplotlib figure, by default (7, 7).
189 | 
190 |     Returns
191 |     -------
192 |     matplotlib.figure.Figure
193 |         Matplotlib figure.
194 | 
195 |     Raises
196 |     ------
197 |     wecr.InvalidKError
198 |         Raised if an invalid k argument is provided.
199 |     '''
200 |     # if order is None do not reorder
201 |     if order is None:
202 |         order = numpy.arange(wecr_res.cmatrix.shape[0])
203 |     # if order is str use WECRResult order
204 |     elif isinstance(order, str):
205 |         order = wecr_res.order(method=order)
206 |     # else order must be numpy.ndarray giving the sample order
207 | 
208 |     wecr_res = wecr_res.reorder(order=order, in_place=False)
209 |     if not k in wecr_res.k:
210 |         msg = f'Result for k={k} not found. Available k are {wecr_res.k}.'
211 |         raise wecr.InvalidKError(msg)
212 |     cl = wecr_res.cl[numpy.argmax(wecr_res.k == k)]
213 | 
214 |     # if names is passed use names, else try to get names
215 |     # from wecr_res, else just use samples indices
216 |     if names is None:
217 |         if wecr_res.names is not None:
218 |             nms = wecr_res.names
219 |         else:
220 |             nms = order.astype('str')
221 |     else:
222 |         nms = numpy.array(names)[order]
223 | 
224 |     # build figure layout
225 |     fig = plt.figure(figsize=figsize)
226 |     ax_cmat = fig.add_axes([0.1, 0.1, 0.8, 0.8])
227 |     ax_clbar = fig.add_axes([0.05, 0.1, 0.05, 0.8])
228 |     ax_cbar = fig.add_axes([0.925, 0.1, 0.025, 0.8])
229 | 
230 |     # = consensus matrix
231 |     ax_cmat.imshow(wecr_res.cmatrix, cmap=cmap_cm)
232 |     ax_cmat.set_xticks(numpy.arange(len(nms)))
233 |     ax_cmat.set_xticklabels(nms)
234 |     for tick in ax_cmat.get_xticklabels():
235 |         tick.set_rotation(90)
236 |     ax_cmat.set_yticks([])
237 |     ax_cmat.tick_params(left=False)
238 | 
239 |     # cluster lines
240 |     cl_01 = []
241 |     cl_start = 0
242 |     for i in range(1, len(cl)):
243 |         if cl[i] != cl[cl_start]:
244 |             cl_01.append((cl_start, i))
245 |             cl_start = i
246 |     cl_01.append((cl_start, len(cl)))
247 |     cl_01 = numpy.array(cl_01)
248 | 
249 |     ax_cmat.hlines(cl_01[:, 0] + 0.5 - 1, -0.5, len(nms) - 0.5, color='white', linewidth=2)
250 |     ax_cmat.vlines(cl_01[:, 0] + 0.5 - 1, -0.5, len(nms) - 0.5, color='white', linewidth=2)
251 | 
252 |     # = cluster membership bar
253 |     ax_clbar.imshow(cl.reshape(-1, 1), cmap=cmap_clbar)
254 |     ax_clbar.set_xticks([])
255 |     ax_clbar.set_yticks(numpy.arange(len(nms)))
256 |     ax_clbar.set_yticklabels(nms)
257 | 
258 |     # = color bar
259 |     ax_cbar.set_xticks([])
260 |     ax_cbar.yaxis.tick_right()
261 |     plt.colorbar(plt.cm.ScalarMappable(cmap=cmap_cm), cax=ax_cbar)
262 | 
263 |     return fig
264 | 
265 | def plot_wecr_result_metrics(
266 |     wecr_res: pyckmeans.core.WECRResult,
267 |     figsize: Tuple[float, float] = (7, 7),
268 | ) -> matplotlib.figure.Figure:
269 |     '''plot_wecr_result_metrics
270 | 
271 |     Plot WECRResult metrics.
272 | 
273 |     Parameters
274 |     ----------
275 |     wecr_res : WECRResult
276 |         WECRResult object
277 |     figsize : Tuple[float, float], optional
278 |         Figure size for the matplotlib figure, by default (7, 7).
279 | 
280 |     Returns
281 |     -------
282 |     matplotlib.figure.Figure
283 |         Matplotlib Figure of the metrics plot.
284 |     '''
285 | 
286 |     fig, axs = plt.subplots(nrows=2, ncols=2, figsize=figsize)
287 | 
288 |     order = numpy.argsort(wecr_res.k)
289 | 
290 |     axs = axs.flatten()
291 |     for ax in axs:
292 |         ax.grid(axis='x')
293 |         ax.set_xticks(wecr_res.k[order])
294 |         ax.set_xlabel('k')
295 | 
296 |     axs[0].plot(wecr_res.k[order], wecr_res.bic[order])
297 |     axs[0].set_title('BIC')
298 |     axs[0].set_ylabel('BIC')
299 | 
300 |     axs[1].plot(wecr_res.k[order], wecr_res.db[order])
301 |     axs[1].set_title('DB')
302 |     axs[1].set_ylabel('DB')
303 | 
304 |     axs[2].plot(wecr_res.k[order], wecr_res.sil[order])
305 |     axs[2].set_title('SIL')
306 |     axs[2].set_ylabel('SIL')
307 | 
308 |     axs[3].plot(wecr_res.k[order], wecr_res.ch[order])
309 |     axs[3].set_title('CH')
310 |     axs[3].set_ylabel('CH')
311 | 
312 |     return fig
313 | 
314 | def plot_cmatrix(
315 |     cmatrix: numpy.ndarray,
316 |     cl: numpy.ndarray,
317 |     names: Optional[Iterable[str]] = None,
318 |     order: Optional[Union[str, numpy.ndarray]] = 'GW',
319 |     cmap_cm: Union[str, matplotlib.colors.Colormap] = 'Blues',
320 |     cmap_clbar: Union[str, matplotlib.colors.Colormap] = 'tab20',
321 |     figsize: Tuple[float, float] = (7, 7),
322 | ) -> Tuple[
323 |     matplotlib.figure.Figure,
324 |     matplotlib.axes.Axes,
325 |     matplotlib.axes.Axes,
326 |     matplotlib.axes.Axes
327 | ]:
328 |     '''plot_cmatrix
329 | 
330 |     Plot consensus matrix and consensus clustering.
331 | 
332 |     Parameters
333 |     ----------
334 |     cmatrix : numpy.ndarray
335 |         Consensus matrix.
336 |     cl : numpy.ndarray
337 |         Cluster membership.
338 |     names : Optional[Iterable[str]]
339 |         Sample names to be plotted.
340 |     order : Optional[Union[str, numpy.ndarray]]
341 |         Sample Plotting order. Either a string, or a numpy.ndarray giving the sample order,
342 |         or None to apply no reordering.
343 |     cmap_cm : Union[str, matplotlib.colors.Colormap], optional
344 |         Colormap for the consensus matrix, by default 'Blues'
345 |     cmap_clbar : Union[str, matplotlib.colors.Colormap], optional
346 |         Colormap for the cluster bar, by default 'tab20'
347 |     figsize : Tuple[float, float], optional
348 |         Figure size for the matplotlib figure, by default (7, 7).
349 | 
350 |     Returns
351 |     -------
352 |     Tuple[matplotlib.figure.Figure, matplotlib.axes.Axes, matplotlib.axes.Axes, matplotlib.axes.Axes]
353 |         Figure, consensus matrix Axes, cluster membership Axes, colorbar Axes.
354 |     '''
355 |     # if order is None do not reorder
356 |     if order is None:
357 |         order = numpy.arange(cmatrix.shape[0])
358 |     # if order is str use WECRResult order
359 |     elif isinstance(order, str):
360 |         order = distance_order(1-cmatrix, method=order)
361 |     # else order must be numpy.ndarray giving the sample order
362 | 
363 |     cmatrix = cmatrix[order, :][:, order]
364 |     cl = cl[order]
365 | 
366 |     # if names is passed use names, else try to get names
367 |     # from wecr_res, else just use samples indices
368 |     if names is None:
369 |         nms = order.astype('str')
370 |     else:
371 |         nms = numpy.array(names)[order]
372 | 
373 |     # build figure layout
374 |     fig = plt.figure(figsize=figsize)
375 |     ax_cmat = fig.add_axes([0.1, 0.1, 0.8, 0.8])
376 |     ax_clbar = fig.add_axes([0.05, 0.1, 0.05, 0.8])
377 |     ax_cbar = fig.add_axes([0.925, 0.1, 0.025, 0.8])
378 | 
379 |     # = consensus matrix
380 |     ax_cmat.imshow(cmatrix, cmap=cmap_cm)
381 |     ax_cmat.set_xticks(numpy.arange(len(nms)))
382 |     ax_cmat.set_xticklabels(nms)
383 |     for tick in ax_cmat.get_xticklabels():
384 |         tick.set_rotation(90)
385 |     ax_cmat.set_yticks([])
386 |     ax_cmat.tick_params(left=False)
387 | 
388 |     # cluster lines
389 |     cl_01 = []
390 |     cl_start = 0
391 |     for i in range(1, len(cl)):
392 |         if cl[i] != cl[cl_start]:
393 |             cl_01.append((cl_start, i))
394 |             cl_start = i
395 |     cl_01.append((cl_start, len(cl)))
396 |     cl_01 = numpy.array(cl_01)
397 | 
398 |     ax_cmat.hlines(cl_01[:, 0] + 0.5 - 1, -0.5, len(nms) - 0.5, color='white', linewidth=2)
399 |     ax_cmat.vlines(cl_01[:, 0] + 0.5 - 1, -0.5, len(nms) - 0.5, color='white', linewidth=2)
400 | 
401 |     # = cluster membership bar
402 |     ax_clbar.imshow(cl.reshape(-1, 1), cmap=cmap_clbar)
403 |     ax_clbar.set_xticks([])
404 |     ax_clbar.set_yticks(numpy.arange(len(nms)))
405 |     ax_clbar.set_yticklabels(nms)
406 | 
407 |     # = color bar
408 |     ax_cbar.set_xticks([])
409 |     ax_cbar.yaxis.tick_right()
410 |     plt.colorbar(plt.cm.ScalarMappable(cmap=cmap_cm), cax=ax_cbar)
411 | 
412 |     return fig, ax_cmat, ax_clbar, ax_cbar
413 | 


--------------------------------------------------------------------------------