├── requirements.txt ├── docs ├── images │ ├── output_10_1.png │ ├── output_10_2.png │ ├── output_4_1.png │ ├── output_6_1.png │ ├── output_8_2.png │ └── output_8_3.png ├── modules.rst ├── installation.rst ├── requirements.txt ├── pyckmeans.knee.rst ├── pyckmeans.ordering.rst ├── documentation.rst ├── index.rst ├── pyckmeans.rst ├── pyckmeans.distance.rst ├── pyckmeans.ordination.rst ├── pyckmeans.utils.rst ├── makefile ├── make.bat ├── pyckmeans.core.rst ├── pyckmeans.io.rst └── conf.py ├── .coveragerc ├── .readthedocs.yaml ├── pyckmeans ├── core │ ├── __init__.py │ ├── utils.py │ ├── tests │ │ └── test_core.py │ └── multickmeans.py ├── utils │ ├── __init__.py │ ├── progressbar.py │ └── plotting.py ├── __init__.py ├── ordination │ ├── utils.py │ └── tests │ │ └── test_pcoa.py ├── io │ ├── __init__.py │ ├── c_interop.py │ ├── tests │ │ ├── test_fasta.py │ │ ├── test_csv.py │ │ ├── test_nucleotidealignment.py │ │ └── test_phylip.py │ ├── fasta.py │ ├── src │ │ └── nucencode.cpp │ ├── csv.py │ ├── phylip.py │ └── nucleotide_alignment.py ├── knee │ ├── tests │ │ └── test_knee.py │ └── __init__.py ├── ordering │ ├── tests │ │ └── test_reordering.py │ └── __init__.py ├── distance │ ├── tests │ │ ├── test_distance.py │ │ └── test_c_interop.py │ ├── c_interop.py │ ├── __init__.py │ └── src │ │ └── distance.cpp └── tests │ ├── manual_test_2.py │ ├── manual_test.py │ ├── manual_tests.ipynb │ └── test_workflow.py ├── .github └── workflows │ ├── start_readthedocs_build.yaml │ ├── publish_coverage.yaml │ ├── publish_wheels_pypi.yaml │ └── publish_anaconda.yaml ├── LICENSE ├── conda.recipe └── meta.yaml ├── setup.py ├── .gitignore └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | scipy 4 | scikit-learn 5 | matplotlib 6 | tqdm 7 | -------------------------------------------------------------------------------- /docs/images/output_10_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TankredO/pyckmeans/HEAD/docs/images/output_10_1.png -------------------------------------------------------------------------------- /docs/images/output_10_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TankredO/pyckmeans/HEAD/docs/images/output_10_2.png -------------------------------------------------------------------------------- /docs/images/output_4_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TankredO/pyckmeans/HEAD/docs/images/output_4_1.png -------------------------------------------------------------------------------- /docs/images/output_6_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TankredO/pyckmeans/HEAD/docs/images/output_6_1.png -------------------------------------------------------------------------------- /docs/images/output_8_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TankredO/pyckmeans/HEAD/docs/images/output_8_2.png -------------------------------------------------------------------------------- /docs/images/output_8_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TankredO/pyckmeans/HEAD/docs/images/output_8_3.png -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | pyckmeans 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | pyckmeans 8 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | Installation 4 | ============ 5 | 6 | Requirements 7 | ------------ 8 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | scipy 4 | scikit-learn 5 | matplotlib 6 | tqdm 7 | numpydoc 8 | pyckmeans 9 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = *tests*, setup.py 3 | 4 | [report] 5 | exclude_lines = 6 | if TYPE_CHECKING: 7 | import matplotlib 8 | import matplotlib.figure -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | configuration: docs/conf.py 5 | 6 | python: 7 | version: 3.8 8 | install: 9 | - requirements: docs/requirements.txt 10 | -------------------------------------------------------------------------------- /pyckmeans/core/__init__.py: -------------------------------------------------------------------------------- 1 | ''' pyckmeans core module 2 | ''' 3 | 4 | from .ckmeans import CKmeans, CKmeansResult 5 | from .multickmeans import MultiCKMeans, MultiCKmeansResult 6 | from .wecr import WECR, WECRResult 7 | -------------------------------------------------------------------------------- /docs/pyckmeans.knee.rst: -------------------------------------------------------------------------------- 1 | pyckmeans.knee package 2 | ====================== 3 | 4 | Module contents 5 | --------------- 6 | 7 | .. automodule:: pyckmeans.knee 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | -------------------------------------------------------------------------------- /docs/pyckmeans.ordering.rst: -------------------------------------------------------------------------------- 1 | pyckmeans.ordering package 2 | ========================== 3 | 4 | Module contents 5 | --------------- 6 | 7 | .. automodule:: pyckmeans.ordering 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | -------------------------------------------------------------------------------- /docs/documentation.rst: -------------------------------------------------------------------------------- 1 | .. _Documentation: 2 | 3 | Documentation 4 | ============= 5 | 6 | .. toctree:: 7 | :maxdepth: 3 8 | 9 | pyckmeans 10 | 11 | Indices and tables 12 | ------------------ 13 | 14 | * :ref:`genindex` 15 | * :ref:`modindex` 16 | -------------------------------------------------------------------------------- /pyckmeans/utils/__init__.py: -------------------------------------------------------------------------------- 1 | ''' Utilties module 2 | ''' 3 | 4 | from .progressbar import MultiCKMeansProgressBars 5 | from .plotting import ( 6 | plot_ckmeans_result, plot_multickmeans_metrics, 7 | plot_wecr_result, plot_wecr_result_metrics, 8 | plot_cmatrix, 9 | ) 10 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. _index: 2 | 3 | Welcome to pyckmeans' documentation! 4 | ==================================== 5 | 6 | .. toctree:: 7 | :maxdepth: 1 8 | :caption: Contents: 9 | 10 | installation.rst 11 | documentation.rst 12 | 13 | Indices and tables 14 | ================== 15 | 16 | * :ref:`genindex` 17 | * :ref:`modindex` 18 | -------------------------------------------------------------------------------- /docs/pyckmeans.rst: -------------------------------------------------------------------------------- 1 | pyckmeans package 2 | ================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | pyckmeans.core 11 | pyckmeans.distance 12 | pyckmeans.io 13 | pyckmeans.knee 14 | pyckmeans.ordering 15 | pyckmeans.ordination 16 | pyckmeans.utils 17 | 18 | Module contents 19 | --------------- 20 | 21 | .. automodule:: pyckmeans 22 | :members: 23 | :undoc-members: 24 | :show-inheritance: 25 | -------------------------------------------------------------------------------- /pyckmeans/__init__.py: -------------------------------------------------------------------------------- 1 | ''' pyckmeans 2 | 3 | pyckmeans, a Python package for Consensus K-Means clustering. 4 | ''' 5 | 6 | __version__ = '0.9.4' 7 | 8 | __all__ = [ 9 | 'CKmeans', 10 | 'MultiCKMeans', 11 | 'WECR', 12 | 'NucleotideAlignment', 13 | 'DistanceMatrix', 14 | 'pcoa', 15 | ] 16 | 17 | from .core import CKmeans, MultiCKMeans, WECR 18 | from .io import NucleotideAlignment 19 | from .distance import DistanceMatrix 20 | from .ordination import pcoa 21 | -------------------------------------------------------------------------------- /docs/pyckmeans.distance.rst: -------------------------------------------------------------------------------- 1 | pyckmeans.distance package 2 | ========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | pyckmeans.distance.c\_interop module 8 | ------------------------------------ 9 | 10 | .. automodule:: pyckmeans.distance.c_interop 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: pyckmeans.distance 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/pyckmeans.ordination.rst: -------------------------------------------------------------------------------- 1 | pyckmeans.ordination package 2 | ============================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | pyckmeans.ordination.utils module 8 | --------------------------------- 9 | 10 | .. automodule:: pyckmeans.ordination.utils 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: pyckmeans.ordination 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /.github/workflows/start_readthedocs_build.yaml: -------------------------------------------------------------------------------- 1 | name: Start readthedocs build 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | logLevel: 6 | description: 'Log level' 7 | required: true 8 | default: 'warning' 9 | tags: 10 | description: 'Start readthedocs build tags' 11 | jobs: 12 | start-readthedocs-build: 13 | runs-on: "ubuntu-latest" 14 | steps: 15 | - run: curl -d "token=${{secrets.READTHEDOCS_TOKEN}}" -X POST https://readthedocs.org/api/v2/webhook/pyckmeans/163759/ 16 | -------------------------------------------------------------------------------- /pyckmeans/core/utils.py: -------------------------------------------------------------------------------- 1 | '''core utilities''' 2 | 3 | import json 4 | import numpy 5 | 6 | # source: https://stackoverflow.com/a/49677241 7 | class NumpyEncoder(json.JSONEncoder): 8 | ''' Special json encoder for numpy types ''' 9 | def default(self, obj): 10 | if isinstance(obj, numpy.integer): 11 | return int(obj) 12 | elif isinstance(obj, numpy.floating): 13 | return float(obj) 14 | elif isinstance(obj, numpy.ndarray): 15 | return obj.tolist() 16 | return json.JSONEncoder.default(self, obj) 17 | -------------------------------------------------------------------------------- /pyckmeans/ordination/utils.py: -------------------------------------------------------------------------------- 1 | '''ordination utilities''' 2 | 3 | import json 4 | import numpy 5 | 6 | # source: https://stackoverflow.com/a/49677241 7 | class NumpyEncoder(json.JSONEncoder): 8 | ''' Special json encoder for numpy types ''' 9 | def default(self, obj): 10 | if isinstance(obj, numpy.integer): 11 | return int(obj) 12 | elif isinstance(obj, numpy.floating): 13 | return float(obj) 14 | elif isinstance(obj, numpy.ndarray): 15 | return obj.tolist() 16 | return json.JSONEncoder.default(self, obj) 17 | -------------------------------------------------------------------------------- /pyckmeans/io/__init__.py: -------------------------------------------------------------------------------- 1 | ''' io 2 | 3 | Module containing input and output functionality. 4 | ''' 5 | 6 | from .nucleotide_alignment import \ 7 | NucleotideAlignment, \ 8 | read_alignment, \ 9 | InvalidAlignmentFileExtensionError, \ 10 | InvalidAlignmentFileFormatError 11 | from .phylip import \ 12 | read_phylip_alignment, \ 13 | InvalidPhylipAlignmentError, \ 14 | read_phylip_distmat, \ 15 | InvalidPhylipMatrixError 16 | from .fasta import \ 17 | read_fasta_alignment, \ 18 | InvalidFastaAlignmentError 19 | from .csv import \ 20 | read_csv_distmat 21 | -------------------------------------------------------------------------------- /docs/pyckmeans.utils.rst: -------------------------------------------------------------------------------- 1 | pyckmeans.utils package 2 | ======================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | pyckmeans.utils.plotting module 8 | ------------------------------- 9 | 10 | .. automodule:: pyckmeans.utils.plotting 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pyckmeans.utils.progressbar module 16 | ---------------------------------- 17 | 18 | .. automodule:: pyckmeans.utils.progressbar 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: pyckmeans.utils 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd -------------------------------------------------------------------------------- /docs/pyckmeans.core.rst: -------------------------------------------------------------------------------- 1 | pyckmeans.core package 2 | ====================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | pyckmeans.core.ckmeans module 8 | ----------------------------- 9 | 10 | .. automodule:: pyckmeans.core.ckmeans 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pyckmeans.core.multickmeans module 16 | ---------------------------------- 17 | 18 | .. automodule:: pyckmeans.core.multickmeans 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pyckmeans.core.utils module 24 | --------------------------- 25 | 26 | .. automodule:: pyckmeans.core.utils 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | pyckmeans.core.wecr module 32 | -------------------------- 33 | 34 | .. automodule:: pyckmeans.core.wecr 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: pyckmeans.core 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Tankred Ott 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /docs/pyckmeans.io.rst: -------------------------------------------------------------------------------- 1 | pyckmeans.io package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | pyckmeans.io.c\_interop module 8 | ------------------------------ 9 | 10 | .. automodule:: pyckmeans.io.c_interop 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pyckmeans.io.csv module 16 | ----------------------- 17 | 18 | .. automodule:: pyckmeans.io.csv 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pyckmeans.io.fasta module 24 | ------------------------- 25 | 26 | .. automodule:: pyckmeans.io.fasta 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | pyckmeans.io.nucleotide\_alignment module 32 | ----------------------------------------- 33 | 34 | .. automodule:: pyckmeans.io.nucleotide_alignment 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | pyckmeans.io.phylip module 40 | -------------------------- 41 | 42 | .. automodule:: pyckmeans.io.phylip 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | Module contents 48 | --------------- 49 | 50 | .. automodule:: pyckmeans.io 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | -------------------------------------------------------------------------------- /pyckmeans/knee/tests/test_knee.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy 3 | 4 | from pyckmeans.knee import KneeLocator 5 | 6 | @pytest.mark.parametrize('direction', ['increasing', 'decreasing']) 7 | @pytest.mark.parametrize('curve', ['convex', 'concave']) 8 | def test_simple(direction, curve): 9 | x = numpy.array([1.0, 2.0, 3.0 ,4.0, 5.0, 6.0, 7.0, 8.0, 9.0 ]) 10 | y = numpy.array([1.0, 2.2, 3.4, 4.5, 7.0, 10.0, 15.0, 22.0, 30.0]) 11 | 12 | kl_0 = KneeLocator(x, y, curve=curve, direction=direction, interp_method='interp1d') 13 | print('kl_0.knee:', kl_0.knee) 14 | print('kl_0.elbow:', kl_0.elbow) 15 | print('kl_0.norm_elbow:', kl_0.norm_elbow) 16 | print('kl_0.elbow_y:', kl_0.elbow_y) 17 | print('kl_0.norm_elbow_y:', kl_0.norm_elbow_y) 18 | print('kl_0.all_elbows:', kl_0.all_elbows) 19 | print('kl_0.all_norm_elbows:', kl_0.all_norm_elbows) 20 | print('kl_0.all_elbows_y:', kl_0.all_elbows_y) 21 | print('kl_0.all_norm_elbows_y:', kl_0.all_norm_elbows_y) 22 | 23 | kl_1 = KneeLocator(x, y, curve=curve, direction=direction, interp_method='polynomial') 24 | 25 | with pytest.raises(ValueError): 26 | KneeLocator(x, y, curve=curve, direction=direction, interp_method='XYZ') 27 | -------------------------------------------------------------------------------- /conda.recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set data = load_setup_py_data(setup_file='../setup.py', from_recipe_dir=True) %} 2 | 3 | package: 4 | name: {{ data.get('name') }} 5 | version: {{ data.get('version') }} 6 | 7 | source: 8 | path: '../' 9 | 10 | build: 11 | # noarch: python 12 | number: 0 13 | script: "{{ PYTHON }} -m pip install --no-deps --ignore-installed -vv ." 14 | 15 | requirements: 16 | build: 17 | - {{ compiler('cxx') }} 18 | host: 19 | - python 20 | - pip 21 | run: 22 | - python 23 | - numpy 24 | - pandas 25 | - scipy 26 | - scikit-learn 27 | - matplotlib 28 | - tqdm 29 | 30 | about: 31 | home: https://github.com/TankredO/pyckmeans 32 | license: MIT 33 | license_family: MIT 34 | license_file: LICENSE 35 | summary: {{ data.get('description') }} 36 | 37 | # The remaining entries in this section are optional, but recommended. 38 | description: | 39 | pyckmeans is a Python package for Consensus K-Means and Weighted Ensemble Consensus of Random (WECR) K-Means clustering, especially in the context of DNA sequence data. 40 | doc_url: https://pyckmeans.readthedocs.io 41 | dev_url: https://github.com/TankredO/pyckmeans 42 | 43 | extra: 44 | recipe-maintainers: 45 | - TankredO 46 | -------------------------------------------------------------------------------- /.github/workflows/publish_coverage.yaml: -------------------------------------------------------------------------------- 1 | name: Publish coverage report to coveralls 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | logLevel: 6 | description: 'Log level' 7 | required: true 8 | default: 'warning' 9 | tags: 10 | description: 'Publish coverage report to coveralls tags' 11 | release: 12 | types: [published] 13 | jobs: 14 | Publish-coverage: 15 | name: Publish coverage 16 | runs-on: "windows-latest" 17 | env: 18 | COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_TOKEN }} 19 | steps: 20 | - run: echo "This job was triggered by ${{ github.event_name }}." 21 | - run: echo "This job is now running on ${{ runner.os }}." 22 | - run: echo "The branch name is ${{ github.ref }}" 23 | - name: Check out repository code 24 | uses: actions/checkout@v2 25 | - run: echo "The ${{ github.repository }} repository has been cloned to the runner." 26 | - name: List files in the repository 27 | run: | 28 | ls ${{ github.workspace }} 29 | - name: Set up Python 30 | uses: actions/setup-python@v2 31 | with: 32 | python-version: "3.9" 33 | - name: Which version 34 | run: which python 35 | - name: Install coveralls and pytest 36 | run: python -m pip install coveralls pytest 37 | - name: Install pyckmeans 38 | run: | 39 | python -m pip install -r requirements.txt 40 | python -m pip install biopython 41 | python -m pip install --no-deps --ignore-installed -vv -e . 42 | - name: Generate coverage report 43 | run: python -m coverage run --source=pyckmeans -m pytest .\pyckmeans\ 44 | - name: Publish coverage report to coveralls 45 | run: python -m coveralls 46 | - run: echo "This job's status is ${{ job.status }}." 47 | -------------------------------------------------------------------------------- /.github/workflows/publish_wheels_pypi.yaml: -------------------------------------------------------------------------------- 1 | name: Publish wheels to PyPi 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | logLevel: 6 | description: 'Log level' 7 | required: true 8 | default: 'warning' 9 | tags: 10 | description: 'Publish wheels to PyPi tags' 11 | release: 12 | types: [published] 13 | jobs: 14 | Publish-to-PyPi: 15 | name: Ex1 (${{ matrix.python-version }}, ${{ matrix.os }}) 16 | runs-on: ${{ matrix.os }} 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | # os: ["ubuntu-latest", "macos-latest", "windows-latest"] 21 | os: ["windows-latest", "macos-latest"] 22 | python-version: ["3.6", "3.7", "3.8", "3.9"] 23 | steps: 24 | - run: echo "This job was triggered by ${{ github.event_name }}." 25 | - run: echo "This job is now running on ${{ runner.os }}." 26 | - run: echo "The branch name is ${{ github.ref }}" 27 | - name: Check out repository code 28 | uses: actions/checkout@v2 29 | - run: echo "The ${{ github.repository }} repository has been cloned to the runner." 30 | - name: List files in the repository 31 | run: | 32 | ls ${{ github.workspace }} 33 | - name: Set up Python 34 | uses: actions/setup-python@v2 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | - name: Which version 38 | run: which python 39 | - name: Install wheel and twine 40 | run: python -m pip install wheel twine 41 | - name: Install pyckmeans 42 | run: | 43 | python -m pip install --no-deps --ignore-installed -vv . 44 | python setup.py bdist_wheel 45 | - name: Publish to PyPI 46 | run: twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }} 47 | - run: echo "This job's status is ${{ job.status }}." 48 | -------------------------------------------------------------------------------- /.github/workflows/publish_anaconda.yaml: -------------------------------------------------------------------------------- 1 | name: Publish to Anaconda Cloud 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | logLevel: 6 | description: 'Log level' 7 | required: true 8 | default: 'warning' 9 | tags: 10 | description: 'Publish to Anaconda Cloud tags' 11 | release: 12 | types: [published] 13 | jobs: 14 | Publish-to-Anaconda-Cloud: 15 | name: Ex1 (${{ matrix.python-version }}, ${{ matrix.os }}) 16 | runs-on: ${{ matrix.os }} 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | #os: ["ubuntu-latest", "macos-latest", "windows-latest"] 21 | os: ["ubuntu-latest", "windows-latest"] 22 | python-version: ["3.6", "3.7", "3.8", "3.9"] 23 | steps: 24 | - run: echo "This job was triggered by ${{ github.event_name }}." 25 | - run: echo "This job is now running on ${{ runner.os }}." 26 | - run: echo "The branch name is ${{ github.ref }}" 27 | - name: Check out repository code 28 | uses: actions/checkout@v2 29 | - run: echo "The ${{ github.repository }} repository has been cloned to the runner." 30 | - name: List files in the repository 31 | run: | 32 | ls ${{ github.workspace }} 33 | - uses: s-weigand/setup-conda@v1 34 | with: 35 | auto-update-conda: true 36 | python-version: ${{ matrix.python-version }} 37 | - name: Conda info 38 | run: conda info 39 | - name: Conda list 40 | run: conda list 41 | - name: Which version 42 | run: which python 43 | - name: Install conda packages 44 | run: conda install conda-build anaconda-client 45 | - name: Build and publish 46 | run: conda build ${{ github.workspace }}/conda.recipe --user ${{secrets.ANACONDA_USER}} --token ${{secrets.ANACONDA_TOKEN}} 47 | - run: echo "This job's status is ${{ job.status }}." 48 | -------------------------------------------------------------------------------- /pyckmeans/ordering/tests/test_reordering.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from sklearn.datasets import make_blobs 4 | from scipy.spatial.distance import squareform, pdist 5 | 6 | from pyckmeans.ordering import InvalidLinkageType, InvalidReorderMethod, distance_order, reorder_distance 7 | from pyckmeans.distance import DistanceMatrix 8 | 9 | @pytest.fixture(scope='session') 10 | def prepare_distances(): 11 | x0, _ = make_blobs(n_samples=10, n_features=2, centers=2) 12 | d0_np = squareform(pdist(x0)) 13 | d0_dm = DistanceMatrix(d0_np) 14 | 15 | x1, _ = make_blobs(n_samples=50, n_features=3, centers=3) 16 | d1_np = squareform(pdist(x1)) 17 | d1_dm = DistanceMatrix(d1_np) 18 | 19 | return ( 20 | d0_np, 21 | d0_dm, 22 | d1_np, 23 | d1_dm, 24 | ) 25 | 26 | def test_reorder(prepare_distances): 27 | d0, d1, d2, d3 = prepare_distances 28 | 29 | d0_o = reorder_distance(d0) 30 | print('d0_o:', d0_o) 31 | d1_o = reorder_distance(d1) 32 | print('d1_o:', d1_o) 33 | d2_o = reorder_distance(d2) 34 | print('d2_o:', d2_o) 35 | d3_o = reorder_distance(d3) 36 | print('d3_o:', d3_o) 37 | 38 | print('d0_o olo:', reorder_distance(d0, method='OLO')) 39 | 40 | with pytest.raises(InvalidReorderMethod): 41 | reorder_distance(d0, method='XYZ') 42 | with pytest.raises(InvalidLinkageType): 43 | reorder_distance(d0, linkage_type='XYZ') 44 | 45 | def test_order(prepare_distances): 46 | d0, d1, d2, d3 = prepare_distances 47 | 48 | o0 = distance_order(d0) 49 | print('o0:', o0) 50 | o1 = distance_order(d1) 51 | print('o1:', o1) 52 | o2 = distance_order(d2) 53 | print('o2:', o2) 54 | o3 = distance_order(d3) 55 | print('o3:', o3) 56 | 57 | print('o0 olo:', distance_order(d0, method='OLO')) 58 | 59 | with pytest.raises(InvalidReorderMethod): 60 | distance_order(d0, method='XYZ') 61 | with pytest.raises(InvalidLinkageType): 62 | distance_order(d0, linkage_type='XYZ') 63 | 64 | -------------------------------------------------------------------------------- /pyckmeans/distance/tests/test_distance.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import numpy as np 4 | 5 | from pyckmeans.io import NucleotideAlignment 6 | from pyckmeans.distance import alignment_distance, p_distance, InvalidDistanceTypeError 7 | 8 | @pytest.fixture() 9 | def prepare_alignments(): 10 | aln_0 = NucleotideAlignment( 11 | ['s0', 's1', 's2', 's3'], 12 | np.array([ 13 | ['A', 'C', 'T', 'G', 'C', 'C', 'T', 'A', 'G', 'A'], 14 | ['T', 'C', 'T', 'G', 'C', 'C', 'T', 'T', 'G', 'A'], 15 | ['A', 'G', 'T', 'G', 'C', 'C', 'T', 'A', 'G', 'A'], 16 | ['A', 'C', 'T', 'A', 'A', 'A', 'T', 'A', 'G', 'A'], 17 | ]) 18 | ) 19 | d_0_p = np.array([ 20 | [0.0, 0.2, 0.1, 0.3], 21 | [0.2, 0.0, 0.3, 0.5], 22 | [0.1, 0.3, 0.0, 0.4], 23 | [0.3, 0.5, 0.4, 0.0], 24 | ]) 25 | 26 | return ( 27 | (aln_0, d_0_p), 28 | ) 29 | 30 | def test_p_distance(prepare_alignments): 31 | eps = 0.0001 32 | 33 | d_0 = alignment_distance(prepare_alignments[0][0], 'p') 34 | d_0_expected = prepare_alignments[0][1] 35 | assert np.all(np.abs(d_0.dist_mat - d_0_expected) < eps) 36 | 37 | print('d_0', d_0) 38 | 39 | d_0_p = p_distance(prepare_alignments[0][0].sequences) 40 | assert np.all(np.abs(d_0_p - d_0_expected) < eps) 41 | 42 | def test_distances_simple(prepare_alignments): 43 | alignment_distance(prepare_alignments[0][0], 'p', True) 44 | alignment_distance(prepare_alignments[0][0], 'p', False) 45 | alignment_distance(prepare_alignments[0][0], 'jc', True) 46 | alignment_distance(prepare_alignments[0][0], 'jc', False) 47 | alignment_distance(prepare_alignments[0][0], 'k2p', True) 48 | alignment_distance(prepare_alignments[0][0], 'k2p', False) 49 | 50 | with pytest.raises(InvalidDistanceTypeError): 51 | alignment_distance(prepare_alignments[0][0], 'xyz', True) 52 | with pytest.raises(InvalidDistanceTypeError): 53 | alignment_distance(prepare_alignments[0][0], 'xyz', False) 54 | -------------------------------------------------------------------------------- /pyckmeans/tests/manual_test_2.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import time 3 | 4 | import numpy as np 5 | import numpy.random as random 6 | import matplotlib.pyplot as plt 7 | 8 | import pyckmeans 9 | 10 | if __name__ == '__main__': 11 | 12 | path = pathlib.Path(__file__).parent.absolute() 13 | 14 | p = 10 15 | 16 | n0 = 50 17 | x0 = np.random.normal(0, 2, (n0, p)) 18 | n1 = 50 19 | x1 = np.random.normal(-3, 1.5, (n1, p)) 20 | n2 = 50 21 | x2 = np.random.normal(3, 2, (n2, p)) 22 | 23 | x_0 = np.r_[x0, x1, x2] 24 | 25 | k = np.arange(2, 8) 26 | n_rep = 200 27 | p_feat = 0.8 28 | p_samp = 0.8 29 | gamma = 0.5 30 | must_link = np.array([ 31 | [0, 10], 32 | [12, 21], 33 | [52, 56], 34 | [75, 61], 35 | [101, 142], 36 | # [1, 51], 37 | # [2, 51], 38 | # [3, 51], 39 | # [4, 51], 40 | # [5, 51], 41 | # [6, 51], 42 | # [7, 51], 43 | # [8, 51], 44 | # [9, 51], 45 | # [10, 51] 46 | ]) 47 | must_not_link = np.array([ 48 | [0, 64], 49 | [88, 15], 50 | [112, 56], 51 | [140, 1], 52 | # [1, 2], 53 | # [1, 3], 54 | # [1, 4], 55 | # [1, 5], 56 | # [1, 6], 57 | # [1, 7], 58 | ]) 59 | 60 | wecr_0 = pyckmeans.WECR(k=k, n_rep=n_rep, p_samp=p_samp, p_feat=p_feat, gamma=gamma) 61 | 62 | t0 = time.time() 63 | wecr_0.fit(x_0, must_link=must_link, must_not_link=must_not_link) 64 | t1 = time.time() 65 | 66 | 67 | t2 = time.time() 68 | cmatrix = wecr_0.predict(x_0) 69 | t3 = time.time() 70 | 71 | # print(cmatrix) 72 | 73 | print(t1 - t0) 74 | print(t3 - t2) 75 | 76 | fig, ax = plt.subplots(1, 1) 77 | ax.imshow(cmatrix) 78 | fig.savefig(path / 'manual_test_2_img0.png') 79 | 80 | print(cmatrix) 81 | 82 | fig, ax = plt.subplots(1, 1) 83 | ax.scatter(x_0[:, 0], x_0[:, 1]) 84 | fig.savefig(path / 'manual_test_2_img1.png') 85 | 86 | # print(wecr_0.qualities) -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | 2 | # Configuration file for the Sphinx documentation builder. 3 | # 4 | # This file only contains a selection of the most common options. For a full 5 | # list see the documentation: 6 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 7 | 8 | # -- Path setup -------------------------------------------------------------- 9 | 10 | # If extensions (or modules to document with autodoc) are in another directory, 11 | # add these directories to sys.path here. If the directory is relative to the 12 | # documentation root, use os.path.abspath to make it absolute, like shown here. 13 | # 14 | import os 15 | import sys 16 | sys.path.insert(0, os.path.abspath('../pyckmeans')) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = 'pyckmeans' 22 | copyright = '2021, Tankred Ott' 23 | author = 'Tankred Ott' 24 | 25 | 26 | # -- General configuration --------------------------------------------------- 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | 'sphinx_rtd_theme', 33 | 'sphinx.ext.autodoc', 34 | 'numpydoc', 35 | ] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | html_theme_options = { 41 | "collapse_navigation": False, 42 | } 43 | 44 | # List of patterns, relative to source directory, that match files and 45 | # directories to ignore when looking for source files. 46 | # This pattern also affects html_static_path and html_extra_path. 47 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 48 | 49 | 50 | # -- Options for HTML output ------------------------------------------------- 51 | 52 | # The theme to use for HTML and HTML Help pages. See the documentation for 53 | # a list of builtin themes. 54 | # 55 | html_theme = 'sphinx_rtd_theme' 56 | 57 | # Add any paths that contain custom static files (such as style sheets) here, 58 | # relative to this directory. They are copied after the builtin static files, 59 | # so a file named "default.css" will overwrite the builtin "default.css". 60 | html_static_path = ['_static'] 61 | 62 | # numpydoc_show_class_members = False 63 | -------------------------------------------------------------------------------- /pyckmeans/utils/progressbar.py: -------------------------------------------------------------------------------- 1 | ''' Progress bar utilities 2 | ''' 3 | 4 | from typing import Dict, Any 5 | 6 | import tqdm 7 | from pyckmeans.core import MultiCKMeans 8 | 9 | class MultiCKMeansProgressBars: 10 | '''MultiCKMeansProgressBars 11 | 12 | Context Manager for a MultiCKMeans progress bars. 13 | 14 | Parameters 15 | ---------- 16 | mckm : MultiCKMeans 17 | MultiCKMeans object to display progress bars for. 18 | kwargs : Dict[str, Any] 19 | Additional keyword arguments passed to tqdm.tqdm. 20 | ''' 21 | def __init__( 22 | self, 23 | mckm: MultiCKMeans, 24 | **kwargs: Dict[str, Any], 25 | ): 26 | self.mckm = mckm 27 | 28 | self.ks = mckm.k 29 | self.n_rep = mckm.n_rep 30 | 31 | self._ckm_idx = 0 32 | self._iter = 0 33 | self._done = False 34 | 35 | # tqdm options 36 | self._tqdm_kwargs = { 37 | } 38 | self._tqdm_kwargs.update(kwargs) 39 | 40 | # init first progress bar 41 | self._tqdm = tqdm.tqdm( 42 | total=self.n_rep, 43 | mininterval=0.5, 44 | desc=f'k={self.ks[self._ckm_idx]}', 45 | **self._tqdm_kwargs, 46 | ) 47 | 48 | def update( 49 | self, 50 | n: int = 1, 51 | ): 52 | '''update 53 | 54 | Update progress by n iterations. 55 | 56 | Parameters 57 | ---------- 58 | n : int, optional 59 | Progress increment in iterations, by default 1 60 | ''' 61 | if self._done: 62 | return 63 | 64 | self._iter += n 65 | self._tqdm.update(n) 66 | 67 | if self._iter >= self.n_rep: 68 | self._tqdm.close() 69 | self._iter = 0 70 | self._ckm_idx += 1 71 | if self._ckm_idx >= len(self.ks): 72 | self._done = True 73 | else: 74 | self._tqdm = tqdm.tqdm( 75 | total=self.n_rep, 76 | desc=f'k={self.ks[self._ckm_idx]}', 77 | **self._tqdm_kwargs, 78 | ) 79 | 80 | def __enter__(self): 81 | return self 82 | 83 | def __exit__(self, exc_type, exc_value, exc_traceback): 84 | self._tqdm.close() 85 | return 86 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | ''' setup 2 | ''' 3 | 4 | import re 5 | import io 6 | from distutils.command.build_ext import build_ext as build_ext_orig 7 | from setuptools import setup, find_packages, Extension 8 | 9 | # source: https://stackoverflow.com/a/39671214 10 | __version__ = re.search( 11 | r'__version__\s*=\s*[\'"]([^\'"]*)[\'"]', 12 | io.open('pyckmeans/__init__.py', encoding='utf_8_sig').read() 13 | ).group(1) 14 | 15 | # ==== ctypes extensions 16 | class CTypesExtension(Extension): 17 | '''CTypesExtension''' 18 | 19 | class build_ext(build_ext_orig): 20 | '''build_ext''' 21 | def build_extension(self, ext): 22 | self._ctypes = isinstance(ext, CTypesExtension) 23 | return super().build_extension(ext) 24 | 25 | def get_export_symbols(self, ext): 26 | if self._ctypes: 27 | return ext.export_symbols 28 | return super().get_export_symbols(ext) 29 | 30 | def get_ext_filename(self, ext_name): 31 | if self._ctypes: 32 | return ext_name + '.so' 33 | return super().get_ext_filename(ext_name) 34 | 35 | distance_module = CTypesExtension( 36 | 'pyckmeans.distance.lib.distance', 37 | sources=['pyckmeans/distance/src/distance.cpp'], 38 | language='c++', 39 | ) 40 | 41 | nucencode_module = CTypesExtension( 42 | 'pyckmeans.io.lib.nucencode', 43 | sources=['pyckmeans/io/src/nucencode.cpp'], 44 | language='c++', 45 | ) 46 | 47 | ext_modules = [ 48 | distance_module, 49 | nucencode_module, 50 | ] 51 | 52 | install_requires = [ 53 | 'numpy', 54 | 'pandas', 55 | 'scipy', 56 | 'scikit-learn', 57 | 'matplotlib', 58 | 'tqdm', 59 | ] 60 | 61 | # ==== 62 | description = 'A consensus K-Means implementation.' 63 | 64 | long_description = io.open('README.md').read() 65 | long_description_content_type = 'text/markdown' 66 | # ==== 67 | setup( 68 | name='pyckmeans', 69 | version=__version__, 70 | packages=find_packages(), 71 | description=description, 72 | long_description=long_description, 73 | long_description_content_type=long_description_content_type, 74 | author='Tankred Ott', 75 | platforms=['any'], 76 | python_requires='>=3.6', 77 | install_requires=install_requires, 78 | cmdclass={'build_ext': build_ext}, 79 | ext_modules=ext_modules, 80 | url='https://github.com/TankredO/pyckmeans', 81 | ) 82 | -------------------------------------------------------------------------------- /pyckmeans/io/c_interop.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | from os import error 3 | import pathlib 4 | 5 | import numpy 6 | 7 | # load the shared library 8 | libfile = pathlib.Path(__file__).parent / 'lib' / 'nucencode.so' 9 | lib = ctypes.CDLL(str(libfile)) 10 | 11 | # == nucleotide encoding 12 | lib.encodeNucleotides.restype = None 13 | lib.encodeNucleotides.argtypes = [ 14 | numpy.ctypeslib.ndpointer( # alignment: n * m matrix 15 | dtype=numpy.uint8, 16 | ndim=2, 17 | flags='C_CONTIGUOUS', 18 | ), 19 | ctypes.c_int, # n: number of entries 20 | ctypes.c_int, # m: number of sites 21 | ] 22 | 23 | lib.encodeNucleotides_uint32.restype = None 24 | lib.encodeNucleotides_uint32.argtypes = [ 25 | numpy.ctypeslib.ndpointer( # alignment: n * m matrix 26 | dtype=numpy.uint32, 27 | ndim=2, 28 | flags='C_CONTIGUOUS', 29 | ), 30 | ctypes.c_int, # n: number of entries 31 | ctypes.c_int, # m: number of sites 32 | numpy.ctypeslib.ndpointer( # encodedAlignment: n * m matrix 33 | dtype=numpy.uint8, 34 | ndim=2, 35 | flags='C_CONTIGUOUS', 36 | ), 37 | ] 38 | 39 | def encode_nucleotides( 40 | alignment: numpy.ndarray, 41 | ) -> numpy.ndarray: 42 | '''encode_nucleotides 43 | 44 | Encode nucleotide alignment INPLACE. 45 | 46 | Parameters 47 | ---------- 48 | alignment : numpy.ndarray 49 | n*m numpy alignment, where n is the number of entries and m is 50 | the number of sites. Dtype must be 'U1' or 'S'. 51 | 52 | Returns 53 | ------- 54 | numpy.ndarray 55 | The encoded alignment. 56 | 57 | Raises 58 | ------ 59 | Exception 60 | Raised if alignment has invalid dtype. 61 | ''' 62 | if not alignment.flags['C_CONTIGUOUS']: 63 | alignment = numpy.ascontiguousarray(alignment) 64 | 65 | n, m = alignment.shape 66 | 67 | # ASCII encoding? 1 byte per character 68 | if alignment.dtype.type == numpy.dtype('S'): 69 | lib.encodeNucleotides(alignment.view(numpy.uint8), n, m) 70 | return alignment.view(numpy.uint8) 71 | # Unicode encoding. Expecting 4 bytes per character 72 | elif alignment.dtype.type == numpy.dtype('U'): 73 | alignment_encoded = numpy.zeros_like(alignment, dtype=numpy.uint8) 74 | lib.encodeNucleotides_uint32(alignment.view(numpy.uint32), n, m, alignment_encoded) 75 | return alignment_encoded 76 | else: 77 | msg = f'Can not encode sequences with dtype {alignment.dtype}.' 78 | raise Exception(msg) 79 | -------------------------------------------------------------------------------- /pyckmeans/io/tests/test_fasta.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tempfile 3 | import os 4 | 5 | from pyckmeans.io import fasta 6 | from pyckmeans.io.fasta import InvalidFastaAlignmentError 7 | 8 | FASTA_STR_0 = \ 9 | ''' 10 | >Sample 0 11 | ACTGTCATG 12 | >Sample 1 13 | ACT--CATC 14 | ''' 15 | 16 | FASTA_STR_1 = \ 17 | ''' 18 | >Sample 0 19 | ACT GTC ATG 20 | >Sample 1 21 | ACT --C ATC 22 | ''' 23 | 24 | FASTA_STR_2 = \ 25 | ''' 26 | >Sample 0 27 | ACT 28 | GTC 29 | ATG 30 | >Sample 1 31 | ACT 32 | --C 33 | ATC 34 | ''' 35 | 36 | FASTA_STR_3 = \ 37 | ''' 38 | >Sample 0 39 | ACTGTCAT 40 | >Sample 1 41 | ACT--CATC 42 | ''' 43 | 44 | FASTA_STR_4 = \ 45 | ''' 46 | >Sample 0 47 | ACTGTCATA 48 | >Sample 1 49 | ACT--CAT 50 | ''' 51 | 52 | @pytest.fixture(scope='session') 53 | def prep_fasta_files(): 54 | with tempfile.TemporaryDirectory() as tempdir: 55 | print(f'Created temporary directory {tempdir}.') 56 | 57 | fasta_file_0 = os.path.join(tempdir, 'fasta_0.fasta') 58 | with open(fasta_file_0, 'w') as f: 59 | f.write(FASTA_STR_0) 60 | 61 | fasta_file_1 = os.path.join(tempdir, 'fasta_1.fasta') 62 | with open(fasta_file_1, 'w') as f: 63 | f.write(FASTA_STR_1) 64 | 65 | fasta_file_2 = os.path.join(tempdir, 'fasta_2.fasta') 66 | with open(fasta_file_2, 'w') as f: 67 | f.write(FASTA_STR_2) 68 | 69 | fasta_file_3 = os.path.join(tempdir, 'fasta_3.fasta') 70 | with open(fasta_file_3, 'w') as f: 71 | f.write(FASTA_STR_3) 72 | 73 | fasta_file_4 = os.path.join(tempdir, 'fasta_4.fasta') 74 | with open(fasta_file_4, 'w') as f: 75 | f.write(FASTA_STR_4) 76 | 77 | yield ( 78 | # should work 79 | fasta_file_0, 80 | fasta_file_1, 81 | fasta_file_2, 82 | 83 | # shouldn't work 84 | fasta_file_3, 85 | fasta_file_4, 86 | ) 87 | 88 | print(f'Deleted temporary directory {tempdir}.') 89 | 90 | def test_read_fasta_alignment(prep_fasta_files): 91 | r_0 = fasta.read_fasta_alignment(prep_fasta_files[0]) 92 | r_1 = fasta.read_fasta_alignment(prep_fasta_files[1]) 93 | r_2 = fasta.read_fasta_alignment(prep_fasta_files[2]) 94 | 95 | print('r_0', r_0) 96 | print('r_1', r_1) 97 | print('r_2', r_2) 98 | 99 | with pytest.raises(InvalidFastaAlignmentError): 100 | r_3 = fasta.read_fasta_alignment(prep_fasta_files[3]) 101 | with pytest.raises(InvalidFastaAlignmentError): 102 | r_4 = fasta.read_fasta_alignment(prep_fasta_files[4]) 103 | -------------------------------------------------------------------------------- /pyckmeans/io/fasta.py: -------------------------------------------------------------------------------- 1 | ''' fasta 2 | 3 | Module for reading and writing FASTA files. 4 | ''' 5 | 6 | 7 | import itertools 8 | import re 9 | from typing import Tuple, Union 10 | 11 | import numpy 12 | 13 | class InvalidFastaAlignmentError(Exception): 14 | '''InvalidFastaAlignmentError 15 | ''' 16 | 17 | WHITESPACE_RE = re.compile(r'\s+') 18 | 19 | def read_fasta_alignment( 20 | fasta_file: str, 21 | dtype: Union[str, numpy.dtype] = 'U', 22 | ) -> Tuple[numpy.ndarray, numpy.ndarray]: 23 | '''read_fasta_alignment 24 | 25 | Read fasta alignment file. This function expects the fasta to be a valid alignment, 26 | meaning that it should contain at least 2 sequences of the same length, including 27 | gaps. 28 | 29 | Parameters 30 | ---------- 31 | fasta_file : str 32 | Path to a fasta file. 33 | dtype: Union[str, numpy.dtype] 34 | Data type to use for the sequence array. 35 | 36 | Returns 37 | ------- 38 | Tuple[numpy.ndarray, numpy.ndarray] 39 | Tuple of sequences and names, each as numpy array. 40 | 41 | Raises 42 | ------ 43 | InvalidFastaAlignmentError 44 | Raised if less than 2 sequences are present in fasta_file. 45 | InvalidFastaAlignmentError 46 | Raised if the sequences have different lengths. 47 | ''' 48 | 49 | names = [] 50 | seqs = [] 51 | first = True 52 | with open(fasta_file) as fasta_f: 53 | seq_buffer = [] 54 | for line in fasta_f: 55 | _line = line.strip() 56 | 57 | # empty line 58 | if not _line: 59 | continue 60 | 61 | # name line 62 | if _line[0] == '>': 63 | names.append(_line[1:]) 64 | if not first: 65 | seqs.append(list(itertools.chain(*seq_buffer))) 66 | seq_buffer = [] 67 | else: 68 | first = False 69 | # sequence line 70 | else: 71 | seq_buffer.append(re.sub(WHITESPACE_RE, '', _line).upper()) 72 | 73 | seqs.append(list(itertools.chain(*seq_buffer))) 74 | 75 | # check alignment validity 76 | n_seq = len(seqs) 77 | if len(seqs) < 2: 78 | msg = f'Expected at least 2 entries but found only {n_seq}.' 79 | raise InvalidFastaAlignmentError(msg) 80 | 81 | seq_len = len(seqs[0]) 82 | for i, seq in enumerate(seqs[1:]): 83 | cur_seq_len = len(seq) 84 | if cur_seq_len != seq_len: 85 | msg = f'Expected all sequences to have length {seq_len}' +\ 86 | f'(length of sequence #0) but sequence #{i+1} has length {cur_seq_len}.' 87 | raise InvalidFastaAlignmentError(msg) 88 | 89 | seqs = numpy.array(seqs, dtype=dtype) 90 | names = numpy.array(names) 91 | 92 | return seqs, names 93 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | # vscode 141 | .vscode 142 | 143 | # test data 144 | *test*.png 145 | pyckmeans/tests/*.pickle 146 | pyckmeans/tests/*.csv 147 | pyckmeans/tests/*.tsv 148 | pyckmeans/tests/*.json 149 | -------------------------------------------------------------------------------- /pyckmeans/io/src/nucencode.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _WIN32 2 | #define LIBRARY_API extern "C" __declspec(dllexport) 3 | #else 4 | #define LIBRARY_API extern "C" 5 | #endif 6 | 7 | #include 8 | #include 9 | 10 | /* 11 | * Base encoding as used by R package ape. 12 | * See http://ape-package.ird.fr/misc/BitLevelCodingScheme.html 13 | * 14 | * Summary: 15 | * Most significant four bits are base information (A, G, C, T) 16 | * 76543210 17 | * 0b00001000 -> base is known 18 | * 0b00000100 -> gap 19 | * 0b00000010 -> unknown base 20 | * 21 | * bases 22 | * A 0b10001000 23 | * G 0b01001000 24 | * C 0b00101000 25 | * T 0b00011000 26 | * 27 | * wobbles 28 | * R 0b11000000 A|G 29 | * M 0b10100000 A|C 30 | * W 0b10010000 A|T 31 | * S 0b01100000 G|C 32 | * K 0b01010000 G|T 33 | * Y 0b00110000 C|T 34 | * V 0b11100000 A|G|C 35 | * H 0b10110000 A|C|T 36 | * D 0b11010000 A|G|T 37 | * B 0b01110000 G|C|T 38 | * N 0b11110000 A|G|C|T 39 | * 40 | * gap 41 | * - 0b00000100 42 | * 43 | * unknown/missing state 44 | * ? 0b00000010 45 | * 46 | */ 47 | 48 | // ASCII code to nucleotide encoding map 49 | const std::uint8_t asciiToEncoding[128] = { 50 | // 0 1 2 3 4 5 6 7 8 9 51 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 000 52 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 010 53 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 020 54 | 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, // 030 55 | 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, // 040 56 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 050 57 | 0, 0, 0, 2, 0, 136, 112, 40, 208, 0, // 060 58 | 0, 72, 176, 0, 0, 80, 0, 160, 240, 0, // 070 59 | 0, 0, 192, 96, 24, 0, 224, 144, 0, 48, // 080 60 | 0, 0, 0, 0, 0, 0, 0, 136, 112, 40, // 090 61 | 208, 0, 0, 72, 176, 0, 0, 80, 0, 160, // 100 62 | 240, 0, 0, 0, 192, 96, 24, 0, 224, 144, // 110 63 | 0, 48, 0, 0, 0, 0, 4, 0 // 120 64 | }; 65 | 66 | 67 | // encode nucleotides in place 68 | LIBRARY_API void encodeNucleotides( 69 | std::uint8_t* alignment, // nucleotide alignment 70 | int n, // number of entries 71 | int m // number of sites 72 | ) { 73 | for (size_t i = 0; i < n; i++) { 74 | for (size_t j = 0; j < m; j++) { 75 | alignment[i * m + j] = asciiToEncoding[alignment[i * m + j]]; 76 | } 77 | } 78 | } 79 | 80 | LIBRARY_API void encodeNucleotides_uint32( 81 | std::uint32_t* alignment, // nucleotide alignment 82 | int n, // number of entries 83 | int m, // number of sites 84 | std::uint8_t* alignmentEncoded 85 | ) { 86 | for (size_t i = 0; i < n; i++) { 87 | for (size_t j = 0; j < m; j++) { 88 | alignmentEncoded[i * m + j] = asciiToEncoding[alignment[i * m + j]]; 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /pyckmeans/distance/tests/test_c_interop.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import numpy as np 4 | 5 | from pyckmeans.io import NucleotideAlignment 6 | from pyckmeans.distance.c_interop import \ 7 | p_distance,\ 8 | jc_distance,\ 9 | k2p_distance 10 | 11 | @pytest.fixture(scope='session') 12 | def prepare_alignments(): 13 | aln_0 = NucleotideAlignment( 14 | ['s0', 's1', 's2', 's3'], 15 | np.array([ 16 | ['A', 'C', 'T', 'G', 'C', 'C', 'T', 'A', 'G', 'A'], 17 | ['T', 'C', '-', 'G', 'C', 'C', 'T', 'T', 'G', 'A'], 18 | ['A', 'G', 'T', 'G', 'C', 'C', 'T', 'A', 'G', 'A'], 19 | ['A', 'C', 'T', 'A', 'A', 'A', 'T', 'A', 'G', 'A'], 20 | ]) 21 | ) 22 | p_d_0_pd = np.array([ 23 | [0.0000000, 0.2222222, 0.1000000, 0.3000000], 24 | [0.2222222, 0.0000000, 0.3333333, 0.5555556], 25 | [0.1000000, 0.3333333, 0.0000000, 0.4000000], 26 | [0.3000000, 0.5555556, 0.4000000, 0.0000000] 27 | ]) 28 | p_d_0_cd = np.array([ 29 | [0.0000000, 0.2222222, 0.1111111, 0.3333333], 30 | [0.2222222, 0.0000000, 0.3333333, 0.5555556], 31 | [0.1111111, 0.3333333, 0.0000000, 0.4444444], 32 | [0.3333333, 0.5555556, 0.4444444, 0.0000000] 33 | ]) 34 | jc_d_0_pd = np.array([ 35 | [0.0000000, 0.2635484, 0.1073256, 0.3831192], 36 | [0.2635484, 0.0000000, 0.4408400, 1.0124450], 37 | [0.1073256, 0.4408400, 0.0000000, 0.5716050], 38 | [0.3831192, 1.0124450, 0.5716050, 0.0000000] 39 | ]) 40 | jc_d_0_cd = np.array([ 41 | [0.0000000, 0.2635484, 0.1202570, 0.4408400], 42 | [0.2635484, 0.0000000, 0.4408400, 1.0124450], 43 | [0.1202570, 0.4408400, 0.0000000, 0.6734562], 44 | [0.4408400, 1.0124450, 0.6734562, 0.0000000] 45 | ]) 46 | k2p_d_0_pd = np.array([ 47 | [0.0000000, 0.2726039, 0.1084661, 0.3831192], 48 | [0.2726039, 0.0000000, 0.4773856, 1.0986123], 49 | [0.1084661, 0.4773856, 0.0000000, 0.5756463], 50 | [0.3831192, 1.0986123, 0.5756463, 0.0000000] 51 | ]) 52 | k2p_d_0_cd = np.array([ 53 | [0.0000000,0.2726039,0.1217201,0.4408400], 54 | [0.2726039,0.0000000,0.4773856,1.0986123], 55 | [0.1217201,0.4773856,0.0000000,0.6801182], 56 | [0.4408400,1.0986123,0.6801182,0.0000000], 57 | ]) 58 | 59 | 60 | return ( 61 | ( 62 | aln_0, 63 | { 64 | 'p_pd': p_d_0_pd, 'p_cd': p_d_0_cd, 65 | 'jc_pd': jc_d_0_pd, 'jc_cd': jc_d_0_cd, 66 | 'k2p_pd': k2p_d_0_pd, 'k2p_cd': k2p_d_0_cd, 67 | } 68 | ), 69 | ) 70 | 71 | def test_p_distance(prepare_alignments): 72 | eps = 0.001 73 | 74 | aln_0, d_expected_0 = prepare_alignments[0] 75 | p_d_0_pd_expected = d_expected_0['p_pd'] 76 | p_d_0_cd_expected = d_expected_0['p_cd'] 77 | 78 | print(aln_0.sequences) 79 | 80 | p_d_0_pd = p_distance(aln_0.sequences, True) 81 | print('p_d_0_pd:', p_d_0_pd) 82 | assert np.max(np.abs(p_d_0_pd - p_d_0_pd_expected)) < eps 83 | p_d_0_cd = p_distance(aln_0.sequences, False) 84 | print('p_d_0_cd:', p_d_0_cd) 85 | assert np.max(np.abs(p_d_0_cd - p_d_0_cd_expected)) < eps 86 | 87 | def test_jc_distance(prepare_alignments): 88 | eps = 0.001 89 | 90 | aln_0, d_expected_0 = prepare_alignments[0] 91 | jc_d_0_pd_expected = d_expected_0['jc_pd'] 92 | jc_d_0_cd_expected = d_expected_0['jc_cd'] 93 | 94 | print(aln_0.sequences) 95 | 96 | jc_d_0_pd = jc_distance(aln_0.sequences, True) 97 | print('jc_d_0_pd:', jc_d_0_pd) 98 | assert np.max(np.abs(jc_d_0_pd - jc_d_0_pd_expected)) < eps 99 | jc_d_0_cd = jc_distance(aln_0.sequences, False) 100 | print('jc_d_0_cd:', jc_d_0_cd) 101 | assert np.max(np.abs(jc_d_0_cd - jc_d_0_cd_expected)) < eps 102 | 103 | def test_k2p_distance(prepare_alignments): 104 | eps = 0.001 105 | 106 | aln_0, d_expected_0 = prepare_alignments[0] 107 | k2p_d_0_pd_expected = d_expected_0['k2p_pd'] 108 | k2p_d_0_cd_expected = d_expected_0['k2p_cd'] 109 | 110 | print(aln_0.sequences) 111 | 112 | k2p_d_0_pd = k2p_distance(aln_0.sequences, True) 113 | print('k2p_d_0_pd:', k2p_d_0_pd) 114 | assert np.max(np.abs(k2p_d_0_pd - k2p_d_0_pd_expected)) < eps 115 | k2p_d_0_cd = k2p_distance(aln_0.sequences, False) 116 | print('k2p_d_0_cd:', k2p_d_0_cd) 117 | assert np.max(np.abs(k2p_d_0_cd - k2p_d_0_cd_expected)) < eps 118 | -------------------------------------------------------------------------------- /pyckmeans/tests/manual_test.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import time 3 | import pickle 4 | 5 | import numpy as np 6 | import numpy.random as random 7 | import matplotlib.pyplot as plt 8 | try: 9 | import tqdm 10 | except: 11 | tqdm = None 12 | 13 | import pyckmeans 14 | import pyckmeans.utils 15 | 16 | if __name__ == '__main__': 17 | path = pathlib.Path(__file__).parent.absolute() 18 | 19 | p = 10 20 | 21 | n0 = 50 22 | x0 = np.random.normal(0, 2, (n0, p)) 23 | n1 = 50 24 | x1 = np.random.normal(-5, 1.5, (n1, p)) 25 | n2 = 50 26 | x2 = np.random.normal(5, 2, (n2, p)) 27 | 28 | x_0 = np.r_[x0, x1, x2] 29 | 30 | k = 3 31 | n_rep = 100 32 | p_feat = 0.5 33 | p_samp = 0.5 34 | 35 | 36 | ckm_0 = pyckmeans.CKmeans( 37 | k=k, 38 | n_rep=n_rep, 39 | p_samp=p_samp, 40 | p_feat=p_feat, 41 | metrics=[ 42 | 'sil', 43 | 'bic', 44 | 'db', 45 | 'ch', 46 | ], 47 | n_init=5, 48 | ) 49 | 50 | print('fitting ...') 51 | if tqdm: 52 | with tqdm.tqdm(total=n_rep) as bar: 53 | t0 = time.time() 54 | ckm_0.fit(x_0, progress_callback=bar.update) 55 | t1 = time.time() 56 | else: 57 | t0 = time.time() 58 | ckm_0.fit(x_0) 59 | t1 = time.time() 60 | 61 | print('predicting ...') 62 | if tqdm: 63 | with tqdm.tqdm(total=n_rep) as bar: 64 | t2 = time.time() 65 | ckm_0_res = ckm_0.predict(x_0, progress_callback=bar.update, return_cls=True) 66 | t3 = time.time() 67 | else: 68 | t2 = time.time() 69 | ckm_0_res = ckm_0.predict(x_0, return_cls=True) 70 | t3 = time.time() 71 | 72 | print(ckm_0_res.cmatrix) 73 | ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_00.tsv', one_hot=False, row_names=False, col_names=False) 74 | ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_10.tsv', one_hot=False, row_names=True, col_names=False) 75 | ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_01.tsv', one_hot=False, row_names=False, col_names=True) 76 | ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_11.tsv', one_hot=False, row_names=True, col_names=True) 77 | ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_oh_00.tsv', one_hot=True, row_names=False, col_names=False) 78 | ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_oh_10.tsv', one_hot=True, row_names=True, col_names=False) 79 | ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_oh_01.tsv', one_hot=True, row_names=False, col_names=True) 80 | ckm_0_res.save_km_cls(path / 'ckm_0_res_km_cls_oh_11.tsv', one_hot=True, row_names=True, col_names=True) 81 | 82 | print(t1 - t0) 83 | print(t3 - t2) 84 | 85 | # fig, ax = plt.subplots(1,1) 86 | # ax.imshow(ckm_0_res.sort().cmatrix) 87 | # fig.savefig(path / 'manual_test_img0.png') 88 | 89 | fig = ckm_0_res.plot(figsize=(10, 10)) 90 | fig.savefig(path / 'manual_test_img0.png') 91 | 92 | fig = ckm_0_res.plot( 93 | names=np.arange(x_0.shape[0]).astype('str'), 94 | figsize=(10, 10), 95 | ) 96 | fig.savefig(path / 'manual_test_img1.png') 97 | 98 | fig = ckm_0_res.plot( 99 | names=np.arange(x_0.shape[0]).astype('str'), 100 | figsize=(10, 10), 101 | order=None, 102 | ) 103 | fig.savefig(path / 'manual_test_img2.png') 104 | 105 | print('sils:', ckm_0.sils) 106 | print('bics:', ckm_0.bics) 107 | print('dbs:', ckm_0.dbs) 108 | print('chs:', ckm_0.chs) 109 | 110 | ks = [2,3,4,5,6,7,8,9,10] 111 | n_rep = 100 112 | mckm_0 = pyckmeans.MultiCKMeans(k=ks, n_rep=n_rep) 113 | 114 | print('fitting multi ...') 115 | with pyckmeans.utils.MultiCKMeansProgressBars(mckm_0) as pb: 116 | mckm_0.fit(x_0, pb.update) 117 | 118 | with open(path / 'mckm_0.pickle', 'wb') as f: 119 | pickle.dump(mckm_0, f) 120 | 121 | print('predicting multi ...') 122 | with pyckmeans.utils.MultiCKMeansProgressBars(mckm_0) as pb: 123 | mckm_0_res = mckm_0.predict(x_0, progress_callback=pb.update) 124 | 125 | print('sils:', mckm_0_res.sils) 126 | print('bics:', mckm_0_res.bics) 127 | print('dbs:', mckm_0_res.dbs) 128 | print('chs:', mckm_0_res.chs) 129 | 130 | fig = mckm_0_res.plot_metrics(figsize=(10, 10)) 131 | fig.savefig(path / f'manual_test_img_metrics0.png') 132 | 133 | for k, ckm_res in zip(ks, mckm_0_res.ckmeans_results): 134 | fig = ckm_res.plot(figsize=(10, 10)) 135 | fig.savefig(path / f'manual_test_img_k-{k}.png') 136 | -------------------------------------------------------------------------------- /pyckmeans/io/tests/test_csv.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tempfile 3 | import os 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from pyckmeans.io.csv import InvalidMatrixShapeError, read_csv_distmat, write_csv_distmat 9 | 10 | d_0 = np.array([ 11 | [0.0, 1.0, 2.0], 12 | [1.0, 0.0, 3.0], 13 | [2.0, 3.0, 0.0], 14 | ]) 15 | nm_0 = ['a', 'b', 'c'] 16 | df_0 = pd.DataFrame(d_0, columns=nm_0, index=nm_0) 17 | 18 | d_1 = np.array([ 19 | [0.0, 1.0, 2.0, 0.5], 20 | [1.0, 0.0, 3.0, 2.4], 21 | [2.0, 3.0, 0.0, 1.5], 22 | [0.5, 2.4, 1.5, 0.0] 23 | ]) 24 | nm_1 = ['a', 'b', 'c', 'd'] 25 | df_1 = pd.DataFrame(d_1, columns=nm_1, index=nm_1) 26 | 27 | # invalid input 28 | d_2 = np.array([ 29 | [0.0, 1.0, 2.0], 30 | [1.0, 0.0, 3.0], 31 | [2.0, 3.0, 0.0], 32 | [0.5, 2.4, 1.5] 33 | ]) 34 | nm_2 = ['a', 'b', 'c'] 35 | df_2 = pd.DataFrame(d_2, columns=nm_2, index=['c', 'b', 'a', '0']) 36 | 37 | @pytest.fixture(scope='session') 38 | def prep_csv_files(): 39 | with tempfile.TemporaryDirectory() as tempdir: 40 | print(f'Created temporary directory {tempdir}.') 41 | 42 | csv_file_0 = os.path.join(tempdir, 'dist_0.csv') 43 | df_0.to_csv(csv_file_0) 44 | 45 | csv_file_1 = os.path.join(tempdir, 'dist_1.csv') 46 | df_1.to_csv(csv_file_1, index=None) 47 | 48 | csv_file_2 = os.path.join(tempdir, 'dist_2.csv') 49 | df_0.to_csv(csv_file_2, header=None) 50 | 51 | csv_file_3 = os.path.join(tempdir, 'dist_3.csv') 52 | df_0.to_csv(csv_file_3, index=None, header=None) 53 | 54 | csv_file_4 = os.path.join(tempdir, 'dist_4.csv') 55 | df_2.to_csv(csv_file_4, index=None, header=None) 56 | 57 | yield ( 58 | # should work 59 | (csv_file_0, d_0, nm_0), 60 | (csv_file_1, d_1, nm_1), 61 | (csv_file_2, d_0, nm_0), 62 | (csv_file_3, d_0, None), 63 | (csv_file_4, d_0, None), 64 | ) 65 | 66 | print(f'Deleted temporary directory {tempdir}.') 67 | 68 | @pytest.fixture(scope='session') 69 | def prep_outdir(): 70 | with tempfile.TemporaryDirectory() as tempdir: 71 | print(f'Created temporary directory {tempdir}.') 72 | 73 | yield tempdir 74 | 75 | print(f'Deleted temporary directory {tempdir}.') 76 | 77 | def test_csv(prep_csv_files, prep_outdir): 78 | eps = 0.00001 79 | 80 | csv_f_0, d_0_expected, nm_0_expected = prep_csv_files[0] 81 | dm_0 = read_csv_distmat(csv_f_0, 0, 0, ',') 82 | assert np.max(np.abs(dm_0.dist_mat - d_0_expected)) < eps 83 | assert all([a == b for a, b in zip(dm_0.names, nm_0_expected)]) 84 | csv_of_0 = os.path.join(prep_outdir, 'dist_0.csv') 85 | write_csv_distmat(dm_0, csv_of_0) 86 | dm_0_r = read_csv_distmat(csv_of_0, 0, 0, ',') 87 | assert np.max(np.abs(dm_0.dist_mat - dm_0_r.dist_mat)) < eps 88 | assert all([a == b for a, b in zip(dm_0.names, dm_0_r.names)]) 89 | 90 | csv_f_1, d_1_expected, nm_1_expected = prep_csv_files[1] 91 | dm_1 = read_csv_distmat(csv_f_1, 0, None, ',') 92 | assert np.max(np.abs(dm_1.dist_mat - d_1_expected)) < eps 93 | assert all([a == b for a, b in zip(dm_1.names, nm_1_expected)]) 94 | csv_of_1 = os.path.join(prep_outdir, 'dist_1.csv') 95 | write_csv_distmat(dm_1, csv_of_1) 96 | dm_1_r = read_csv_distmat(csv_of_1, 0, 0, ',') 97 | assert np.max(np.abs(dm_1.dist_mat - dm_1_r.dist_mat)) < eps 98 | assert all([a == b for a, b in zip(dm_1.names, dm_1_r.names)]) 99 | 100 | csv_f_2, d_2_expected, nm_2_expected = prep_csv_files[2] 101 | dm_2 = read_csv_distmat(csv_f_2, None, 0, ',') 102 | assert np.max(np.abs(dm_2.dist_mat - d_2_expected)) < eps 103 | assert all([a == b for a, b in zip(dm_2.names, nm_2_expected)]) 104 | 105 | csv_f_3, d_3_expected, nm_3_expected = prep_csv_files[3] 106 | dm_3 = read_csv_distmat(csv_f_3, None, None, ',') 107 | assert np.max(np.abs(dm_3.dist_mat - d_3_expected)) < eps 108 | assert dm_3.names == nm_3_expected 109 | csv_of_3 = os.path.join(prep_outdir, 'dist_3.csv') 110 | write_csv_distmat(dm_3, csv_of_3) 111 | dm_3_r = read_csv_distmat(csv_of_3) 112 | assert np.max(np.abs(dm_3.dist_mat - dm_3_r.dist_mat)) < eps 113 | 114 | with pytest.raises(FileExistsError): 115 | write_csv_distmat(dm_3, os.path.join(prep_outdir, 'dist_3.csv')) 116 | with pytest.raises(FileExistsError): 117 | d_path = os.path.join(prep_outdir, 'SOMEDIR') 118 | os.mkdir(d_path) 119 | write_csv_distmat(dm_3, d_path) 120 | 121 | csv_f_4, d_4_expected, nm_4_expected = prep_csv_files[4] 122 | with pytest.raises(InvalidMatrixShapeError): 123 | read_csv_distmat(csv_f_4, None, None, ',') -------------------------------------------------------------------------------- /pyckmeans/io/csv.py: -------------------------------------------------------------------------------- 1 | ''' csv 2 | 3 | Comma Separated Value (CSV) input and output. 4 | ''' 5 | import os 6 | from typing import Optional 7 | 8 | import pandas 9 | 10 | import pyckmeans.distance 11 | 12 | class InvalidMatrixShapeError(Exception): 13 | '''InvalidMatrixShapeError''' 14 | 15 | class IncompatibleNamesError(Exception): 16 | '''IncompatibleNamesError''' 17 | 18 | def read_csv_distmat( # pylint: disable=missing-param-doc 19 | file_path: str, 20 | header: Optional[int] = 0, 21 | index_col: Optional[int] = 0, 22 | sep: str = ',', 23 | **kwargs, 24 | ) -> 'pyckmeans.distance.DistanceMatrix': 25 | '''read_csv_distmat 26 | 27 | Read distance matrix from CSV file. 28 | 29 | Parameters 30 | ---------- 31 | file_path : str 32 | Path to CSV file. 33 | header : Optional[int] 34 | Determines the row in the CSV file containing 35 | sample names. Is passed to pandas.read_csv(). By default 0, meaning 36 | the first row. 37 | index_col : Optional[int] 38 | Determines the index column. By default, the first column is expected 39 | to contain sample names. Passed to pandas.read_csv(). 40 | sep : str 41 | Column separator, be default ','. Passed to Passed to pandas.read_csv(). 42 | **kwargs 43 | Additional keyword arguments passed to pandas.read_csv(). 44 | Returns 45 | ------- 46 | pyckmeans.distance.DistanceMatrix 47 | DistanceMatrix object. 48 | 49 | Raises 50 | ------ 51 | InvalidMatrixShapeError 52 | Raised if matrix is not square. 53 | IncompatibleNamesError 54 | Raised if column and row names do not match. 55 | ''' 56 | dist_df = pandas.read_csv( 57 | file_path, 58 | header=header, 59 | index_col=index_col, 60 | sep=sep, 61 | **kwargs 62 | ) 63 | 64 | dist_mat = dist_df.values 65 | 66 | # distance matrix must be a square matrix 67 | if dist_mat.shape[0] != dist_mat.shape[1]: 68 | msg = 'Expected a square matrix but matrix has dimensions '+ \ 69 | f'{dist_mat.shape[0]}x{dist_mat.shape[1]}.' 70 | raise InvalidMatrixShapeError(msg) 71 | 72 | names = None 73 | # names are present in file 74 | if (not header is None) or (not index_col is None): 75 | # row and column names are present 76 | if (not header is None) and (not index_col is None): 77 | names_a = [nm.strip() for nm in dist_df.index.astype(str)] 78 | names_b = [nm.strip() for nm in dist_df.columns.astype(str)] 79 | 80 | # if row names and column names do not match, something 81 | # is probably wrong 82 | if not all([a == b for a, b in zip(names_a, names_b)]): 83 | raise IncompatibleNamesError('Column and row names do not match.') 84 | 85 | names = names_a 86 | # column names are present 87 | elif not header is None: 88 | names = [nm.strip() for nm in dist_df.columns.astype(str)] 89 | # row names are present 90 | elif not index_col is None: 91 | names = [nm.strip() for nm in dist_df.index.astype(str)] 92 | 93 | 94 | return pyckmeans.distance.DistanceMatrix( 95 | dist_mat, 96 | names, 97 | ) 98 | 99 | def write_csv_distmat( 100 | dist: 'pyckmeans.distance.DistanceMatrix', 101 | file_path: str, 102 | force: bool = False, 103 | ) -> None: 104 | '''write_csv_distmat 105 | 106 | Write DistanceMatrix object to CSV. 107 | 108 | Parameters 109 | ---------- 110 | dist : pyckmeans.distance.DistanceMatrix 111 | DistanceMatrix object. 112 | file_path : str 113 | CSV file path. 114 | force : bool, optional 115 | Force overwrite if file_path already exists, by default False 116 | 117 | Raises 118 | ------ 119 | FileExistsError 120 | Raised if file at file_path already exists and force is False. 121 | FileExistsError 122 | Raised if file_path points to an existing directory. 123 | ''' 124 | if os.path.exists(file_path): 125 | if os.path.isfile(file_path) and not force: 126 | msg = f'File {file_path} already exists. If you want to overwrite ' +\ 127 | 'it run the function with force=True.' 128 | raise FileExistsError(msg) 129 | elif os.path.isdir(file_path): 130 | msg = f'A directory exists at path {file_path}.' 131 | raise FileExistsError(msg) 132 | 133 | dist_df = pandas.DataFrame( 134 | dist.dist_mat, 135 | columns=dist.names, 136 | index=dist.names, 137 | ) 138 | 139 | dist_df.to_csv(file_path, index_label='sample') 140 | -------------------------------------------------------------------------------- /pyckmeans/io/tests/test_nucleotidealignment.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from pyckmeans.io.nucleotide_alignment import InvalidAlignmentCharacterError 4 | import numpy 5 | import pytest 6 | 7 | from pyckmeans.io import \ 8 | NucleotideAlignment, \ 9 | read_alignment, \ 10 | InvalidAlignmentFileExtensionError, \ 11 | InvalidAlignmentFileFormatError 12 | 13 | from test_fasta import prep_fasta_files 14 | from test_phylip import prep_phylip_files 15 | 16 | Bio = None 17 | try: 18 | import Bio 19 | from Bio import SeqIO, AlignIO 20 | except: 21 | warnings.warn('Could not test Biopython since it is not installed.') 22 | 23 | 24 | def test_simple(prep_fasta_files, prep_phylip_files): 25 | na_fa_0 = NucleotideAlignment.from_file(prep_fasta_files[0]) 26 | na_fa_1 = NucleotideAlignment.from_file(prep_fasta_files[1]) 27 | na_fa_2 = NucleotideAlignment.from_file(prep_fasta_files[2], 'fasta') 28 | na_fa_3 = NucleotideAlignment.from_file(prep_fasta_files[2], 'fasta', fast_encoding=True) 29 | 30 | assert (na_fa_2.sequences == na_fa_3.sequences).all() 31 | 32 | print('na_fa_0:', na_fa_0) 33 | print('na_fa_1:', na_fa_1) 34 | print('na_fa_2:', na_fa_2) 35 | 36 | na_phy_0 = NucleotideAlignment.from_file(prep_phylip_files[0]) 37 | na_phy_1 = NucleotideAlignment.from_file(prep_phylip_files[1]) 38 | na_phy_2 = NucleotideAlignment.from_file(prep_phylip_files[2], 'phylip') 39 | na_phy_3 = NucleotideAlignment.from_file(prep_phylip_files[2], 'phylip', fast_encoding=True) 40 | 41 | assert (na_phy_2.sequences == na_phy_3.sequences).all() 42 | 43 | print('na_phy_0:', na_phy_0) 44 | print('na_phy_1:', na_phy_1) 45 | print('na_phy_2:', na_phy_2) 46 | 47 | with pytest.raises(InvalidAlignmentFileFormatError): 48 | NucleotideAlignment.from_file(prep_fasta_files[0], 'xyz') 49 | with pytest.raises(InvalidAlignmentFileExtensionError): 50 | NucleotideAlignment.from_file('test.png', 'auto') 51 | 52 | na_phy_0_di = na_phy_0.drop_invariant_sites(in_place=False) 53 | assert not na_phy_0_di is na_phy_0 54 | na_phy_0_di = na_phy_0.drop_invariant_sites(in_place=True) 55 | assert na_phy_0_di is na_phy_0 56 | 57 | na_phy_0_cp = na_phy_0.copy() 58 | assert (na_phy_0_cp.names == na_phy_0.names).all() 59 | assert (na_phy_0_cp.sequences == na_phy_0.sequences).all() 60 | 61 | with pytest.raises(Exception): 62 | NucleotideAlignment( 63 | ['a', 'b'], 64 | numpy.array([ 65 | ['A', 'C'], 66 | ['A', 'T'], 67 | ['A', 'T'] 68 | ]), 69 | fast_encoding=False 70 | ) 71 | 72 | with pytest.raises(InvalidAlignmentCharacterError): 73 | NucleotideAlignment( 74 | ['a', 'b', 'C'], 75 | numpy.array([ 76 | ['A', '3'], 77 | ['A', 'T'], 78 | ['A', 'T'] 79 | ]), 80 | fast_encoding=False, 81 | ) 82 | 83 | if not Bio is None: 84 | bio_aln = AlignIO.read(prep_fasta_files[0], format='fasta') 85 | aln_b = NucleotideAlignment.from_bp_seqio_records(bio_aln) 86 | print('aln_b:', aln_b) 87 | 88 | 89 | def test_read_alignment(prep_fasta_files, prep_phylip_files): 90 | na_fa_0 = read_alignment(prep_fasta_files[0]) 91 | na_fa_1 = read_alignment(prep_fasta_files[1]) 92 | na_fa_2 = read_alignment(prep_fasta_files[2], 'fasta') 93 | 94 | print('na_fa_0:', na_fa_0) 95 | print('na_fa_1:', na_fa_1) 96 | print('na_fa_2:', na_fa_2) 97 | 98 | na_phy_0 = read_alignment(prep_phylip_files[0]) 99 | na_phy_1 = read_alignment(prep_phylip_files[1]) 100 | na_phy_2 = read_alignment(prep_phylip_files[2], 'phylip') 101 | 102 | print('na_phy_0:', na_phy_0) 103 | print('na_phy_1:', na_phy_1) 104 | print('na_phy_2:', na_phy_2) 105 | 106 | with pytest.raises(InvalidAlignmentFileFormatError): 107 | read_alignment(prep_fasta_files[0], 'xyz') 108 | with pytest.raises(InvalidAlignmentFileExtensionError): 109 | read_alignment('test.png', 'auto') 110 | 111 | def test_utils(): 112 | na_0 = NucleotideAlignment( 113 | ['a', 'b', 'c', 'd', 'e'], 114 | numpy.array([ 115 | ['a', 't', 'a', 't', 't', 'g', 'c'], 116 | ['a', 'a', '-', 't', 't', 'g', 'c'], 117 | ['a', 'a', '-', 't', 't', 'g', 'c'], 118 | ['a', 't', 'a', 't', 'g', 'g', 'c'], 119 | ['a', 't', 'a', 't', 'g', 'g', 'c'], 120 | ]), 121 | ) 122 | na_0_0 = na_0[:2] 123 | assert na_0_0.shape == (2, na_0.shape[1]) 124 | assert (na_0_0.names == na_0.names[:2]).all() 125 | assert (na_0_0.sequences == na_0.sequences[:2]).all() 126 | 127 | na_0_1 = na_0[::2] 128 | assert na_0_1.shape == (3, na_0.shape[1]) 129 | assert (na_0_1.names == na_0.names[::2]).all() 130 | assert (na_0_1.sequences == na_0.sequences[::2]).all() 131 | 132 | na_0_2 = na_0[:4, :3] 133 | assert na_0_2.shape == (4, 3) 134 | assert (na_0_2.names == na_0.names[:4]).all() 135 | assert (na_0_2.sequences == na_0.sequences[:4, :3]).all() 136 | 137 | assert na_0.drop_invariant_sites().shape == (5, 3) 138 | -------------------------------------------------------------------------------- /pyckmeans/distance/c_interop.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import pathlib 3 | 4 | import numpy 5 | 6 | # load the shared library 7 | libfile = pathlib.Path(__file__).parent / 'lib' / 'distance.so' 8 | lib = ctypes.CDLL(str(libfile)) 9 | 10 | # == p distance 11 | lib.pDistance.restype = None 12 | lib.pDistance.argtypes = [ 13 | numpy.ctypeslib.ndpointer( # alignment: n * m matrix 14 | dtype=numpy.uint8, 15 | ndim=2, 16 | flags='C_CONTIGUOUS', 17 | ), 18 | ctypes.c_int, # n: number of entries 19 | ctypes.c_int, # m: number of sites 20 | ctypes.c_int, # pairwiseDeletion 21 | numpy.ctypeslib.ndpointer( # (output) distMat: n * n distance matrixmatrix 22 | dtype=numpy.double, 23 | ndim=2, 24 | flags='C_CONTIGUOUS', 25 | ), 26 | ] 27 | 28 | def p_distance( 29 | alignment: numpy.ndarray, 30 | pairwise_deletion: bool = True, 31 | ) -> numpy.ndarray: 32 | '''p_distance 33 | 34 | Calculate p-distance for a nucleotide alignment. 35 | 36 | Parameters 37 | ---------- 38 | alignment : numpy.ndarray 39 | n*m numpy alignment, where n is the number of entries and m is 40 | the number of sites. Bases must be encoded in the format of 41 | pyckmeans.io.NucleotideAlignment. 42 | pairwise_deletion : bool, optional 43 | Calculate distances with pairwise-deletion in case of missing 44 | data, by default True 45 | 46 | Returns 47 | ------- 48 | numpy.ndarray 49 | n*n distance matrix. 50 | ''' 51 | if not alignment.flags['C_CONTIGUOUS']: 52 | alignment = numpy.ascontiguousarray(alignment) 53 | 54 | n, m = alignment.shape 55 | 56 | dist_mat = numpy.zeros((n, n), dtype=numpy.double) 57 | 58 | lib.pDistance(alignment, n, m, pairwise_deletion, dist_mat) 59 | 60 | return dist_mat 61 | 62 | # == Jukes-Cantor distance 63 | lib.jcDistance.restype = None 64 | lib.jcDistance.argtypes = [ 65 | numpy.ctypeslib.ndpointer( # alignment: n * m matrix 66 | dtype=numpy.uint8, 67 | ndim=2, 68 | flags='C_CONTIGUOUS', 69 | ), 70 | ctypes.c_int, # n: number of entries 71 | ctypes.c_int, # m: number of sites 72 | ctypes.c_int, # pairwiseDeletion 73 | numpy.ctypeslib.ndpointer( # (output) distMat: n * n distance matrixmatrix 74 | dtype=numpy.double, 75 | ndim=2, 76 | flags='C_CONTIGUOUS', 77 | ), 78 | ] 79 | 80 | def jc_distance( 81 | alignment: numpy.ndarray, 82 | pairwise_deletion: bool = True, 83 | ) -> numpy.ndarray: 84 | '''jc_distance 85 | 86 | Calculate Jukes-Cantor distance for a nucleotide alignment. 87 | 88 | Parameters 89 | ---------- 90 | alignment : numpy.ndarray 91 | n*m numpy alignment, where n is the number of entries and m is 92 | the number of sites. Bases must be encoded in the format of 93 | pyckmeans.io.NucleotideAlignment. 94 | pairwise_deletion : bool, optional 95 | Calculate distances with pairwise-deletion in case of missing 96 | data, by default True 97 | 98 | Returns 99 | ------- 100 | numpy.ndarray 101 | n*n distance matrix. 102 | ''' 103 | if not alignment.flags['C_CONTIGUOUS']: 104 | alignment = numpy.ascontiguousarray(alignment) 105 | 106 | n, m = alignment.shape 107 | 108 | dist_mat = numpy.zeros((n, n), dtype=numpy.double) 109 | 110 | lib.jcDistance(alignment, n, m, pairwise_deletion, dist_mat) 111 | 112 | return dist_mat 113 | 114 | # == Kimura 2-parameter distance 115 | lib.k2pDistance.restype = None 116 | lib.k2pDistance.argtypes = [ 117 | numpy.ctypeslib.ndpointer( # alignment: n * m matrix 118 | dtype=numpy.uint8, 119 | ndim=2, 120 | flags='C_CONTIGUOUS', 121 | ), 122 | ctypes.c_int, # n: number of entries 123 | ctypes.c_int, # m: number of sites 124 | ctypes.c_int, # pairwiseDeletion 125 | numpy.ctypeslib.ndpointer( # (output) distMat: n * n distance matrixmatrix 126 | dtype=numpy.double, 127 | ndim=2, 128 | flags='C_CONTIGUOUS', 129 | ), 130 | ] 131 | 132 | def k2p_distance( 133 | alignment: numpy.ndarray, 134 | pairwise_deletion: bool = True, 135 | ) -> numpy.ndarray: 136 | '''jc_distance 137 | 138 | Calculate Kimura 2-parameter distance for a nucleotide alignment. 139 | 140 | Parameters 141 | ---------- 142 | alignment : numpy.ndarray 143 | n*m numpy alignment, where n is the number of entries and m is 144 | the number of sites. Bases must be encoded in the format of 145 | pyckmeans.io.NucleotideAlignment. 146 | pairwise_deletion : bool, optional 147 | Calculate distances with pairwise-deletion in case of missing 148 | data, by default True 149 | 150 | Returns 151 | ------- 152 | numpy.ndarray 153 | n*n distance matrix. 154 | ''' 155 | if not alignment.flags['C_CONTIGUOUS']: 156 | alignment = numpy.ascontiguousarray(alignment) 157 | 158 | n, m = alignment.shape 159 | 160 | dist_mat = numpy.zeros((n, n), dtype=numpy.double) 161 | 162 | lib.k2pDistance(alignment, n, m, pairwise_deletion, dist_mat) 163 | 164 | return dist_mat 165 | -------------------------------------------------------------------------------- /pyckmeans/tests/manual_tests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyckmeans import NucleotideAlignment, CKmeans\n", 10 | "from pyckmeans.io.nucleotide_alignment import BASE_ENCODING_INVERSE, BASE_ENCODING\n", 11 | "import numpy as np\n", 12 | "import time" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 3, 18 | "metadata": { 19 | "tags": [] 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "# import pprofile\n", 24 | "# profiler = pprofile.Profile()\n", 25 | "# with profiler:\n", 26 | "# # aln = NucleotideAlignment.from_file('../../docs/datasets/rhodanthemum_ct85_msl68.snps.phy')\n", 27 | "# aln = NucleotideAlignment.from_file('C:/Users/Tankr/Downloads/leu_reference_msl12.phy')\n", 28 | "# # Process profile content: generate a cachegrind file and send it to user.\n", 29 | "\n", 30 | "# # You can also write the result to the console:\n", 31 | "# profiler.print_stats()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 4, 37 | "metadata": { 38 | "tags": [] 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "# import pprofile\n", 43 | "# profiler = pprofile.Profile()\n", 44 | "# with profiler:\n", 45 | "# # aln = NucleotideAlignment.from_file('../../docs/datasets/rhodanthemum_ct85_msl68.snps.phy', fast_encoding=True)\n", 46 | "# aln2 = NucleotideAlignment.from_file('C:/Users/Tankr/Downloads/leu_reference_msl12.phy', fast_encoding=True)\n", 47 | "# # Process profile content: generate a cachegrind file and send it to user.\n", 48 | "\n", 49 | "# # You can also write the result to the console:\n", 50 | "# profiler.print_stats()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "= 0.6)+1) 56 | ] 57 | assert vectors_1_np.shape == vectors_1_np_expected.shape 58 | assert abs(vectors_1_np - vectors_1_np_expected).sum() < 0.0001 59 | 60 | vectors_1_pd = pcoares_0.get_vectors(filter_by='eigvals_rel_cum', filter_th=0.6, out_format='pd') 61 | assert vectors_1_pd.shape == vectors_1_np_expected.shape 62 | assert abs(vectors_1_pd.values - vectors_1_np_expected).sum() < 0.0001 63 | 64 | 65 | pcoares_1 = pcoa(prepare_distmats[1][0]) 66 | assert all([nm_a == nm_b for nm_a, nm_b in zip(pcoares_1.names, prepare_distmats[1][1])]) 67 | assert pcoares_1.vectors.shape[0] == prepare_distmats[1][0].shape[0] 68 | print('pcoares_0:', pcoares_1) 69 | print('pcoares_0.vectors:', pcoares_1.vectors) 70 | print('pcoares_0.values:', pcoares_1.values) 71 | print('pcoares_0.names:', pcoares_1.names) 72 | 73 | vectors_1_pd = pcoares_1.get_vectors(filter_by='eigvals_rel_cum', filter_th=0.6, out_format='pd') 74 | vectors_1_np_expected = pcoares_1.vectors[ 75 | :, 76 | :(np.argmax(pcoares_1.values['eigvals_rel_cum'].values >= 0.6)+1) 77 | ] 78 | assert vectors_1_pd.shape == vectors_1_np_expected.shape 79 | assert abs(vectors_1_pd.values - vectors_1_np_expected).sum() < 0.0001 80 | print(vectors_1_pd.index.values) 81 | assert np.all(vectors_1_pd.index.values == pcoares_1.names) 82 | 83 | x_2, _ = make_blobs(200, 3, centers= 3) 84 | d_2 = squareform(pdist(x_2)) 85 | pcoares_2 = pcoa(d_2) 86 | 87 | with pytest.raises(InvalidCorrectionTypeError): 88 | pcoa(d_2, correction='NONEXISTING_CORRECTION') 89 | 90 | def assert_pcoa_res_are_equal( 91 | a: PCOAResult, 92 | b: PCOAResult, 93 | eps: float=1e-8, 94 | ): 95 | assert (np.abs(a.vectors - b.vectors) < eps).all() 96 | assert (np.abs(a.values - b.values) < eps).values.all() 97 | assert (a.trace - b.trace) < eps or a.trace is b.trace 98 | assert a.trace_corr is b.trace_corr or (a.trace_corr - b.trace_corr) < eps 99 | assert a.correction == b.correction or a.correction is b.correction 100 | assert a.negative_eigvals == b.negative_eigvals 101 | 102 | @pytest.mark.parametrize('correction', [None, 'lingoes', 'cailliez']) 103 | def test_save_load(prepare_distmats, test_dir, correction): 104 | pcoa_res_0 = pcoa(prepare_distmats[0][0], correction=correction) 105 | print('correction:', correction) 106 | 107 | pcoa_res_0_json_file = os.path.join(test_dir, f'{correction}_pcoa_res_0.json') 108 | pcoa_res_0.to_json(pcoa_res_0_json_file) 109 | pcoa_res_0_l = PCOAResult.from_json(pcoa_res_0_json_file) 110 | assert_pcoa_res_are_equal(pcoa_res_0, pcoa_res_0_l) 111 | 112 | pcoa_res_0_dir = os.path.join(test_dir, f'{correction}_pcoa_res_0') 113 | pcoa_res_0.to_dir(pcoa_res_0_dir) 114 | pcoa_res_0_l = PCOAResult.from_dir(pcoa_res_0_dir) 115 | assert_pcoa_res_are_equal(pcoa_res_0, pcoa_res_0_l) 116 | 117 | pcoa_res_1 = pcoa(prepare_distmats[1][0], correction=correction) 118 | print('correction:', correction) 119 | 120 | pcoa_res_1_json_file = os.path.join(test_dir, f'{correction}_pcoa_res_1.json') 121 | pcoa_res_1.to_json(pcoa_res_1_json_file) 122 | pcoa_res_1_l = PCOAResult.from_json(pcoa_res_1_json_file) 123 | assert_pcoa_res_are_equal(pcoa_res_1, pcoa_res_1_l) 124 | 125 | pcoa_res_1_dir = os.path.join(test_dir, f'{correction}_pcoa_res_1') 126 | pcoa_res_1.to_dir(pcoa_res_1_dir) 127 | pcoa_res_1_l = PCOAResult.from_dir(pcoa_res_1_dir) 128 | assert_pcoa_res_are_equal(pcoa_res_1, pcoa_res_1_l) 129 | 130 | assert_pcoa_res_are_equal( 131 | pcoa_res_1, PCOAResult.from_json_str(pcoa_res_1.to_json()) 132 | ) 133 | 134 | with pytest.raises(Exception): 135 | pcoa_res_1.to_dir(pcoa_res_1_dir, force=False) 136 | with pytest.raises(Exception): 137 | PCOAResult.from_dir('NONEXISTING_DIR') 138 | 139 | 140 | def test_pcoa_result(prepare_distmats): 141 | pcoa_res_0 = pcoa(prepare_distmats[0][0]) 142 | 143 | with pytest.raises(InvalidFilterError): 144 | pcoa_res_0.get_vectors(filter_by='NONEXISTING_FILTER', filter_th=0.8) 145 | with pytest.raises(InvalidFilterError): 146 | pcoa_res_0.get_vectors(filter_by='eigvals_rel_cum') 147 | with pytest.raises(InvalidFilterError): 148 | pcoa_res_0.get_vectors(filter_th=0.8) 149 | with pytest.raises(InvalidOutFormatError): 150 | pcoa_res_0.get_vectors(filter_by='eigvals_rel_cum', filter_th=0.8, out_format='NONEXISTING_FORMAT') 151 | -------------------------------------------------------------------------------- /pyckmeans/tests/test_workflow.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import pandas as pd 3 | import pytest 4 | import tempfile 5 | import os 6 | 7 | from pyckmeans.io import read_alignment 8 | from pyckmeans.distance import alignment_distance 9 | from pyckmeans.ordination import PCOAResult, pcoa 10 | from pyckmeans.core import CKmeans, MultiCKMeans, WECR 11 | from pyckmeans.utils import plot_ckmeans_result, plot_multickmeans_metrics, MultiCKMeansProgressBars, plot_cmatrix 12 | 13 | import tqdm 14 | 15 | PHYLIP_STR_0 = \ 16 | '''10 14 17 | Sample0 ACTGTCATGAAGGA 18 | Sample1 ACT--CATCAAGGA 19 | Sample2 ACTCTCATGAAGGA 20 | Sample3 AGTCTCTTGAAGGA 21 | Sample4 AGT--CATGAACTG 22 | Sample5 ACTGTCATGAACTG 23 | Sample6 ACTC-CATCAACTG 24 | Sample7 AGGCTCCTGAACTG 25 | Sample8 ACTCTCTTTAACTG 26 | Sample9 TTTCTCACGAACTG 27 | ''' 28 | 29 | @pytest.fixture(scope='session') 30 | def prep_phylip_files(): 31 | with tempfile.TemporaryDirectory() as tempdir: 32 | print(f'Created temporary directory {tempdir}.') 33 | 34 | phylip_file_0 = os.path.join(tempdir, 'phylip_0.phy') 35 | with open(phylip_file_0, 'w') as f: 36 | f.write(PHYLIP_STR_0) 37 | 38 | yield ( 39 | phylip_file_0, 40 | ) 41 | 42 | print(f'Destroyed temporary directory {tempdir}.') 43 | 44 | @pytest.fixture(scope='session') 45 | def prep_pcoa_results(prep_phylip_files): 46 | na_0 = read_alignment(prep_phylip_files[0]) 47 | d_0_p = alignment_distance(na_0, 'p') 48 | pcoares_0 = pcoa(d_0_p, 'lingoes') 49 | 50 | return ( 51 | pcoares_0, 52 | ) 53 | 54 | def test_simple_workflow(prep_phylip_files): 55 | na_0 = read_alignment(prep_phylip_files[0]) 56 | d_0_p = alignment_distance(na_0, 'p') 57 | pcoares_0 = pcoa(d_0_p, 'lingoes') 58 | ckm_0 = CKmeans(k=2, n_rep=50, n_init=2) 59 | ckm_0.fit(pcoares_0.vectors) 60 | ckm_0_res = ckm_0.predict(pcoares_0.vectors) 61 | ckm_0_res.sort(in_place=True) 62 | 63 | print('pcoares_0.vectors', pcoares_0.vectors) 64 | print('ckm_0_res.cl:', ckm_0_res.cl) 65 | 66 | ckm_1 = CKmeans(k=2, n_rep=50, n_init=2) 67 | with tqdm.tqdm(total=ckm_1.n_rep) as pb: 68 | ckm_1.fit(pcoares_0, progress_callback=pb.update) 69 | with tqdm.tqdm(total=ckm_1.n_rep) as pb: 70 | ckm_1_res = ckm_1.predict(pcoares_0, progress_callback=pb.update) 71 | ckm_1_res.sort(in_place=True) 72 | ckm_1_res.recalculate_cluster_memberships(pcoares_0, linkage_type='complete') 73 | 74 | print('ckm_1_res.cl:', ckm_1_res.cl) 75 | print('ckm_1_res.names:', ckm_1_res.names) 76 | 77 | ckm_2 = CKmeans(k=2, n_rep=50, n_init=2) 78 | df = pd.DataFrame(pcoares_0.vectors, pcoares_0.names) 79 | ckm_2.fit(df) 80 | ckm_2_res = ckm_2.predict(df) 81 | ckm_2_res.sort(in_place=True) 82 | print('ckm_2_res.cl:', ckm_2_res.cl) 83 | print('ckm_2_res.names:', ckm_2_res.names) 84 | 85 | ckm_2_res_cls = ckm_2.predict(df, return_cls=True) 86 | assert ckm_2_res_cls.km_cls.shape == (ckm_2.n_rep, df.shape[0]) 87 | ckm_2_res_cls.sort() 88 | 89 | # test copy 90 | ckm_2_res_cls_cp = ckm_2_res_cls.copy() 91 | ckm_2_res_cls_cp.cl[0] = -1000 92 | ckm_2_res_cls_cp.km_cls[0,0] = -1000 93 | assert ckm_2_res_cls_cp.cl[0] != ckm_2_res_cls.cl[0] 94 | assert ckm_2_res_cls_cp.km_cls[0,0] != ckm_2_res_cls.km_cls[0,0] 95 | assert not ckm_2_res_cls_cp is ckm_2_res_cls 96 | 97 | # test recalculate cluster memberships 98 | ckm_2_res_cls_cp_rcm_1 = ckm_2_res_cls_cp.recalculate_cluster_memberships(df, 'average', in_place=False) 99 | ckm_2_res_cls_cp_rcm_2 = ckm_2_res_cls_cp.recalculate_cluster_memberships(df, 'average', in_place=True) 100 | assert ckm_2_res_cls_cp_rcm_2 is ckm_2_res_cls_cp 101 | assert not ckm_2_res_cls_cp_rcm_1 is ckm_2_res_cls_cp 102 | 103 | # test distance 104 | d_0_p_0 = alignment_distance(na_0, 'p') 105 | d_0_p_1 = na_0.distance('p') 106 | assert numpy.abs(d_0_p_0.dist_mat - d_0_p_1.dist_mat).sum() < 0.001 107 | 108 | def test_multi_workflow(prep_pcoa_results): 109 | pcoares_0: PCOAResult = prep_pcoa_results[0] 110 | mckm_0 = MultiCKMeans([2,3,3]) 111 | with MultiCKMeansProgressBars(mckm_0) as pb: 112 | mckm_0.fit(pcoares_0, progress_callback=pb.update) 113 | with MultiCKMeansProgressBars(mckm_0) as pb: 114 | mckm_0_res = mckm_0.predict(pcoares_0, progress_callback=pb.update) 115 | 116 | plot_multickmeans_metrics(mckm_0_res) 117 | mckm_0_res.plot_metrics() 118 | 119 | mckm_1 = MultiCKMeans([2,3,3]) 120 | mckm_1.fit(pcoares_0.vectors) 121 | mckm_1_res = mckm_1.predict(pcoares_0.vectors) 122 | plot_multickmeans_metrics(mckm_1_res) 123 | mckm_1_res.plot_metrics() 124 | 125 | mckm_2 = MultiCKMeans([2,3,3], n_rep=100) 126 | df = pd.DataFrame(pcoares_0.vectors, pcoares_0.names) 127 | mckm_2.fit(df) 128 | mckm_2_res = mckm_2.predict(df) 129 | plot_multickmeans_metrics(mckm_2_res) 130 | mckm_2_res.plot_metrics() 131 | 132 | mckm_2_res_cls = mckm_2.predict(df, return_cls=True) 133 | assert mckm_2_res_cls.ckmeans_results[0].km_cls.shape == (mckm_2.n_rep, df.shape[0]) 134 | mckm_2_res_cls.sort(0) 135 | 136 | def test_wecr_workflow(prep_pcoa_results): 137 | pcoares_0: PCOAResult = prep_pcoa_results[0] 138 | wecr_0 = WECR([2, 3]) 139 | 140 | with tqdm.tqdm(total=wecr_0.n_rep) as pb: 141 | wecr_0.fit(pcoares_0, progress_callback=pb.update) 142 | with tqdm.tqdm(total=wecr_0.n_rep) as pb: 143 | wecr_res_0 = wecr_0.predict(pcoares_0, progress_callback=pb.update) 144 | 145 | wecr_res_0.recalculate_cluster_memberships(pcoares_0, 'single') 146 | 147 | def test_plotting(prep_pcoa_results): 148 | pcoares_0 = prep_pcoa_results[0] 149 | ckm_0 = CKmeans(k=2, n_rep=10) 150 | ckm_0.fit(pcoares_0) 151 | ckm_0_res = ckm_0.predict(pcoares_0) 152 | 153 | ckm_0_res.sort() 154 | ord = ckm_0_res.order() 155 | ckm_0_res.reorder(ord) 156 | 157 | plot_ckmeans_result(ckm_0_res) 158 | plot_ckmeans_result(ckm_0_res, order=None) 159 | plot_ckmeans_result(ckm_0_res, order=ord) 160 | 161 | ckm_0_res.plot() 162 | ckm_0_res.plot(order=None) 163 | ckm_0_res.plot(order=ord) 164 | 165 | plot_cmatrix(ckm_0_res.cmatrix, ckm_0_res.cl, names=ckm_0_res.names, order = None) 166 | plot_cmatrix(ckm_0_res.cmatrix, ckm_0_res.cl, names=ckm_0_res.names, order = 'GW') 167 | plot_cmatrix(ckm_0_res.cmatrix, ckm_0_res.cl, names=ckm_0_res.names, order = ord) 168 | plot_cmatrix(ckm_0_res.cmatrix, ckm_0_res.cl, names=None, order = None) 169 | -------------------------------------------------------------------------------- /pyckmeans/distance/__init__.py: -------------------------------------------------------------------------------- 1 | ''' distance 2 | 3 | Module for distance calculations. 4 | ''' 5 | 6 | from typing import Iterable, Optional, Tuple 7 | 8 | import numpy 9 | 10 | import pyckmeans.io 11 | 12 | from .c_interop import p_distance, jc_distance, k2p_distance 13 | 14 | class IncompatibleNamesError(Exception): 15 | '''IncompatibleNamesError''' 16 | 17 | class DistanceMatrix: 18 | '''__init__ 19 | 20 | Distance Matrix, optionally named. 21 | 22 | Parameters 23 | ---------- 24 | dist_mat : numpy.ndarray 25 | n*n distance matrix. 26 | names : Optional[Iterable[str]] 27 | Names, by default None. 28 | 29 | Raises 30 | ------ 31 | IncompatibleNamesError 32 | Raised if dimension of names and dist_mat are incompatible. 33 | ''' 34 | def __init__(self, dist_mat: numpy.ndarray, names: Optional[Iterable[str]] = None): 35 | self.dist_mat = dist_mat 36 | self.names = None 37 | 38 | if not names is None: 39 | n = dist_mat.shape[0] 40 | if len(names) != n: 41 | msg = f'Expected {n} names for {n}x{n} distance matrix ' +\ 42 | f'but {len(names)} were passed.' 43 | raise IncompatibleNamesError(msg) 44 | 45 | self.names = numpy.array(names) 46 | 47 | def __repr__(self) -> str: 48 | '''__repr__ 49 | 50 | Returns 51 | ------- 52 | str 53 | String representation. 54 | ''' 55 | return f'{repr(self.names)}\n{repr(self.dist_mat)}' 56 | 57 | @property 58 | def shape(self) -> Tuple[int]: 59 | '''shape 60 | 61 | Get matrix shape. 62 | 63 | Returns 64 | ------- 65 | Tuple[int] 66 | Matrix shape. 67 | ''' 68 | return self.dist_mat.shape 69 | 70 | @staticmethod 71 | def from_phylip(file_path: str) -> 'DistanceMatrix': 72 | '''from_phylip 73 | 74 | Read PHYLIP distance matrix. 75 | 76 | Returns 77 | ------- 78 | DistanceMatrix 79 | DistanceMatrix object. 80 | ''' 81 | return pyckmeans.io.phylip.read_phylip_distmat(file_path) 82 | 83 | @staticmethod 84 | def from_csv( # pylint: disable=missing-param-doc 85 | file_path: str, 86 | header: Optional[int] = 0, 87 | index_col: Optional[int] = 0, 88 | sep: str = ',', 89 | **kwargs, 90 | ) -> 'DistanceMatrix': 91 | '''read_csv_distmat 92 | 93 | Read distance matrix from CSV file. 94 | 95 | Parameters 96 | ---------- 97 | file_path : str 98 | Path to CSV file. 99 | header : Optional[int] 100 | Determines the row in the CSV file containing 101 | sample names. Is passed to pandas.read_csv(). By default 0, meaning 102 | the first row. 103 | index_col : Optional[int] 104 | Determines the index column. By default, the first column is expected 105 | to contain sample names. Passed to pandas.read_csv(). 106 | sep : str 107 | Column separator, be default ','. Passed to Passed to pandas.read_csv(). 108 | **kwargs 109 | Additional keyword arguments passed to pandas.read_csv(). 110 | Returns 111 | ------- 112 | pyckmeans.distance.DistanceMatrix 113 | DistanceMatrix object. 114 | ''' 115 | return pyckmeans.io.csv.read_csv_distmat( 116 | file_path=file_path, 117 | header=header, 118 | index_col=index_col, 119 | sep=sep, 120 | **kwargs, 121 | ) 122 | 123 | def to_phylip( 124 | self, 125 | file_path: str, 126 | force: bool = False, 127 | ): 128 | '''to_phylip 129 | 130 | Write distance matrix to file in PHYLIP matrix format. 131 | 132 | Parameters 133 | ---------- 134 | file_path : str 135 | Output file path. 136 | force : bool, optional 137 | Force overwrite if file exists, by default False 138 | ''' 139 | pyckmeans.io.phylip.write_phylip_distmat( 140 | dist=self, 141 | file_path=file_path, 142 | force=force, 143 | ) 144 | 145 | def to_csv( 146 | self, 147 | file_path: str, 148 | force: bool = False, 149 | ): 150 | '''to_csv 151 | 152 | Write DistanceMatrix object to CSV. 153 | 154 | Parameters 155 | ---------- 156 | file_path : str 157 | CSV file path. 158 | force : bool, optional 159 | Force overwrite if file_path already exists, by default False 160 | ''' 161 | pyckmeans.io.csv.write_csv_distmat( 162 | dist=self, 163 | file_path=file_path, 164 | force=force, 165 | ) 166 | 167 | class InvalidDistanceTypeError(Exception): 168 | '''UnknownDistanceTypeError''' 169 | 170 | def alignment_distance( 171 | alignment: "pyckmeans.io.NucleotideAlignment", 172 | distance_type: str = 'p', 173 | pairwise_deletion: bool = True, 174 | ) -> DistanceMatrix: 175 | '''genetic_distance 176 | 177 | Calculate genetic distance based on a nucleotide alignment. 178 | 179 | Parameters 180 | ---------- 181 | alignment : pyckmeans.io.NucleotideAlignment 182 | Nucleotide alignment. 183 | distance_type : str, optional 184 | Type of genetic distance to calculate, by default 'p'. 185 | Available distance types are p-distances ('p'), 186 | Jukes-Cantor distances ('jc'), and Kimura 2-paramater distances 187 | ('k2p'). 188 | pairwise_deletion : bool 189 | Use pairwise deletion as action to deal with missing data. 190 | If False, complete deletion is applied. 191 | Gaps ("-", "~", " "), "?", and ambiguous bases are treated as 192 | missing data. 193 | Returns 194 | ------- 195 | DistanceMatrix 196 | n*n distance matrix. 197 | 198 | Raises 199 | ------ 200 | InvalidDistanceTypeError 201 | Raised if invalid distance_type is passed. 202 | ''' 203 | distance_type = distance_type.lower() 204 | if distance_type in ['p', 'raw']: 205 | return DistanceMatrix( 206 | p_distance(alignment.sequences, pairwise_deletion), 207 | alignment.names, 208 | ) 209 | elif distance_type in ['jc', 'jc69']: 210 | return DistanceMatrix( 211 | jc_distance(alignment.sequences, pairwise_deletion), 212 | alignment.names, 213 | ) 214 | elif distance_type in ['k2p', 'k80']: 215 | return DistanceMatrix( 216 | k2p_distance(alignment.sequences, pairwise_deletion), 217 | alignment.names, 218 | ) 219 | else: 220 | msg = f'Unknown distance type "{distance_type}".' 221 | raise InvalidDistanceTypeError(msg) 222 | -------------------------------------------------------------------------------- /pyckmeans/core/tests/test_core.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import pytest 3 | import tempfile 4 | import os 5 | 6 | import numpy as np 7 | from sklearn.datasets import make_blobs 8 | 9 | from pyckmeans.core.multickmeans import MultiCKMeans 10 | from pyckmeans.core.ckmeans import CKmeans, CKmeansResult, InvalidClusteringMetric 11 | from pyckmeans.core.wecr import WECR, WECRResult, InvalidConstraintsError, InvalidKError 12 | 13 | @pytest.fixture(scope='session') 14 | def test_dir(): 15 | with tempfile.TemporaryDirectory() as tempdir: 16 | 17 | yield tempdir 18 | 19 | print(f'Deleted temporary directory {tempdir}.') 20 | 21 | def assert_ckm_res_equal(a: CKmeansResult, b: CKmeansResult, eps=1e-8): 22 | assert (np.abs(a.cmatrix - b.cmatrix) < eps).all() 23 | assert (a.cl == b.cl).all() 24 | assert (a.bic is b.bic) or (a.bic - b.bic) < eps 25 | assert (a.db is b.db) or (a.db - b.db) < eps 26 | assert (a.sil is b.sil) or (a.sil - b.sil) < eps 27 | assert (a.ch is b.ch) or (a.ch - b.ch) < eps 28 | assert (a.names == b.names).all() 29 | assert a.km_cls is b.km_cls or \ 30 | (np.abs(a.km_cls - b.km_cls) < eps).all() 31 | 32 | def assert_wecr_res_equal(a: WECRResult, b: WECRResult, eps=1e-8): 33 | assert (np.abs(a.cmatrix - b.cmatrix) < eps).all() 34 | assert (a.cl == b.cl).all() 35 | assert (a.bic is b.bic) or ((a.bic - b.bic) < eps).all() 36 | assert (a.db is b.db) or ((a.db - b.db) < eps).all() 37 | assert (a.sil is b.sil) or ((a.sil - b.sil) < eps).all() 38 | assert (a.ch is b.ch) or ((a.ch - b.ch) < eps).all() 39 | assert (a.names == b.names).all() 40 | assert a.km_cls is b.km_cls or \ 41 | (np.abs(a.km_cls - b.km_cls) < eps).all() 42 | 43 | def test_simple(): 44 | ckm_0 = CKmeans(2) 45 | ckm_1 = CKmeans(np.array(3, dtype=int)) 46 | ckm_2 = CKmeans(np.array(3, dtype=np.int64)) 47 | 48 | def test_ckmeans(): 49 | x_0, _ = make_blobs(100, 5, centers=3, center_box=[-15, 15], shuffle=False) 50 | ckm_0 = CKmeans(3, metrics=['sil', 'bic', 'db', 'ch']) 51 | ckm_0.fit(x_0) 52 | ckm_res_0 = ckm_0.predict(x_0) 53 | 54 | with pytest.raises(InvalidClusteringMetric): 55 | CKmeans(3, metrics=['NONEXISTENT_METRIC']) 56 | 57 | def test_wecr(): 58 | x_0, _ = make_blobs(100, 5, centers=3, center_box=[-15, 15], shuffle=False) 59 | wecr_0 = WECR([2,3,4,5], 100) 60 | wecr_0.fit(x_0) 61 | 62 | wecr_res_0 = wecr_0.predict(x_0, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51], [5, 99]]) 63 | with pytest.raises(InvalidConstraintsError): 64 | wecr_0.predict(x_0, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51], [5, 100]]) 65 | with pytest.raises(InvalidConstraintsError): 66 | wecr_0.predict(x_0, must_link=[[0, 1], [101, 2]], must_not_link=[[0, 51], [5, 99]]) 67 | with pytest.raises(InvalidConstraintsError): 68 | wecr_0.predict(x_0, must_link=[['a', 'b'], ['c', 'd']], must_not_link=[[0, 51], [5, 99]]) 69 | with pytest.raises(InvalidConstraintsError): 70 | wecr_0.predict(x_0, must_link=[[0, 1], [0, 2]], must_not_link=[['a', 'b'], ['c', 'd']]) 71 | with pytest.raises(InvalidConstraintsError): 72 | wecr_0.predict(x_0, must_link=[[0, 1, 0, 2]], must_not_link=[[0, 51], [5, 99]]) 73 | with pytest.raises(InvalidConstraintsError): 74 | wecr_0.predict(x_0, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51, 5, 99]]) 75 | 76 | x_1 = pandas.DataFrame(x_0) 77 | wecr_0.fit(x_1) 78 | wecr_res_1 = wecr_0.predict(x_1, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51], [5, 99]]) 79 | wecr_res_1 = wecr_0.predict(x_1, must_link=[['0', '1'], ['0', '2']], must_not_link=[[0, 51], [5, 99]]) 80 | wecr_res_1 = wecr_0.predict(x_1, must_link=[[0, 1], [0, 2]], must_not_link=[['0', '51'], ['5', '99']]) 81 | with pytest.raises(InvalidConstraintsError): 82 | wecr_0.predict(x_1, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51], [5, 100]]) 83 | with pytest.raises(InvalidConstraintsError): 84 | wecr_0.predict(x_1, must_link=[['a', 'b'], ['c', 'd']], must_not_link=[[0, 51], [5, 99]]) 85 | with pytest.raises(InvalidConstraintsError): 86 | wecr_0.predict(x_1, must_link=[[0, 'b'], ['c', 'd']], must_not_link=[[0, 51], [5, 99]]) 87 | with pytest.raises(InvalidConstraintsError): 88 | wecr_0.predict(x_1, must_link=[[0, 1], ['c', 'd']], must_not_link=[[0, 51], [5, 99]]) 89 | with pytest.raises(InvalidConstraintsError): 90 | wecr_0.predict(x_1, must_link=[[0, 1], [0, 'd']], must_not_link=[[0, 51], [5, 99]]) 91 | with pytest.raises(InvalidConstraintsError): 92 | wecr_0.predict(x_1, must_link=[[0, 1], [0, 2]], must_not_link=[['a', 'b'], ['c', 'd']]) 93 | with pytest.raises(InvalidConstraintsError): 94 | wecr_0.predict(x_1, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51], [5, 'd']]) 95 | with pytest.raises(InvalidConstraintsError): 96 | wecr_0.predict(x_1, must_link=[[0, 1, 0, 2]], must_not_link=[[0, 51], [5, 99]]) 97 | with pytest.raises(InvalidConstraintsError): 98 | wecr_0.predict(x_1, must_link=[[0, 1], [0, 2]], must_not_link=[[0, 51, 5, 99]]) 99 | 100 | wecr_res_1_ro = wecr_res_1.reorder(wecr_res_1.order(linkage_type='single')) 101 | wecr_res_1_sort = wecr_res_1.sort(linkage_type='single') 102 | assert_wecr_res_equal(wecr_res_1_ro, wecr_res_1_sort) 103 | 104 | wecr_res_1.plot(2) 105 | wecr_res_1.plot_metrics() 106 | wecr_res_1.plot_affinity_propagation() 107 | 108 | wecr_res_1_rcm = wecr_res_1.recalculate_cluster_memberships(x_1, 'average', in_place=True) 109 | assert wecr_res_1_rcm is wecr_res_1 110 | 111 | wecr_res_1_rcm = wecr_res_1.recalculate_cluster_memberships(x_1, 'average', in_place=False) 112 | assert not wecr_res_1_rcm is wecr_res_1 113 | 114 | cl = wecr_res_1.get_cl(2, with_names=False) 115 | cl = wecr_res_1.get_cl(2, with_names=True) 116 | cl = wecr_res_1.get_cl_affinity_propagation(with_names=False) 117 | cl = wecr_res_1.get_cl_affinity_propagation(with_names=True) 118 | with pytest.raises(InvalidKError): 119 | wecr_res_1.get_cl(12500) 120 | with pytest.raises(InvalidKError): 121 | wecr_res_1.get_cl(-1) 122 | 123 | def test_multickmeans(): 124 | x_0, _ = make_blobs(100, 5, centers=3, center_box=[-15, 15], shuffle=False) 125 | mckm_0 = MultiCKMeans([2,3,4,5], n_rep=25) 126 | mckm_0.fit(x_0) 127 | mckm_res_0 = mckm_0.predict(x_0) 128 | mckm_res_0_ro = mckm_res_0.reorder(mckm_res_0.order(2), in_place=True) 129 | assert mckm_res_0 is mckm_res_0_ro 130 | 131 | mckm_res_0_ro = mckm_res_0.reorder(mckm_res_0.order(2), in_place=False) 132 | assert not mckm_res_0 is mckm_res_0_ro 133 | 134 | with pytest.raises(InvalidClusteringMetric): 135 | MultiCKMeans([2,3,4,5], n_rep=25, metrics=['NONEXISTENT_METRIC']) 136 | 137 | @pytest.mark.parametrize('return_cls', [True, False]) 138 | def test_save_load_ckm_res(test_dir, return_cls): 139 | x, _ = make_blobs(100, 5, centers=3) 140 | ckm = CKmeans(3, 20) 141 | ckm.fit(x) 142 | ckm_res = ckm.predict(x, return_cls=return_cls) 143 | 144 | if return_cls: 145 | ckm_res_km_cls_file = os.path.join(test_dir, f'{return_cls}_ckm_res_km_cls.txt') 146 | ckm_res.save_km_cls(ckm_res_km_cls_file, one_hot=False) 147 | ckm_res.save_km_cls(ckm_res_km_cls_file, one_hot=True) 148 | 149 | assert_ckm_res_equal( 150 | ckm_res, CKmeansResult.from_json_str(ckm_res.to_json()) 151 | ) 152 | 153 | ckm_res_json_file = os.path.join(test_dir, f'{return_cls}_ckm_res.json') 154 | ckm_res.to_json(ckm_res_json_file) 155 | ckm_res_l = CKmeansResult.from_json(ckm_res_json_file) 156 | 157 | assert_ckm_res_equal(ckm_res, ckm_res_l) 158 | 159 | ckm_res_dir = os.path.join(test_dir, f'{return_cls}_ckm_res') 160 | ckm_res.to_dir(ckm_res_dir) 161 | ckm_res.to_dir(ckm_res_dir, force=True) 162 | with pytest.raises(Exception): 163 | ckm_res.to_dir(ckm_res_dir, force=False) 164 | ckm_res_l = CKmeansResult.from_dir(ckm_res_dir) 165 | assert_ckm_res_equal(ckm_res, ckm_res_l) 166 | 167 | with pytest.raises(Exception): 168 | CKmeansResult.from_dir('SOME_NONEXISTENT_DIR') 169 | 170 | @pytest.mark.parametrize('return_cls', [True, False]) 171 | def test_save_load_wecr_res(test_dir, return_cls): 172 | x, _ = make_blobs(100, 5, centers=3, center_box=[-15, 15]) 173 | wecr = WECR([2,3,4,5], 100) 174 | wecr.fit(x) 175 | wecr_res = wecr.predict(x, return_cls=return_cls) 176 | 177 | if return_cls: 178 | wecr_res_km_cls_file = os.path.join(test_dir, f'{return_cls}_wecr_res_km_cls.txt') 179 | wecr_res.save_km_cls(wecr_res_km_cls_file, one_hot=False) 180 | wecr_res.save_km_cls(wecr_res_km_cls_file, one_hot=True) 181 | 182 | assert_wecr_res_equal( 183 | wecr_res, WECRResult.from_json_str(wecr_res.to_json()) 184 | ) 185 | 186 | wecr_res_json_file = os.path.join(test_dir, f'{return_cls}_wecr_res.json') 187 | wecr_res.to_json(wecr_res_json_file) 188 | wecr_res_l = WECRResult.from_json(wecr_res_json_file) 189 | assert_wecr_res_equal(wecr_res, wecr_res_l) 190 | 191 | wecr_res_dir = os.path.join(test_dir, f'{return_cls}_wecr_res') 192 | wecr_res.to_dir(wecr_res_dir) 193 | wecr_res.to_dir(wecr_res_dir, force=True) 194 | with pytest.raises(Exception): 195 | wecr_res.to_dir(wecr_res_dir, force=False) 196 | wecr_res_l = WECRResult.from_dir(wecr_res_dir) 197 | assert_wecr_res_equal(wecr_res, wecr_res_l) 198 | 199 | with pytest.raises(Exception): 200 | WECRResult.from_dir('SOME_NONEXISTENT_DIR') 201 | -------------------------------------------------------------------------------- /pyckmeans/io/tests/test_phylip.py: -------------------------------------------------------------------------------- 1 | from pyckmeans.distance import DistanceMatrix 2 | import pytest 3 | import tempfile 4 | import os 5 | 6 | import numpy as np 7 | 8 | from pyckmeans.io import phylip 9 | from pyckmeans.io.phylip import InvalidPhylipAlignmentError, InvalidPhylipMatrixError, read_phylip_distmat, write_phylip_distmat, IncompatibleNamesError 10 | 11 | 12 | # ==== alignment 13 | 14 | PHYLIP_STR_0 = \ 15 | '''2 9 16 | Sample0 ACTGTCATG 17 | Sample1 ACT--CATC 18 | ''' 19 | 20 | PHYLIP_STR_1 = \ 21 | '''2 9 22 | Sample0 ACTGT CATG 23 | Sample1 ACT-- CATC 24 | ''' 25 | 26 | PHYLIP_STR_2 = \ 27 | '''2 9 28 | Sample0 ACTGT CATG 29 | 30 | Sample1 ACT-- CATC 31 | 32 | ''' 33 | 34 | PHYLIP_STR_3 = \ 35 | '''2 9 3 36 | Sample0 ACTGTCATG 37 | Sample1 ACT--CATC 38 | ''' 39 | 40 | PHYLIP_STR_4 = \ 41 | '''2 8 42 | Sample0 ACTGTCATG 43 | Sample1 ACT--CATC 44 | Sample2 ACTTGCATC 45 | ''' 46 | 47 | PHYLIP_STR_5 = \ 48 | '''1 9 49 | Sample0 ACTGTCATG 50 | ''' 51 | 52 | @pytest.fixture(scope='session') 53 | def prep_phylip_files(): 54 | with tempfile.TemporaryDirectory() as tempdir: 55 | print(f'Created temporary directory {tempdir}.') 56 | 57 | phylip_file_0 = os.path.join(tempdir, 'phylip_0.phy') 58 | with open(phylip_file_0, 'w') as f: 59 | f.write(PHYLIP_STR_0) 60 | 61 | phylip_file_1 = os.path.join(tempdir, 'phylip_1.phy') 62 | with open(phylip_file_1, 'w') as f: 63 | f.write(PHYLIP_STR_1) 64 | 65 | phylip_file_2 = os.path.join(tempdir, 'phylip_2.phy') 66 | with open(phylip_file_2, 'w') as f: 67 | f.write(PHYLIP_STR_2) 68 | 69 | phylip_file_3 = os.path.join(tempdir, 'phylip_3.phy') 70 | with open(phylip_file_3, 'w') as f: 71 | f.write(PHYLIP_STR_3) 72 | 73 | phylip_file_4 = os.path.join(tempdir, 'phylip_4.phy') 74 | with open(phylip_file_4, 'w') as f: 75 | f.write(PHYLIP_STR_4) 76 | 77 | phylip_file_5 = os.path.join(tempdir, 'phylip_5.phy') 78 | with open(phylip_file_5, 'w') as f: 79 | f.write(PHYLIP_STR_5) 80 | 81 | yield ( 82 | # should work 83 | phylip_file_0, 84 | phylip_file_1, 85 | phylip_file_2, 86 | 87 | # shouldn't work 88 | phylip_file_3, 89 | phylip_file_4, 90 | phylip_file_5, 91 | ) 92 | 93 | print(f'Deleted temporary directory {tempdir}.') 94 | 95 | def test_read_phylip_alignment(prep_phylip_files): 96 | r_0 = phylip.read_phylip_alignment(prep_phylip_files[0]) 97 | r_1 = phylip.read_phylip_alignment(prep_phylip_files[1]) 98 | r_2 = phylip.read_phylip_alignment(prep_phylip_files[2]) 99 | 100 | print('r_0', r_0) 101 | print('r_1', r_1) 102 | print('r_2', r_2) 103 | 104 | with pytest.raises(InvalidPhylipAlignmentError): 105 | r_3 = phylip.read_phylip_alignment(prep_phylip_files[3]) 106 | with pytest.raises(InvalidPhylipAlignmentError): 107 | r_4 = phylip.read_phylip_alignment(prep_phylip_files[4]) 108 | with pytest.raises(InvalidPhylipAlignmentError): 109 | r_5 = phylip.read_phylip_alignment(prep_phylip_files[5]) 110 | 111 | 112 | 113 | # ==== distance 114 | 115 | PHYLIP_DIST_STR_0 = \ 116 | '''4 117 | Sample0 0.00 0.90 0.80 0.30 118 | Sample1 0.90 0.00 0.40 0.70 119 | Sample2 0.80 0.40 0.00 0.50 120 | Sample3 0.30 0.70 0.50 0.00 121 | ''' 122 | 123 | PHYLIP_DIST_STR_1 = \ 124 | '''4 125 | Sample0 126 | Sample1 0.90 127 | Sample2 0.80 0.40 128 | Sample3 0.30 0.70 0.50 129 | ''' 130 | 131 | PHYLIP_DIST_STR_2 = \ 132 | '''5 133 | Sample0 134 | Sample1 0.90 135 | Sample2 0.80 0.40 136 | Sample3 0.30 0.70 0.50 137 | ''' 138 | 139 | PHYLIP_DIST_STR_3 = \ 140 | '''4 141 | Sample0 142 | Sample1 0.90 143 | Sample2 0.80 144 | Sample3 0.30 0.70 0.50 145 | ''' 146 | 147 | PHYLIP_DIST_STR_4 = \ 148 | '''5 149 | Sample0 0.00 0.90 0.80 0.30 150 | Sample1 0.90 0.00 0.40 0.70 151 | Sample2 0.80 0.40 0.00 0.50 152 | Sample3 0.30 0.70 0.50 0.00 153 | ''' 154 | 155 | PHYLIP_DIST_STR_5 = \ 156 | '''4 157 | Sample0 0.00 0.90 0.80 0.30 158 | Sample1 0.90 0.40 0.70 159 | Sample2 0.80 0.40 0.00 0.50 160 | Sample3 0.30 0.70 0.50 0.00 161 | ''' 162 | 163 | PHYLIP_DIST_STR_6 = \ 164 | '''4 165 | 166 | Sample0 0.00 0.90 0.80 0.30 167 | Sample1 0.90 0.00 0.40 0.70 168 | Sample2 0.80 0.40 0.00 0.50 169 | Sample3 0.30 0.70 0.50 0.00 170 | ''' 171 | 172 | PHYLIP_DIST_STR_7 = \ 173 | '''X 174 | Sample0 0.00 0.90 0.80 0.30 175 | 176 | Sample1 0.90 0.00 0.40 0.70 177 | Sample2 0.80 0.40 0.00 0.50 178 | Sample3 0.30 0.70 0.50 0.00 179 | ''' 180 | 181 | PHYLIP_DIST_STR_8 = \ 182 | '''4 183 | Sample0 0.00 0.90 0.80 0.30 184 | 185 | Sample1 0.90 0.00 0.40 0.70 186 | Sample2 0.80 0.40 0.00 0.50 187 | Sample3 0.30 0.70 0.50 0.00 188 | ''' 189 | 190 | PHYLIP_DIST_STR_9 = \ 191 | '''4 192 | Sample0 193 | 194 | Sample1 0.90 195 | Sample2 0.80 196 | Sample3 0.30 0.70 0.50 197 | ''' 198 | 199 | @pytest.fixture(scope='session') 200 | def prep_phylip_dist_files(): 201 | with tempfile.TemporaryDirectory() as tempdir: 202 | print(f'Created temporary directory {tempdir}.') 203 | 204 | phylip_dist_file_0 = os.path.join(tempdir, 'phylip_dist_0.dist') 205 | with open(phylip_dist_file_0, 'w') as f: 206 | f.write(PHYLIP_DIST_STR_0) 207 | 208 | phylip_dist_file_1 = os.path.join(tempdir, 'phylip_dist_1.dist') 209 | with open(phylip_dist_file_1, 'w') as f: 210 | f.write(PHYLIP_DIST_STR_1) 211 | 212 | phylip_dist_file_2 = os.path.join(tempdir, 'phylip_dist_2.dist') 213 | with open(phylip_dist_file_2, 'w') as f: 214 | f.write(PHYLIP_DIST_STR_2) 215 | 216 | phylip_dist_file_3 = os.path.join(tempdir, 'phylip_dist_3.dist') 217 | with open(phylip_dist_file_3, 'w') as f: 218 | f.write(PHYLIP_DIST_STR_3) 219 | 220 | phylip_dist_file_4 = os.path.join(tempdir, 'phylip_dist_4.dist') 221 | with open(phylip_dist_file_4, 'w') as f: 222 | f.write(PHYLIP_DIST_STR_4) 223 | 224 | phylip_dist_file_5 = os.path.join(tempdir, 'phylip_dist_5.dist') 225 | with open(phylip_dist_file_5, 'w') as f: 226 | f.write(PHYLIP_DIST_STR_5) 227 | 228 | phylip_dist_file_6 = os.path.join(tempdir, 'phylip_dist_6.dist') 229 | with open(phylip_dist_file_6, 'w') as f: 230 | f.write(PHYLIP_DIST_STR_6) 231 | 232 | phylip_dist_file_7 = os.path.join(tempdir, 'phylip_dist_7.dist') 233 | with open(phylip_dist_file_7, 'w') as f: 234 | f.write(PHYLIP_DIST_STR_7) 235 | 236 | phylip_dist_file_8 = os.path.join(tempdir, 'phylip_dist_8.dist') 237 | with open(phylip_dist_file_8, 'w') as f: 238 | f.write(PHYLIP_DIST_STR_8) 239 | 240 | phylip_dist_file_9 = os.path.join(tempdir, 'phylip_dist_9.dist') 241 | with open(phylip_dist_file_9, 'w') as f: 242 | f.write(PHYLIP_DIST_STR_9) 243 | 244 | yield ( 245 | # should work 246 | phylip_dist_file_0, 247 | phylip_dist_file_1, 248 | 249 | # shouldn't work 250 | phylip_dist_file_2, 251 | phylip_dist_file_3, 252 | phylip_dist_file_4, 253 | phylip_dist_file_5, 254 | phylip_dist_file_6, 255 | phylip_dist_file_7, 256 | phylip_dist_file_8, 257 | phylip_dist_file_9, 258 | ) 259 | 260 | print(f'Deleted temporary directory {tempdir}.') 261 | 262 | def test_phylip_distance(prep_phylip_dist_files): 263 | eps = 0.0001 264 | 265 | # == reading 266 | d_0 = read_phylip_distmat(prep_phylip_dist_files[0]) 267 | nm_0 = d_0.names 268 | print('d_0:', d_0) 269 | 270 | d_1 = read_phylip_distmat(prep_phylip_dist_files[1]) 271 | nm_1 = d_1.names 272 | print('d_1:', d_1) 273 | 274 | assert np.sum(np.abs(d_0.dist_mat - d_1.dist_mat)) < eps 275 | 276 | 277 | with pytest.raises(InvalidPhylipMatrixError): 278 | phylip.read_phylip_distmat(prep_phylip_dist_files[2]) 279 | with pytest.raises(InvalidPhylipMatrixError): 280 | phylip.read_phylip_distmat(prep_phylip_dist_files[3]) 281 | with pytest.raises(InvalidPhylipMatrixError): 282 | phylip.read_phylip_distmat(prep_phylip_dist_files[4]) 283 | with pytest.raises(InvalidPhylipMatrixError): 284 | phylip.read_phylip_distmat(prep_phylip_dist_files[5]) 285 | with pytest.raises(InvalidPhylipMatrixError): 286 | phylip.read_phylip_distmat(prep_phylip_dist_files[6]) 287 | with pytest.raises(InvalidPhylipMatrixError): 288 | phylip.read_phylip_distmat(prep_phylip_dist_files[7]) 289 | with pytest.raises(InvalidPhylipMatrixError): 290 | phylip.read_phylip_distmat(prep_phylip_dist_files[8]) 291 | with pytest.raises(InvalidPhylipMatrixError): 292 | phylip.read_phylip_distmat(prep_phylip_dist_files[9]) 293 | 294 | # == writing 295 | with tempfile.TemporaryDirectory() as tempdir: 296 | d_file_0 = os.path.join(tempdir, 'd_file_0.dist') 297 | write_phylip_distmat(d_0, d_file_0) 298 | with pytest.raises(FileExistsError): 299 | write_phylip_distmat(d_0, d_file_0, force=False) 300 | with pytest.raises(FileExistsError): 301 | d_path = os.path.join(tempdir, 'SOMEDIR') 302 | os.mkdir(d_path) 303 | write_phylip_distmat(d_0, d_path, force=True) 304 | with pytest.raises(IncompatibleNamesError): 305 | d_x = DistanceMatrix(d_0.dist_mat.copy(), d_0.names.copy()) 306 | d_x.names = d_x.names[1:] 307 | d_path = os.path.join(tempdir, 'somefile.dist') 308 | write_phylip_distmat(d_x, d_path) 309 | 310 | d_0_r = read_phylip_distmat(d_file_0) 311 | nm_0_r = d_0_r.names 312 | assert all([a == b for a, b in zip(nm_0, nm_0_r)]) 313 | assert np.sum(np.abs(d_0.dist_mat - d_0_r.dist_mat)) < eps 314 | 315 | d_file_1 = os.path.join(tempdir, 'd_file_1.dist') 316 | write_phylip_distmat(d_1, d_file_1) 317 | d_1_r = read_phylip_distmat(d_file_1) 318 | nm_1_r = d_1_r.names 319 | assert all([a == b for a, b in zip(nm_1, nm_1_r)]) 320 | assert np.sum(np.abs(d_1.dist_mat - d_1_r.dist_mat)) < eps 321 | 322 | 323 | -------------------------------------------------------------------------------- /pyckmeans/io/phylip.py: -------------------------------------------------------------------------------- 1 | ''' fasta 2 | 3 | Module for reading and writing PHYLIP files. 4 | ''' 5 | 6 | import os 7 | import re 8 | from typing import Tuple, Union 9 | 10 | import numpy 11 | 12 | import pyckmeans.distance 13 | 14 | WHITESPACE_RE = re.compile(r'\s+') 15 | 16 | class InvalidPhylipAlignmentError(Exception): 17 | '''InvalidPhylipAlignmentError 18 | ''' 19 | 20 | def read_phylip_alignment( 21 | phylip_file: str, 22 | dtype: Union[str, numpy.dtype] = 'U', 23 | ) -> Tuple[numpy.ndarray, numpy.ndarray]: 24 | '''read_phylip_alignment 25 | 26 | Read phylip alignment file. This function expects the phylip to be a valid alignment, 27 | meaning that it should contain at least 2 sequences of the same length, including 28 | gaps. 29 | 30 | WARNING: whitespace characters in entry names are NOT supported. 31 | 32 | Parameters 33 | ---------- 34 | phylip_file : str 35 | Path to a phylip file. 36 | dtype: Union[str, numpy.dtype] 37 | Data type to use for the sequence array. 38 | 39 | Returns 40 | ------- 41 | Tuple[numpy.ndarray, numpy.ndarray] 42 | Tuple of sequences and names, each as numpy array. 43 | 44 | Raises 45 | ------ 46 | InvalidPhylipAlignmentError 47 | Raised if header is malformed. 48 | InvalidPhylipAlignmentError 49 | Raised if less than 2 entries are present in phylip_file. 50 | InvalidPhylipAlignmentError 51 | Raised if number of entries does not match header. 52 | ''' 53 | 54 | names = [] 55 | seqs = [] 56 | with open(phylip_file) as phylip_f: 57 | # header 58 | header_str = next(phylip_f) 59 | try: 60 | n_entries, n_sites = [int(s) for s in header_str.split()] 61 | except: 62 | raise InvalidPhylipAlignmentError('Malformed header.') 63 | 64 | for line in phylip_f: 65 | _line = re.sub(WHITESPACE_RE, '', line) 66 | if not _line: 67 | continue 68 | l_len = len(_line) 69 | start = l_len-n_sites 70 | name = _line[:start] 71 | seq = _line[start:].upper() 72 | 73 | names.append(name) 74 | seqs.append(list(seq)) 75 | 76 | # check alignment validity 77 | n_seq = len(seqs) 78 | if len(seqs) < 2: 79 | msg = f'Expected at least 2 entries but found only {n_seq}.' 80 | raise InvalidPhylipAlignmentError(msg) 81 | 82 | if n_seq != n_entries: 83 | msg = f'Expected {n_entries} entries but found {n_seq} instead.' 84 | raise InvalidPhylipAlignmentError(msg) 85 | 86 | # construct output 87 | seqs = numpy.array(seqs, dtype=dtype) 88 | names = numpy.array(names) 89 | 90 | return seqs, names 91 | 92 | 93 | class InvalidPhylipMatrixError(Exception): 94 | '''InvalidPhylipMatrixTypeError 95 | ''' 96 | 97 | def read_phylip_distmat(phylip_file: str) -> 'pyckmeans.distance.DistanceMatrix': 98 | '''read_phylip_distmat 99 | 100 | Read distance matrix in PHYLIP format. 101 | Supports full and lower-triangle matrices. 102 | 103 | Parameters 104 | ---------- 105 | phylip_file : str 106 | Path to distance file in phylip format. 107 | 108 | Returns 109 | ------- 110 | pyckmeans.distance.DistanceMatrix 111 | Distance matrix as pyckmeans.distance DistanceMatrix object. 112 | 113 | Raises 114 | ------ 115 | InvalidPhylipMatrixError 116 | Raised if the header is malformed. 117 | InvalidPhylipMatrixError 118 | Raised if an empty line is encountered as second line. 119 | InvalidPhylipMatrixError 120 | Raised if file format can neither be inferred as full nor 121 | as lower-triangle matrix. 122 | InvalidPhylipMatrixError 123 | Raised if an empty line is encountered. 124 | InvalidPhylipMatrixError 125 | Raised if expecting a full matrix but number of values 126 | does not match the header. 127 | InvalidPhylipMatrixError 128 | Raised if an empty line is encountered. 129 | InvalidPhylipMatrixError 130 | Raised if expecting lower-triangle matrix but number of values 131 | does not match the expected number of values for that entry. 132 | InvalidPhylipMatrixError 133 | Raised if number of names does not match number of entries 134 | stated in the header. 135 | ''' 136 | with open(phylip_file) as phylip_f: 137 | # == header 138 | header_str = next(phylip_f) 139 | try: 140 | n_entries = int(header_str.strip()) 141 | except: 142 | raise InvalidPhylipMatrixError('Malformed header.') 143 | 144 | dist_mat = numpy.zeros((n_entries, n_entries)) 145 | names = [] 146 | 147 | # == detect matrix type (full, lower-triangle) 148 | line = next(phylip_f) 149 | _line = line.strip() 150 | if not _line: 151 | msg = 'Line 2: Empty lines are not allowed.' 152 | raise InvalidPhylipMatrixError(msg) 153 | name, *mat_entries = _line.split() 154 | names.append(name) 155 | 156 | # lower-triangle matrix 157 | if len(mat_entries) == 0: 158 | mat_type = 'lower-triangle' 159 | # full matrix 160 | elif len(mat_entries) == n_entries: 161 | mat_type = 'full' 162 | dist_mat[0,] = numpy.array(mat_entries, dtype=float) 163 | # error 164 | else: 165 | msg = 'Line 2: Expected either 0 values for a lower-triangle ' +\ 166 | f'matrix or {n_entries} values for a full matrix; found ' +\ 167 | f'{len(mat_entries)} values instead.' 168 | raise InvalidPhylipMatrixError(msg) 169 | 170 | # == full matrix 171 | if mat_type == 'full': 172 | for i, line in enumerate(phylip_f): 173 | l_num = i + 3 # 1-based line number: header + first line already read 174 | 175 | _line = line.strip() 176 | if not _line: 177 | # last line can be empty 178 | if i + 2 == n_entries: 179 | continue 180 | msg = f'Line {l_num}: Empty lines are not allowed.' 181 | raise InvalidPhylipMatrixError(msg) 182 | name, *mat_entries = _line.split() 183 | names.append(name) 184 | 185 | # error 186 | if len(mat_entries) != n_entries: 187 | msg = f'Line {l_num}: Expected {n_entries} values for a full matrix but ' +\ 188 | f'found {len(mat_entries)} values instead.' 189 | raise InvalidPhylipMatrixError(msg) 190 | 191 | dist_mat[i+1,] = numpy.array(mat_entries, dtype=float) 192 | 193 | # == lower-triangle matrix 194 | elif mat_type == 'lower-triangle': 195 | for i, line in enumerate(phylip_f): 196 | l_num = i + 3 # 1-based line number: header + first line already read 197 | 198 | _line = line.strip() 199 | if not _line: 200 | # last line can be empty 201 | if i + 2 == n_entries: 202 | continue 203 | msg = f'Line {l_num}: Empty lines are not allowed.' 204 | raise InvalidPhylipMatrixError(msg) 205 | name, *mat_entries = _line.split() 206 | names.append(name) 207 | 208 | # error 209 | if len(mat_entries) != i+1: 210 | msg = f'Line {l_num}: Expected {i+1} values for a lower-triangle ' +\ 211 | f'matrix but found {len(mat_entries)} values instead.' 212 | raise InvalidPhylipMatrixError(msg) 213 | 214 | dist_mat[i+1, :i+1] = numpy.array(mat_entries, dtype=float) 215 | 216 | # fill upper triangle 217 | dist_mat = dist_mat + dist_mat.T 218 | 219 | # check validity 220 | if len(names) != n_entries: 221 | msg = f'Expected {n_entries} entries but found {len(names)}.' 222 | raise InvalidPhylipMatrixError(msg) 223 | 224 | return pyckmeans.distance.DistanceMatrix(dist_mat, names) 225 | 226 | class IncompatibleNamesError(Exception): 227 | '''IncompatibleNamesError''' 228 | 229 | NAME_PADDING = 64 230 | 231 | def write_phylip_distmat( 232 | dist: 'pyckmeans.distance.DistanceMatrix', 233 | file_path: str, 234 | force: bool = False, 235 | ) -> None: 236 | '''write_phylip_distmat 237 | 238 | Write distance matrix to file in PHYLIP matrix format. 239 | 240 | Parameters 241 | ---------- 242 | dist : pyckmeans.distance.DistanceMatrix 243 | Distance matrix as pyckmeans.distance DistanceMatrix object. 244 | file_path : str 245 | Output file path. 246 | force : bool, optional 247 | Force overwrite if file exists, by default False 248 | 249 | Raises 250 | ------ 251 | FileExistsError 252 | Raised if file at file_path already exists and force is False. 253 | FileExistsError 254 | Raised if file_path points to an existing directory. 255 | IncompatibleNamesError 256 | Raised if names are incompatible with dist_mat. 257 | ''' 258 | if os.path.exists(file_path): 259 | if os.path.isfile(file_path) and not force: 260 | msg = f'File {file_path} already exists. If you want to overwrite ' +\ 261 | 'it run the function with force=True.' 262 | raise FileExistsError(msg) 263 | else: 264 | msg = f'A directory exists at path {file_path}.' 265 | raise FileExistsError(msg) 266 | 267 | dist_mat = dist.dist_mat 268 | names = dist.names 269 | 270 | n_entries = dist_mat.shape[0] 271 | if len(names) != n_entries: 272 | msg = f'Expected {n_entries} names but got {len(names)} instead.' 273 | raise IncompatibleNamesError(msg) 274 | 275 | with open(file_path, 'w') as phylip_f: 276 | # header 277 | phylip_f.write(f'{n_entries}\n') 278 | 279 | # body 280 | for name, dists in zip(names, dist_mat): 281 | nm_str = f'{name: <{NAME_PADDING}}' 282 | dst_str = '\t'.join(dists.astype(str)) 283 | phylip_f.write(f'{nm_str} {dst_str}\n') 284 | -------------------------------------------------------------------------------- /pyckmeans/ordering/__init__.py: -------------------------------------------------------------------------------- 1 | ''' Module for distance matrix ordering. 2 | ''' 3 | 4 | from typing import Union 5 | import numpy 6 | from scipy.cluster import hierarchy 7 | 8 | import pyckmeans.distance 9 | 10 | class InvalidReorderMethod(Exception): 11 | '''InvalidReorderMethod''' 12 | class InvalidLinkageType(Exception): 13 | '''InvalidLinkageType''' 14 | 15 | REORDER_METHODS = ( 16 | 'GW', 17 | 'OLO', 18 | ) 19 | LINKAGE_TYPES = ( 20 | 'average', 21 | 'complete', 22 | 'single', 23 | 'weighted', 24 | 'centroid', 25 | ) 26 | 27 | 28 | def distance_order( 29 | dist: Union[numpy.ndarray, 'pyckmeans.distance.DistanceMatrix'], 30 | method: str = 'GW', 31 | linkage_type: str = 'average', 32 | ) -> numpy.ndarray: 33 | '''distance_order 34 | 35 | Get optimal distance matrix order. 36 | 37 | Parameters 38 | ---------- 39 | dist : Union[numpy.ndarray, 'pyckmeans.distance.DistanceMatrix'] 40 | A n * n distance matrix as either numpy.ndarray or as 41 | pyckmeans.distance.DistanceMatrix object. 42 | method : str 43 | Reordering method. Either 'GW' (Gruvaeus & Wainer, 1972) or 'OLO' for 44 | scipy.hierarchy.optimal_leaf_ordering. 45 | 46 | Gruvaeus, G., H., Wainer. 1972. Two Additions to Hierarchical Cluster Analysis. 47 | The British Psychological Society 25. 48 | linkage_type : str 49 | Linkage type for the hierarchical clustering. One of 50 | 51 | * 'average' 52 | * 'complete' 53 | * 'single' 54 | * 'weighted' 55 | * 'centroid' 56 | 57 | See scipy.cluster.hierarchy.linkage for details. 58 | 59 | Returns 60 | ------- 61 | numpy.ndarray 62 | Optimal order as vector. 63 | 64 | Raises 65 | ------ 66 | InvalidReorderMethod 67 | Raised if an unknown reordering method is passed. 68 | InvalidLinkageType 69 | Raised if an unknown linakage type is passed. 70 | ''' 71 | 72 | method = method.upper() 73 | if method not in REORDER_METHODS: 74 | msg = f'"{method}" is not a valid reordering method. Available ' +\ 75 | f'methods are {REORDER_METHODS}.' 76 | raise InvalidReorderMethod(msg) 77 | 78 | linkage_type = linkage_type.lower() 79 | if linkage_type not in LINKAGE_TYPES: 80 | msg = f'"{linkage_type}" is not a valid linkage type. Available ' +\ 81 | f'types are {LINKAGE_TYPES}.' 82 | raise InvalidLinkageType(msg) 83 | 84 | is_ndarray = isinstance(dist, numpy.ndarray) 85 | if is_ndarray: 86 | dist_mat = dist 87 | else: 88 | dist_mat = dist.dist_mat 89 | 90 | dist_mat_cond = condensed_form(dist_mat) 91 | linkage_mat = hierarchy.linkage(dist_mat_cond, method=linkage_type) 92 | # cluster distance can become negative due to floating point 93 | # errors. 94 | linkage_mat[numpy.abs(linkage_mat) < 1e-8] = 0 95 | 96 | if method == 'OLO': 97 | linkage_mat = hierarchy.optimal_leaf_ordering(linkage_mat, dist_mat_cond) 98 | elif method == 'GW': 99 | linkage_mat = reorder_linkage_gw(linkage_mat, dist_mat) 100 | 101 | order = hierarchy.leaves_list(linkage_mat) 102 | dist_mat = dist_mat[order, :][:, order] 103 | 104 | return order 105 | 106 | # This function duplicates code from distance_order, but I 107 | # want to keep this duplication for now for more flexibility 108 | def reorder_distance( 109 | dist: Union[numpy.ndarray, 'pyckmeans.distance.DistanceMatrix'], 110 | method: str = 'GW', 111 | linkage_type: str = 'average', 112 | ) -> Union[numpy.ndarray, 'pyckmeans.distance.DistanceMatrix']: 113 | '''reorder_distance 114 | 115 | Reorder distance matrix using hierarchical clustering. 116 | 117 | Parameters 118 | ---------- 119 | dist : Union[numpy.ndarray, 'pyckmeans.distance.DistanceMatrix'] 120 | A n * n distance matrix as either numpy.ndarray or as 121 | pyckmeans.distance.DistanceMatrix object. 122 | method : str 123 | Reordering method. Either 'GW' (Gruvaeus & Wainer, 1972) or 'OLO' for 124 | scipy.hierarchy.optimal_leaf_ordering. 125 | 126 | Gruvaeus, G., H., Wainer. 1972. Two Additions to Hierarchical Cluster Analysis. 127 | The British Psychological Society 25. 128 | linkage_type : str 129 | Linkage type for the hierarchical clustering. One of 130 | 131 | * 'average' 132 | * 'complete' 133 | * 'single' 134 | * 'weighted' 135 | * 'centroid' 136 | 137 | See scipy.cluster.hierarchy.linkage for details. 138 | 139 | Returns 140 | ------- 141 | Union[numpy.ndarray, 'pyckmeans.distance.DistanceMatrix'] 142 | The sorted distance matrix as either numpy.ndarray or 143 | pyckmeans.distance.DistanceMatrix, depending on the input. 144 | 145 | Raises 146 | ------ 147 | InvalidReorderMethod 148 | Raised if an unknown reordering method is passed. 149 | InvalidLinkageType 150 | Raised if an unknown linakage type is passed. 151 | ''' 152 | method = method.upper() 153 | if method not in REORDER_METHODS: 154 | msg = f'"{method}" is not a valid reordering method. Available ' +\ 155 | f'methods are {REORDER_METHODS}.' 156 | raise InvalidReorderMethod(msg) 157 | 158 | linkage_type = linkage_type.lower() 159 | if linkage_type not in LINKAGE_TYPES: 160 | msg = f'"{linkage_type}" is not a valid linkage type. Available ' +\ 161 | f'types are {LINKAGE_TYPES}.' 162 | raise InvalidLinkageType(msg) 163 | 164 | is_ndarray = isinstance(dist, numpy.ndarray) 165 | if is_ndarray: 166 | dist_mat = dist 167 | else: 168 | dist_mat = dist.dist_mat 169 | 170 | dist_mat_cond = condensed_form(dist_mat) 171 | linkage_mat = hierarchy.linkage(dist_mat_cond, method=linkage_type) 172 | if method == 'OLO': 173 | linkage_mat = hierarchy.optimal_leaf_ordering(linkage_mat, dist_mat_cond) 174 | elif method == 'GW': 175 | linkage_mat = reorder_linkage_gw(linkage_mat, dist_mat) 176 | 177 | order = hierarchy.leaves_list(linkage_mat) 178 | dist_mat = dist_mat[order, :][:, order] 179 | 180 | if is_ndarray: 181 | return dist_mat 182 | else: 183 | return pyckmeans.distance.DistanceMatrix( 184 | dist_mat, 185 | dist.names[order] if not dist.names is None else None, 186 | ) 187 | 188 | def condensed_form(dist: numpy.ndarray) -> numpy.ndarray: 189 | '''condensed_form 190 | 191 | Convert n*n distance matrix to condensed vector form. 192 | 193 | Parameters 194 | ---------- 195 | dist : numpy.ndarray 196 | n * n distance matrix. 197 | 198 | Returns 199 | ------- 200 | numpy.ndarray 201 | Distance matrix in condensed vector form as expected by 202 | scipy.cluster.hierarchy.linkage. 203 | ''' 204 | 205 | return dist[numpy.triu_indices_from(dist, k=1)] 206 | 207 | def reorder_linkage_gw( 208 | linkage: numpy.ndarray, 209 | dist: numpy.ndarray, 210 | ) -> numpy.ndarray: 211 | '''reorder_linkage_gw 212 | 213 | Reorder linkage matrix using the algorithm described by Gruvaeus & Wainer (1972) [1]_. 214 | 215 | Parameters 216 | ---------- 217 | linkage : numpy.ndarray 218 | Linkage matrix as returned from scipy.cluster.hierarchy.linkage. 219 | dist : numpy.ndarray 220 | n * n distance matrix. 221 | 222 | Returns 223 | ------- 224 | numpy.ndarray 225 | Reordered linkage matrix. 226 | 227 | References 228 | ---------- 229 | .. [1] Gruvaeus, G., H., Wainer. 1972. Two Additions to Hierarchical Cluster Analysis. 230 | The British Psychological Society 25. 231 | ''' 232 | linkage = linkage.copy() 233 | 234 | n = linkage.shape[0] 235 | 236 | # left and right leaves of a cluster 237 | l_r = numpy.zeros((n, 2)) 238 | # matrix determining, whether a cluster (subtree) should be flipped 239 | flip = numpy.full((n, 2), False) 240 | 241 | # find left and right leaves of clusters 242 | # and determine, whether cluster should 243 | # be flipped 244 | for i in range(n): 245 | l, r = linkage[i, [0, 1]].astype(int) 246 | 247 | # l and r are singletons 248 | if l <= n and r <= n: 249 | l_r[i] = (l, r) 250 | # only l is a singleton 251 | elif l <= n: 252 | l_r[i, 0] = l 253 | 254 | # left and right leaves of cluster r 255 | rl, rr = l_r[r - (n + 1)].astype(int) 256 | 257 | if dist[l, rl] < dist[l, rr]: 258 | l_r[i, 1] = rr 259 | else: 260 | l_r[i, 1] = rl 261 | flip[i, 1] = True 262 | # only r is singleton 263 | elif r <= n: 264 | l_r[i, 1] = r 265 | 266 | # left and right leaves of cluster l 267 | ll, lr = l_r[l - (n + 1)].astype(int) 268 | 269 | if dist[r, ll] < dist[r, lr]: 270 | l_r[i, 0] = lr 271 | flip[i, 0] = True 272 | else: 273 | l_r[i, 0] = ll 274 | # none of l and r are singletons 275 | else: 276 | # left and right leaves 277 | ll, lr = l_r[l - (n + 1)].astype(int) 278 | rl, rr = l_r[r - (n + 1)].astype(int) 279 | 280 | d_ll_rl = dist[ll, rl] # 0 281 | d_ll_rr = dist[ll, rr] # 1 282 | d_lr_rl = dist[lr, rl] # 2 283 | d_lr_rr = dist[lr, rr] # 3 284 | 285 | mn_idx = numpy.argmin([d_ll_rl, d_ll_rr, d_lr_rl, d_lr_rr]) 286 | if mn_idx == 0: # d_ll_rl 287 | l_r[i] = (lr, rr) 288 | flip[i, 0] = True 289 | elif mn_idx == 1: # d_ll_rr 290 | l_r[i] = (lr, rl) 291 | flip[i] = (True, True) 292 | elif mn_idx == 2: # d_lr_rl 293 | l_r[i] = (ll, rr) 294 | else: # d_lr_rr 295 | l_r[i] = (ll, rl) 296 | flip[i, 1] = True 297 | 298 | # apply flip 299 | for i in range((n-1), 0, -1): 300 | if flip[i, 0]: 301 | c = linkage[i, 0].astype(int) 302 | # non-singleton cluster 303 | if c > n: 304 | c = c - (n + 1) 305 | linkage[c, [0, 1]] = linkage[c, [1, 0]] 306 | if flip[c, 0] == flip[c, 1]: 307 | flip[c] = ~flip[c] 308 | if flip[i, 1]: 309 | c = linkage[i, 1].astype(int) 310 | if c > n: 311 | c = c - (n + 1) 312 | linkage[c, [0, 1]] = linkage[c, [1, 0]] 313 | if flip[c, 0] == flip[c, 1]: 314 | flip[c] = ~flip[c] 315 | 316 | return linkage 317 | -------------------------------------------------------------------------------- /pyckmeans/distance/src/distance.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _WIN32 2 | #define LIBRARY_API extern "C" __declspec(dllexport) 3 | #else 4 | #define LIBRARY_API extern "C" 5 | #endif 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | /* 13 | * Base encoding as used by R package ape. 14 | * See http://ape-package.ird.fr/misc/BitLevelCodingScheme.html 15 | * 16 | * Summary: 17 | * Most significant four bits are base information (A, G, C, T) 18 | * 76543210 19 | * 0b00001000 -> base is known 20 | * 0b00000100 -> gap 21 | * 0b00000010 -> unknown base 22 | * 23 | * bases 24 | * A 0b10001000 25 | * G 0b01001000 26 | * C 0b00101000 27 | * T 0b00011000 28 | * 29 | * wobbles 30 | * R 0b11000000 A|G 31 | * M 0b10100000 A|C 32 | * W 0b10010000 A|T 33 | * S 0b01100000 G|C 34 | * K 0b01010000 G|T 35 | * Y 0b00110000 C|T 36 | * V 0b11100000 A|G|C 37 | * H 0b10110000 A|C|T 38 | * D 0b11010000 A|G|T 39 | * B 0b01110000 G|C|T 40 | * N 0b11110000 A|G|C|T 41 | * 42 | * gap 43 | * - 0b00000100 44 | * 45 | * unknown/missing state 46 | * ? 0b00000010 47 | * 48 | */ 49 | 50 | // bases 51 | const std::uint8_t A = 0b10001000; // A 52 | const std::uint8_t G = 0b01001000; // G 53 | const std::uint8_t C = 0b00101000; // C 54 | const std::uint8_t T = 0b00011000; // T 55 | // wobbles 56 | const std::uint8_t R = 0b11000000; // A|G 57 | const std::uint8_t M = 0b10100000; // A|C 58 | const std::uint8_t W = 0b10010000; // A|T 59 | const std::uint8_t S = 0b01100000; // G|C 60 | const std::uint8_t K = 0b01010000; // G|T 61 | const std::uint8_t Y = 0b00110000; // C|T 62 | const std::uint8_t V = 0b11100000; // A|G|C 63 | const std::uint8_t H = 0b10110000; // A|C|T 64 | const std::uint8_t D = 0b11010000; // A|G|T 65 | const std::uint8_t B = 0b01110000; // G|C|T 66 | const std::uint8_t N = 0b11110000; // A|G|C|T 67 | // extra 68 | const std::uint8_t KNOWN = 0b00001000; // base is known, i.e. A, G, C, T 69 | const std::uint8_t GAP = 0b00000100; // gap 70 | const std::uint8_t UNKNOWN = 0b00000010; // base is unknown, e.g. missing data 71 | 72 | const std::uint8_t NOT_PURINE = 0b00110111; // not a unabiguous purine 73 | const std::uint8_t NOT_PYRIMIDINE = 0b11000111; // not a unabiguous pyrimidine 74 | 75 | // helper functions 76 | inline bool isA(std::uint8_t base) {return base == A;} 77 | inline bool isG(std::uint8_t base) {return base == G;} 78 | inline bool isC(std::uint8_t base) {return base == C;} 79 | inline bool isT(std::uint8_t base) {return base == T;} 80 | 81 | inline bool isKnown(std::uint8_t base) {return (base & KNOWN) == KNOWN;} 82 | inline bool isUnknown(std::uint8_t base) {return base == UNKNOWN;} 83 | inline bool isGap(std::uint8_t base) {return base == GAP;} 84 | 85 | inline bool isSameBase(std::uint8_t a, std::uint8_t b) {return (a == b) && isKnown(a);} 86 | inline bool isDifferentBase(std::uint8_t a, std::uint8_t b) {return (a & b) < 0b00010000;} 87 | inline bool isMatch(std::uint8_t a, std::uint8_t b) {return (a == b);} 88 | inline bool isAmbiguousMatch(std::uint8_t a, std::uint8_t b) {return (a & b) > 0b00001111;} 89 | 90 | inline bool isPurine(std::uint8_t base) {return (base & NOT_PURINE) == 0;} 91 | inline bool isPyrimidine(std::uint8_t base) {return (base & NOT_PYRIMIDINE) == 0;} 92 | inline bool isTransition(std::uint8_t a, std::uint8_t b) { 93 | return (isPurine(a) && isPurine(b)) 94 | || (isPyrimidine(a) && isPyrimidine(b)); 95 | } 96 | inline bool isTransversion(std::uint8_t a, std::uint8_t b) {return !isTransition(a, b);} 97 | 98 | // == distances 99 | 100 | // helpers 101 | std::vector completeDeletionSites( 102 | std::uint8_t* alignment, 103 | int n, 104 | int m 105 | ) { 106 | std::vector skip(m); 107 | for (size_t i = 0; i < m; ++i) { 108 | skip[i] = false; 109 | for (size_t j = 0; j < n; ++j) { 110 | std::uint8_t base = alignment[j * m + i]; 111 | 112 | // TODO: think about whether it is a good idea to ignore wobbles 113 | if (isGap(base) || !isKnown(base)) { 114 | skip[i] = true; 115 | break; 116 | } 117 | } 118 | } 119 | 120 | return skip; 121 | } 122 | 123 | // p-distance 124 | LIBRARY_API void pDistance( 125 | std::uint8_t* alignment, // nucleotide alignment 126 | int n, // number of entries 127 | int m, // number of sites 128 | bool pairwiseDeletion, // gap handling 129 | double *distMat // (output) distance matrix 130 | ) { 131 | // pairwise deletion 132 | if (pairwiseDeletion) { 133 | for (size_t i_a = 0; i_a < (n - 1); ++i_a) { 134 | for (size_t i_b = (i_a + 1); i_b < n; ++i_b) { 135 | // double to avoid casting later 136 | double nComp = 0; 137 | double nMatch = 0; 138 | for (size_t j = 0; j < m; ++j) { 139 | std::uint8_t a = alignment[i_a * m + j]; 140 | std::uint8_t b = alignment[i_b * m + j]; 141 | 142 | // TODO: think about this... This seems to be the same way as in ape 143 | // but I'm not sure that it is a good idea to ignore wobbles. 144 | if (!(isGap(a) || isGap(b)) && isKnown(a) && isKnown(b)) { 145 | nComp += 1; 146 | nMatch += isMatch(a, b); 147 | } 148 | } 149 | 150 | double d = 1.0; 151 | if (nComp > 0) d = 1 - nMatch / nComp; 152 | 153 | distMat[i_a * n + i_b] = d; 154 | distMat[i_b * n + i_a] = d; 155 | } 156 | } 157 | // complete deletion 158 | } else { 159 | // find sites with missing values 160 | std::vector skip = completeDeletionSites(alignment, n, m); 161 | 162 | // p distance calculation 163 | for (size_t i_a = 0; i_a < (n - 1); ++i_a) { 164 | for (size_t i_b = (i_a + 1); i_b < n; ++i_b) { 165 | // double to avoid casting later 166 | double nComp = 0; 167 | double nMatch = 0; 168 | for (size_t j = 0; j < m; ++j) { 169 | if (skip[j]) continue; // skip if site contains missing value 170 | std::uint8_t a = alignment[i_a * m + j]; 171 | std::uint8_t b = alignment[i_b * m + j]; 172 | 173 | nComp += 1; 174 | nMatch += isMatch(a, b); 175 | } 176 | 177 | double d = 1.0; 178 | if (nComp > 0) d = 1 - nMatch / nComp; 179 | 180 | distMat[i_a * n + i_b] = d; 181 | distMat[i_b * n + i_a] = d; 182 | } 183 | } 184 | } 185 | }; 186 | 187 | // Jukes-Cantor distance 188 | LIBRARY_API void jcDistance( 189 | std::uint8_t* alignment, // nucleotide alignment 190 | int n, // number of entries 191 | int m, // number of sites 192 | bool pairwiseDeletion, // gap handling 193 | double *distMat // (output) distance matrix 194 | ) { 195 | // calculate p 196 | pDistance( 197 | alignment, n, m, 198 | pairwiseDeletion, 199 | distMat 200 | ); 201 | 202 | for (size_t i = 0; i < n; ++i) { 203 | for (size_t j = 0; j < n; ++j) { 204 | double d = abs(- (3.0 / 4.0) * log(1 - (4.0 / 3.0) * distMat[i * n + j])); 205 | if (isnan(d)) d = INFINITY; 206 | distMat[i * n + j] = d; 207 | } 208 | } 209 | } 210 | 211 | // Kimura 2-parameter distance 212 | LIBRARY_API void k2pDistance( 213 | std::uint8_t* alignment, // nucleotide alignment 214 | int n, // number of entries 215 | int m, // number of sites 216 | bool pairwiseDeletion, // gap handling 217 | double *distMat // (output) distance matrix 218 | ) { 219 | // pairwise deletion 220 | if (pairwiseDeletion) { 221 | for (size_t i_a = 0; i_a < (n - 1); ++i_a) { 222 | for (size_t i_b = (i_a + 1); i_b < n; ++i_b) { 223 | // double to avoid casting later 224 | double nComp = 0; 225 | double nTransitions = 0; 226 | double nTransversions = 0; 227 | for (size_t j = 0; j < m; ++j) { 228 | std::uint8_t a = alignment[i_a * m + j]; 229 | std::uint8_t b = alignment[i_b * m + j]; 230 | 231 | // TODO: think about this... This seems to be the same way as in ape 232 | // but I'm not sure that it is a good idea to ignore wobbles. 233 | if (!(isGap(a) || isGap(b)) && isKnown(a) && isKnown(b)) { 234 | nComp += 1; 235 | // if bases are the same there is neither transition 236 | // not transversion 237 | if (isMatch(a, b)) continue; 238 | 239 | bool isTs = isTransition(a, b); 240 | nTransitions += isTs; 241 | nTransversions += 1 - isTs; 242 | } 243 | } 244 | 245 | double d = INFINITY; 246 | if (nComp > 0) { 247 | double p = nTransitions / nComp; 248 | double q = nTransversions / nComp; 249 | 250 | d = abs(-(1.0 / 2.0) * log((1 - 2 * p - q) * sqrt(1 - 2 * q))); 251 | if (isnan(d)) d = INFINITY; 252 | } 253 | 254 | distMat[i_a * n + i_b] = d; 255 | distMat[i_b * n + i_a] = d; 256 | } 257 | } 258 | // complete deletion 259 | } else { 260 | // find sites with missing values 261 | std::vector skip = completeDeletionSites(alignment, n, m); 262 | 263 | for (size_t i_a = 0; i_a < (n - 1); ++i_a) { 264 | for (size_t i_b = (i_a + 1); i_b < n; ++i_b) { 265 | // double to avoid casting later 266 | double nComp = 0; 267 | double nTransitions = 0; 268 | double nTransversions = 0; 269 | for (size_t j = 0; j < m; ++j) { 270 | if (skip[j]) continue; // skip if site contains missing value 271 | std::uint8_t a = alignment[i_a * m + j]; 272 | std::uint8_t b = alignment[i_b * m + j]; 273 | 274 | nComp += 1; 275 | // if bases are the same there is neither transition 276 | // not transversion 277 | if (isMatch(a, b)) continue; 278 | 279 | bool isTs = isTransition(a, b); 280 | nTransitions += isTs; 281 | nTransversions += 1 - isTs; 282 | } 283 | 284 | double d = INFINITY; 285 | if (nComp > 0) { 286 | double p = nTransitions / nComp; 287 | double q = nTransversions / nComp; 288 | 289 | d = abs(-(1.0 / 2.0) * log((1 - 2 * p - q) * sqrt(1 - 2 * q))); 290 | if (isnan(d)) d = INFINITY; 291 | } 292 | 293 | distMat[i_a * n + i_b] = d; 294 | distMat[i_b * n + i_a] = d; 295 | } 296 | } 297 | } 298 | } 299 | -------------------------------------------------------------------------------- /pyckmeans/knee/__init__.py: -------------------------------------------------------------------------------- 1 | ''' Knee and elbow search. 2 | ''' 3 | 4 | from typing import Callable, Iterable 5 | import warnings 6 | 7 | import numpy 8 | 9 | def rel_extrema_idcs( 10 | x: numpy.ndarray, 11 | cmp_fun: Callable[[numpy.ndarray, numpy.ndarray], numpy.ndarray] = numpy.greater, 12 | mode: str = 'clip', 13 | ) -> numpy.ndarray: 14 | '''rel_extrema_idcs 15 | 16 | Find indices of relative extrema. A relative extremum is found if 17 | at an element, if cmp_fun returns true for both of its neighbors. 18 | 19 | Parameters 20 | ---------- 21 | x : numpy.ndarray 22 | Data vector. 23 | cmp_fun : Callable[[numpy.ndarray, numpy.ndarray], numpy.ndarray], optional 24 | Compare function function, by default numpy.greater 25 | mode : str, optional 26 | Specifies how out-of-bounds indices will behave. 27 | 28 | * 'raise' - raise an error (default) 29 | * 'wrap' - wrap around 30 | * 'clip' - clip to the range 31 | 32 | 'clip' mode means that all indices that are too large are replaced 33 | by the index that addresses the last element along that axis. Note 34 | that this disables indexing with negative numbers. 35 | 36 | (mode documentation copied from numpy.take) 37 | 38 | Returns 39 | ------- 40 | numpy.ndarray 41 | Indices of the extrema. 42 | ''' 43 | idcs = numpy.arange(0, x.shape[0]) 44 | left = x.take(idcs + 1, mode=mode) 45 | right = x.take(idcs - 1, mode=mode) 46 | 47 | return numpy.nonzero(cmp_fun(x, left) & cmp_fun(x, right))[0] 48 | 49 | 50 | # the following code is mostly copied from 51 | # https://github.com/arvkevi/kneed and 52 | # was adapted for compatibility 53 | 54 | VALID_CURVE = ('convex', 'concave') 55 | VALID_DIRECTION = ('increasing', 'decreasing') 56 | 57 | class KneeLocator: 58 | '''KneeLocator 59 | 60 | An implementation of the Kneedle algorithm [1]_. 61 | 62 | Once instantiated, this class attempts to find the point of maximum 63 | curvature on a line. The knee is accessible via the `.knee` attribute. 64 | 65 | Parameters 66 | ---------- 67 | x : numpy.ndarray 68 | x values. 69 | y : numpy.ndarray 70 | y values. 71 | S : float, optional 72 | Sensitivity, original paper suggests default of 1.0, by default 1.0. 73 | curve : str, optional 74 | If 'concave', algorithm will detect knees. If 'convex', it 75 | will detect elbows., by default 'concave'. 76 | direction : str, optional 77 | Curve direction. One of {'increasing', 'decreasing'}, by default 'increasing'. 78 | interp_method : str, optional 79 | Interpolation method. One of 80 | 81 | * 'interp1d' - no interpolation 82 | * 'polynomial' - polynomial interpolation 83 | 84 | By default 'interp1d'. 85 | online : bool, optional 86 | Correct old knee points if True, will return first knee if False, 87 | by default False. 88 | polynomial_degree : int, optional 89 | The degree of the fitting polynomial. Only used when interp_method='polynomial'. 90 | This argument is passed to numpy polyfit `deg` parameter., by default 7. 91 | 92 | Raises 93 | ------ 94 | ValueError 95 | Raised if invalid curve or direction argument passed. 96 | ValueError 97 | Raised if invalid interp_method argument passed. 98 | 99 | References 100 | ---------- 101 | .. [1] Satopaa, V., J., Albrecht, D., Irwin, B., Raghavan. 2011. 102 | "Finding a "Kneedle" in a Haystack: Detecting Knee Points in System Behavior". 103 | 31st International Conference on Distributed Computing Systems Workshops. 104 | doi: 10.1109/ICDCSW.2011.20. 105 | ''' 106 | def __init__( 107 | self, 108 | x: numpy.ndarray, 109 | y: numpy.ndarray, 110 | S: float = 1.0, 111 | curve: str = 'concave', 112 | direction: str = 'increasing', 113 | interp_method: str = 'interp1d', 114 | online: bool = False, 115 | polynomial_degree: int = 7, 116 | ): 117 | # Step 0: Raw Input 118 | self.x = numpy.array(x) 119 | self.y = numpy.array(y) 120 | self.curve = curve 121 | self.direction = direction 122 | self.N = len(self.x) 123 | self.S = S 124 | self.all_knees = set() 125 | self.all_norm_knees = set() 126 | self.all_knees_y = [] 127 | self.all_norm_knees_y = [] 128 | self.online = online 129 | self.polynomial_degree = polynomial_degree 130 | 131 | valid_curve = self.curve in VALID_CURVE 132 | valid_direction = self.direction in VALID_DIRECTION 133 | if not all((valid_curve, valid_direction)): 134 | raise ValueError( 135 | 'Please check that the curve and direction arguments are valid.' 136 | ) 137 | 138 | # Step 1: fit a smooth line 139 | if interp_method == 'interp1d': 140 | # uspline = interpolate.interp1d(self.x, self.y) 141 | # self.ds_y = uspline(self.x) 142 | self.ds_y = y 143 | elif interp_method == 'polynomial': 144 | p = numpy.poly1d(numpy.polyfit(x, y, self.polynomial_degree)) 145 | self.ds_y = p(x) 146 | else: 147 | msg = f'{interp_method} is an invalid interp_method parameter, ' +\ 148 | 'use either "interp1d" or "polynomial".' 149 | raise ValueError(msg) 150 | 151 | # Step 2: normalize values 152 | self.x_normalized = self._normalize(self.x) 153 | self.y_normalized = self._normalize(self.ds_y) 154 | 155 | # Step 3: Calculate the Difference curve 156 | self.y_normalized = self.transform_y( 157 | self.y_normalized, self.direction, self.curve 158 | ) 159 | # normalized difference curve 160 | self.y_difference = self.y_normalized - self.x_normalized 161 | self.x_difference = self.x_normalized.copy() 162 | 163 | # Step 4: Identify local maxima/minima 164 | # local maxima 165 | self.maxima_indices = rel_extrema_idcs(self.y_difference, numpy.greater_equal) 166 | self.x_difference_maxima = self.x_difference[self.maxima_indices] 167 | self.y_difference_maxima = self.y_difference[self.maxima_indices] 168 | 169 | # local minima 170 | self.minima_indices = rel_extrema_idcs(self.y_difference, numpy.less_equal) 171 | self.x_difference_minima = self.x_difference[self.minima_indices] 172 | self.y_difference_minima = self.y_difference[self.minima_indices] 173 | 174 | # Step 5: Calculate thresholds 175 | self.Tmx = self.y_difference_maxima - ( 176 | self.S * numpy.abs(numpy.diff(self.x_normalized).mean()) 177 | ) 178 | 179 | # Step 6: find knee 180 | self.knee, self.norm_knee = self.find_knee() 181 | 182 | # Step 7: If we have a knee, extract data about it 183 | self.knee_y = self.norm_knee_y = None 184 | if self.knee: 185 | self.knee_y = self.y[self.x == self.knee][0] 186 | self.norm_knee_y = self.y_normalized[self.x_normalized == self.norm_knee][0] 187 | 188 | @staticmethod 189 | def _normalize(x: numpy.ndarray) -> numpy.ndarray: 190 | '''_normalize 191 | 192 | Scale vector values between 0 and 1. 193 | 194 | Parameters 195 | ---------- 196 | x : numpy.ndarray 197 | Vector to scale. 198 | 199 | Returns 200 | ------- 201 | numpy.ndarray 202 | Scaled vector 203 | ''' 204 | return (x - x.min()) / (x.max() - x.min()) 205 | 206 | @staticmethod 207 | def transform_y(y: Iterable[float], direction: str, curve: str) -> float: 208 | '''transform y to concave, increasing based on given direction and curve''' 209 | # convert elbows to knees 210 | if direction == 'decreasing': 211 | if curve == 'concave': 212 | y = numpy.flip(y) 213 | elif curve == 'convex': 214 | y = y.max() - y 215 | elif direction == 'increasing' and curve == 'convex': 216 | y = numpy.flip(y.max() - y) 217 | 218 | return y 219 | 220 | def find_knee(self,): 221 | ''' 222 | This function is called when KneeLocator is instantiated. 223 | It identifies the knee value and sets the instance attributes. 224 | ''' 225 | if not self.maxima_indices.size: 226 | warnings.warn( 227 | 'No local maxima found in the difference curve\n' 228 | 'The line is probably not polynomial.', 229 | RuntimeWarning, 230 | ) 231 | return None, None 232 | # placeholder for which threshold region i is located in. 233 | maxima_threshold_index = 0 234 | minima_threshold_index = 0 235 | # traverse the difference curve 236 | for i, x in enumerate(self.x_difference): 237 | # skip points on the curve before the the first local maxima 238 | if i < self.maxima_indices[0]: 239 | continue 240 | 241 | j = i + 1 242 | 243 | # reached the end of the curve 244 | if x == 1.0: 245 | break 246 | 247 | # if we're at a local max, increment the maxima threshold index and continue 248 | if (self.maxima_indices == i).any(): 249 | threshold = self.Tmx[maxima_threshold_index] 250 | threshold_index = i 251 | maxima_threshold_index += 1 252 | # values in difference curve are at or after a local minimum 253 | if (self.minima_indices == i).any(): 254 | threshold = 0.0 255 | minima_threshold_index += 1 256 | 257 | if self.y_difference[j] < threshold: 258 | if self.curve == 'convex': 259 | if self.direction == 'decreasing': 260 | knee = self.x[threshold_index] 261 | norm_knee = self.x_normalized[threshold_index] 262 | else: 263 | knee = self.x[-(threshold_index + 1)] 264 | norm_knee = self.x_normalized[threshold_index] 265 | 266 | elif self.curve == 'concave': 267 | if self.direction == 'decreasing': 268 | knee = self.x[-(threshold_index + 1)] 269 | norm_knee = self.x_normalized[threshold_index] 270 | else: 271 | knee = self.x[threshold_index] 272 | norm_knee = self.x_normalized[threshold_index] 273 | 274 | # add the y value at the knee 275 | y_at_knee = self.y[self.x == knee][0] 276 | y_norm_at_knee = self.y_normalized[self.x_normalized == norm_knee][0] 277 | if knee not in self.all_knees: 278 | self.all_knees_y.append(y_at_knee) 279 | self.all_norm_knees_y.append(y_norm_at_knee) 280 | 281 | # now add the knee 282 | self.all_knees.add(knee) 283 | self.all_norm_knees.add(norm_knee) 284 | 285 | # if detecting in offline mode, return the first knee found 286 | if self.online is False: 287 | return knee, norm_knee 288 | 289 | if self.all_knees == set(): 290 | warnings.warn('No knee/elbow found') 291 | return None, None 292 | 293 | return knee, norm_knee 294 | 295 | # Niceties for users working with elbows rather than knees 296 | @property 297 | def elbow(self): 298 | return self.knee 299 | 300 | @property 301 | def norm_elbow(self): 302 | return self.norm_knee 303 | 304 | @property 305 | def elbow_y(self): 306 | return self.knee_y 307 | 308 | @property 309 | def norm_elbow_y(self): 310 | return self.norm_knee_y 311 | 312 | @property 313 | def all_elbows(self): 314 | return self.all_knees 315 | 316 | @property 317 | def all_norm_elbows(self): 318 | return self.all_norm_knees 319 | 320 | @property 321 | def all_elbows_y(self): 322 | return self.all_knees_y 323 | 324 | @property 325 | def all_norm_elbows_y(self): 326 | return self.all_norm_knees_y 327 | -------------------------------------------------------------------------------- /pyckmeans/io/nucleotide_alignment.py: -------------------------------------------------------------------------------- 1 | ''' nucleotide_alignment 2 | 3 | Module for the representation of nucleotide alignments. 4 | ''' 5 | 6 | import os 7 | from typing import Iterable, Tuple 8 | 9 | import numpy 10 | import pyckmeans.distance 11 | from .c_interop import encode_nucleotides 12 | 13 | # Base encoding as used by R package ape. 14 | # See http://ape-package.ird.fr/misc/BitLevelCodingScheme.html 15 | # 16 | # Summary: 17 | # Most significant four bits are base information (A, G, C, T) 18 | # 0b00001000 -> base is known 19 | # 0b00000100 -> gap 20 | # 0b00000010 -> unknown base 21 | BASE_ENCODING = { 22 | # bases 23 | 'A': 0b10001000, 'a': 0b10001000, 24 | 'G': 0b01001000, 'g': 0b01001000, 25 | 'C': 0b00101000, 'c': 0b00101000, 26 | 'T': 0b00011000, 't': 0b00011000, 27 | # wobbles 28 | 'R': 0b11000000, 'r': 0b11000000, # A|G 29 | 'M': 0b10100000, 'm': 0b10100000, # A|C 30 | 'W': 0b10010000, 'w': 0b10010000, # A|T 31 | 'S': 0b01100000, 's': 0b01100000, # G|C 32 | 'K': 0b01010000, 'k': 0b01010000, # G|T 33 | 'Y': 0b00110000, 'y': 0b00110000, # C|T 34 | 'V': 0b11100000, 'v': 0b11100000, # A|G|C 35 | 'H': 0b10110000, 'h': 0b10110000, # A|C|T 36 | 'D': 0b11010000, 'd': 0b11010000, # A|G|T 37 | 'B': 0b01110000, 'b': 0b01110000, # G|C|T 38 | 'N': 0b11110000, 'n': 0b11110000, # A|G|C|T 39 | # gaps 40 | '-': 0b00000100, 41 | '~': 0b00000100, 42 | ' ': 0b00000100, 43 | # unknown/missing state 44 | '?': 0b00000010 45 | } 46 | BASE_ENCODING_INVERSE = { 47 | v:k for k, v in BASE_ENCODING.items() if k.isupper() or k in ('-', '?') 48 | } 49 | 50 | class InvalidAlignmentFileExtensionError(Exception): 51 | '''InvalidAlignmentFileExtensionError''' 52 | 53 | class InvalidAlignmentFileFormatError(Exception): 54 | '''InvalidAlignmentFileFormatError''' 55 | 56 | class InvalidAlignmentCharacterError(Exception): 57 | '''InvalidAlignmentCharacterError''' 58 | 59 | class InvalidSeqIORecordsError(Exception): 60 | '''InvalidSeqIORecordsError''' 61 | 62 | class NucleotideAlignment: 63 | '''NucleotideAlignment 64 | 65 | Class for nucleotide alignments. 66 | 67 | Parameters 68 | ---------- 69 | names : List[str] 70 | Sequence identifiers/names. 71 | sequences : numpy.ndarray 72 | n*m alignment matrix, where n is the number of entries and m 73 | is the number of sites. 74 | copy : bool 75 | If True, sequences will be copied. If false, the NucleotideAlignment 76 | will use the original sequences, potentially modifying them. 77 | fast_encoding : bool 78 | If true, a fast nucleotide encoding method without error checking 79 | will be used. ATTENTION: This will modify sequences in place. 80 | ''' 81 | def __init__( 82 | self, 83 | names: Iterable[str], 84 | sequences: numpy.ndarray, 85 | copy: bool = False, 86 | fast_encoding: bool = False, 87 | ): 88 | # check validity 89 | n_names = len(names) 90 | n_seqs = sequences.shape[0] 91 | if n_names != n_seqs: 92 | msg = f'Number of names ({n_names}) does not match number of sequences ({n_seqs}).' 93 | raise Exception(msg) 94 | self.names = numpy.array(names) 95 | 96 | # encode strings as uint8, see BASE_ENCODING 97 | if sequences.dtype != numpy.uint8: 98 | if fast_encoding: 99 | self.sequences = encode_nucleotides(sequences.copy() if copy else sequences) 100 | else: 101 | try: 102 | self.sequences = numpy.array( 103 | [[BASE_ENCODING[n] for n in row] for row in sequences], 104 | dtype=numpy.uint8, 105 | ) 106 | except KeyError as k_err: 107 | msg = f'Encountered unknown character in alignment: {str(k_err)}' 108 | raise InvalidAlignmentCharacterError(msg) from k_err 109 | else: 110 | self.sequences = sequences.copy() if copy else sequences 111 | 112 | def drop_invariant_sites(self, in_place: bool = False) -> 'NucleotideAlignment': 113 | '''drop_invariant_sites 114 | 115 | Remove invariant sites from alignment. Invariant sites 116 | are sites, where each entry has the same symbol. 117 | 118 | Parameters 119 | ---------- 120 | in_place : bool, optional 121 | Modify self in place, by default False 122 | 123 | Returns 124 | ------- 125 | NucleotideAlignment 126 | NucleotideAlignment without invariant sites. 127 | If in_place is set to True, self is returned. 128 | ''' 129 | if in_place: 130 | self.sequences = self.sequences[ 131 | :, 132 | ~numpy.all((self.sequences == self.sequences[0,]), axis=0) 133 | ] 134 | return self 135 | else: 136 | return NucleotideAlignment( 137 | self.names.copy(), 138 | self.sequences[ 139 | :, ~numpy.all((self.sequences == self.sequences[0,]), axis=0) 140 | ].copy(), 141 | ) 142 | 143 | def copy(self) -> 'NucleotideAlignment': 144 | '''copy 145 | 146 | Return a copy of the NucleotideAligment object. 147 | 148 | Returns 149 | ------- 150 | NucleotideAlignment 151 | Copy of self. 152 | ''' 153 | return NucleotideAlignment(self.names.copy(), self.sequences.copy()) 154 | 155 | def distance( 156 | self, 157 | distance_type: str = 'p', 158 | pairwise_deletion: bool = True, 159 | ) -> 'pyckmeans.distance.DistanceMatrix': 160 | '''distance 161 | 162 | Calculate genetic distance. 163 | 164 | Parameters 165 | ---------- 166 | distance_type : str, optional 167 | Type of genetic distance to calculate, by default 'p'. 168 | Available distance types are p-distances ('p'), 169 | Jukes-Cantor distances ('jc'), and Kimura 2-paramater distances 170 | ('k2p'). 171 | pairwise_deletion : bool 172 | Use pairwise deletion as action to deal with missing data. 173 | If False, complete deletion is applied. 174 | Gaps ("-", "~", " "), "?", and ambiguous bases are treated as 175 | missing data. 176 | Returns 177 | ------- 178 | pyckmeans.distance.DistanceMatrix 179 | n*n distance matrix. 180 | ''' 181 | 182 | return pyckmeans.distance.alignment_distance( 183 | alignment=self, 184 | distance_type=distance_type, 185 | pairwise_deletion=pairwise_deletion, 186 | ) 187 | 188 | @property 189 | def shape(self) -> Tuple[int, int]: 190 | '''shape 191 | 192 | Get alignment dimensions/shapes. 193 | 194 | Returns 195 | ------- 196 | Tuple[int, int] 197 | Number of samples n, number of sites m 198 | ''' 199 | return self.sequences.shape 200 | 201 | def __getitem__(self, idx): 202 | if isinstance(idx, tuple): 203 | return NucleotideAlignment(self.names[idx[0]], self.sequences[idx]) 204 | else: 205 | return NucleotideAlignment(self.names[idx], self.sequences[idx]) 206 | 207 | def __repr__(self) -> str: 208 | '''__repr__ 209 | 210 | Returns 211 | ------- 212 | str 213 | String representation 214 | ''' 215 | shape = self.shape 216 | return f'' 217 | 218 | @classmethod 219 | def from_bp_seqio_records( 220 | cls, 221 | records: Iterable['Bio.SeqRecord.SeqRecord'], 222 | fast_encoding: bool = False, 223 | ) -> 'NucleotideAlignment': 224 | '''from_bp_seqio_records 225 | 226 | Build NucleotideAlignment from iterable of Bio.SeqRecord.SeqRecord. 227 | Such an iterable is, for example, returned by Bio.SeqIO.parse() or 228 | can be constructed using Bio.Align.MultipleSequenceAlignment(). 229 | 230 | Parameters 231 | ---------- 232 | records: Iterable['Bio.SeqRecord.SeqRecord'] 233 | Iterable of Bio.SeqRecord.SeqRecord. 234 | Such an iterable is, for example, returned by Bio.SeqIO.parse() or 235 | can be constructed using Bio.Align.MultipleSequenceAlignment(). 236 | fast_encoding : bool 237 | If true, a fast nucleotide encoding method without error checking 238 | will be used. 239 | 240 | Returns 241 | ------- 242 | NucleotideAlignment 243 | NucleotideAlignment object. 244 | 245 | Raises 246 | ------ 247 | InvalidSeqIORecordsError 248 | Raised of sequences have different lengths. 249 | ''' 250 | names = [] 251 | seqs = [] 252 | 253 | for record in records: 254 | names.append(record.id) 255 | seqs.append(list(record.seq)) 256 | 257 | # check if all sequences have same length 258 | seq_len = len(seqs[0]) 259 | for i, seq in enumerate(seqs[1:]): 260 | cur_seq_len = len(seq) 261 | if cur_seq_len != seq_len: 262 | msg = f'Expected all sequences to have length {seq_len}' +\ 263 | f'(length of sequence #0) but sequence #{i+1} has length {cur_seq_len}.' 264 | raise InvalidSeqIORecordsError(msg) 265 | 266 | seqs = numpy.array(seqs) 267 | names = numpy.array(names) 268 | 269 | return cls(names, seqs, copy=False, fast_encoding=fast_encoding) 270 | 271 | @classmethod 272 | def from_file( 273 | cls, 274 | file_path: str, 275 | file_format='auto', 276 | fast_encoding=False, 277 | ) -> 'NucleotideAlignment': 278 | '''from_file 279 | 280 | Read nucleotide alignment from file. 281 | 282 | Parameters 283 | ---------- 284 | file_path: str 285 | Path to alignment file. 286 | file_format: str 287 | Alignment file format. Either "auto", "fasta" or "phylip". 288 | When "auto" the file format will be inferred based on the file extension. 289 | fast_encoding : bool 290 | If true, a fast nucleotide encoding method without error checking 291 | will be used. 292 | 293 | Returns 294 | ------- 295 | Tuple[numpy.ndarray, numpy.ndarray] 296 | Tuple of sequences and names, each as numpy array. 297 | 298 | Raises 299 | ------ 300 | InvalidAlignmentFileExtensionError 301 | Raised if file_format is "auto" and the file extension is not understood. 302 | InvalidAlignmentFileFormatError 303 | Raised if an invalid file_format is passed. 304 | ''' 305 | if file_format == 'auto': 306 | ext = os.path.splitext(file_path)[1].lower() 307 | 308 | if ext in ['.fasta', '.fas', '.fa']: 309 | file_format = 'fasta' 310 | elif ext in ['.phylip', '.phy']: 311 | file_format = 'phylip' 312 | else: 313 | msg = f'Unknown alignment file extension "{ext}". Please set file_format manually.' 314 | raise InvalidAlignmentFileExtensionError(msg) 315 | 316 | if file_format in ['fasta', 'FASTA']: 317 | from .fasta import read_fasta_alignment 318 | 319 | seqs, names = read_fasta_alignment( 320 | file_path, 321 | dtype='S' if fast_encoding else 'U', 322 | ) 323 | 324 | return cls( 325 | names=names, 326 | sequences=seqs, 327 | copy=False, 328 | fast_encoding=fast_encoding, 329 | ) 330 | 331 | elif file_format in ['phylip', 'PHYLIP']: 332 | from .phylip import read_phylip_alignment 333 | 334 | seqs, names = read_phylip_alignment( 335 | file_path, 336 | dtype='S' if fast_encoding else 'U', 337 | ) 338 | 339 | return cls( 340 | names=names, 341 | sequences=seqs, 342 | copy=False, 343 | fast_encoding=fast_encoding, 344 | ) 345 | 346 | else: 347 | msg = f'Unknown aligment file format "{file_format}". ' +\ 348 | 'Supported formats are "fasta" and "phylip".' 349 | raise InvalidAlignmentFileFormatError(msg) 350 | 351 | def read_alignment(file_path: str, file_format: str = 'auto') -> NucleotideAlignment: 352 | '''read_alignment 353 | 354 | Read nucleotide alignment from file. 355 | Alias for NucleotideAlignment.from_file. 356 | 357 | Parameters 358 | ---------- 359 | file_path: str 360 | Path to alignment file. 361 | file_format: str 362 | Alignment file format. Either "auto", "fasta" or "phylip". 363 | When "auto" the file format will be inferred based on the file extension. 364 | 365 | Returns 366 | ------- 367 | NucleotideAlignment 368 | NucleotideAlignment instance. 369 | 370 | Raises 371 | ------ 372 | InvalidAlignmentFileExtensionError 373 | Raised if file_format is "auto" and the file extension is not understood. 374 | InvalidAlignmentFileFormatError 375 | Raised if an invalid file_format is passed. 376 | ''' 377 | 378 | return NucleotideAlignment.from_file(file_path, file_format) 379 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](/LICENSE) 2 | [![PyPI version](https://img.shields.io/pypi/v/pyckmeans?color=blue)](https://pypi.org/project/pyckmeans/) 3 | [![Anaconda-Server Badge](https://img.shields.io/conda/v/TankredO/pyckmeans?label=conda)](https://anaconda.org/tankredo/pyckmeans) 4 | [![Coverage Status](https://img.shields.io/coveralls/github/TankredO/pyckmeans)](https://coveralls.io/github/TankredO/pyckmeans?branch=main) 5 | 6 | [![DOI](https://zenodo.org/badge/361376094.svg)](https://zenodo.org/badge/latestdoi/361376094) 7 | 8 | # pyckmeans 9 | 10 | pyckmeans is a Python package for [Consensus K-Means](https://doi.org/10.1023/A:1023949509487) and [Weighted Ensemble Consensus of Random (WECR) K-Means](https://doi.org/10.1109/TKDE.2019.2952596) clustering, especially in the context of DNA sequence data. To evaluate the quality of clusterings, pyckmeans implements several internal validation metrics. 11 | 12 | In addition to the clustering functionality, it provides tools for working with DNA sequence data such as reading and writing of DNA alignment files, calculating genetic distances, and Principle Coordinate Analysis (PCoA) for dimensionality reduction. 13 | 14 | ## Consensus K-Means 15 | 16 | [Consensus K-Means](https://doi.org/10.1023/A:1023949509487) is an unsupervised ensemble clustering algorithm, combining multiple K-Means clusterings, where each K-Means is trained on a subset of the data (random subset) and a subset of the the features (random subspace). The predicted cluster memberships of the single clusterings are combined to a consensus (or co-association) matrix, determining the number of times each pair of samples was clustered together over all clusterings. This matrix can be interpreted as similarity matrix and can be used to resolve the final consensus clustering by subjecting it to a last clustering step, e.g. hierarchical, or spectral clustering. 17 | 18 | ## WECR K-Means 19 | 20 | [Weighted Ensemble Consensus of Random (WECR) K-Means](https://doi.org/10.1109/TKDE.2019.2952596) is a semi-supervised ensemble clustering algorithm. Similar to consensus K-Means, it is based on a collection of K-Means clusterings, which are each trained on a random subset of data and a random subspace of features. In addition, for each single clustering the number of clusters _k_ is also randomized. This library of clusterings is subjected to weighting function that integrates user-supplied must-link and must-not-link constraints, as well as an internal cluster validation criterion. The constraints represent the semi-supervised component of WECR K-Means: the user can provide prior knowledge considering the composition of the clusters. Must-link and must-not-link constraints imply that a pair of samples (observations, data points) is expected to be found in the same or different clusters, respectively. Based on the clusterings and the calculated weights, a weighted consensus (co-association) matrix is constructed, which is subjected to Cluster-based Similariry Partitioning (CSPA; e.g. hierarchical clustering) or spectral clustering to resolve the consensus clustering. 21 | 22 | ## Documentation 23 | 24 | See pyckmeans' [RTD Documentation](https://pyckmeans.readthedocs.io/) for details. 25 | 26 | ## Installation 27 | 28 | pyckmeans can be installed using pip, Conda, or from source. 29 | 30 | ### pip 31 | 32 | ```bash 33 | pip install pyckmeans 34 | ``` 35 | 36 | ### Conda 37 | 38 | ```bash 39 | conda install pyckmeans -c TankredO 40 | ``` 41 | 42 | ### From Source 43 | 44 | The installation from source requires `git` and a c++ compiler. 45 | 46 | ```bash 47 | git clone https://github.com/TankredO/pyckmeans 48 | cd pyckmeans 49 | pip install . 50 | ``` 51 | 52 | ## Usage 53 | 54 | Examples using the Python API: 55 | 56 | - [Consensus K-Means: Clustering a Data Matrix (Single K)](#ckmeans-data-single) 57 | - [Consensus K-Means: Clustering a Data Matrix (Multi K)](#ckmeans-data-multi) 58 | - [Consensus K-Means: Clustering Sequence Data](#ckmeans-sequence-multi) 59 | - [WECR K-Means: Clustering Sequence Data](#wecr-sequence) 60 | 61 |

Consensus K-Means: Clustering a Data Matrix (Single K)

62 | 63 | ```python 64 | from pyckmeans import CKmeans 65 | 66 | # simulate dataset 67 | # 50 samples, 2 features, 3 true clusters 68 | import sklearn.datasets 69 | x, _ = sklearn.datasets.make_blobs(n_samples=50, n_features=2, centers=3, random_state=75) 70 | 71 | # apply Consensus K-Means 72 | # 3 clusters, 100 K-Means runs, 73 | # draw 80% of samples and 50% of features for each single K-Means 74 | ckm = CKmeans(k=3, n_rep=100, p_samp=0.8, p_feat=0.5) 75 | ckm.fit(x) 76 | ckm_res = ckm.predict(x) 77 | 78 | # plot consensus matrix and consensus clustering 79 | fig = ckm_res.plot(figsize=(7,7)) 80 | 81 | # consensus matrix 82 | ckm_res.cmatrix 83 | 84 | # clustering metrics 85 | print('Bayesian Information Criterion:', ckm_res.bic) 86 | print('Davies-Bouldin Index:', ckm_res.db) 87 | print('Silhouette Score:', ckm_res.sil) 88 | print('Calinski-Harabasz Index:', ckm_res.ch) 89 | 90 | # consensus clusters 91 | print('Cluster Membership:', ckm_res.cl) 92 | ``` 93 | 94 | Bayesian Information Criterion: 50.21824821939818 95 | Davies-Bouldin Index: 0.2893792767901513 96 | Silhouette Score: 0.7827738719266039 97 | Calinski-Harabasz Index: 630.8235586596012 98 | Cluster Membership: [0 2 1 0 2 2 1 0 2 1 0 0 2 0 2 2 1 1 1 1 0 1 2 2 2 2 1 0 2 2 1 0 1 1 0 0 0 99 | 1 0 1 2 1 2 2 1 0 0 0 0 1] 100 | 101 | ![png](https://github.com/TankredO/pyckmeans/blob/main/docs/images/output_4_1.png?raw=true) 102 | 103 |

Consensus K-Means: Clustering a Data Matrix (Multi K)

104 | 105 | The `MultiCKmeans` class allows to train multiple `CKmeans` objects a once. 106 | This is, for example, useful for exploring clustering for different values of _k_. 107 | 108 | ```python 109 | from pyckmeans import MultiCKMeans 110 | import sklearn.datasets 111 | 112 | # simulate dataset 113 | # 50 samples, 10 features, 3 true clusters 114 | x, _ = sklearn.datasets.make_blobs(n_samples=50, n_features=10, centers=3, random_state=44) 115 | 116 | # apply multiple Consensus K-Means for 117 | # k = 2, ..., 5 118 | # 100 K-Means runs per Consensus K-Means 119 | # draw 80% of the sample for each single K-Means 120 | # draw 50% of the features for each single K-Means 121 | mckm = MultiCKMeans(k=[2, 3, 4, 5], n_rep=100, p_samp=0.8, p_feat=0.5) 122 | mckm.fit(x) 123 | mckm_res = mckm.predict(x) 124 | 125 | # clustering metrics 126 | print('Metrics:') 127 | print(mckm_res.metrics) 128 | 129 | # plot clustering metrics against k 130 | # BIC, DB: lower is better 131 | # SIL, CH: higher is better 132 | mckm_res.plot_metrics(figsize=(10,5)) 133 | 134 | 135 | # get a single CKmeansResult 0 |1| 2 3 136 | ckm_res_k3 = mckm_res.ckmeans_results[1] # k=[2, 3, 4, 5] 137 | # ... 138 | # see "Clustering a Data Matrix (Single K)" 139 | ``` 140 | 141 | Metrics: 142 | k sil bic db ch 143 | 0 2 0.574369 225.092100 0.646401 59.733498 144 | 1 3 0.788207 126.358519 0.302979 387.409107 145 | 2 4 0.563343 126.979355 1.214520 271.019424 146 | 3 5 0.339466 128.061382 1.698652 211.080143 147 | 148 | ![png](https://github.com/TankredO/pyckmeans/blob/main/docs/images/output_6_1.png?raw=true) 149 | 150 |

Consensus K-Means: Clustering Sequence Data

151 | 152 | ```python 153 | from pyckmeans import MultiCKMeans, NucleotideAlignment, pcoa 154 | from IPython.display import display 155 | # Set random seed for demonstration 156 | import numpy 157 | numpy.random.seed(0) 158 | 159 | # Load nucleotide alignment 160 | # Note: the file is available from 161 | # "https://github.com/TankredO/pyckmeans/tree/main/docs/datasets/rhodanthemum_ct85_msl68.snps.phy" 162 | aln = NucleotideAlignment.from_file('datasets/rhodanthemum_ct85_msl68.snps.phy') 163 | print('Nucleotide alignment:', aln) 164 | 165 | # Calculate Kimura 2-parameter distances 166 | dst = aln.distance(distance_type='k2p') 167 | 168 | # Apply PCoA, including negative Eigentvalue correction 169 | pcoa_res = pcoa(dst, correction='lingoes') 170 | # display Eigenvalues 171 | print('Eigenvalues:') 172 | display(pcoa_res.values) 173 | 174 | # Get Eigenvectors until the cumulative corrected Eigenvalues are >= 0.8 175 | vectors = pcoa_res.get_vectors( 176 | filter_by='eigvals_rel_corrected_cum', 177 | filter_th=0.8, 178 | out_format='pandas' 179 | ) 180 | 181 | # Apply Multi-K Consensus K-Means 182 | mckm = MultiCKMeans( 183 | k=range(2, 20), 184 | n_rep=50, 185 | p_samp=0.8, 186 | p_feat=0.8 187 | ) 188 | mckm.fit(vectors) 189 | mckm_res = mckm.predict(vectors) 190 | mckm_res.plot_metrics(figsize=(12, 7)) 191 | 192 | # Select a 'good' K 193 | # At k values around 7, BIC, DB, and SIL have a (local) optimum 194 | ckm_res_k7 = mckm_res.ckmeans_results[5] 195 | fig = ckm_res_k7.plot(figsize=(14,14)) 196 | ``` 197 | 198 | Nucleotide alignment: 199 | Eigenvalues: 200 | 201 |
202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 |
eigvalseigvals_releigvals_rel_cumeigvals_rel_correctedeigvals_rel_corrected_cum
00.1159720.4714580.2339860.2339860.233986
10.0395850.1609240.3170160.0830300.317016
20.0350790.1426040.3911400.0741250.391140
30.0173830.0706650.4302950.0391540.430295
40.0098310.0399650.4545250.0242300.454525
..................
103-0.001325-0.0053880.9985750.0014570.998575
104-0.001693-0.0068810.9996540.0010790.999654
105-0.001884-0.0076601.0000000.0003461.000000
106-0.002255-0.0091681.0000000.0000001.000000
107-0.002430-0.0098801.0000000.0000001.000000
304 |

108 rows × 5 columns

305 |
306 | 307 | ![png](https://github.com/TankredO/pyckmeans/blob/main/docs/images/output_8_2.png?raw=true) 308 | 309 | ![png](https://github.com/TankredO/pyckmeans/blob/main/docs/images/output_8_3.png?raw=true) 310 | 311 |

WECR K-Means: Clustering Sequence Data

312 | 313 | ```python 314 | from pyckmeans import WECR, NucleotideAlignment, pcoa 315 | 316 | # Load nucleotide alignment 317 | aln = NucleotideAlignment.from_file('datasets/rhodanthemum_ct85_msl68.snps.phy') 318 | 319 | # Calculate Kimura 2-parameter distances 320 | dst = aln.distance(distance_type='k2p') 321 | 322 | # Apply PCoA, including negative Eigentvalue correction 323 | pcoa_res = pcoa(dst, correction='lingoes') 324 | 325 | # Get Eigenvectors until the cumulative corrected Eigenvalues are >= 0.8 326 | vectors = pcoa_res.get_vectors( 327 | filter_by='eigvals_rel_corrected_cum', 328 | filter_th=0.8, 329 | out_format='pandas' 330 | ) 331 | 332 | # Apply WECR K-Means 333 | wecr = WECR( 334 | k=range(2, 20), 335 | n_rep=1000, 336 | p_samp=0.6, 337 | p_feat=0.6, 338 | ) 339 | wecr.fit(vectors) 340 | wecr_res = wecr.predict(vectors) 341 | 342 | # Plot clustering metrics for each k 343 | wecr_res.plot_metrics(figsize=(12, 7)) 344 | 345 | # Select a 'good' K (e.g., 6, 7, 8) for the consensus clustering 346 | wecr_res.plot(k=6, figsize=(14,14)) 347 | 348 | cluster_membership = wecr_res.get_cl(k=6, with_names=True) 349 | print('cluster_membership:') 350 | print(cluster_membership) 351 | ``` 352 | 353 | cluster_membership: 354 | PP-R002-01 0 355 | PP-R002-01-dupl 0 356 | PP-R017-04 4 357 | PP-R017-04-dupl 4 358 | PP-R019-01 5 359 | .. 360 | R044-02 3 361 | R044-12 3 362 | R045-02 0 363 | R045-06 0 364 | R045-25 0 365 | Length: 108, dtype: int32 366 | 367 | ![png](https://github.com/TankredO/pyckmeans/blob/main/docs/images/output_10_1.png?raw=true) 368 | 369 | ![png](https://github.com/TankredO/pyckmeans/blob/main/docs/images/output_10_2.png?raw=true) 370 | -------------------------------------------------------------------------------- /pyckmeans/core/multickmeans.py: -------------------------------------------------------------------------------- 1 | '''multickmeans module''' 2 | 3 | from typing import List, Optional, Iterable, Dict, Any, Tuple, Union, Callable, TYPE_CHECKING 4 | import numpy 5 | import pandas 6 | 7 | from pyckmeans.ordination import PCOAResult 8 | from .ckmeans import CKmeansResult, CKmeans, InvalidClusteringMetric 9 | 10 | if TYPE_CHECKING: 11 | import matplotlib 12 | import matplotlib.figure 13 | 14 | class MultiCKmeansResult: 15 | '''MultiCKmeansResult 16 | 17 | Result of MultiCKmeansResult.predict. 18 | 19 | Parameters 20 | ---------- 21 | ckmeans_results: List[CKmeansResult] 22 | List of CKmeansResults. 23 | names: Optional[Iterable(str)] 24 | Sample names. 25 | ''' 26 | def __init__( 27 | self, 28 | ckmeans_results: List[CKmeansResult], 29 | names: Optional[Iterable[str]] = None, 30 | ): 31 | self.ckmeans_results = ckmeans_results 32 | self.names = numpy.arange(ckmeans_results[0].cmatrix.shape[0]).astype(str) \ 33 | if names is None else numpy.array(names).astype(str) 34 | 35 | self.ks = numpy.array([ckm_res.k for ckm_res in ckmeans_results]) 36 | 37 | self.sils = numpy.array([ckm_res.sil for ckm_res in ckmeans_results]) 38 | self.bics = numpy.array([ckm_res.bic for ckm_res in ckmeans_results]) 39 | self.dbs = numpy.array([ckm_res.db for ckm_res in ckmeans_results]) 40 | self.chs = numpy.array([ckm_res.ch for ckm_res in ckmeans_results]) 41 | 42 | self.metrics = pandas.DataFrame({ 43 | 'k': self.ks, 44 | 'sil': self.sils, 45 | 'bic': self.bics, 46 | 'db': self.dbs, 47 | 'ch': self.chs, 48 | }) 49 | 50 | def order( 51 | self, 52 | by: int, 53 | method: str = 'GW', 54 | linkage_type: str = 'average', 55 | ) -> numpy.ndarray: 56 | '''order 57 | 58 | Get optimal sample order according to hierarchical clustering of the 59 | CKmeansResult at index "by". 60 | 61 | Parameters 62 | ---------- 63 | by : int 64 | Index of the CKMeansResult to order by. 65 | method : str 66 | Reordering method. Either 'GW' (Gruvaeus & Wainer, 1972) or 'OLO' for 67 | scipy.hierarchy.optimal_leaf_ordering. 68 | 69 | Gruvaeus, G., H., Wainer. 1972. Two Additions to Hierarchical Cluster Analysis. 70 | The British Psychological Society 25. 71 | linkage_type : str 72 | Linkage type for the hierarchical clustering. One of 73 | 74 | * 'average' 75 | * 'complete' 76 | * 'single' 77 | * 'weighted' 78 | * 'centroid' 79 | 80 | See scipy.cluster.hierarchy.linkage for details. 81 | 82 | Returns 83 | ------- 84 | numpy.ndarray 85 | Optimal sample order. 86 | ''' 87 | ckm_res = self.ckmeans_results[by] 88 | 89 | return ckm_res.order(method=method, linkage_type=linkage_type) 90 | 91 | def sort( 92 | self, 93 | by: int, 94 | method: str = 'GW', 95 | linkage_type: str = 'average', 96 | in_place: bool = False, 97 | ) -> 'MultiCKmeansResult': 98 | '''sort 99 | 100 | Sort samples according to hierarchical clustering of the 101 | CKmeansResult at index "by". 102 | 103 | Parameters 104 | ---------- 105 | by : int 106 | Index of the CKMeansResult to sort by. 107 | method : str 108 | Reordering method. Either 'GW' (Gruvaeus & Wainer, 1972) or 'OLO' for 109 | scipy.hierarchy.optimal_leaf_ordering. 110 | 111 | Gruvaeus, G., H., Wainer. 1972. Two Additions to Hierarchical Cluster Analysis. 112 | The British Psychological Society 25. 113 | linkage_type : str 114 | Linkage type for the hierarchical clustering. One of 115 | 116 | * 'average' 117 | * 'complete' 118 | * 'single' 119 | * 'weighted' 120 | * 'centroid' 121 | 122 | See scipy.cluster.hierarchy.linkage for details. 123 | in_place : bool 124 | If False, a new, sorted MultiCKmeansResult object will be returned. 125 | If True, the object will be sorted in place and self will be returned. 126 | 127 | Returns 128 | ------- 129 | MultiCKmeansResult 130 | Sorted MultiCKmeansResult 131 | ''' 132 | 133 | order = self.order(by=by, method=method, linkage_type=linkage_type) 134 | 135 | return self.reorder(order, in_place=in_place) 136 | 137 | def reorder( 138 | self, 139 | order: numpy.ndarray, 140 | in_place: bool = False, 141 | ) -> 'MultiCKmeansResult': 142 | '''reorder 143 | 144 | Reorder samples in all CKmeansResults according to provided order. 145 | 146 | Parameters 147 | ---------- 148 | order : numpy.ndarray 149 | New sample order. 150 | in_place : bool 151 | If False, a new, sorted MultiCKmeansResult object will be returned. 152 | If True, the object will be sorted in place and self will be returned. 153 | 154 | Returns 155 | ------- 156 | MultiCKmeansResult 157 | Reordered MultiCKmeansResult 158 | ''' 159 | 160 | if in_place: 161 | mckmres = self 162 | for ckmres in self.ckmeans_results: 163 | ckmres.reorder(order, in_place=True) 164 | else: 165 | ckm_results = [ 166 | ckmres.reorder(order, in_place=False) for ckmres in self.ckmeans_results 167 | ] 168 | names = None if self.names is None else self.names.copy() 169 | mckmres = MultiCKmeansResult(ckm_results, names=names) 170 | 171 | return mckmres 172 | 173 | def plot_metrics( 174 | self, 175 | figsize: Tuple[float, float] = (7, 7), 176 | ) -> 'matplotlib.figure.Figure': 177 | '''plot_metrics 178 | 179 | Plot MultiCKMeansResult metrics. 180 | 181 | Parameters 182 | ---------- 183 | figsize : Tuple[float, float], optional 184 | Figure size for the matplotlib figure, by default (7, 7). 185 | 186 | Returns 187 | ------- 188 | matplotlib.figure.Figure 189 | Matplotlib Figure of the metrics plot. 190 | ''' 191 | 192 | from pyckmeans.utils import plot_multickmeans_metrics 193 | 194 | fig = plot_multickmeans_metrics( 195 | mckm_res=self, 196 | figsize=figsize, 197 | ) 198 | fig.tight_layout() 199 | return fig 200 | 201 | class MultiCKMeans: 202 | '''MultiCKMeans 203 | 204 | Convenience class wrapping Consensus K-Means runs for multiple different numbers of clusters. 205 | 206 | Parameters 207 | ---------- 208 | k : Iterable[int] 209 | List of cluster counts for CKmeans. 210 | n_rep : int, optional 211 | Number of K-Means to fit for each single CKmeans, by default 100 212 | p_samp : float, optional 213 | Proportion of samples (observations) to randomly draw per K-Means run, by default 0.8. 214 | The resulting number of samples will be rounded up. I.e. if number of samples is 10 and 215 | p_samp is 0.75, each K-Means will use 8 randomly drawn samples (0.72 * 10 = 7.2, 7.2 -> 8). 216 | p_feat : float, optional 217 | Proportion of features (predictors) to randomly draw per K-Means run, by default 0.8. 218 | The resulting number of features will be rounded up. I.e. if number of features is 10 and 219 | p_feat is 0.72, each K-Means will use 8 randomly drawn features (0.72 * 10 = 7.5, 7.2 -> 8). 220 | metrics : Iterable[str] 221 | Clustering quality metrics to calculate while training. Available metrics are 222 | * "sil" (Silhouette Index) 223 | * "bic" (Bayesian Information Criterion) 224 | * "db" (Davies-Bouldin Index) 225 | * "ch" (Calinski-Harabasz). 226 | kwargs : Dict[str, Any] 227 | Additional keyword arguments passed to sklearn.cluster.KMeans. 228 | ''' 229 | def __init__( 230 | self, 231 | k: Iterable[int], 232 | n_rep: int = 100, 233 | p_samp: float = 0.8, 234 | p_feat: float = 0.8, 235 | metrics: Iterable[str] = ('sil', 'bic'), 236 | **kwargs: Dict[str, Any], 237 | ): 238 | self.k = k 239 | self.n_rep = n_rep 240 | self.p_samp = p_samp 241 | self.p_feat = p_feat 242 | 243 | for metric in metrics: 244 | if not metric in CKmeans.AVAILABLE_METRICS: 245 | am_str = ", ".join(CKmeans.AVAILABLE_METRICS) 246 | msg = f'Unknown metric "{metric}". Available metrics are {am_str}.' 247 | raise InvalidClusteringMetric(msg) 248 | 249 | self._metrics = metrics 250 | self._kmeans_kwargs = kwargs 251 | 252 | self.ckmeans: Optional[List[CKmeans]] = None 253 | 254 | def fit( 255 | self, 256 | x: Union[numpy.ndarray, PCOAResult, pandas.DataFrame], 257 | progress_callback: Optional[Callable] = None, 258 | ): 259 | '''fit 260 | 261 | Fit MultiCKmeans. 262 | 263 | Parameters 264 | ---------- 265 | x : Union[numpy.ndarray, PCOAResult] 266 | a n * m matrix (numpy.ndarray) or dataframe (pandas.DataFrame), where n is the number 267 | of samples (observations) and m is the number of features (predictors). 268 | Alternatively a pyckmeans.ordination.PCOAResult as returned from pyckmeans.pcoa. 269 | progress_callback : Optional[Callable] 270 | Optional callback function for progress reporting. 271 | ''' 272 | 273 | if isinstance(x, PCOAResult): 274 | x = x.vectors 275 | elif isinstance(x, pandas.DataFrame): 276 | x = x.values 277 | 278 | # _fit is called here to be able to extend later on. 279 | # The plan is to add a parallel fitting function later on 280 | # e.g. _fit_parallel(x, progress_callback, n_cores) 281 | self._fit(x, progress_callback=progress_callback) 282 | 283 | def predict( 284 | self, 285 | x: Union[numpy.ndarray, PCOAResult, pandas.DataFrame], 286 | linkage_type: str = 'average', 287 | return_cls: bool = False, 288 | progress_callback: Optional[Callable] = None, 289 | ) -> MultiCKmeansResult: 290 | '''predict 291 | 292 | Predict cluster membership of new data from all fitted CKmeans. 293 | 294 | Parameters 295 | ---------- 296 | x : Union[numpy.ndarray, PCOAResult] 297 | a n * m matrix (numpy.ndarray) or dataframe (pandas.DataFrame), where n is the number 298 | of samples (observations) and m is the number of features (predictors). If x is a 299 | dataframe, the index will be used a sample names. 300 | Alternatively a pyckmeans.ordination.PCOAResult as returned from pyckmeans.pcoa. 301 | linkage_type : str 302 | Linkage type of the hierarchical clustering that is used for consensus cluster 303 | calculation. One of 304 | 305 | * 'average' 306 | * 'complete' 307 | * 'single' 308 | * 'weighted' 309 | * 'centroid' 310 | 311 | See scipy.cluster.hierarchy.linkage for details. 312 | return_cls : bool 313 | If True, the cluster memberships of the single K-Means runs will be present 314 | in the output. 315 | progress_callback : Optional[Callable] 316 | Optional callback function for progress reporting. 317 | 318 | Returns 319 | ------- 320 | CKmeansResult 321 | Object comprising a n * n consensus matrix, and a n-length vector of 322 | precited cluster memberships. 323 | ''' 324 | names = None 325 | if isinstance(x, PCOAResult): 326 | names = numpy.array(x.names).astype(str) 327 | elif isinstance(x, pandas.DataFrame): 328 | names = numpy.array(x.index).astype(str) 329 | 330 | ckmeans_results: List[CKmeansResult] = [] 331 | for ckm in self.ckmeans: 332 | ckm_res = ckm.predict( 333 | x=x, 334 | linkage_type=linkage_type, 335 | return_cls=return_cls, 336 | progress_callback=progress_callback, 337 | ) 338 | ckmeans_results.append(ckm_res) 339 | 340 | return MultiCKmeansResult( 341 | ckmeans_results=ckmeans_results, 342 | names=names, 343 | ) 344 | 345 | def _fit( 346 | self, 347 | x: numpy.ndarray, 348 | progress_callback: Optional[Callable] = None, 349 | ): 350 | '''_fit 351 | 352 | Internal sequential fitting function. 353 | 354 | Parameters 355 | ---------- 356 | x : numpy.ndarray 357 | n * m matrix, where n is the number of samples (observations) and m is 358 | the number of features (predictors). 359 | progress_callback : Optional[Callable] 360 | Optional callback function for progress reporting. 361 | ''' 362 | self._reset() 363 | 364 | self.ckmeans = [] 365 | 366 | for k in self.k: 367 | ckm = CKmeans( 368 | k=k, 369 | n_rep=self.n_rep, 370 | p_samp=self.p_samp, 371 | p_feat=self.p_feat, 372 | metrics=self._metrics, 373 | **self._kmeans_kwargs, 374 | ) 375 | ckm.fit(x=x, progress_callback=progress_callback) 376 | self.ckmeans.append(ckm) 377 | 378 | def _reset(self): 379 | '''_reset 380 | 381 | Reset MultiCKmeans object. 382 | ''' 383 | self.ckmeans = None 384 | -------------------------------------------------------------------------------- /pyckmeans/utils/plotting.py: -------------------------------------------------------------------------------- 1 | ''' Plotting utitlies 2 | ''' 3 | 4 | from typing import Iterable, Optional, Tuple, Union 5 | import numpy 6 | import matplotlib.pyplot as plt 7 | import matplotlib.figure 8 | import matplotlib.colors 9 | import matplotlib.axes 10 | 11 | import pyckmeans.core 12 | from pyckmeans.core import wecr 13 | from pyckmeans.ordering import distance_order 14 | 15 | def plot_ckmeans_result( 16 | ckm_res: pyckmeans.core.CKmeansResult, 17 | names: Optional[Iterable[str]] = None, 18 | order: Optional[Union[str, numpy.ndarray]] = 'GW', 19 | cmap_cm: Union[str, matplotlib.colors.Colormap] = 'Blues', 20 | cmap_clbar: Union[str, matplotlib.colors.Colormap] = 'tab20', 21 | figsize: Tuple[float, float] = (7, 7), 22 | ) -> matplotlib.figure.Figure: 23 | '''plot_ckmeans_result 24 | 25 | Plot pyckmeans result consensus matrix with consensus clusters. 26 | 27 | Parameters 28 | ---------- 29 | ckm_res : CKmeansResult 30 | CKmeansResult as returned from CKmeans.predict. 31 | names : Optional[Iterable[str]] 32 | Sample names to be plotted. 33 | order : Optional[Union[str, numpy.ndarray]] 34 | Sample Plotting order. Either a string, determining the oder method to use 35 | (see CKmeansResult.order), or a numpy.ndarray giving the sample order, 36 | or None to apply no reordering. 37 | cmap_cm : Union[str, matplotlib.colors.Colormap], optional 38 | Colormap for the consensus matrix, by default 'Blues' 39 | cmap_clbar : Union[str, matplotlib.colors.Colormap], optional 40 | Colormap for the cluster bar, by default 'tab20' 41 | figsize : Tuple[float, float], optional 42 | Figure size for the matplotlib figure, by default (7, 7). 43 | 44 | Returns 45 | ------- 46 | matplotlib.figure.Figure 47 | Matplotlib figure. 48 | ''' 49 | # if order is None do not reorder 50 | if order is None: 51 | order = numpy.arange(ckm_res.cmatrix.shape[0]) 52 | # if order is str use CKMeansResult order 53 | elif isinstance(order, str): 54 | order = ckm_res.order(method=order) 55 | # else order must be numpy.ndarray giving the sample order 56 | 57 | ckm_res = ckm_res.reorder(order=order, in_place=False) 58 | cl = ckm_res.cl 59 | 60 | # if names is passed use names, else try to get names 61 | # from ckm_res, else just use samples indices 62 | if names is None: 63 | if ckm_res.names is not None: 64 | nms = ckm_res.names 65 | else: 66 | nms = order.astype('str') 67 | else: 68 | nms = numpy.array(names)[order] 69 | 70 | # build figure layout 71 | fig = plt.figure(figsize=figsize) 72 | ax_cmat = fig.add_axes([0.1, 0.1, 0.8, 0.8]) 73 | ax_clbar = fig.add_axes([0.05, 0.1, 0.05, 0.8]) 74 | ax_cbar = fig.add_axes([0.925, 0.1, 0.025, 0.8]) 75 | 76 | # = consensus matrix 77 | ax_cmat.imshow(ckm_res.cmatrix, cmap=cmap_cm) 78 | ax_cmat.set_xticks(numpy.arange(len(nms))) 79 | ax_cmat.set_xticklabels(nms) 80 | for tick in ax_cmat.get_xticklabels(): 81 | tick.set_rotation(90) 82 | ax_cmat.set_yticks([]) 83 | ax_cmat.tick_params(left=False) 84 | 85 | # cluster lines 86 | cl_01 = [] 87 | cl_start = 0 88 | for i in range(1, len(cl)): 89 | if cl[i] != cl[cl_start]: 90 | cl_01.append((cl_start, i)) 91 | cl_start = i 92 | cl_01.append((cl_start, len(cl))) 93 | cl_01 = numpy.array(cl_01) 94 | 95 | ax_cmat.hlines(cl_01[:, 0] + 0.5 - 1, -0.5, len(nms) - 0.5, color='white', linewidth=2) 96 | ax_cmat.vlines(cl_01[:, 0] + 0.5 - 1, -0.5, len(nms) - 0.5, color='white', linewidth=2) 97 | 98 | # = cluster membership bar 99 | ax_clbar.imshow(ckm_res.cl.reshape(-1, 1), cmap=cmap_clbar) 100 | ax_clbar.set_xticks([]) 101 | ax_clbar.set_yticks(numpy.arange(len(nms))) 102 | ax_clbar.set_yticklabels(nms) 103 | 104 | # = color bar 105 | ax_cbar.set_xticks([]) 106 | ax_cbar.yaxis.tick_right() 107 | plt.colorbar(plt.cm.ScalarMappable(cmap=cmap_cm), cax=ax_cbar) 108 | 109 | return fig 110 | 111 | def plot_multickmeans_metrics( 112 | mckm_res: pyckmeans.core.MultiCKmeansResult, 113 | figsize: Tuple[float, float] = (7, 7), 114 | ) -> matplotlib.figure.Figure: 115 | '''plot_multickmeans_metrics 116 | 117 | Plot MultiCKMeansResult metrics. 118 | 119 | Parameters 120 | ---------- 121 | mckm_res : MultiCKmeansResult 122 | MultiCKmeansResult object 123 | figsize : Tuple[float, float], optional 124 | Figure size for the matplotlib figure, by default (7, 7). 125 | 126 | Returns 127 | ------- 128 | matplotlib.figure.Figure 129 | Matplotlib Figure of the metrics plot. 130 | ''' 131 | 132 | fig, axs = plt.subplots(nrows=2, ncols=2, figsize=figsize) 133 | 134 | axs = axs.flatten() 135 | for ax in axs: 136 | ax.grid(axis='x') 137 | ax.set_xticks(mckm_res.ks) 138 | ax.set_xlabel('k') 139 | 140 | axs[0].plot(mckm_res.ks, mckm_res.bics) 141 | axs[0].set_title('BIC') 142 | axs[0].set_ylabel('BIC') 143 | 144 | axs[1].plot(mckm_res.ks, mckm_res.dbs) 145 | axs[1].set_title('DB') 146 | axs[1].set_ylabel('DB') 147 | 148 | axs[2].plot(mckm_res.ks, mckm_res.sils) 149 | axs[2].set_title('SIL') 150 | axs[2].set_ylabel('SIL') 151 | 152 | axs[3].plot(mckm_res.ks, mckm_res.chs) 153 | axs[3].set_title('CH') 154 | axs[3].set_ylabel('CH') 155 | 156 | return fig 157 | 158 | def plot_wecr_result( 159 | wecr_res: pyckmeans.core.WECRResult, 160 | k: int, 161 | names: Optional[Iterable[str]] = None, 162 | order: Optional[Union[str, numpy.ndarray]] = 'GW', 163 | cmap_cm: Union[str, matplotlib.colors.Colormap] = 'Blues', 164 | cmap_clbar: Union[str, matplotlib.colors.Colormap] = 'tab20', 165 | figsize: Tuple[float, float] = (7, 7), 166 | ) -> matplotlib.figure.Figure: 167 | '''plot_wecr_result 168 | 169 | Plot wecr result consensus matrix with consensus clusters. 170 | 171 | Parameters 172 | ---------- 173 | wecr_res : pyckmeans.core.WECRResult 174 | WECRResult as returned from pyckmeans.core.WECR.predict. 175 | k: int 176 | The number of clusters k to use for plotting. 177 | names : Optional[Iterable[str]] 178 | Sample names to be plotted. 179 | order : Optional[Union[str, numpy.ndarray]] 180 | Sample Plotting order. Either a string, determining the oder method to use 181 | (see WECRResult.order), or a numpy.ndarray giving the sample order, 182 | or None to apply no reordering. 183 | cmap_cm : Union[str, matplotlib.colors.Colormap], optional 184 | Colormap for the consensus matrix, by default 'Blues' 185 | cmap_clbar : Union[str, matplotlib.colors.Colormap], optional 186 | Colormap for the cluster bar, by default 'tab20' 187 | figsize : Tuple[float, float], optional 188 | Figure size for the matplotlib figure, by default (7, 7). 189 | 190 | Returns 191 | ------- 192 | matplotlib.figure.Figure 193 | Matplotlib figure. 194 | 195 | Raises 196 | ------ 197 | wecr.InvalidKError 198 | Raised if an invalid k argument is provided. 199 | ''' 200 | # if order is None do not reorder 201 | if order is None: 202 | order = numpy.arange(wecr_res.cmatrix.shape[0]) 203 | # if order is str use WECRResult order 204 | elif isinstance(order, str): 205 | order = wecr_res.order(method=order) 206 | # else order must be numpy.ndarray giving the sample order 207 | 208 | wecr_res = wecr_res.reorder(order=order, in_place=False) 209 | if not k in wecr_res.k: 210 | msg = f'Result for k={k} not found. Available k are {wecr_res.k}.' 211 | raise wecr.InvalidKError(msg) 212 | cl = wecr_res.cl[numpy.argmax(wecr_res.k == k)] 213 | 214 | # if names is passed use names, else try to get names 215 | # from wecr_res, else just use samples indices 216 | if names is None: 217 | if wecr_res.names is not None: 218 | nms = wecr_res.names 219 | else: 220 | nms = order.astype('str') 221 | else: 222 | nms = numpy.array(names)[order] 223 | 224 | # build figure layout 225 | fig = plt.figure(figsize=figsize) 226 | ax_cmat = fig.add_axes([0.1, 0.1, 0.8, 0.8]) 227 | ax_clbar = fig.add_axes([0.05, 0.1, 0.05, 0.8]) 228 | ax_cbar = fig.add_axes([0.925, 0.1, 0.025, 0.8]) 229 | 230 | # = consensus matrix 231 | ax_cmat.imshow(wecr_res.cmatrix, cmap=cmap_cm) 232 | ax_cmat.set_xticks(numpy.arange(len(nms))) 233 | ax_cmat.set_xticklabels(nms) 234 | for tick in ax_cmat.get_xticklabels(): 235 | tick.set_rotation(90) 236 | ax_cmat.set_yticks([]) 237 | ax_cmat.tick_params(left=False) 238 | 239 | # cluster lines 240 | cl_01 = [] 241 | cl_start = 0 242 | for i in range(1, len(cl)): 243 | if cl[i] != cl[cl_start]: 244 | cl_01.append((cl_start, i)) 245 | cl_start = i 246 | cl_01.append((cl_start, len(cl))) 247 | cl_01 = numpy.array(cl_01) 248 | 249 | ax_cmat.hlines(cl_01[:, 0] + 0.5 - 1, -0.5, len(nms) - 0.5, color='white', linewidth=2) 250 | ax_cmat.vlines(cl_01[:, 0] + 0.5 - 1, -0.5, len(nms) - 0.5, color='white', linewidth=2) 251 | 252 | # = cluster membership bar 253 | ax_clbar.imshow(cl.reshape(-1, 1), cmap=cmap_clbar) 254 | ax_clbar.set_xticks([]) 255 | ax_clbar.set_yticks(numpy.arange(len(nms))) 256 | ax_clbar.set_yticklabels(nms) 257 | 258 | # = color bar 259 | ax_cbar.set_xticks([]) 260 | ax_cbar.yaxis.tick_right() 261 | plt.colorbar(plt.cm.ScalarMappable(cmap=cmap_cm), cax=ax_cbar) 262 | 263 | return fig 264 | 265 | def plot_wecr_result_metrics( 266 | wecr_res: pyckmeans.core.WECRResult, 267 | figsize: Tuple[float, float] = (7, 7), 268 | ) -> matplotlib.figure.Figure: 269 | '''plot_wecr_result_metrics 270 | 271 | Plot WECRResult metrics. 272 | 273 | Parameters 274 | ---------- 275 | wecr_res : WECRResult 276 | WECRResult object 277 | figsize : Tuple[float, float], optional 278 | Figure size for the matplotlib figure, by default (7, 7). 279 | 280 | Returns 281 | ------- 282 | matplotlib.figure.Figure 283 | Matplotlib Figure of the metrics plot. 284 | ''' 285 | 286 | fig, axs = plt.subplots(nrows=2, ncols=2, figsize=figsize) 287 | 288 | order = numpy.argsort(wecr_res.k) 289 | 290 | axs = axs.flatten() 291 | for ax in axs: 292 | ax.grid(axis='x') 293 | ax.set_xticks(wecr_res.k[order]) 294 | ax.set_xlabel('k') 295 | 296 | axs[0].plot(wecr_res.k[order], wecr_res.bic[order]) 297 | axs[0].set_title('BIC') 298 | axs[0].set_ylabel('BIC') 299 | 300 | axs[1].plot(wecr_res.k[order], wecr_res.db[order]) 301 | axs[1].set_title('DB') 302 | axs[1].set_ylabel('DB') 303 | 304 | axs[2].plot(wecr_res.k[order], wecr_res.sil[order]) 305 | axs[2].set_title('SIL') 306 | axs[2].set_ylabel('SIL') 307 | 308 | axs[3].plot(wecr_res.k[order], wecr_res.ch[order]) 309 | axs[3].set_title('CH') 310 | axs[3].set_ylabel('CH') 311 | 312 | return fig 313 | 314 | def plot_cmatrix( 315 | cmatrix: numpy.ndarray, 316 | cl: numpy.ndarray, 317 | names: Optional[Iterable[str]] = None, 318 | order: Optional[Union[str, numpy.ndarray]] = 'GW', 319 | cmap_cm: Union[str, matplotlib.colors.Colormap] = 'Blues', 320 | cmap_clbar: Union[str, matplotlib.colors.Colormap] = 'tab20', 321 | figsize: Tuple[float, float] = (7, 7), 322 | ) -> Tuple[ 323 | matplotlib.figure.Figure, 324 | matplotlib.axes.Axes, 325 | matplotlib.axes.Axes, 326 | matplotlib.axes.Axes 327 | ]: 328 | '''plot_cmatrix 329 | 330 | Plot consensus matrix and consensus clustering. 331 | 332 | Parameters 333 | ---------- 334 | cmatrix : numpy.ndarray 335 | Consensus matrix. 336 | cl : numpy.ndarray 337 | Cluster membership. 338 | names : Optional[Iterable[str]] 339 | Sample names to be plotted. 340 | order : Optional[Union[str, numpy.ndarray]] 341 | Sample Plotting order. Either a string, or a numpy.ndarray giving the sample order, 342 | or None to apply no reordering. 343 | cmap_cm : Union[str, matplotlib.colors.Colormap], optional 344 | Colormap for the consensus matrix, by default 'Blues' 345 | cmap_clbar : Union[str, matplotlib.colors.Colormap], optional 346 | Colormap for the cluster bar, by default 'tab20' 347 | figsize : Tuple[float, float], optional 348 | Figure size for the matplotlib figure, by default (7, 7). 349 | 350 | Returns 351 | ------- 352 | Tuple[matplotlib.figure.Figure, matplotlib.axes.Axes, matplotlib.axes.Axes, matplotlib.axes.Axes] 353 | Figure, consensus matrix Axes, cluster membership Axes, colorbar Axes. 354 | ''' 355 | # if order is None do not reorder 356 | if order is None: 357 | order = numpy.arange(cmatrix.shape[0]) 358 | # if order is str use WECRResult order 359 | elif isinstance(order, str): 360 | order = distance_order(1-cmatrix, method=order) 361 | # else order must be numpy.ndarray giving the sample order 362 | 363 | cmatrix = cmatrix[order, :][:, order] 364 | cl = cl[order] 365 | 366 | # if names is passed use names, else try to get names 367 | # from wecr_res, else just use samples indices 368 | if names is None: 369 | nms = order.astype('str') 370 | else: 371 | nms = numpy.array(names)[order] 372 | 373 | # build figure layout 374 | fig = plt.figure(figsize=figsize) 375 | ax_cmat = fig.add_axes([0.1, 0.1, 0.8, 0.8]) 376 | ax_clbar = fig.add_axes([0.05, 0.1, 0.05, 0.8]) 377 | ax_cbar = fig.add_axes([0.925, 0.1, 0.025, 0.8]) 378 | 379 | # = consensus matrix 380 | ax_cmat.imshow(cmatrix, cmap=cmap_cm) 381 | ax_cmat.set_xticks(numpy.arange(len(nms))) 382 | ax_cmat.set_xticklabels(nms) 383 | for tick in ax_cmat.get_xticklabels(): 384 | tick.set_rotation(90) 385 | ax_cmat.set_yticks([]) 386 | ax_cmat.tick_params(left=False) 387 | 388 | # cluster lines 389 | cl_01 = [] 390 | cl_start = 0 391 | for i in range(1, len(cl)): 392 | if cl[i] != cl[cl_start]: 393 | cl_01.append((cl_start, i)) 394 | cl_start = i 395 | cl_01.append((cl_start, len(cl))) 396 | cl_01 = numpy.array(cl_01) 397 | 398 | ax_cmat.hlines(cl_01[:, 0] + 0.5 - 1, -0.5, len(nms) - 0.5, color='white', linewidth=2) 399 | ax_cmat.vlines(cl_01[:, 0] + 0.5 - 1, -0.5, len(nms) - 0.5, color='white', linewidth=2) 400 | 401 | # = cluster membership bar 402 | ax_clbar.imshow(cl.reshape(-1, 1), cmap=cmap_clbar) 403 | ax_clbar.set_xticks([]) 404 | ax_clbar.set_yticks(numpy.arange(len(nms))) 405 | ax_clbar.set_yticklabels(nms) 406 | 407 | # = color bar 408 | ax_cbar.set_xticks([]) 409 | ax_cbar.yaxis.tick_right() 410 | plt.colorbar(plt.cm.ScalarMappable(cmap=cmap_cm), cax=ax_cbar) 411 | 412 | return fig, ax_cmat, ax_clbar, ax_cbar 413 | --------------------------------------------------------------------------------