├── MANIFEST.in
├── paper
└── paper.pdf
├── .github
├── assets
│ ├── logo.png
│ ├── rr2.png
│ ├── cmatrix.jpg
│ ├── logo2.png
│ ├── logo3.png
│ ├── clusterCrit.pdf
│ ├── R-R2-Rsquared.docx
│ └── metric_names.xlsx
└── workflows
│ └── draft-pdf.yml
├── docs
├── source
│ ├── _static
│ │ ├── images
│ │ │ ├── DI.png
│ │ │ ├── HI.png
│ │ │ ├── CHI.png
│ │ │ ├── EVS.png
│ │ │ └── class_score_1.png
│ │ └── notes
│ │ │ ├── KGE.docx
│ │ │ ├── general_table.xlsx
│ │ │ └── table_of_metrics.xlsx
│ ├── pages
│ │ ├── permetrics.utils.rst
│ │ ├── permetrics.rst
│ │ ├── clustering
│ │ │ ├── CHI.rst
│ │ │ ├── SSEI.rst
│ │ │ ├── DI.rst
│ │ │ ├── PuS.rst
│ │ │ ├── HI.rst
│ │ │ ├── ES.rst
│ │ │ ├── DBCVI.rst
│ │ │ ├── RSI.rst
│ │ │ ├── BI.rst
│ │ │ ├── BHI.rst
│ │ │ ├── DHI.rst
│ │ │ └── TS.rst
│ │ ├── classification
│ │ │ ├── LS.rst
│ │ │ ├── MCC.rst
│ │ │ ├── HS.rst
│ │ │ ├── F2S.rst
│ │ │ ├── RS.rst
│ │ │ ├── PS.rst
│ │ │ ├── NPV.rst
│ │ │ ├── FBS.rst
│ │ │ ├── SS.rst
│ │ │ └── F1S.rst
│ │ ├── regression
│ │ │ ├── AE.rst
│ │ │ ├── MRE.rst
│ │ │ ├── MPE.rst
│ │ │ ├── DRV.rst
│ │ │ ├── MSE.rst
│ │ │ ├── RE.rst
│ │ │ ├── SE.rst
│ │ │ ├── OI.rst
│ │ │ ├── CE.rst
│ │ │ ├── MASE.rst
│ │ │ ├── ME.rst
│ │ │ ├── MedAE.rst
│ │ │ ├── NRMSE.rst
│ │ │ ├── SLE.rst
│ │ │ ├── RSE.rst
│ │ │ ├── AR.rst
│ │ │ ├── NSE.rst
│ │ │ ├── VAF.rst
│ │ │ ├── CRM.rst
│ │ │ ├── PCD.rst
│ │ │ ├── COV.rst
│ │ │ ├── CI.rst
│ │ │ ├── RAE.rst
│ │ │ ├── EC.rst
│ │ │ ├── A30.rst
│ │ │ ├── A20.rst
│ │ │ ├── EVS.rst
│ │ │ ├── MAE.rst
│ │ │ ├── A10.rst
│ │ │ ├── MAPE.rst
│ │ │ ├── AR2.rst
│ │ │ ├── NNSE.rst
│ │ │ ├── COR.rst
│ │ │ ├── R2s.rst
│ │ │ ├── SMAPE.rst
│ │ │ ├── WI.rst
│ │ │ ├── MBE.rst
│ │ │ ├── R2.rst
│ │ │ ├── KLD.rst
│ │ │ └── MAAPE.rst
│ │ ├── quick_start.rst
│ │ └── examples
│ │ │ └── classification.rst
│ └── conf.py
├── requirements.txt
├── Makefile
└── make.bat
├── requirements.txt
├── .flake8
├── examples
├── __init__.py
├── regression
│ ├── __init__.py
│ └── 05_get_supported_metrics.py
├── clustering
│ ├── __init__.py
│ ├── 05_get_supported_metrics.py
│ ├── 01_internal_metrics.py
│ ├── 06_speed_up_BI.py
│ ├── 06_speed_up_DI.py
│ ├── 06_speed_up_BHI.py
│ ├── 06_speed_up_DRI.py
│ ├── 06_speed_up_RSI.py
│ ├── 06_speed_up_KDI.py
│ ├── 06_speed_up_BRI.py
│ ├── 06_speed_up_LDRI.py
│ ├── 06_speed_up_ES.py
│ ├── 06_speed_up_PuS.py
│ ├── 06_speed_up_ReS.py
│ ├── 06_speed_up_PrS.py
│ ├── 06_speed_up_FMeasureS.py
│ ├── 06_speed_up_DBCVI.py
│ ├── 06_speed_up_RaS.py
│ ├── 06_speed_up_VMS.py
│ ├── 06_speed_up_HS.py
│ ├── 06_speed_up_CS.py
│ ├── 06_speed_up_MIS.py
│ ├── 06_speed_up_ARS.py
│ ├── 06_speed_up_FMS.py
│ ├── 02_external_metrics.py
│ ├── 06_speed_up_NMIS.py
│ ├── 03_exam_cases_indexes.py
│ ├── 06_speed_up_SSEI.py
│ ├── 06_speed_up_XBI.py
│ ├── 00_all_metrics.py
│ ├── 06_speed_up_TS-GAS-GPS.py
│ └── 04_exam_cases_scores.py
└── classification
│ ├── __init__.py
│ ├── 07_get_supported_metrics.py
│ ├── 10_hinge_loss.py
│ ├── 08_roc_auc_score.py
│ ├── 05_check_ranges_2labels.py
│ ├── 06_check_ranges_4labels.py
│ ├── 04_confusion_matrix.py
│ ├── 09_crossentropy_loss.py
│ ├── 00_all_metrics.py
│ ├── 12_brier_score_loss.py
│ ├── 02_oop_style_metric.py
│ ├── 01_functional_style_metric.py
│ ├── 03_multiple_metrics.py
│ └── 11_kld_loss.py
├── tests
├── __init__.py
└── test_comparisons
│ └── test_sklearn_regression.py
├── permetrics
├── utils
│ ├── __init__.py
│ ├── constant.py
│ ├── encoder.py
│ └── regressor_util.py
└── __init__.py
├── .readthedocs.yaml
├── CITATION.cff
└── .gitignore
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE CODE_OF_CONDUCT.md ChangeLog.md R-R2-Rsquared.docx
--------------------------------------------------------------------------------
/paper/paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/paper/paper.pdf
--------------------------------------------------------------------------------
/.github/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/.github/assets/logo.png
--------------------------------------------------------------------------------
/.github/assets/rr2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/.github/assets/rr2.png
--------------------------------------------------------------------------------
/.github/assets/cmatrix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/.github/assets/cmatrix.jpg
--------------------------------------------------------------------------------
/.github/assets/logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/.github/assets/logo2.png
--------------------------------------------------------------------------------
/.github/assets/logo3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/.github/assets/logo3.png
--------------------------------------------------------------------------------
/.github/assets/clusterCrit.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/.github/assets/clusterCrit.pdf
--------------------------------------------------------------------------------
/.github/assets/R-R2-Rsquared.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/.github/assets/R-R2-Rsquared.docx
--------------------------------------------------------------------------------
/.github/assets/metric_names.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/.github/assets/metric_names.xlsx
--------------------------------------------------------------------------------
/docs/source/_static/images/DI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/docs/source/_static/images/DI.png
--------------------------------------------------------------------------------
/docs/source/_static/images/HI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/docs/source/_static/images/HI.png
--------------------------------------------------------------------------------
/docs/source/_static/images/CHI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/docs/source/_static/images/CHI.png
--------------------------------------------------------------------------------
/docs/source/_static/images/EVS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/docs/source/_static/images/EVS.png
--------------------------------------------------------------------------------
/docs/source/_static/notes/KGE.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/docs/source/_static/notes/KGE.docx
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.17.1
2 | scipy>=1.7.1
3 | pytest==7.1.2
4 | pytest-cov==4.0.0
5 | flake8>=4.0.1
6 | scikit-learn>=1.0.1
7 |
--------------------------------------------------------------------------------
/docs/source/_static/images/class_score_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/docs/source/_static/images/class_score_1.png
--------------------------------------------------------------------------------
/docs/source/_static/notes/general_table.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/docs/source/_static/notes/general_table.xlsx
--------------------------------------------------------------------------------
/docs/source/_static/notes/table_of_metrics.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thieu1995/permetrics/HEAD/docs/source/_static/notes/table_of_metrics.xlsx
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = __pycache__, built, build, mytest, htmlcov, drafts
3 | ignore = E203, E266, W291, W503
4 | max-line-length = 180
5 | max-complexity = 18
6 | select = B,C,E,F,W,T4,B9
7 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # Defining the exact version will make sure things don't break
2 | sphinx==4.4.0
3 | sphinx_rtd_theme==1.0.0
4 | readthedocs-sphinx-search==0.3.2
5 | sphinxcontrib-bibtex==2.5.0
6 | numpy>=1.17.1
7 | scipy>=1.7.1
8 |
--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 16:54, 21/04/2020 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
--------------------------------------------------------------------------------
/examples/regression/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 09:52, 23/09/2020 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 09:58, 27/07/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
--------------------------------------------------------------------------------
/permetrics/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 12:11, 19/05/2022 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
--------------------------------------------------------------------------------
/examples/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 16:03, 25/07/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
--------------------------------------------------------------------------------
/examples/classification/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 08:11, 18/05/2022 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # Read the Docs configuration file
2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3 |
4 | # Required
5 | version: 2
6 |
7 | # Build all formats
8 | formats: all
9 |
10 | build:
11 | os: "ubuntu-20.04"
12 | tools:
13 | python: "3.8"
14 |
15 | # Build documentation in the docs/ directory with Sphinx
16 | sphinx:
17 | configuration: docs/source/conf.py
18 |
19 | python:
20 | install:
21 | - requirements: docs/requirements.txt
22 |
23 | submodules:
24 | include: all
--------------------------------------------------------------------------------
/permetrics/utils/constant.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 23:22, 11/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 |
9 |
10 | SUPPORTED_LIST = (list, tuple, np.ndarray)
11 | EPSILON = 1e-10
12 |
--------------------------------------------------------------------------------
/examples/clustering/05_get_supported_metrics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 09:23, 04/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | from permetrics import ClusteringMetric
8 |
9 | bhi = ClusteringMetric.get_support(name="BHI", verbose=True)
10 | print(bhi)
11 |
12 | all_metrics = ClusteringMetric.get_support(name="all", verbose=True)
13 | print(all_metrics)
14 |
--------------------------------------------------------------------------------
/examples/regression/05_get_supported_metrics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 09:40, 04/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | from permetrics import RegressionMetric
8 |
9 | rmse = RegressionMetric.get_support(name="RMSE", verbose=True)
10 | print(rmse)
11 |
12 | all_metrics = RegressionMetric.get_support(name="all", verbose=True)
13 | print(all_metrics)
14 |
--------------------------------------------------------------------------------
/examples/classification/07_get_supported_metrics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 09:34, 04/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | from permetrics import ClassificationMetric
8 |
9 | ascore = ClassificationMetric.get_support(name="AS", verbose=True)
10 | print(ascore)
11 |
12 | all_metrics = ClassificationMetric.get_support(name="all", verbose=True)
13 | print(all_metrics)
14 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.https://www.sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: "1.2.0"
2 | authors:
3 | - family-names: Thieu
4 | given-names: Nguyen Van
5 | orcid: "https://orcid.org/0000-0001-9994-8747"
6 | doi: 10.5281/zenodo.3951205
7 | message: If you use this software, please cite our article in the
8 | Journal of Open Source Software.
9 | preferred-citation:
10 | authors:
11 | - family-names: Thieu
12 | given-names: Nguyen Van
13 | orcid: "https://orcid.org/0000-0001-9994-8747"
14 | date-published: 2024-03-09
15 | doi: 10.21105/joss.06143
16 | issn: 2475-9066
17 | issue: 95
18 | journal: Journal of Open Source Software
19 | publisher:
20 | name: Open Journals
21 | start: 6143
22 | title: "PerMetrics: A Framework of Performance Metrics for Machine
23 | Learning Models"
24 | type: article
25 | url: "https://joss.theoj.org/papers/10.21105/joss.06143"
26 | volume: 9
27 | title: "PerMetrics: A Framework of Performance Metrics for Machine
28 | Learning Models"
29 |
--------------------------------------------------------------------------------
/.github/workflows/draft-pdf.yml:
--------------------------------------------------------------------------------
1 | name: Draft PDF for JOSS
2 |
3 | on:
4 | push:
5 | branches:
6 | - "joss-paper"
7 |
8 | jobs:
9 | paper:
10 | runs-on: ubuntu-latest
11 | name: Paper Draft
12 | steps:
13 | - name: Checkout
14 | uses: actions/checkout@v3
15 | - name: Build draft PDF
16 | uses: openjournals/openjournals-draft-action@master
17 | with:
18 | journal: joss
19 | # This should be the path to the paper within your repo.
20 | paper-path: paper/paper.md
21 | - name: Upload
22 | uses: actions/upload-artifact@v1
23 | with:
24 | name: paper
25 | # This is the output path where Pandoc will write the compiled
26 | # PDF. Note, this should be the same directory as the input
27 | # paper.md
28 | path: paper/paper.pdf
29 | - name: save pdf to repo
30 | uses: stefanzweifel/git-auto-commit-action@v4
31 | with:
32 | commit_message: Saved new PDF of paper
33 |
--------------------------------------------------------------------------------
/permetrics/utils/encoder.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 06:18, 26/07/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 |
9 |
10 | class LabelEncoder:
11 | def __init__(self):
12 | self.classes_ = None
13 | self.encoded_classes_ = None
14 |
15 | def fit(self, y):
16 | self.classes_, indices = np.unique(y, return_inverse=True)
17 | self.encoded_classes_ = np.arange(len(self.classes_))
18 | return self
19 |
20 | def transform(self, y):
21 | return np.searchsorted(self.classes_, y)
22 |
23 | def inverse_transform(self, y):
24 | return self.classes_[y]
25 |
26 | def fit_transform(self, y):
27 | self.fit(y)
28 | return self.transform(y)
29 |
--------------------------------------------------------------------------------
/docs/source/pages/permetrics.utils.rst:
--------------------------------------------------------------------------------
1 |
2 | permetrics.utils.classifier\_util module
3 | ----------------------------------------
4 |
5 | .. automodule:: permetrics.utils.classifier_util
6 | :members:
7 | :undoc-members:
8 | :show-inheritance:
9 |
10 | permetrics.utils.cluster\_util module
11 | -------------------------------------
12 |
13 | .. automodule:: permetrics.utils.cluster_util
14 | :members:
15 | :undoc-members:
16 | :show-inheritance:
17 |
18 | permetrics.utils.data\_util module
19 | ----------------------------------
20 |
21 | .. automodule:: permetrics.utils.data_util
22 | :members:
23 | :undoc-members:
24 | :show-inheritance:
25 |
26 | permetrics.utils.encoder module
27 | -------------------------------
28 |
29 | .. automodule:: permetrics.utils.encoder
30 | :members:
31 | :undoc-members:
32 | :show-inheritance:
33 |
34 | permetrics.utils.regressor\_util module
35 | ---------------------------------------
36 |
37 | .. automodule:: permetrics.utils.regressor_util
38 | :members:
39 | :undoc-members:
40 | :show-inheritance:
41 |
--------------------------------------------------------------------------------
/docs/source/pages/permetrics.rst:
--------------------------------------------------------------------------------
1 | PERMETRICS Library
2 | ==================
3 |
4 | permetrics.utils package
5 | ------------------------
6 |
7 | .. toctree::
8 | :maxdepth: 4
9 |
10 | permetrics.utils
11 |
12 |
13 | permetrics.evaluator module
14 | ---------------------------
15 |
16 | .. automodule:: permetrics.evaluator
17 | :members:
18 | :undoc-members:
19 | :show-inheritance:
20 |
21 | permetrics.regression module
22 | ----------------------------
23 |
24 | .. automodule:: permetrics.regression
25 | :members:
26 | :undoc-members:
27 | :show-inheritance:
28 |
29 | permetrics.classification module
30 | --------------------------------
31 |
32 | .. automodule:: permetrics.classification
33 | :members:
34 | :undoc-members:
35 | :show-inheritance:
36 |
37 | permetrics.clustering module
38 | ----------------------------
39 |
40 | .. automodule:: permetrics.clustering
41 | :members:
42 | :undoc-members:
43 | :show-inheritance:
44 |
45 |
46 | .. toctree::
47 | :maxdepth: 3
48 |
49 | .. toctree::
50 | :maxdepth: 3
51 |
52 | .. toctree::
53 | :maxdepth: 3
54 |
55 | .. toctree::
56 | :maxdepth: 3
57 |
--------------------------------------------------------------------------------
/docs/source/pages/clustering/CHI.rst:
--------------------------------------------------------------------------------
1 | Calinski-Harabasz Index
2 | =======================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Calinski-Harabasz Index
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | The Calinski-Harabasz Index is a clustering evaluation metric used to measure the quality of clusters obtained from clustering algorithms. It aims to quantify the separation between clusters and the compactness within clusters.
18 |
19 | .. image:: /_static/images/CHI.png
20 |
21 | In practice, you can use the Calinski-Harabasz Index along with other clustering evaluation metrics to assess the performance of clustering algorithms and select the best number of clusters for your dataset.
22 |
23 |
24 | Example:
25 |
26 | .. code-block:: python
27 |
28 | import numpy as np
29 | from permetrics import ClusteringMetric
30 |
31 | ## For integer labels or categorical labels
32 | data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
33 | y_pred = np.array([0, 0, 1, 1, 1])
34 |
35 | cm = ClusteringMetric(X=data, y_pred=y_pred)
36 |
37 | print(cm.calinski_harabasz_index())
38 | print(cm.CHI())
39 |
--------------------------------------------------------------------------------
/docs/source/pages/classification/LS.rst:
--------------------------------------------------------------------------------
1 | Lift Score (LS)
2 | ===============
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Lift Score (LS)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | In the multi-class and multi-label case, this is the average of the LS score of each class with weighting depending on the average parameter.
19 |
20 | + Higher is better (No best value), Range = [0, +inf)
21 | + http://rasbt.github.io/mlxtend/user_guide/evaluate/lift_score/
22 | + https://neptune.ai/blog/evaluation-metrics-binary-classification
23 |
24 |
25 | Example:
26 |
27 | .. code-block:: python
28 | :emphasize-lines: 11,13-16
29 |
30 | from numpy import array
31 | from permetrics.classification import ClassificationMetric
32 |
33 | ## For integer labels or categorical labels
34 | y_true = [0, 1, 0, 0, 1, 0]
35 | y_pred = [0, 1, 0, 0, 0, 1]
36 |
37 | # y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
38 | # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
39 |
40 | cm = ClassificationMetric(y_true, y_pred)
41 |
42 | print(cm.lift_score(average=None))
43 | print(cm.LS(average="micro"))
44 | print(cm.LS(average="macro"))
45 | print(cm.LS(average="weighted"))
46 |
47 |
--------------------------------------------------------------------------------
/examples/classification/10_hinge_loss.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 17:46, 12/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | from permetrics import ClassificationMetric
9 |
10 |
11 | # Example usage
12 | y_true_binary = np.array([0, 1, 0, 1, 1])
13 | y_pred_binary = np.array([0.2, 0.7, 0.3, 0.8, 0.4])
14 |
15 | cu = ClassificationMetric()
16 | print(cu.hinge_loss(y_true_binary, y_pred_binary))
17 |
18 | # Example usage
19 | y_true_multiclass = np.array([[1, 0, 0],
20 | [0, 1, 0],
21 | [0, 0, 1],
22 | [0, 1, 0],
23 | [0, 0, 1]])
24 | y_pred_multiclass = np.array([[0.2, 0.6, 0.2],
25 | [0.7, 0.1, 0.2],
26 | [0.3, 0.4, 0.3],
27 | [0.8, 0.1, 0.1],
28 | [0.4, 0.2, 0.4]])
29 |
30 | print(cu.hinge_loss(y_true_multiclass, y_pred_multiclass))
31 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/AE.rst:
--------------------------------------------------------------------------------
1 | AE - Absolute Error
2 | ===================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: AE - Absolute Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{AE}(y, \hat{y}) = \frac{1}{n} \sum_{i=1}^{n} | \hat{y}_i - y_i |
21 |
22 | Latex equation code::
23 |
24 | \text{AE}(y, \hat{y}) = \frac{1}{n} \sum_{i=1}^{n} | \hat{y}_i - y_i |
25 |
26 | + Best possible score is 0.0, smaller value is better. Range = (-inf, +inf)
27 | + Computes the absolute error between two numbers, or for element between a pair of list, tuple or numpy arrays.
28 |
29 |
30 | Example to use AE metric:
31 |
32 | .. code-block:: python
33 | :emphasize-lines: 8-9,15-16
34 |
35 | from numpy import array
36 | from permetrics.regression import RegressionMetric
37 |
38 | ## For 1-D array
39 | y_true = array([3, -0.5, 2, 7])
40 | y_pred = array([2.5, 0.0, 2, 8])
41 |
42 | evaluator = RegressionMetric(y_true, y_pred)
43 | print(evaluator.single_absolute_error())
44 |
45 | ## For > 1-D array
46 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
47 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
48 |
49 | evaluator = RegressionMetric(y_true, y_pred)
50 | print(evaluator.AE())
51 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/MRE.rst:
--------------------------------------------------------------------------------
1 | MRE - Mean Relative Error
2 | =========================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: MRE - Mean Relative Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{MRE}(y, \hat{y}) = \frac{1}{N} \sum_{i=0}^{N - 1} \frac{|y_i - \hat{y}_i|}{|y_i|}
21 |
22 | Latex equation code::
23 |
24 | \text{MRE}(y, \hat{y}) = \frac{1}{N} \sum_{i=0}^{N - 1} \frac{|y_i - \hat{y}_i|}{|y_i|}}
25 |
26 |
27 | + Mean Relative Error (MRE) or Mean Relative Bias (MRB)
28 | + Best possible score is 0.0, smaller value is better. Range = [0, +inf)
29 |
30 |
31 | Example to use MRE metric:
32 |
33 | .. code-block:: python
34 | :emphasize-lines: 8-9,15-16
35 |
36 | from numpy import array
37 | from permetrics.regression import RegressionMetric
38 |
39 | ## For 1-D array
40 | y_true = array([3, -0.5, 2, 7])
41 | y_pred = array([2.5, 0.0, 2, 8])
42 |
43 | evaluator = RegressionMetric(y_true, y_pred)
44 | print(evaluator.mean_relative_error())
45 |
46 | ## For > 1-D array
47 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
48 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
49 |
50 | evaluator = RegressionMetric(y_true, y_pred)
51 | print(evaluator.MRE(multi_output="raw_values"))
52 |
--------------------------------------------------------------------------------
/examples/clustering/01_internal_metrics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 06:25, 27/07/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | ## Better to use OOP to call all of the available functions
8 | ## Internal metrics: Need X and y_pred and has suffix as index
9 |
10 | import numpy as np
11 | from permetrics import ClusteringMetric
12 | from sklearn.datasets import make_blobs
13 |
14 | # generate sample data
15 | X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
16 | y_pred = np.random.randint(0, 4, size=300)
17 |
18 | evaluator = ClusteringMetric(y_pred=y_pred, X=X)
19 |
20 | print(evaluator.get_metrics_by_list_names(["BHI", "CHI", "XBI", "BRI", "DBI", "DRI", "DI", "KDI", "LDRI", "LSRI", "SI"]))
21 |
22 | # BHI = ball_hall_index
23 | # CHI = calinski_harabasz_index
24 | # XBI = xie_beni_index
25 | # BRI = banfeld_raftery_index
26 | # DBI = davies_bouldin_index
27 | # DRI = det_ratio_index
28 | # DI = dunn_index
29 | # KDI = ksq_detw_index
30 | # LDRI = log_det_ratio_index
31 | # LSRI = log_ss_ratio_index
32 | # SI = silhouette_index
33 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/MPE.rst:
--------------------------------------------------------------------------------
1 | MPE - Mean Percentage Error
2 | ===========================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: MPE - Mean Percentage Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{MPE}(y, \hat{y}) = \frac{100\%}{N} \sum_{i=0}^{N - 1} \frac{y_i - \hat{y}_i}{y_i}.
21 |
22 | Latex equation code::
23 |
24 | \text{MPE}(y, \hat{y}) = \frac{100\%}{N} \sum_{i=0}^{N - 1} \frac{y_i - \hat{y}_i}{y_i}.
25 |
26 | + Mean Percentage Error (MPE): Best possible score is 0.0. Range = (-inf, +inf)
27 | + `Link to equation `_
28 |
29 |
30 | Example to use MPE metric:
31 |
32 | .. code-block:: python
33 | :emphasize-lines: 8-9,15-16
34 |
35 | from numpy import array
36 | from permetrics.regression import RegressionMetric
37 |
38 | ## For 1-D array
39 | y_true = array([3, -0.5, 2, 7])
40 | y_pred = array([2.5, 0.0, 2, 8])
41 |
42 | evaluator = RegressionMetric(y_true, y_pred)
43 | print(evaluator.mean_percentage_error())
44 |
45 | ## For > 1-D array
46 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
47 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
48 |
49 | evaluator = RegressionMetric(y_true, y_pred)
50 | print(evaluator.MPE(multi_output="raw_values"))
51 |
--------------------------------------------------------------------------------
/examples/classification/08_roc_auc_score.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 15:30, 12/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | from sklearn.metrics import roc_auc_score
9 | from permetrics import ClassificationMetric
10 |
11 | # Example usage
12 | y_true_binary = np.array([0, 1, 0, 1, 1])
13 | y_pred_binary = np.array([0.9, 0.71, 0.6, 0.8, 0.7])
14 |
15 | cu = ClassificationMetric()
16 | print(cu.roc_auc_score(y_true_binary, y_pred_binary, average="weighted"))
17 | print(roc_auc_score(y_true_binary, y_pred_binary))
18 |
19 |
20 | # Example usage
21 | y_true_multiclass = np.array([0, 1, 2, 1, 2])
22 | y_pred_multiclass = np.array([[0.6, 0.2, 0.2],
23 | [0.1, 0.7, 0.2],
24 | [0.3, 0.4, 0.3],
25 | [0.8, 0.1, 0.1],
26 | [0.2, 0.6, 0.2]])
27 |
28 | cu = ClassificationMetric()
29 | print(cu.roc_auc_score(y_true_multiclass, y_pred_multiclass, average="macro"))
30 | print(roc_auc_score(y_true_multiclass, y_pred_multiclass, multi_class="ovr"))
31 |
32 |
33 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/DRV.rst:
--------------------------------------------------------------------------------
1 | DRV - Deviation of Runoff Volume
2 | ================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: DRV - Deviation of Runoff Volume
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{DRV}(y, \hat{y}) = \frac{ \sum_{i=0}^{N - 1} y_i }{ \sum_{i=0}^{N - 1} \hat{y_i} }
21 |
22 | Latex equation code::
23 |
24 | \text{DRV}(y, \hat{y}) = \frac{ \sum_{i=0}^{N - 1} y_i }{ \sum_{i=0}^{N - 1} \hat{y_i} }
25 |
26 | + Best possible score is 1.0, smaller value is better. Range = [1, +inf)
27 | + `Link to equation `_
28 |
29 |
30 | Example to use DRV metric:
31 |
32 | .. code-block:: python
33 | :emphasize-lines: 8-9,15-16
34 |
35 | from numpy import array
36 | from permetrics.regression import RegressionMetric
37 |
38 | ## For 1-D array
39 | y_true = array([3, -0.5, 2, 7])
40 | y_pred = array([2.5, 0.0, 2, 8])
41 |
42 | evaluator = RegressionMetric(y_true, y_pred)
43 | print(evaluator.deviation_of_runoff_volume())
44 |
45 | ## For > 1-D array
46 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
47 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
48 |
49 | evaluator = RegressionMetric(y_true, y_pred)
50 | print(evaluator.DRV(multi_output="raw_values"))
51 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/MSE.rst:
--------------------------------------------------------------------------------
1 | MSE - Mean Squared Error
2 | ========================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: MSE - Mean Squared Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{MSE}(y, \hat{y}) = \frac{\sum_{i=0}^{N - 1} (y_i - \hat{y}_i)^2}{N}
21 |
22 | Latex equation code::
23 |
24 | \text{MSE}(y, \hat{y}) = \frac{\sum_{i=0}^{N - 1} (y_i - \hat{y}_i)^2}{N}
25 |
26 |
27 | + Best possible score is 0.0, smaller value is better. Range = [0, +inf)
28 | + MSE: a risk metric corresponding to the expected value of the squared (quadratic) error or loss.
29 | + `Link to equation `_
30 |
31 |
32 | Example to use MSE metric:
33 |
34 | .. code-block:: python
35 | :emphasize-lines: 8-9,15-16
36 |
37 | from numpy import array
38 | from permetrics.regression import RegressionMetric
39 |
40 | ## For 1-D array
41 | y_true = array([3, -0.5, 2, 7])
42 | y_pred = array([2.5, 0.0, 2, 8])
43 |
44 | evaluator = RegressionMetric(y_true, y_pred)
45 | print(evaluator.mean_squared_error())
46 |
47 | ## For > 1-D array
48 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
49 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
50 |
51 | evaluator = RegressionMetric(y_true, y_pred)
52 | print(evaluator.MSE(multi_output="raw_values"))
53 |
--------------------------------------------------------------------------------
/docs/source/pages/classification/MCC.rst:
--------------------------------------------------------------------------------
1 | Matthews Correlation Coefficient (MCC)
2 | ======================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Matthews Correlation Coefficient (MCC)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | In the multi-class and multi-label case, this is the average of the MCC score of each class with weighting depending on the average parameter.
19 |
20 | + Best possible score is 1.0, higher value is better. Range = [-1, +1]
21 | + https://neptune.ai/blog/evaluation-metrics-binary-classification
22 | + https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html#sklearn.metrics.matthews_corrcoef
23 |
24 |
25 | Example:
26 |
27 | .. code-block:: python
28 | :emphasize-lines: 11,13-16
29 |
30 | from numpy import array
31 | from permetrics.classification import ClassificationMetric
32 |
33 | ## For integer labels or categorical labels
34 | y_true = [0, 1, 0, 0, 1, 0]
35 | y_pred = [0, 1, 0, 0, 0, 1]
36 |
37 | # y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
38 | # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
39 |
40 | cm = ClassificationMetric(y_true, y_pred)
41 |
42 | print(cm.mcc(average=None))
43 | print(cm.MCC(average="micro"))
44 | print(cm.MCC(average="macro"))
45 | print(cm.MCC(average="weighted"))
46 |
47 |
--------------------------------------------------------------------------------
/docs/source/pages/quick_start.rst:
--------------------------------------------------------------------------------
1 |
2 | ============
3 | Installation
4 | ============
5 |
6 | * Install the `current PyPI release `_::
7 |
8 | $ pip install permetrics==2.0.0
9 |
10 |
11 | * Install directly from source code::
12 |
13 | $ git clone https://github.com/thieu1995/permetrics.git
14 | $ cd permetrics
15 | $ python setup.py install
16 |
17 | * In case, you want to install the development version from Github::
18 |
19 | $ pip install git+https://github.com/thieu1995/permetrics
20 |
21 |
22 | After installation, you can import Permetrics as any other Python module::
23 |
24 | $ python
25 | >>> import permetrics
26 | >>> permetrics.__version__
27 |
28 |
29 | Let's go through some examples.
30 |
31 |
32 | ========
33 | Examples
34 | ========
35 |
36 | There are several ways you can use a performance metrics in this library. However, the most used are these two ways: functional-based and object-oriented
37 | based programming. We will go through detail of how to use 3 main type of metrics (regression, classification, and clustering) with these two methods.
38 |
39 |
40 | .. include:: examples/regression.rst
41 | .. include:: examples/classification.rst
42 | .. include:: examples/clustering.rst
43 |
44 |
45 | .. toctree::
46 | :maxdepth: 3
47 |
48 | .. toctree::
49 | :maxdepth: 3
50 |
51 | .. toctree::
52 | :maxdepth: 3
53 |
54 | .. toctree::
55 | :maxdepth: 3
56 |
--------------------------------------------------------------------------------
/docs/source/pages/classification/HS.rst:
--------------------------------------------------------------------------------
1 | Hamming Score (HS)
2 | ==================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Hamming Score (HS)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | The Hamming score is 1 - the fraction of labels that are incorrectly predicted.
19 |
20 | In the multi-class and multi-label case, this is the average of the HL score of each class with weighting depending on the average parameter.
21 |
22 | + Higher is better (Best = 1), Range = [0, 1]
23 | + A little bit difference than hamming_score in scikit-learn library.
24 | + https://scikit-learn.org/stable/modules/generated/sklearn.metrics.hamming_score.html#sklearn.metrics.hamming_score
25 |
26 |
27 | Example:
28 |
29 | .. code-block:: python
30 | :emphasize-lines: 11,13-16
31 |
32 | from numpy import array
33 | from permetrics.classification import ClassificationMetric
34 |
35 | ## For integer labels or categorical labels
36 | y_true = [0, 1, 0, 0, 1, 0]
37 | y_pred = [0, 1, 0, 0, 0, 1]
38 |
39 | # y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
40 | # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
41 |
42 | cm = ClassificationMetric(y_true, y_pred)
43 |
44 | print(cm.hamming_score(average=None))
45 | print(cm.HS(average="micro"))
46 | print(cm.HS(average="macro"))
47 | print(cm.HS(average="weighted"))
48 |
49 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/RE.rst:
--------------------------------------------------------------------------------
1 | RE - Relative Error
2 | ===================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: RE - Relative Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{RE}(y, \hat{y}) = \frac{|y_i - \hat{y}_i|}{|y_i|}
21 |
22 | Latex equation code::
23 |
24 | \text{RE}(y, \hat{y}) = \frac{|y_i - \hat{y}_i|}{|y_i|}
25 |
26 | + Relative Error (RE): Best possible score is 0.0, smaller value is better. Range = (-inf, +inf)
27 | + Note: Computes the relative error between two numbers, or for element between a pair of list, tuple or numpy arrays.
28 | + The Relative Error (RE) is a metric used to evaluate the accuracy of a regression model by measuring the ratio of the absolute error to the actual value.
29 |
30 |
31 | Example to use RE metric:
32 |
33 | .. code-block:: python
34 | :emphasize-lines: 8-9,15-16
35 |
36 | from numpy import array
37 | from permetrics.regression import RegressionMetric
38 |
39 | ## For 1-D array
40 | y_true = array([3, -0.5, 2, 7])
41 | y_pred = array([2.5, 0.0, 2, 8])
42 |
43 | evaluator = RegressionMetric(y_true, y_pred)
44 | print(evaluator.single_relative_error())
45 |
46 | ## For > 1-D array
47 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
48 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
49 |
50 | evaluator = RegressionMetric(y_true, y_pred)
51 | print(evaluator.RE())
52 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/SE.rst:
--------------------------------------------------------------------------------
1 | SE - Squared Error
2 | ==================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: SE - Squared Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{SE}(y, f_i) = \frac{1}{n}\sum_{i=1}^{n}(y_i - f_i)^2
21 |
22 | Latex equation code::
23 |
24 | \text{SE}(y, f_i) = \frac{1}{n}\sum_{i=1}^{n}(y_i - f_i)^2
25 |
26 | + Best possible score is 0.0, smaller value is better. Range = [0, +inf)
27 | + Note: Computes the squared error between two numbers, or for element between a pair of list, tuple or numpy arrays.
28 | + The Squared Error (SE) is a metric used to evaluate the accuracy of a regression model by measuring the average of the squared differences between the
29 | predicted and actual values.
30 |
31 |
32 | Example to use SE metric:
33 |
34 | .. code-block:: python
35 | :emphasize-lines: 8-9,15-16
36 |
37 | from numpy import array
38 | from permetrics.regression import RegressionMetric
39 |
40 | ## For 1-D array
41 | y_true = array([3, -0.5, 2, 7])
42 | y_pred = array([2.5, 0.0, 2, 8])
43 |
44 | evaluator = RegressionMetric(y_true, y_pred)
45 | print(evaluator.single_squared_error())
46 |
47 | ## For > 1-D array
48 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
49 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
50 |
51 | evaluator = RegressionMetric(y_true, y_pred)
52 | print(evaluator.SE())
53 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_BI.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 |
12 | np.random.seed(100)
13 |
14 |
15 | def generate_dataset(num_samples, num_features, num_clusters, cluster_std):
16 | centroids = np.random.randn(num_clusters, num_features)
17 | labels = np.random.randint(0, num_clusters, num_samples)
18 | data = centroids[labels] + np.random.randn(num_samples, num_features) * cluster_std
19 | return data, centroids, labels
20 |
21 |
22 | num_samples = 10000000
23 | num_features = 2
24 | num_clusters = 5
25 | cluster_std = 0.5
26 |
27 | data, centroids, labels = generate_dataset(num_samples, num_features, num_clusters, cluster_std)
28 |
29 | time02 = time.perf_counter()
30 | cm = ClusteringMetric(y_true=labels, y_pred=labels, X=data)
31 | res = cm.beale_index()
32 | print("res: ", res, time.perf_counter() - time02 )
33 |
34 | time03 = time.perf_counter()
35 | s3 = cut.calculate_beale_index(data, labels)
36 | print("res: ", s3, time.perf_counter() - time03)
37 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_DI.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 |
12 | np.random.seed(100)
13 |
14 |
15 | def generate_dataset(num_samples, num_features, num_clusters, cluster_std):
16 | centroids = np.random.randn(num_clusters, num_features)
17 | labels = np.random.randint(0, num_clusters, num_samples)
18 | data = centroids[labels] + np.random.randn(num_samples, num_features) * cluster_std
19 | return data, centroids, labels
20 |
21 |
22 | num_samples = 100000
23 | num_features = 2
24 | num_clusters = 5
25 | cluster_std = 0.5
26 |
27 | data, centroids, labels = generate_dataset(num_samples, num_features, num_clusters, cluster_std)
28 |
29 |
30 | time02 = time.perf_counter()
31 | cm = ClusteringMetric(y_true=labels, y_pred=labels, X=data)
32 | sse02 = cm.dunn_index()
33 | print("DI: ", sse02, time.perf_counter() - time02 )
34 |
35 | time03 = time.perf_counter()
36 | s3 = cut.calculate_dunn_index(data, labels)
37 | print("DI: ", s3, time.perf_counter() - time03)
38 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_BHI.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 |
12 | np.random.seed(100)
13 |
14 |
15 | def generate_dataset(num_samples, num_features, num_clusters, cluster_std):
16 | centroids = np.random.randn(num_clusters, num_features)
17 | labels = np.random.randint(0, num_clusters, num_samples)
18 | data = centroids[labels] + np.random.randn(num_samples, num_features) * cluster_std
19 | return data, centroids, labels
20 |
21 |
22 | num_samples = 10000000
23 | num_features = 2
24 | num_clusters = 5
25 | cluster_std = 0.5
26 |
27 | data, centroids, labels = generate_dataset(num_samples, num_features, num_clusters, cluster_std)
28 |
29 | time02 = time.perf_counter()
30 | cm = ClusteringMetric(y_true=labels, y_pred=labels, X=data)
31 | res = cm.det_ratio_index()
32 | print("BHI: ", res, time.perf_counter() - time02 )
33 |
34 | time03 = time.perf_counter()
35 | s3 = cut.calculate_det_ratio_index(data, labels)
36 | print("BHI: ", s3, time.perf_counter() - time03)
37 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_DRI.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 |
12 | np.random.seed(100)
13 |
14 |
15 | def generate_dataset(num_samples, num_features, num_clusters, cluster_std):
16 | centroids = np.random.randn(num_clusters, num_features)
17 | labels = np.random.randint(0, num_clusters, num_samples)
18 | data = centroids[labels] + np.random.randn(num_samples, num_features) * cluster_std
19 | return data, centroids, labels
20 |
21 |
22 | num_samples = 10000000
23 | num_features = 2
24 | num_clusters = 5
25 | cluster_std = 0.5
26 |
27 | data, centroids, labels = generate_dataset(num_samples, num_features, num_clusters, cluster_std)
28 |
29 | time02 = time.perf_counter()
30 | cm = ClusteringMetric(y_true=labels, y_pred=labels, X=data)
31 | res = cm.det_ratio_index()
32 | print("res: ", res, time.perf_counter() - time02 )
33 |
34 | time03 = time.perf_counter()
35 | s3 = cut.calculate_det_ratio_index(data, labels)
36 | print("res: ", s3, time.perf_counter() - time03)
37 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_RSI.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 |
12 | np.random.seed(100)
13 |
14 |
15 | def generate_dataset(num_samples, num_features, num_clusters, cluster_std):
16 | centroids = np.random.randn(num_clusters, num_features)
17 | labels = np.random.randint(0, num_clusters, num_samples)
18 | data = centroids[labels] + np.random.randn(num_samples, num_features) * cluster_std
19 | return data, centroids, labels
20 |
21 |
22 | num_samples = 10000000
23 | num_features = 2
24 | num_clusters = 5
25 | cluster_std = 0.5
26 |
27 | data, centroids, labels = generate_dataset(num_samples, num_features, num_clusters, cluster_std)
28 |
29 | time02 = time.perf_counter()
30 | cm = ClusteringMetric(y_true=labels, y_pred=labels, X=data)
31 | res = cm.r_squared_index()
32 | print("res: ", res, time.perf_counter() - time02 )
33 |
34 | time03 = time.perf_counter()
35 | s3 = cut.calculate_r_squared_index(data, labels)
36 | print("res: ", s3, time.perf_counter() - time03)
37 |
--------------------------------------------------------------------------------
/examples/classification/05_check_ranges_2labels.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 14:13, 03/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | from permetrics.classification import ClassificationMetric
9 |
10 | y_true = np.array([0, 1, 0, 0, 1, 0, 0, 1, 0])
11 |
12 | t1 = [
13 | y_true.copy(),
14 | 1 - y_true,
15 | np.zeros(len(y_true)),
16 | np.ones(len(y_true)),
17 | np.random.randint(0, 2, len(y_true))
18 | ]
19 | for idx in range(len(t1)):
20 | evaluator = ClassificationMetric(y_true, t1[idx])
21 | print(evaluator.gini_index())
22 |
23 | # CM = confusion_matrix
24 | # PS = precision_score
25 | # NPV = negative_predictive_value
26 | # RS = recall_score
27 | # AS = accuracy_score
28 | # F1S = f1_score
29 | # F2S = f2_score
30 | # FBS = fbeta_score
31 | # SS = specificity_score
32 | # MCC = matthews_correlation_coefficient
33 | # HS = hamming_score
34 | # LS = lift_score
35 | # CKS = cohen_kappa_score
36 | # JSI = JSC = jaccard_similarity_coefficient = jaccard_similarity_index
37 | # GMS = g_mean_score
38 | # GINI = gini_index
39 | # ROC = AUC = RAS = roc_auc_score
40 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/OI.rst:
--------------------------------------------------------------------------------
1 | OI - Overall Index
2 | ==================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: OI - Overall Index
7 |
8 |
9 | .. toctree::
10 | :maxdepth: 3
11 |
12 | .. toctree::
13 | :maxdepth: 3
14 |
15 | .. toctree::
16 | :maxdepth: 3
17 |
18 |
19 | .. math::
20 |
21 | \text{OI}(y, \hat{y}) = \frac{1}{2} \biggr[ 1 - \frac{RMSE}{y_{max} - y_{min}} + EC \biggr]
22 |
23 | Latex equation code::
24 |
25 | \text{OI}(y, \hat{y}) = \frac{1}{2} \biggr[ 1 - \frac{RMSE}{y_{max} - y_{min}} + EC \biggr]
26 |
27 |
28 | The Overall Index (OI) :cite:`almodfer2022modeling` is a composite measure used to evaluate the accuracy of a forecasting model. It combines the Root Mean
29 | Squared Error (RMSE) with a measure of the relative error and a correction term.
30 | + Best possible value = 1, bigger value is better. Range = [-1, +1)
31 |
32 | Example to use COR metric:
33 |
34 | .. code-block:: python
35 | :emphasize-lines: 8-9,15-16
36 |
37 | from numpy import array
38 | from permetrics.regression import RegressionMetric
39 |
40 | ## For 1-D array
41 | y_true = array([3, -0.5, 2, 7])
42 | y_pred = array([2.5, 0.0, 2, 8])
43 |
44 | evaluator = RegressionMetric(y_true, y_pred)
45 | print(evaluator.overall_index())
46 |
47 | ## For > 1-D array
48 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
49 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
50 |
51 | evaluator = RegressionMetric(y_true, y_pred)
52 | print(evaluator.OI(multi_output="raw_values"))
53 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_KDI.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 |
12 | np.random.seed(100)
13 |
14 |
15 | def generate_dataset(num_samples, num_features, num_clusters, cluster_std):
16 | centroids = np.random.randn(num_clusters, num_features)
17 | labels = np.random.randint(0, num_clusters, num_samples)
18 | data = centroids[labels] + np.random.randn(num_samples, num_features) * cluster_std
19 | return data, centroids, labels
20 |
21 |
22 | num_samples = 10000000
23 | num_features = 2
24 | num_clusters = 5
25 | cluster_std = 0.5
26 |
27 | data, centroids, labels = generate_dataset(num_samples, num_features, num_clusters, cluster_std)
28 |
29 | time02 = time.perf_counter()
30 | cm = ClusteringMetric(y_true=labels, y_pred=labels, X=data)
31 | sse02 = cm.ksq_detw_index()
32 | print("res 2: ", sse02, time.perf_counter() - time02 )
33 |
34 | time03 = time.perf_counter()
35 | s3 = cut.calculate_ksq_detw_index(data, labels)
36 | print("res 3: ", s3, time.perf_counter() - time03)
37 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_BRI.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 |
12 | np.random.seed(100)
13 |
14 |
15 | def generate_dataset(num_samples, num_features, num_clusters, cluster_std):
16 | centroids = np.random.randn(num_clusters, num_features)
17 | labels = np.random.randint(0, num_clusters, num_samples)
18 | data = centroids[labels] + np.random.randn(num_samples, num_features) * cluster_std
19 | return data, centroids, labels
20 |
21 |
22 | num_samples = 10000000
23 | num_features = 2
24 | num_clusters = 5
25 | cluster_std = 0.5
26 |
27 | data, centroids, labels = generate_dataset(num_samples, num_features, num_clusters, cluster_std)
28 |
29 | time02 = time.perf_counter()
30 | cm = ClusteringMetric(y_true=labels, y_pred=labels, X=data)
31 | sse02 = cm.banfeld_raftery_index()
32 | print("BRI 2: ", sse02, time.perf_counter() - time02 )
33 |
34 | time03 = time.perf_counter()
35 | s3 = cut.calculate_banfeld_raftery_index(data, labels)
36 | print("BRI 3: ", s3, time.perf_counter() - time03)
37 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_LDRI.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 |
12 | np.random.seed(100)
13 |
14 |
15 | def generate_dataset(num_samples, num_features, num_clusters, cluster_std):
16 | centroids = np.random.randn(num_clusters, num_features)
17 | labels = np.random.randint(0, num_clusters, num_samples)
18 | data = centroids[labels] + np.random.randn(num_samples, num_features) * cluster_std
19 | return data, centroids, labels
20 |
21 |
22 | num_samples = 10000000
23 | num_features = 2
24 | num_clusters = 5
25 | cluster_std = 0.5
26 |
27 | data, centroids, labels = generate_dataset(num_samples, num_features, num_clusters, cluster_std)
28 |
29 | time02 = time.perf_counter()
30 | cm = ClusteringMetric(y_true=labels, y_pred=labels, X=data)
31 | sse02 = cm.log_det_ratio_index()
32 | print("result: ", sse02, time.perf_counter() - time02 )
33 |
34 | time03 = time.perf_counter()
35 | s3 = cut.calculate_log_det_ratio_index(data, labels)
36 | print("result: ", s3, time.perf_counter() - time03)
37 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/CE.rst:
--------------------------------------------------------------------------------
1 | CE - Cross Entropy
2 | ==================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: CE - Cross Entropy
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{CE}(y, \hat{y}) = -\frac{1}{n}\sum_{i=1}^{n} \left[y_i\log(\hat{y}_i) + (1-y_i)\log(1-\hat{y}_i)\right]
21 |
22 | Latex equation code::
23 |
24 | \text{CE}(y, \hat{y}) = -\frac{1}{n}\sum_{i=1}^{n} \left[y_i\log(\hat{y}_i) + (1-y_i)\log(1-\hat{y}_i)\right]
25 |
26 | + Range = (-inf, 0]. Can't give comment about this one
27 | + Greater value of Entropy, the greater the uncertainty for probability distribution and smaller the value the less the uncertainty
28 | + `Link to equation `_
29 |
30 |
31 | Example to use CE metric:
32 |
33 | .. code-block:: python
34 | :emphasize-lines: 8-9,15-16
35 |
36 | from numpy import array
37 | from permetrics.regression import RegressionMetric
38 |
39 | ## For 1-D array
40 | y_true = array([3, -0.5, 2, 7])
41 | y_pred = array([2.5, 0.0, 2, 8])
42 |
43 | evaluator = RegressionMetric(y_true, y_pred)
44 | print(evaluator.cross_entropy())
45 |
46 | ## For > 1-D array
47 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
48 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
49 |
50 | evaluator = RegressionMetric(y_true, y_pred)
51 | print(evaluator.CE(multi_output="raw_values"))
52 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/MASE.rst:
--------------------------------------------------------------------------------
1 | MASE - Mean Absolute Scaled Error
2 | =================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: MASE - Mean Absolute Scaled Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{MASE}(y, \hat{y}) = \frac{ \frac{1}{N} \sum_{i=0}{N-1} |y_i - \hat{y_i}| }{ \frac{1}{N-1} \sum_{i=1}^{N-1} |y_i - y_{i-1}| }
21 |
22 | Latex equation code::
23 |
24 | \text{MASE}(y, \hat{y}) = \frac{ \frac{1}{N} \sum_{i=0}{N-1} |y_i - \hat{y_i}| }{ \frac{1}{N-1} \sum_{i=1}^{N-1} |y_i - y_{i-1}| }
25 |
26 |
27 | + Best possible score is 0.0, smaller value is better. Range = [0, +inf)
28 | + m = 1 for non-seasonal data, m > 1 for seasonal data
29 | + `Link to equation `_
30 |
31 |
32 | Example to use MASE metric:
33 |
34 | .. code-block:: python
35 | :emphasize-lines: 8-9,15-16
36 |
37 | from numpy import array
38 | from permetrics.regression import RegressionMetric
39 |
40 | ## For 1-D array
41 | y_true = array([3, -0.5, 2, 7])
42 | y_pred = array([2.5, 0.0, 2, 8])
43 |
44 | evaluator = RegressionMetric(y_true, y_pred)
45 | print(evaluator.mean_absolute_scaled_error())
46 |
47 | ## For > 1-D array
48 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
49 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
50 |
51 | evaluator = RegressionMetric(y_true, y_pred)
52 | print(evaluator.MASE(multi_output="raw_values"))
53 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/ME.rst:
--------------------------------------------------------------------------------
1 | ME - Max Error
2 | ==============
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: ME - Max Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{ME}(y, \hat{y}) = max(| y_i - \hat{y}_i |)
21 |
22 | Latex equation code::
23 |
24 | \text{ME}(y, \hat{y}) = max(| y_i - \hat{y}_i |)
25 |
26 | The `max_error` function computes the maximum residual error , a metric that captures the worst case error between the predicted value and the true value. In a
27 | perfectly fitted single output regression model, max_error would be 0 on the training set and though this would be highly unlikely in the real world, this
28 | metric shows the extent of error that the model had when it was fitted.
29 |
30 | + Best possible score is 0.0, smaller value is better. Range = [0, +inf)
31 |
32 |
33 | Example to use ME metric:
34 |
35 | .. code-block:: python
36 | :emphasize-lines: 8-9,15-16
37 |
38 | from numpy import array
39 | from permetrics.regression import RegressionMetric
40 |
41 | ## For 1-D array
42 | y_true = array([3, -0.5, 2, 7])
43 | y_pred = array([2.5, 0.0, 2, 8])
44 |
45 | evaluator = RegressionMetric(y_true, y_pred)
46 | print(evaluator.max_error())
47 |
48 | ## For > 1-D array
49 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
50 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
51 |
52 | evaluator = RegressionMetric(y_true, y_pred)
53 | print(evaluator.ME(multi_output="raw_values"))
54 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_ES.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 |
13 | np.random.seed(100)
14 |
15 |
16 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
17 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
18 | cluster_std=1.0, random_state=random_state, return_centers=True)
19 | y_pred = np.random.randint(0, n_clusters, n_samples)
20 | return X, y_true, y_pred, centers
21 |
22 |
23 | num_samples = 10000000
24 | num_features = 2
25 | num_clusters = 7
26 | cluster_std = 0.5
27 |
28 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
29 |
30 |
31 | time02 = time.perf_counter()
32 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
33 | res = cm.entropy_score()
34 | print("res: ", res, time.perf_counter() - time02 )
35 |
36 | time03 = time.perf_counter()
37 | s3 = cut.calculate_entropy_score(y_true, y_pred)
38 | print("res: ", s3, time.perf_counter() - time03)
39 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_PuS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 |
13 | np.random.seed(100)
14 |
15 |
16 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
17 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
18 | cluster_std=1.0, random_state=random_state, return_centers=True)
19 | y_pred = np.random.randint(0, n_clusters, n_samples)
20 | return X, y_true, y_pred, centers
21 |
22 |
23 | num_samples = 10000000
24 | num_features = 2
25 | num_clusters = 7
26 | cluster_std = 0.5
27 |
28 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
29 |
30 |
31 | time02 = time.perf_counter()
32 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
33 | res = cm.purity_score()
34 | print("res: ", res, time.perf_counter() - time02 )
35 |
36 | time03 = time.perf_counter()
37 | s3 = cut.calculate_purity_score(y_true, y_pred)
38 | print("res: ", s3, time.perf_counter() - time03)
39 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_ReS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 |
13 | np.random.seed(100)
14 |
15 |
16 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
17 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
18 | cluster_std=1.0, random_state=random_state, return_centers=True)
19 | y_pred = np.random.randint(0, n_clusters, n_samples)
20 | return X, y_true, y_pred, centers
21 |
22 |
23 | num_samples = 100000
24 | num_features = 2
25 | num_clusters = 7
26 | cluster_std = 0.5
27 |
28 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
29 |
30 |
31 | time02 = time.perf_counter()
32 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
33 | res = cm.recall_score()
34 | print("res: ", res, time.perf_counter() - time02 )
35 |
36 | time03 = time.perf_counter()
37 | s3 = cut.calculate_recall_score(y_true, y_pred)
38 | print("res: ", s3, time.perf_counter() - time03)
39 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/MedAE.rst:
--------------------------------------------------------------------------------
1 | MedAE - Median Absolute Error
2 | =============================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: MedAE - Median Absolute Error
7 |
8 |
9 | .. toctree::
10 | :maxdepth: 3
11 |
12 | .. toctree::
13 | :maxdepth: 3
14 |
15 | .. toctree::
16 | :maxdepth: 3
17 |
18 |
19 | .. math::
20 |
21 | \text{MedAE}(y, \hat{y}) = \text{median}(\mid y_1 - \hat{y}_1 \mid, \ldots, \mid y_n - \hat{y}_n \mid)
22 |
23 | Latex equation code::
24 |
25 | \text{MedAE}(y, \hat{y}) = \text{median}(\mid y_1 - \hat{y}_1 \mid, \ldots, \mid y_n - \hat{y}_n \mid)
26 |
27 |
28 | The Median Absolute Error (MedAE) :cite:`nguyen2022improved` is particularly interesting because it is robust to outliers. The loss is calculated by taking
29 | the median of all absolute differences between the target and the prediction.
30 |
31 | + Best possible score is 0.0, smaller value is better. Range = [0, +inf)
32 |
33 |
34 | Example to use MedAE metric:
35 |
36 | .. code-block:: python
37 | :emphasize-lines: 8-9,15-16
38 |
39 | from numpy import array
40 | from permetrics.regression import RegressionMetric
41 |
42 | ## For 1-D array
43 | y_true = array([3, -0.5, 2, 7])
44 | y_pred = array([2.5, 0.0, 2, 8])
45 |
46 | evaluator = RegressionMetric(y_true, y_pred)
47 | print(evaluator.max_error())
48 |
49 | ## For > 1-D array
50 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
51 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
52 |
53 | evaluator = RegressionMetric(y_true, y_pred)
54 | print(evaluator.ME(multi_output="raw_values"))
55 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_PrS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 |
13 | np.random.seed(100)
14 |
15 |
16 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
17 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
18 | cluster_std=1.0, random_state=random_state, return_centers=True)
19 | y_pred = np.random.randint(0, n_clusters, n_samples)
20 | return X, y_true, y_pred, centers
21 |
22 |
23 | num_samples = 100000
24 | num_features = 2
25 | num_clusters = 7
26 | cluster_std = 0.5
27 |
28 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
29 |
30 |
31 | time02 = time.perf_counter()
32 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
33 | res = cm.precision_score()
34 | print("res: ", res, time.perf_counter() - time02 )
35 |
36 | time03 = time.perf_counter()
37 | s3 = cut.calculate_precision_score(y_true, y_pred)
38 | print("res: ", s3, time.perf_counter() - time03)
39 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_FMeasureS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 |
13 | np.random.seed(100)
14 |
15 |
16 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
17 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
18 | cluster_std=1.0, random_state=random_state, return_centers=True)
19 | y_pred = np.random.randint(0, n_clusters, n_samples)
20 | return X, y_true, y_pred, centers
21 |
22 |
23 | num_samples = 100000
24 | num_features = 2
25 | num_clusters = 7
26 | cluster_std = 0.5
27 |
28 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
29 |
30 |
31 | time02 = time.perf_counter()
32 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
33 | res = cm.f_measure_score()
34 | print("res: ", res, time.perf_counter() - time02 )
35 |
36 | time03 = time.perf_counter()
37 | s3 = cut.calculate_f_measure_score(y_true, y_pred)
38 | print("res: ", s3, time.perf_counter() - time03)
39 |
--------------------------------------------------------------------------------
/docs/source/pages/classification/F2S.rst:
--------------------------------------------------------------------------------
1 | F2 Score (F2S)
2 | ==============
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: F2 Score (F2S)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | It’s a metric that combines precision and recall, putting 2x emphasis on recall
18 |
19 | .. math::
20 |
21 | F2 = 5 * (precision * recall) / (4 * precision + recall)
22 |
23 | In the multi-class and multi-label case, this is the average of the F2 score of each class with weighting depending on the average parameter.
24 |
25 | + Best possible score is 1.0, higher value is better. Range = [0, 1]
26 | + https://towardsdatascience.com/multi-class-metrics-made-simple-part-i-precision-and-recall-9250280bddc2
27 | + https://www.debadityachakravorty.com/ai-ml/cmatrix/
28 | + https://neptune.ai/blog/evaluation-metrics-binary-classification
29 |
30 | Example:
31 |
32 | .. code-block:: python
33 | :emphasize-lines: 11,13-16
34 |
35 | from numpy import array
36 | from permetrics.classification import ClassificationMetric
37 |
38 | ## For integer labels or categorical labels
39 | y_true = [0, 1, 0, 0, 1, 0]
40 | y_pred = [0, 1, 0, 0, 0, 1]
41 |
42 | # y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
43 | # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
44 |
45 | cm = ClassificationMetric(y_true, y_pred)
46 |
47 | print(cm.f2_score(average=None))
48 | print(cm.F2S(average="micro"))
49 | print(cm.F2S(average="macro"))
50 | print(cm.F2S(average="weighted"))
51 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/NRMSE.rst:
--------------------------------------------------------------------------------
1 | NRMSE - Normalized Root Mean Square Error
2 | =========================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: NRMSE - Normalized Root Mean Square Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | The NRMSE :cite:`stephen2014improved` is calculated as the RMSE divided by the range of the observed values, expressed as a percentage. The range of the
19 | observed values is the difference between the maximum and minimum values of the observed data.
20 |
21 | + Normalized Root Mean Square Error (NRMSE): Best possible score is 0.0, smaller value is better. Range = [0, +inf)
22 | + `Link to equation `_
23 |
24 |
25 | + Example to use NMRSE metric:
26 |
27 | .. code-block:: python
28 | :emphasize-lines: 8-9,15-16
29 |
30 | from numpy import array
31 | from permetrics.regression import RegressionMetric
32 |
33 | ## For 1-D array
34 | y_true = array([3, -0.5, 2, 7])
35 | y_pred = array([2.5, 0.0, 2, 8])
36 |
37 | evaluator = RegressionMetric(y_true, y_pred)
38 | print(evaluator.normalized_root_mean_square_error())
39 |
40 | ## For > 1-D array
41 | y_true = array([[0.5, 1], [-1, 1], [7, -6], [1, 2], [2.1, 2.2], [3.4, 5.5]])
42 | y_pred = array([[0, 2], [-1, 2], [8, -5], [1.1, 1.9], [2.0, 2.3], [3.0, 4.2]])
43 |
44 | evaluator = RegressionMetric(y_true, y_pred)
45 | print(evaluator.NRMSE(multi_output="raw_values"))
46 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_DBCVI.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from scipy.spatial.distance import cdist
12 |
13 | np.random.seed(100)
14 |
15 |
16 | def generate_dataset(num_samples, num_features, num_clusters, cluster_std):
17 | centroids = np.random.randn(num_clusters, num_features)
18 | labels = np.random.randint(0, num_clusters, num_samples)
19 | data = centroids[labels] + np.random.randn(num_samples, num_features) * cluster_std
20 | return data, centroids, labels
21 |
22 |
23 |
24 | num_samples = 10000
25 | num_features = 2
26 | num_clusters = 5
27 | cluster_std = 0.5
28 |
29 | data, centroids, labels = generate_dataset(num_samples, num_features, num_clusters, cluster_std)
30 |
31 |
32 | time02 = time.perf_counter()
33 | cm = ClusteringMetric(y_true=labels, y_pred=labels, X=data)
34 | res = cm.density_based_clustering_validation_index()
35 | print("res: ", res, time.perf_counter() - time02 )
36 |
37 | time03 = time.perf_counter()
38 | s3 = cut.calculate_density_based_clustering_validation_index(data, labels)
39 | print("res: ", s3, time.perf_counter() - time03)
40 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/SLE.rst:
--------------------------------------------------------------------------------
1 | SLE - Squared Log Error
2 | =======================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: SLE - Squared Log Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | .. math::
18 |
19 | \text{SLE}(y, f_i) = \frac{1}{n}\sum_{i=1}^{n}(\log(y_i + 1) - \log(f_i + 1))^2
20 |
21 | Latex equation code::
22 |
23 | \text{SLE}(y, f_i) = \frac{1}{n}\sum_{i=1}^{n}(\log(y_i + 1) - \log(f_i + 1))^2
24 |
25 | + Squared Log Error (SLE): Best possible score is 0.0, smaller value is better. Range = [0, +inf)
26 | + Note: Computes the squared log error between two numbers, or for element between a pair of list, tuple or numpy arrays.
27 | + The Squared Log Error (SLE) is a metric used to evaluate the accuracy of regression models that predict logarithmic values. It measures the average of the
28 | squared differences between the logarithm of the predicted values and the logarithm of the actual values.
29 |
30 |
31 | Example to use SLE metric:
32 |
33 | .. code-block:: python
34 | :emphasize-lines: 8-9,15-16
35 |
36 | from numpy import array
37 | from permetrics.regression import RegressionMetric
38 |
39 | ## For 1-D array
40 | y_true = array([3, -0.5, 2, 7])
41 | y_pred = array([2.5, 0.0, 2, 8])
42 |
43 | evaluator = RegressionMetric(y_true, y_pred)
44 | print(evaluator.single_squared_log_error())
45 |
46 | ## For > 1-D array
47 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
48 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
49 |
50 | evaluator = RegressionMetric(y_true, y_pred)
51 | print(evaluator.SLE())
52 |
--------------------------------------------------------------------------------
/examples/classification/06_check_ranges_4labels.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 14:25, 03/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | from permetrics.classification import ClassificationMetric
9 |
10 | y_true = np.array([0, 1, 0, 2, 3, 1, 3, 2, 0, 1, 3, 2, 3, 1, 0, 2, 3, 1, 1, 3, 2, 0])
11 | y_pred_rand = []
12 | for idx in range(0, len(y_true)):
13 | y_pred_rand.append(np.random.choice(list(set(range(0, 4)) - {idx})))
14 | t1 = [
15 | y_true.copy(),
16 | y_pred_rand,
17 | np.zeros(len(y_true)),
18 | np.ones(len(y_true)),
19 | np.random.randint(0, 4, len(y_true))
20 | ]
21 | for idx in range(len(t1)):
22 | evaluator = ClassificationMetric(y_true, t1[idx])
23 | print(evaluator.gini_index())
24 |
25 | # CM = confusion_matrix
26 | # PS = precision_score
27 | # NPV = negative_predictive_value
28 | # RS = recall_score
29 | # AS = accuracy_score
30 | # F1S = f1_score
31 | # F2S = f2_score
32 | # FBS = fbeta_score
33 | # SS = specificity_score
34 | # MCC = matthews_correlation_coefficient
35 | # HS = hamming_score
36 | # LS = lift_score
37 | # CKS = cohen_kappa_score
38 | # JSI = JSC = jaccard_similarity_coefficient = jaccard_similarity_index
39 | # GMS = g_mean_score
40 | # GINI = gini_index
41 | # ROC = AUC = RAS = roc_auc_score
42 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/RSE.rst:
--------------------------------------------------------------------------------
1 | RSE - Residual Standard Error
2 | =============================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: RSE - Residual Standard Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{RSE}(y, f_i) = \sqrt{\frac{\sum_{i=1}^{n}(y_i - f_i)^2}{n-p-1}}
21 |
22 | Latex equation code::
23 |
24 | \text{RSE}(y, f_i) = \sqrt{\frac{\sum_{i=1}^{n}(y_i - f_i)^2}{n-p-1}}
25 |
26 |
27 | + Residual Standard Error (RSE): Best possible score is 0.0, smaller value is better. Range = [0, +inf)
28 | + `Link to equation `_
29 | + `Link to equation `_
30 | + The Residual Standard Error (RSE) is a metric used to evaluate the goodness of fit of a regression model. It measures the average distance between the
31 | observed values and the predicted values.
32 |
33 |
34 | Example to use RSE metric:
35 |
36 | .. code-block:: python
37 | :emphasize-lines: 8-9,15-16
38 |
39 | from numpy import array
40 | from permetrics.regression import RegressionMetric
41 |
42 | ## For 1-D array
43 | y_true = array([3, -0.5, 2, 7])
44 | y_pred = array([2.5, 0.0, 2, 8])
45 |
46 | evaluator = RegressionMetric(y_true, y_pred)
47 | print(evaluator.residual_standard_error())
48 |
49 | ## For > 1-D array
50 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
51 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
52 |
53 | evaluator = RegressionMetric(y_true, y_pred)
54 | print(evaluator.RSE(multi_output="raw_values"))
55 |
--------------------------------------------------------------------------------
/docs/source/pages/clustering/SSEI.rst:
--------------------------------------------------------------------------------
1 | Sum of Squared Error Index (SSEI)
2 | =================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Sum of Squared Error Index (SSEI)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | Sum of Squared Error (SSE) is a commonly used metric to evaluate the quality of clustering in unsupervised learning problems.
18 | SSE measures the sum of squared distances between each data point and its corresponding centroid or cluster center.
19 | It quantifies the compactness of the clusters.
20 |
21 | Here's how you can calculate the SSE in a clustering problem::
22 |
23 | 1) Assign each data point to its nearest centroid or cluster center based on some distance metric (e.g., Euclidean distance).
24 | 2) For each data point, calculate the squared Euclidean distance between the data point and its assigned centroid.
25 | 3) Sum up the squared distances for all data points to obtain the SSE.
26 |
27 | Higher SSE values indicate higher dispersion or greater variance within the clusters, while lower SSE values indicate
28 | more compact and well-separated clusters. Therefore, minimizing the SSE is often a goal in clustering algorithms.
29 |
30 | Example:
31 |
32 | .. code-block:: python
33 |
34 | import numpy as np
35 | from permetrics import ClusteringMetric
36 |
37 | ## For integer labels or categorical labels
38 | data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
39 | y_pred = np.array([0, 0, 1, 1, 1])
40 |
41 | cm = ClusteringMetric(X=data, y_pred=y_pred)
42 |
43 | print(cm.sum_squared_error_index())
44 | print(cm.SSEI())
45 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/AR.rst:
--------------------------------------------------------------------------------
1 | AR - Absolute Pearson’s Correlation Index
2 | =========================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: AR - Absolute Pearson’s Correlation Index
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{R}(y, \hat{y}) = \frac{ \sum_{i=0}^{N - 1} (|y_i - mean(y)|*|\hat{y_i} - mean(\hat{y})|) }{ \sqrt{ \sum_{i=0}^{N - 1} (y_i - mean(y))^2} *\sqrt{\sum_{i=0}^{N - 1} (\hat{y_i} - mean(\hat{y}))^2} }
21 |
22 | Latex equation code::
23 |
24 | \text{AR}(y, \hat{y}) = \frac{ \sum_{i=0}^{N - 1} (|y_i - mean(y)|*|\hat{y_i} - mean(\hat{y})|) }{ \sqrt{ \sum_{i=0}^{N - 1} (y_i - mean(y))^2} *\sqrt{\sum_{i=0}^{N - 1} (\hat{y_i} - mean(\hat{y}))^2} }
25 |
26 | + Absolute Pearson’s Correlation Coefficient (APCC or AR): Best possible score is 1.0, bigger value is better. Range = [0, 1]
27 | + I developed this method, do not have enough time to analysis this metric.
28 |
29 |
30 | Example to use AR metric:
31 |
32 | .. code-block:: python
33 | :emphasize-lines: 8-9,15-16
34 |
35 | from numpy import array
36 | from permetrics.regression import RegressionMetric
37 |
38 | ## For 1-D array
39 | y_true = array([3, -0.5, 2, 7])
40 | y_pred = array([2.5, 0.0, 2, 8])
41 |
42 | evaluator = RegressionMetric(y_true, y_pred)
43 | print(evaluator.absolute_pearson_correlation_coefficient())
44 |
45 | ## For > 1-D array
46 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
47 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
48 |
49 | evaluator = RegressionMetric(y_true, y_pred)
50 | print(evaluator.AR(multi_output="raw_values"))
51 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/NSE.rst:
--------------------------------------------------------------------------------
1 | NSE - Nash-Sutcliffe Efficiency
2 | ===============================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: NSE - Nash-Sutcliffe Efficiency
7 |
8 |
9 | .. toctree::
10 | :maxdepth: 3
11 |
12 | .. toctree::
13 | :maxdepth: 3
14 |
15 | .. toctree::
16 | :maxdepth: 3
17 |
18 |
19 | .. math::
20 |
21 | \text{NSE}(y, \hat{y}) = 1 - \frac{\sum_{i=0}^{N - 1} (y_i - \hat{y_i})^2}{ \sum_{i=0}^{N - 1} (y_i - mean(y))^2}
22 |
23 | Latex equation code::
24 |
25 | \text{NSE}(y, \hat{y}) = 1 - \frac{\sum_{i=0}^{N - 1} (y_i - \hat{y_i})^2}{ \sum_{i=0}^{N - 1} (y_i - mean(y))^2}
26 |
27 |
28 | The NSE :cite:`xie2021predicting` is calculated as the ratio of the mean squared error between the observed and simulated streamflow to the variance of the
29 | observed streamflow. The NSE ranges between -inf and 1, with a value of 1 indicating perfect agreement between the observed and simulated streamflow.
30 | + `Link to equation `_
31 |
32 |
33 | Example to use NSE metric:
34 |
35 | .. code-block:: python
36 | :emphasize-lines: 8-9,15-16
37 |
38 | from numpy import array
39 | from permetrics.regression import RegressionMetric
40 |
41 | ## For 1-D array
42 | y_true = array([3, -0.5, 2, 7])
43 | y_pred = array([2.5, 0.0, 2, 8])
44 |
45 | evaluator = RegressionMetric(y_true, y_pred)
46 | print(evaluator.nash_sutcliffe_efficiency())
47 |
48 | ## For > 1-D array
49 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
50 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
51 |
52 | evaluator = RegressionMetric(y_true, y_pred)
53 | print(evaluator.NSE(multi_output="raw_values"))
54 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_RaS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 | from sklearn.metrics import rand_score as sk_rs
13 |
14 | np.random.seed(100)
15 |
16 |
17 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
18 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
19 | cluster_std=1.0, random_state=random_state, return_centers=True)
20 | y_pred = np.random.randint(0, n_clusters, n_samples)
21 | return X, y_true, y_pred, centers
22 |
23 |
24 | num_samples = 10000000
25 | num_features = 2
26 | num_clusters = 7
27 | cluster_std = 0.5
28 |
29 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
30 |
31 |
32 | time03 = time.perf_counter()
33 | s3 = sk_rs(y_true, y_pred)
34 | print("res: ", s3, time.perf_counter() - time03)
35 |
36 | time02 = time.perf_counter()
37 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
38 | res = cm.RaS()
39 | print("res: ", res, time.perf_counter() - time02 )
40 |
41 | time03 = time.perf_counter()
42 | s3 = cut.calculate_rand_score(y_true, y_pred)
43 | print("res: ", s3, time.perf_counter() - time03)
44 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/VAF.rst:
--------------------------------------------------------------------------------
1 | VAF - Variance Accounted For
2 | ============================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: VAF - Variance Accounted For
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{VAF}(y, f_i) = 100\% \times \frac{\sum_{i=1}^{n}(y_i - \bar{y})(f_i - \bar{f})}{\sum_{i=1}^{n}(y_i - \bar{y})^2}
21 |
22 | Latex equation code::
23 |
24 | \text{VAF}(y, f_i) = 100\% \times \frac{\sum_{i=1}^{n}(y_i - \bar{y})(f_i - \bar{f})}{\sum_{i=1}^{n}(y_i - \bar{y})^2}
25 |
26 | + Variance Accounted For (VAF) is a metric used to evaluate the performance of a regression model. It measures the proportion of the total variance in the
27 | actual values that is accounted for by the variance in the predicted values.
28 | + Variance Accounted For between 2 signals (VAF): Best possible score is 100% (identical signal), bigger value is better. Range = (-inf, 100%]
29 | + `Link to equation `_
30 |
31 |
32 | Example to use VAF metric:
33 |
34 | .. code-block:: python
35 | :emphasize-lines: 8-9,15-16
36 |
37 | from numpy import array
38 | from permetrics.regression import RegressionMetric
39 |
40 | ## For 1-D array
41 | y_true = array([3, -0.5, 2, 7])
42 | y_pred = array([2.5, 0.0, 2, 8])
43 |
44 | evaluator = RegressionMetric(y_true, y_pred)
45 | print(evaluator.variance_accounted_for())
46 |
47 | ## For > 1-D array
48 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
49 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
50 |
51 | evaluator = RegressionMetric(y_true, y_pred)
52 | print(evaluator.VAF(multi_output="raw_values"))
53 |
--------------------------------------------------------------------------------
/examples/classification/04_confusion_matrix.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 11:34, 23/05/2022 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | from permetrics.classification import ClassificationMetric
9 |
10 | y_true = [0, 1, 0, 0, 1, 0]
11 | y_pred = [0, 1, 0, 0, 0, 1]
12 |
13 | evaluator = ClassificationMetric(y_true, y_pred)
14 | cm, imap, imap_count = evaluator.confusion_matrix()
15 | print(cm)
16 | print(imap)
17 | print(imap_count)
18 |
19 | print("======================================")
20 |
21 | y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
22 | y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
23 |
24 | evaluator = ClassificationMetric(y_true, y_pred)
25 | cm, imap, imap_count = evaluator.confusion_matrix()
26 | print(cm)
27 | print(imap)
28 | print(imap_count)
29 |
30 | # CM = confusion_matrix
31 | # PS = precision_score
32 | # NPV = negative_predictive_value
33 | # RS = recall_score
34 | # AS = accuracy_score
35 | # F1S = f1_score
36 | # F2S = f2_score
37 | # FBS = fbeta_score
38 | # SS = specificity_score
39 | # MCC = matthews_correlation_coefficient
40 | # HS = hamming_score
41 | # LS = lift_score
42 | # CKS = cohen_kappa_score
43 | # JSI = JSC = jaccard_similarity_coefficient = jaccard_similarity_index
44 | # GMS = g_mean_score
45 | # GINI = gini_index
46 | # ROC = AUC = RAS = roc_auc_score
47 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/CRM.rst:
--------------------------------------------------------------------------------
1 | CRM - Coefficient of Residual Mass
2 | ==================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: CRM - Coefficient of Residual Mass
7 |
8 |
9 | .. toctree::
10 | :maxdepth: 3
11 |
12 | .. toctree::
13 | :maxdepth: 3
14 |
15 | .. toctree::
16 | :maxdepth: 3
17 |
18 |
19 | .. math::
20 |
21 | \text{CRM}(y, \hat{y}) = \frac{\sum{\hat{Y}} - \sum{Y}}{\sum{Y}}
22 |
23 |
24 | The CRM :cite:`almodfer2022modeling` is a measure of the accuracy of the model in predicting the values of the dependent variable. A lower value of CRM
25 | indicates that the model is better at predicting the values of the dependent variable, while a higher value indicates poorer performance. The coefficient of
26 | residual mass is typically used in environmental engineering and hydrology to measure the accuracy of models used to predict water quality and quantity,
27 | sediment transport, and erosion.
28 | + Best possible value = 0, smaller value is better. Range = (-inf, +inf)
29 | + `Link to equation `_
30 |
31 |
32 | Example to use CRM metric:
33 |
34 | .. code-block:: python
35 | :emphasize-lines: 8-9,15-16
36 |
37 | from numpy import array
38 | from permetrics.regression import RegressionMetric
39 |
40 | ## For 1-D array
41 | y_true = array([3, -0.5, 2, 7])
42 | y_pred = array([2.5, 0.0, 2, 8])
43 |
44 | evaluator = RegressionMetric(y_true, y_pred)
45 | print(evaluator.coefficient_of_residual_mass())
46 |
47 | ## For > 1-D array
48 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
49 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
50 |
51 | evaluator = RegressionMetric(y_true, y_pred)
52 | print(evaluator.CRM(multi_output="raw_values"))
53 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_VMS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 | from sklearn.metrics import v_measure_score as sk_vms
13 |
14 | np.random.seed(100)
15 |
16 |
17 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
18 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
19 | cluster_std=1.0, random_state=random_state, return_centers=True)
20 | y_pred = np.random.randint(0, n_clusters, n_samples)
21 | return X, y_true, y_pred, centers
22 |
23 |
24 | num_samples = 10000000
25 | num_features = 2
26 | num_clusters = 7
27 | cluster_std = 0.5
28 |
29 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
30 |
31 |
32 | time03 = time.perf_counter()
33 | s3 = sk_vms(y_true, y_pred)
34 | print("res: ", s3, time.perf_counter() - time03)
35 |
36 | time02 = time.perf_counter()
37 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
38 | res = cm.v_measure_score()
39 | print("res: ", res, time.perf_counter() - time02 )
40 |
41 | time03 = time.perf_counter()
42 | s3 = cut.calculate_v_measure_score(y_true, y_pred)
43 | print("res: ", s3, time.perf_counter() - time03)
44 |
--------------------------------------------------------------------------------
/docs/source/pages/clustering/DI.rst:
--------------------------------------------------------------------------------
1 | Dunn Index (DI)
2 | ===============
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Dunn Index (DI)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | The Dunn Index, which is a measure used to evaluate the performance of clustering algorithms. The Dunn Index aims to quantify the compactness and separation
19 | between clusters in a clustering solution. It helps assess the quality of the clustering by considering both the distance between points within the same cluster (intra-cluster distance) and the distance between points in different clusters (inter-cluster distance).
20 |
21 |
22 | .. image:: /_static/images/DI.png
23 |
24 |
25 | A higher Dunn Index value indicates better clustering quality – it suggests that the clusters are well separated from each other while being compact internally. Conversely, a lower Dunn Index value may indicate that the clusters are too spread out or not well separated.
26 |
27 | However, like any clustering evaluation metric, the Dunn Index has its limitations and should be used in conjunction with other metrics and domain knowledge. It's worth noting that the choice of clustering algorithm, distance metric, and dataset characteristics can influence the interpretation of the Dunn Index.
28 |
29 |
30 | Example:
31 |
32 | .. code-block:: python
33 |
34 | import numpy as np
35 | from permetrics import ClusteringMetric
36 |
37 | ## For integer labels or categorical labels
38 | data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
39 | y_pred = np.array([0, 0, 1, 1, 1])
40 |
41 | cm = ClusteringMetric(X=data, y_pred=y_pred)
42 |
43 | print(cm.dunn_index())
44 | print(cm.DI())
45 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_HS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 | from sklearn.metrics import homogeneity_score as sk_hs
13 |
14 | np.random.seed(100)
15 |
16 |
17 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
18 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
19 | cluster_std=1.0, random_state=random_state, return_centers=True)
20 | y_pred = np.random.randint(0, n_clusters, n_samples)
21 | return X, y_true, y_pred, centers
22 |
23 |
24 | num_samples = 10000000
25 | num_features = 2
26 | num_clusters = 7
27 | cluster_std = 0.5
28 |
29 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
30 |
31 |
32 | time03 = time.perf_counter()
33 | s3 = sk_hs(y_true, y_pred)
34 | print("res: ", s3, time.perf_counter() - time03)
35 |
36 | time02 = time.perf_counter()
37 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
38 | res = cm.homogeneity_score()
39 | print("res: ", res, time.perf_counter() - time02 )
40 |
41 | time03 = time.perf_counter()
42 | s3 = cut.calculate_homogeneity_score(y_true, y_pred)
43 | print("res: ", s3, time.perf_counter() - time03)
44 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/PCD.rst:
--------------------------------------------------------------------------------
1 | PCD - Prediction of Change in Direction
2 | =======================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: PCD - Prediction of Change in Direction
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{PCD}(y, \hat{y}) = \frac{1}{n-1} \sum_{i=2}^{n} I\left((f_{i}-f_{i-1}) (y_{i}-y_{i-1}) > 0\right)
21 |
22 | Latex equation code::
23 |
24 | \text{PCD}(y, \hat{y}) = \frac{1}{n-1} \sum_{i=2}^{n} I\left((f_{i}-f_{i-1}) (y_{i}-y_{i-1}) > 0\right)
25 |
26 | + where $f_i$ is the predicted value at time $i$, $y_i$ is the actual value at time $i$, $n$ is the total number of predictions, and $I(\cdot)$ is the
27 | indicator function which equals 1 if the argument is true and 0 otherwise.
28 | + Best possible score is 1.0, bigger value is better . Range = [0, 1]
29 | + The Prediction of Change in Direction (PCD) metric is used to evaluate the performance of regression models on detecting changes in the direction of a target variable.
30 |
31 |
32 | Example to use PCD metric:
33 |
34 | .. code-block:: python
35 | :emphasize-lines: 8-9,15-16
36 |
37 | from numpy import array
38 | from permetrics.regression import RegressionMetric
39 |
40 | ## For 1-D array
41 | y_true = array([3, -0.5, 2, 7])
42 | y_pred = array([2.5, 0.0, 2, 8])
43 |
44 | evaluator = RegressionMetric(y_true, y_pred)
45 | print(evaluator.prediction_of_change_in_direction())
46 |
47 | ## For > 1-D array
48 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
49 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
50 |
51 | evaluator = RegressionMetric(y_true, y_pred)
52 | print(evaluator.PCD(multi_output="raw_values"))
53 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_CS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 | from sklearn.metrics import completeness_score as sk_cs
13 |
14 | np.random.seed(100)
15 |
16 |
17 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
18 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
19 | cluster_std=1.0, random_state=random_state, return_centers=True)
20 | y_pred = np.random.randint(0, n_clusters, n_samples)
21 | return X, y_true, y_pred, centers
22 |
23 |
24 | num_samples = 10000000
25 | num_features = 2
26 | num_clusters = 7
27 | cluster_std = 0.5
28 |
29 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
30 |
31 |
32 | time03 = time.perf_counter()
33 | s3 = sk_cs(y_true, y_pred)
34 | print("res: ", s3, time.perf_counter() - time03)
35 |
36 | time02 = time.perf_counter()
37 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
38 | res = cm.completeness_score()
39 | print("res: ", res, time.perf_counter() - time02 )
40 |
41 | time03 = time.perf_counter()
42 | s3 = cut.calculate_completeness_score(y_true, y_pred)
43 | print("res: ", s3, time.perf_counter() - time03)
44 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_MIS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 | from sklearn.metrics import mutual_info_score as sk_mis
13 |
14 | np.random.seed(100)
15 |
16 |
17 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
18 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
19 | cluster_std=1.0, random_state=random_state, return_centers=True)
20 | y_pred = np.random.randint(0, n_clusters, n_samples)
21 | return X, y_true, y_pred, centers
22 |
23 |
24 | num_samples = 10000000
25 | num_features = 2
26 | num_clusters = 7
27 | cluster_std = 0.5
28 |
29 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
30 |
31 |
32 | time03 = time.perf_counter()
33 | s3 = sk_mis(y_true, y_pred)
34 | print("res: ", s3, time.perf_counter() - time03)
35 |
36 | time02 = time.perf_counter()
37 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
38 | res = cm.mutual_info_score()
39 | print("res: ", res, time.perf_counter() - time02 )
40 |
41 | time03 = time.perf_counter()
42 | s3 = cut.calculate_mutual_info_score(y_true, y_pred)
43 | print("res: ", s3, time.perf_counter() - time03)
44 |
--------------------------------------------------------------------------------
/docs/source/pages/clustering/PuS.rst:
--------------------------------------------------------------------------------
1 | Purity Score (PuS)
2 | ==================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Purity Score (PuS)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | Purity is a metric used to evaluate the quality of clustering results, particularly in situations where the ground truth
18 | labels of the data points are known. It measures the extent to which the clusters produced by a clustering algorithm
19 | match the true class labels of the data. Here's how Purity is calculated::
20 |
21 | 1) For each cluster, find the majority class label among the data points in that cluster.
22 | 2) Sum up the sizes of the clusters that belong to the majority class label.
23 | 3) Divide the sum by the total number of data points.
24 |
25 | The resulting value is the Purity score, which ranges from 0 to 1. A Purity score of 1 indicates a perfect clustering,
26 | where each cluster contains only data points from a single class.
27 |
28 | Purity is a simple and intuitive metric but has some limitations. It does not consider the actual structure or
29 | distribution of the data within the clusters and is sensitive to the number of clusters and class imbalance.
30 | Therefore, it may not be suitable for evaluating clustering algorithms in all scenarios.
31 |
32 | Example:
33 |
34 | .. code-block:: python
35 |
36 | import numpy as np
37 | from permetrics import ClusteringMetric
38 |
39 | ## For integer labels or categorical labels
40 | y_true = np.array([0, 0, 1, 1, 1, 2, 2, 1])
41 | y_pred = np.array([0, 0, 1, 1, 2, 2, 2, 2])
42 |
43 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
44 |
45 | print(cm.purity_score())
46 | print(cm.PuS())
47 |
--------------------------------------------------------------------------------
/examples/classification/09_crossentropy_loss.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 16:03, 12/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | from permetrics import ClassificationMetric
9 |
10 |
11 | def multiclass_cross_entropy_loss(y_true, y_pred):
12 | epsilon = 1e-15 # Small value to avoid numerical instability
13 |
14 | # Clip predicted probabilities to a minimum and maximum value
15 | y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
16 |
17 | # Compute multi-class cross-entropy loss
18 | loss = -np.sum(y_true * np.log(y_pred), axis=1)
19 |
20 | # Take the average across samples
21 | loss = np.mean(loss)
22 |
23 | return loss
24 |
25 | # Example usage
26 | y_true_multiclass = np.array([[1, 0, 0],
27 | [0, 1, 0],
28 | [0, 0, 1],
29 | [0, 1, 0],
30 | [0, 0, 1]])
31 | y_pred_multiclass = np.array([[0.2, 0.6, 0.2],
32 | [0.7, 0.1, 0.2],
33 | [0.3, 0.4, 0.3],
34 | [0.8, 0.1, 0.1],
35 | [0.4, 0.2, 0.4]])
36 | multiclass_loss = multiclass_cross_entropy_loss(y_true_multiclass, y_pred_multiclass)
37 | print("Multiclass Cross-Entropy Loss:", multiclass_loss)
38 |
39 | cu = ClassificationMetric()
40 | print(cu.crossentropy_loss(y_true_multiclass, y_pred_multiclass))
41 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_ARS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 | from sklearn.metrics import adjusted_rand_score as sk_ars
13 |
14 | np.random.seed(100)
15 |
16 |
17 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
18 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
19 | cluster_std=1.0, random_state=random_state, return_centers=True)
20 | y_pred = np.random.randint(0, n_clusters, n_samples)
21 | return X, y_true, y_pred, centers
22 |
23 |
24 | num_samples = 10000000
25 | num_features = 2
26 | num_clusters = 7
27 | cluster_std = 0.5
28 |
29 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
30 |
31 |
32 | time03 = time.perf_counter()
33 | s3 = sk_ars(y_true, y_pred)
34 | print("res: ", s3, time.perf_counter() - time03)
35 |
36 | time02 = time.perf_counter()
37 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
38 | res = cm.adjusted_rand_score()
39 | print("res: ", res, time.perf_counter() - time02 )
40 |
41 | time03 = time.perf_counter()
42 | s3 = cut.calculate_adjusted_rand_score(y_true, y_pred)
43 | print("res: ", s3, time.perf_counter() - time03)
44 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_FMS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 | from sklearn.metrics import fowlkes_mallows_score as sk_fms
13 |
14 | np.random.seed(100)
15 |
16 |
17 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
18 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
19 | cluster_std=1.0, random_state=random_state, return_centers=True)
20 | y_pred = np.random.randint(0, n_clusters, n_samples)
21 | return X, y_true, y_pred, centers
22 |
23 |
24 | num_samples = 10000000
25 | num_features = 2
26 | num_clusters = 7
27 | cluster_std = 0.5
28 |
29 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
30 |
31 |
32 | time03 = time.perf_counter()
33 | s3 = sk_fms(y_true, y_pred)
34 | print("res: ", s3, time.perf_counter() - time03)
35 |
36 | time02 = time.perf_counter()
37 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
38 | res = cm.fowlkes_mallows_score()
39 | print("res: ", res, time.perf_counter() - time02 )
40 |
41 | time03 = time.perf_counter()
42 | s3 = cut.calculate_fowlkes_mallows_score(y_true, y_pred)
43 | print("res: ", s3, time.perf_counter() - time03)
44 |
--------------------------------------------------------------------------------
/examples/clustering/02_external_metrics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 06:32, 27/07/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | ## Better to use OOP to call all of the available functions
8 | ## External metrics: Need y_true and y_pred and has suffix as score
9 |
10 | import numpy as np
11 | from permetrics import ClusteringMetric
12 | from sklearn.datasets import make_blobs
13 |
14 | # generate sample data
15 | X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
16 | y_pred = np.random.randint(0, 4, size=300)
17 |
18 | evaluator = ClusteringMetric(y_true=y_true, y_pred=y_pred)
19 |
20 | print(evaluator.get_metrics_by_list_names(["MIS", "NMIS", "RaS", "FMS", "HS", "CS", "VMS", "PrS", "ReS", "FmS",
21 | "CDS", "HGS", "JS", "KS", "MNS", "PhS", "RTS", "RRS", "SS1S", "SS2S"]))
22 |
23 | # MIS = mutual_info_score
24 | # NMIS = normalized_mutual_info_score
25 | # RaS = rand_score
26 | # FMS = fowlkes_mallows_score
27 | # HS = homogeneity_score
28 | # CS = completeness_score
29 | # VMS = v_measure_score
30 | # PrS = precision_score
31 | # ReS = recall_score
32 | # FmS = f_measure_score
33 | # CDS = czekanowski_dice_score
34 | # HGS = hubert_gamma_score
35 | # JS = jaccard_score
36 | # KS = kulczynski_score
37 | # MNS = mc_nemar_score
38 | # PhS = phi_score
39 | # RTS = rogers_tanimoto_score
40 | # RRS = russel_rao_score
41 | # SS1S = sokal_sneath1_score
42 | # SS2S = sokal_sneath2_score
43 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/COV.rst:
--------------------------------------------------------------------------------
1 | COV - Covariance
2 | ================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: COV - Covariance
7 |
8 |
9 | .. toctree::
10 | :maxdepth: 3
11 |
12 | .. toctree::
13 | :maxdepth: 3
14 |
15 | .. toctree::
16 | :maxdepth: 3
17 |
18 |
19 | Covariance of population .. math::
20 |
21 | \text{COV}(y, \hat{y}) = \frac{\sum_{i=1}^{N} (y_i - mean(Y)) (\hat{y}_i - mean(\hat{Y}))}{N}
22 |
23 |
24 | Covariance of sample .. math::
25 |
26 | \text{COV}(y, \hat{y}) = \frac{\sum_{i=1}^{N} (y_i - mean(Y)) (\hat{y}_i - mean(\hat{Y}))}{N - 1}
27 |
28 |
29 | + There is no best value, bigger value is better. Range = [-inf, +inf)
30 | + Positive covariance: Indicates that two variables tend to move in the same direction.
31 | + Negative covariance: Reveals that two variables tend to move in inverse directions.
32 | + COV is a measure of the relationship between two random variables evaluates how much – to what extent – the variables change together, does not assess the
33 | dependency between variables.
34 | + `Link to equation `_
35 |
36 |
37 | Example to use COV metric:
38 |
39 | .. code-block:: python
40 | :emphasize-lines: 8-9,15-16
41 |
42 | from numpy import array
43 | from permetrics.regression import RegressionMetric
44 |
45 | ## For 1-D array
46 | y_true = array([3, -0.5, 2, 7])
47 | y_pred = array([2.5, 0.0, 2, 8])
48 |
49 | evaluator = RegressionMetric(y_true, y_pred)
50 | print(evaluator.covariance())
51 |
52 | ## For > 1-D array
53 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
54 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
55 |
56 | evaluator = RegressionMetric(y_true, y_pred)
57 | print(evaluator.COV(multi_output="raw_values"))
58 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/CI.rst:
--------------------------------------------------------------------------------
1 | CI - Confidence Index
2 | =====================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: CI - Confidence Index
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{CI}(y, \hat{y}) = \text{R}(y, \hat{y}) * \text{WI}(y, \hat{y})
21 |
22 | Latex equation code::
23 |
24 | \text{CI}(y, \hat{y}) = \text{R}(y, \hat{y}) * \text{WI}(y, \hat{y})
25 |
26 |
27 | Confidence Index :cite:`ahmed2021comprehensive` or Performance Index (CI/PI) is score that measures the performance of each estimation method, with a higher value
28 | indicating better performance. The range of the CI/PI is (-inf, 1], meaning it can take any value less than or equal to 1, but not including negative infinity.
29 |
30 | + Best possible score is 1.0, bigger value is better. Range = (-inf, 1], meaning of values::
31 |
32 | > 0.85 Excellent Model
33 | 0.76-0.85 Very good
34 | 0.66-0.75 Good
35 | 0.61-0.65 Satisfactory
36 | 0.51-0.60 Poor
37 | 0.41-0.50 Bad
38 | < 0.40 Very bad
39 |
40 | Example to use CI metric:
41 |
42 | .. code-block:: python
43 | :emphasize-lines: 8-9,15-16
44 |
45 | from numpy import array
46 | from permetrics.regression import RegressionMetric
47 |
48 | ## For 1-D array
49 | y_true = array([3, -0.5, 2, 7])
50 | y_pred = array([2.5, 0.0, 2, 8])
51 |
52 | evaluator = RegressionMetric(y_true, y_pred)
53 | print(evaluator.confidence_index())
54 |
55 | ## For > 1-D array
56 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
57 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
58 |
59 | evaluator = RegressionMetric(y_true, y_pred)
60 | print(evaluator.CI(multi_output="raw_values"))
61 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_NMIS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 | from sklearn.metrics import normalized_mutual_info_score as sk_nmis
13 |
14 | np.random.seed(100)
15 |
16 |
17 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
18 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
19 | cluster_std=1.0, random_state=random_state, return_centers=True)
20 | y_pred = np.random.randint(0, n_clusters, n_samples)
21 | return X, y_true, y_pred, centers
22 |
23 |
24 | num_samples = 10000000
25 | num_features = 2
26 | num_clusters = 7
27 | cluster_std = 0.5
28 |
29 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
30 |
31 |
32 | time03 = time.perf_counter()
33 | s3 = sk_nmis(y_true, y_pred)
34 | print("res: ", s3, time.perf_counter() - time03)
35 |
36 | time02 = time.perf_counter()
37 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
38 | res = cm.normalized_mutual_info_score()
39 | print("res: ", res, time.perf_counter() - time02 )
40 |
41 | time03 = time.perf_counter()
42 | s3 = cut.calculate_normalized_mutual_info_score(y_true, y_pred)
43 | print("res: ", s3, time.perf_counter() - time03)
44 |
--------------------------------------------------------------------------------
/docs/source/pages/classification/RS.rst:
--------------------------------------------------------------------------------
1 | Recall Score (RS)
2 | =================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Recall Score (RS)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. image:: /_static/images/class_score_1.png
19 |
20 | The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives.
21 | The recall is intuitively the ability of the classifier to find all the positive samples.
22 |
23 | In the multi-class and multi-label case, this is the average of the RS score of each class with weighting depending on the average parameter.
24 |
25 | + Best possible score is 1.0, higher value is better. Range = [0, 1]
26 | + https://towardsdatascience.com/multi-class-metrics-made-simple-part-i-precision-and-recall-9250280bddc2
27 | + https://www.debadityachakravorty.com/ai-ml/cmatrix/
28 | + https://neptune.ai/blog/evaluation-metrics-binary-classification
29 | + https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html#sklearn.metrics.recall_score
30 |
31 |
32 | Example:
33 |
34 | .. code-block:: python
35 | :emphasize-lines: 11,13-16
36 |
37 | from numpy import array
38 | from permetrics.classification import ClassificationMetric
39 |
40 | ## For integer labels or categorical labels
41 | y_true = [0, 1, 0, 0, 1, 0]
42 | y_pred = [0, 1, 0, 0, 0, 1]
43 |
44 | # y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
45 | # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
46 |
47 | cm = ClassificationMetric(y_true, y_pred)
48 |
49 | print(cm.recall_score(average=None))
50 | print(cm.RS(average="micro"))
51 | print(cm.RS(average="macro"))
52 | print(cm.RS(average="weighted"))
53 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/RAE.rst:
--------------------------------------------------------------------------------
1 | RAE - Relative Absolute Error
2 | =============================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: RAE - Relative Absolute Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{RAE}(y, \hat{y}) = \frac{\Big[\sum_{i=1}^{n}(\hat{y}_i - y_i)^2\Big]^{1/2}}{\Big[\sum_{i=1}^{n}(y_i)^2\Big]^{1/2}}
21 |
22 | Latex equation code::
23 |
24 | \text{RAE}(y, \hat{y}) = \frac{\Big[\sum_{i=1}^{n}(\hat{y}_i - y_i)^2\Big]^{1/2}}{\Big[\sum_{i=1}^{n}(y_i)^2\Big]^{1/2}}
25 |
26 | + Relative Absolute Error (RAE): Best possible score is 0.0, smaller value is better. Range = [0, +inf)
27 | + `Link to equation `_
28 | + `Link to equation `_
29 | + The Relative Absolute Error (RAE) is a metric used to evaluate the accuracy of a regression model by measuring the ratio of the mean absolute error to the
30 | mean absolute deviation of the actual values.
31 |
32 |
33 | Example to use RAE metric:
34 |
35 | .. code-block:: python
36 | :emphasize-lines: 8-9,15-16
37 |
38 | from numpy import array
39 | from permetrics.regression import RegressionMetric
40 |
41 | ## For 1-D array
42 | y_true = array([3, -0.5, 2, 7])
43 | y_pred = array([2.5, 0.0, 2, 8])
44 |
45 | evaluator = RegressionMetric(y_true, y_pred)
46 | print(evaluator.relative_absolute_error())
47 |
48 | ## For > 1-D array
49 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
50 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
51 |
52 | evaluator = RegressionMetric(y_true, y_pred)
53 | print(evaluator.RAE(multi_output="raw_values"))
54 |
--------------------------------------------------------------------------------
/examples/classification/00_all_metrics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 10:13, 23/05/2022 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | ## 1. Import packages, classes
8 | ## 2. Create object
9 | ## 3. From object call function and use
10 |
11 | import numpy as np
12 | from permetrics.classification import ClassificationMetric
13 |
14 | ## For integer labels or categorical labels
15 | y_true = [0, 1, 0, 0, 1, 0]
16 | y_pred = [0, 1, 0, 0, 0, 1]
17 |
18 | # y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
19 | # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
20 |
21 | evaluator = ClassificationMetric(y_true, y_pred)
22 |
23 | ## Call specific function inside object, each function has 2 names like below
24 |
25 | print(evaluator.f1_score())
26 | print(evaluator.F1S(average="micro"))
27 | print(evaluator.f1_score(average="macro"))
28 | print(evaluator.F1S(average="weighted"))
29 |
30 | # CM = confusion_matrix
31 | # PS = precision_score
32 | # NPV = negative_predictive_value
33 | # RS = recall_score
34 | # AS = accuracy_score
35 | # F1S = f1_score
36 | # F2S = f2_score
37 | # FBS = fbeta_score
38 | # SS = specificity_score
39 | # MCC = matthews_correlation_coefficient
40 | # HS = hamming_score
41 | # LS = lift_score
42 | # CKS = cohen_kappa_score
43 | # JSI = JSC = jaccard_similarity_coefficient = jaccard_similarity_index
44 | # GMS = g_mean_score
45 | # GINI = gini_index
46 | # ROC = AUC = RAS = roc_auc_score
47 |
--------------------------------------------------------------------------------
/examples/classification/12_brier_score_loss.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:49, 12/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | from permetrics import ClassificationMetric
9 | from sklearn.metrics import brier_score_loss
10 |
11 |
12 | def brier_score_loss2(y_true, y_pred):
13 | num_classes = y_pred.shape[1] if len(y_pred.shape) > 1 else 1
14 | if num_classes == 1: # Binary classification
15 | return np.mean((y_true - y_pred) ** 2)
16 | else: # Multi-class classification
17 | return np.mean(np.sum((y_true - y_pred) ** 2, axis=1))
18 |
19 |
20 | # Binary classification example
21 | y_true_binary = np.array([0, 1, 1, 0]) # True binary labels
22 | y_pred_binary = np.array([0.3, 0.7, 0.9, 0.2]) # Predicted probabilities
23 | cu = ClassificationMetric()
24 | print(cu.brier_score_loss(y_true_binary, y_pred_binary))
25 | print(brier_score_loss2(y_true_binary, y_pred_binary))
26 | print(brier_score_loss(y_true_binary, y_pred_binary))
27 |
28 | # Multi-Class Classification Example
29 | y_true_multiclass = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) # True class labels (one-hot encoded)
30 | y_pred_multiclass = np.array([[0.7, 0.2, 0.1], [0.3, 0.6, 0.1], [0.2, 0.1, 0.7]]) # Predicted class probabilities
31 | print(cu.brier_score_loss(y_true_multiclass, y_pred_multiclass))
32 | print(brier_score_loss2(y_true_multiclass, y_pred_multiclass))
33 | # print(brier_score_loss(y_true_multiclass, y_pred_multiclass))
34 | # Scikit Learn library can't even calculate in this case.
35 |
--------------------------------------------------------------------------------
/docs/source/pages/classification/PS.rst:
--------------------------------------------------------------------------------
1 | Precision Score (PS)
2 | ====================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Precision Score (PS)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. image:: /_static/images/class_score_1.png
19 |
20 | The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives.
21 | The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.
22 |
23 | In the multi-class and multi-label case, this is the average of the PS score of each class with weighting depending on the average parameter.
24 |
25 | + Best possible score is 1.0, higher value is better. Range = [0, 1]
26 | + https://towardsdatascience.com/multi-class-metrics-made-simple-part-i-precision-and-recall-9250280bddc2
27 | + https://www.debadityachakravorty.com/ai-ml/cmatrix/
28 | + https://neptune.ai/blog/evaluation-metrics-binary-classification
29 | + https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn.metrics.precision_score
30 |
31 |
32 | Example:
33 |
34 | .. code-block:: python
35 | :emphasize-lines: 11,13-16
36 |
37 | from numpy import array
38 | from permetrics.classification import ClassificationMetric
39 |
40 | ## For integer labels or categorical labels
41 | y_true = [0, 1, 0, 0, 1, 0]
42 | y_pred = [0, 1, 0, 0, 0, 1]
43 |
44 | # y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
45 | # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
46 |
47 | cm = ClassificationMetric(y_true, y_pred)
48 |
49 | print(cm.PS(average=None))
50 | print(cm.PS(average="micro"))
51 | print(cm.PS(average="macro"))
52 | print(cm.PS(average="weighted"))
53 |
--------------------------------------------------------------------------------
/docs/source/pages/clustering/HI.rst:
--------------------------------------------------------------------------------
1 | Hartigan Index (HI)
2 | ===================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Hartigan Index (HI)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | The Hartigan index, also known as the Hartigan's criterion, is a measure used for evaluating the quality of clustering solutions.
18 | It is specifically designed for assessing the goodness of fit of a clustering algorithm, particularly the k-means algorithm.
19 |
20 | .. image:: /_static/images/HI.png
21 |
22 | The Hartigan index quantifies the compactness of clusters and the separation between clusters in a clustering solution.
23 | It aims to find a balance between minimizing the within-cluster variance (compactness) and maximizing the between-cluster variance (separation).
24 |
25 | While the Hartigan index is a useful measure, it is not as widely used as other clustering evaluation indices
26 | like the Silhouette coefficient or Dunn index. Nevertheless, it can provide insights into the quality of a clustering solution,
27 | particularly when comparing different clustering algorithms or determining the optimal number of clusters.
28 |
29 | The goal of the Hartigan index is to minimize this ratio. Lower values of the Hartigan index indicate better clustering
30 | solutions with lower within-cluster variance and higher separation between clusters.
31 |
32 |
33 | Example:
34 |
35 | .. code-block:: python
36 |
37 | import numpy as np
38 | from permetrics import ClusteringMetric
39 |
40 | ## For integer labels or categorical labels
41 | data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
42 | y_pred = np.array([0, 0, 1, 1, 1])
43 |
44 | cm = ClusteringMetric(X=data, y_pred=y_pred)
45 |
46 | print(cm.hartigan_index())
47 | print(cm.HI())
48 |
--------------------------------------------------------------------------------
/docs/source/pages/classification/NPV.rst:
--------------------------------------------------------------------------------
1 | Negative Predictive Value (NPV)
2 | ===============================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Negative Predictive Value (NPV)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. image:: /_static/images/class_score_1.png
19 |
20 | The negative predictive value is defined as the number of true negatives (people who test negative who don't have a condition) divided
21 | by the total number of people who test negative.
22 |
23 | The negative predictive value is the ratio tn / (tn + fn) where tn is the number of true negatives and fn the number of false negatives.
24 |
25 | In the multi-class and multi-label case, this is the average of the NPV score of each class with weighting depending on the average parameter.
26 |
27 | + Best possible score is 1.0, higher value is better. Range = [0, 1]
28 | + https://www.debadityachakravorty.com/ai-ml/cmatrix/
29 | + https://neptune.ai/blog/evaluation-metrics-binary-classification
30 | + https://towardsdatascience.com/multi-class-metrics-made-simple-part-i-precision-and-recall-9250280bddc2
31 |
32 |
33 | Example:
34 |
35 | .. code-block:: python
36 | :emphasize-lines: 11,13-16
37 |
38 | from numpy import array
39 | from permetrics.classification import ClassificationMetric
40 |
41 | ## For integer labels or categorical labels
42 | y_true = [0, 1, 0, 0, 1, 0]
43 | y_pred = [0, 1, 0, 0, 0, 1]
44 |
45 | # y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
46 | # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
47 |
48 | cm = ClassificationMetric(y_true, y_pred)
49 |
50 | print(cm.npv(average=None))
51 | print(cm.NPV(average="micro"))
52 | print(cm.NPV(average="macro"))
53 | print(cm.NPV(average="weighted"))
54 |
--------------------------------------------------------------------------------
/docs/source/pages/classification/FBS.rst:
--------------------------------------------------------------------------------
1 | F-Beta Score (FBS)
2 | ==================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: F-Beta Score (FBS)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | The F-beta score is the weighted harmonic mean of precision and recall, reaching its optimal value at 1 and its worst value at 0.
19 |
20 | The beta parameter determines the weight of recall in the combined score. beta < 1 lends more weight to precision,
21 | while beta > 1 favors recall (beta -> 0 considers only precision, beta -> +inf only recall).
22 |
23 | .. math::
24 |
25 | F-beta = (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)
26 |
27 | In the multi-class and multi-label case, this is the average of the FBS score of each class with weighting depending on the average parameter.
28 |
29 | + Best possible score is 1.0, higher value is better. Range = [0, 1]
30 | + https://neptune.ai/blog/evaluation-metrics-binary-classification
31 | + https://scikit-learn.org/stable/modules/generated/sklearn.metrics.fbeta_score.html#sklearn.metrics.fbeta_score
32 |
33 |
34 | Example:
35 |
36 | .. code-block:: python
37 | :emphasize-lines: 11,13-16
38 |
39 | from numpy import array
40 | from permetrics.classification import ClassificationMetric
41 |
42 | ## For integer labels or categorical labels
43 | y_true = [0, 1, 0, 0, 1, 0]
44 | y_pred = [0, 1, 0, 0, 0, 1]
45 |
46 | # y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
47 | # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
48 |
49 | cm = ClassificationMetric(y_true, y_pred)
50 |
51 | print(cm.fbeta_score(average=None))
52 | print(cm.fbeta_score(average="micro"))
53 | print(cm.FBS(average="macro"))
54 | print(cm.FBS(average="weighted"))
55 |
56 |
--------------------------------------------------------------------------------
/docs/source/pages/classification/SS.rst:
--------------------------------------------------------------------------------
1 | Specificity Score (SS)
2 | ======================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Specificity Score (SS)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. image:: /_static/images/class_score_1.png
19 |
20 | The specificity score is the ratio tn / (tn + fp) where tn is the number of false positives and fp the number of false positives.
21 | It measures how many observations out of all negative observations have we classified as negative.
22 | In fraud detection example, it tells us how many transactions, out of all non-fraudulent transactions, we marked as clean.
23 |
24 | In the multi-class and multi-label case, this is the average of the SS score of each class with weighting depending on the average parameter.
25 |
26 |
27 | + Best possible score is 1.0, higher value is better. Range = [0, 1]
28 | + https://neptune.ai/blog/evaluation-metrics-binary-classification
29 | + https://towardsdatascience.com/multi-class-metrics-made-simple-part-i-precision-and-recall-9250280bddc2
30 | + https://www.debadityachakravorty.com/ai-ml/cmatrix/
31 |
32 | Example:
33 |
34 | .. code-block:: python
35 | :emphasize-lines: 11,13-16
36 |
37 | from numpy import array
38 | from permetrics.classification import ClassificationMetric
39 |
40 | ## For integer labels or categorical labels
41 | y_true = [0, 1, 0, 0, 1, 0]
42 | y_pred = [0, 1, 0, 0, 0, 1]
43 |
44 | # y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
45 | # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
46 |
47 | cm = ClassificationMetric(y_true, y_pred)
48 |
49 | print(cm.specificity_score(average=None))
50 | print(cm.ss(average="micro"))
51 | print(cm.SS(average="macro"))
52 | print(cm.SS(average="weighted"))
53 |
--------------------------------------------------------------------------------
/examples/classification/02_oop_style_metric.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 11:35, 25/03/2022 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | ## This is modern and better way to use metrics
8 | ## You only need to pass y_true, y_pred one time when creating metric object,
9 | ## After that, you can get the value of any metrics without passing y_true, y_pred
10 |
11 | ## 1. Import packages, classes
12 | ## 2. Create object
13 | ## 3. From object call function and use
14 |
15 | import numpy as np
16 | from permetrics.classification import ClassificationMetric
17 |
18 | y_true = [0, 1, 0, 0, 1, 0]
19 | y_pred = [0, 1, 0, 0, 0, 1]
20 |
21 | evaluator = ClassificationMetric(y_true, y_pred)
22 |
23 | ## Get the result of any function you want to
24 |
25 | hamming_score = evaluator.hamming_score()
26 | mcc = evaluator.matthews_correlation_coefficient()
27 | specificity = evaluator.specificity_score()
28 |
29 | print(f"HL: {hamming_score}, MCC: {mcc}, specificity: {specificity}")
30 |
31 | # CM = confusion_matrix
32 | # PS = precision_score
33 | # NPV = negative_predictive_value
34 | # RS = recall_score
35 | # AS = accuracy_score
36 | # F1S = f1_score
37 | # F2S = f2_score
38 | # FBS = fbeta_score
39 | # SS = specificity_score
40 | # MCC = matthews_correlation_coefficient
41 | # HS = hamming_score
42 | # LS = lift_score
43 | # CKS = cohen_kappa_score
44 | # JSI = JSC = jaccard_similarity_coefficient = jaccard_similarity_index
45 | # GMS = g_mean_score
46 | # GINI = gini_index
47 | # ROC = AUC = RAS = roc_auc_score
48 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/EC.rst:
--------------------------------------------------------------------------------
1 | EC - Efficiency Coefficient
2 | ===========================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: EC - Efficiency Coefficient
7 |
8 |
9 | .. toctree::
10 | :maxdepth: 3
11 |
12 | .. toctree::
13 | :maxdepth: 3
14 |
15 | .. toctree::
16 | :maxdepth: 3
17 |
18 |
19 | .. math::
20 |
21 | \text{EC}(y, \hat{y}) = 1 - \frac{ \sum_{i=1}^n (y_i - \hat{y_i})^2 }{ \sum_{i=1}^n (y_i - mean(Y))^2 }
22 |
23 | Latex equation code::
24 |
25 | \text{EC}(y, \hat{y}) = 1 - \frac{ \sum_{i=1}^n (y_i - \hat{y_i})^2 }{ \sum_{i=1}^n (y_i - mean(Y))^2 }
26 |
27 | Efficiency Coefficient (EC) :cite:`joreskog1978structural` is a metric used to evaluate the accuracy of a regression model in predicting continuous values.
28 |
29 | + Best possible value = 1, bigger value is better. Range = [-inf, +1]
30 | + The EC ranges from negative infinity to 1, where a value of 1 indicates a perfect match between the model predictions and the observed data, and a value
31 | of 0 indicates that the model predictions are no better than the benchmark prediction.
32 | + A negative value indicates that the model predictions are worse than the benchmark prediction.
33 | + `Link to equation `_
34 |
35 | Example to use EC metric:
36 |
37 | .. code-block:: python
38 | :emphasize-lines: 8-9,15-16
39 |
40 | from numpy import array
41 | from permetrics.regression import RegressionMetric
42 |
43 | ## For 1-D array
44 | y_true = array([3, -0.5, 2, 7])
45 | y_pred = array([2.5, 0.0, 2, 8])
46 |
47 | evaluator = RegressionMetric(y_true, y_pred)
48 | print(evaluator.efficiency_coefficient())
49 |
50 | ## For > 1-D array
51 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
52 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
53 |
54 | evaluator = RegressionMetric(y_true, y_pred)
55 | print(evaluator.EC(multi_output="raw_values"))
56 |
--------------------------------------------------------------------------------
/examples/classification/01_functional_style_metric.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 11:36, 25/03/2022 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | ## This is traditional way to call a specific metric you want to use.
8 | ## Everytime, you want to use a function, you need to pass y_true and y_pred
9 |
10 | ## 1. Import packages, classes
11 | ## 2. Create object
12 | ## 3. From object call function and use
13 |
14 | import numpy as np
15 | from permetrics.classification import ClassificationMetric
16 |
17 | y_true = [0, 1, 0, 0, 1, 0]
18 | y_pred = [0, 1, 0, 0, 0, 1]
19 |
20 | evaluator = ClassificationMetric()
21 |
22 | ## 3.1 Call specific function inside object, each function has 2 names like below
23 |
24 | ps1 = evaluator.precision_score(y_true, y_pred)
25 | ps2 = evaluator.PS(y_true, y_pred)
26 | print(f"Precision: {ps1}, {ps2}")
27 |
28 | recall = evaluator.recall_score(y_true, y_pred)
29 | accuracy = evaluator.accuracy_score(y_true, y_pred)
30 | print(f"recall: {recall}, accuracy: {accuracy}")
31 |
32 | # CM = confusion_matrix
33 | # PS = precision_score
34 | # NPV = negative_predictive_value
35 | # RS = recall_score
36 | # AS = accuracy_score
37 | # F1S = f1_score
38 | # F2S = f2_score
39 | # FBS = fbeta_score
40 | # SS = specificity_score
41 | # MCC = matthews_correlation_coefficient
42 | # HS = hamming_score
43 | # LS = lift_score
44 | # CKS = cohen_kappa_score
45 | # JSI = JSC = jaccard_similarity_coefficient = jaccard_similarity_index
46 | # GMS = g_mean_score
47 | # GINI = gini_index
48 | # ROC = AUC = RAS = roc_auc_score
49 |
--------------------------------------------------------------------------------
/examples/clustering/03_exam_cases_indexes.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 09:22, 02/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | from permetrics import ClusteringMetric
9 | from sklearn.datasets import make_blobs
10 |
11 | # generate sample data
12 | X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
13 | y_pred_rand = []
14 | for idx in range(0, len(y_true)):
15 | y_pred_rand.append(np.random.choice(list(set(range(0, 4)) - {idx})))
16 | temp = [
17 | y_true.copy(), y_pred_rand, np.random.randint(0, 4, size=300),
18 | np.random.randint(0, 2, 300), np.random.randint(0, 6, 300),
19 | np.ones((300,)), np.zeros((300,))
20 | ]
21 | for idx in range(7):
22 | evaluator = ClusteringMetric(y_pred=temp[idx], X=X)
23 | print(evaluator.hartigan_index())
24 |
25 | # print(evaluator.get_metrics_by_list_names(["BHI", "XBI", "DBI", "BRI", "KDI", "DRI", "DI", "CHI",
26 | # "LDRI", "LSRI", "SI", "SSEI", "MSEI", "DHI", "BI", "RSI", "DBCVI", "HI"]))
27 |
28 | # BHI = ball_hall_index
29 | # XBI = xie_beni_index
30 | # DBI = davies_bouldin_index
31 | # BRI = banfeld_raftery_index
32 | # KDI = ksq_detw_index
33 | # DRI = det_ratio_index
34 | # DI = dunn_index
35 | # CHI = calinski_harabasz_index
36 | # LDRI = log_det_ratio_index
37 | # LSRI = log_ss_ratio_index
38 | # SI = silhouette_index
39 | # SSEI = sum_squared_error_index
40 | # MSEI = mean_squared_error_index
41 | # DHI = duda_hart_index
42 | # BI = beale_index
43 | # RSI = r_squared_index
44 | # DBCVI = density_based_clustering_validation_index
45 | # HI = hartigan_index
46 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/A30.rst:
--------------------------------------------------------------------------------
1 | A30 - A30 index
2 | ===============
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: A30 - A30 index
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{A30}(y, \hat{y}) = \frac{1}{n}\sum_{i=1}^{n} \left\{\begin{array}{ll} 1, & \textrm{if } \frac{|\hat{y}_i - y_i|}{y_i} \leq 0.3\\ 0, & \textrm{otherwise} \end{array}\right.
21 |
22 | Latex equation code::
23 |
24 | \text{A30}(y, \hat{y}) = \frac{1}{n}\sum_{i=1}^{n} \left\{\begin{array}{ll} 1, & \textrm{if } \frac{|\hat{y}_i - y_i|}{y_i} \leq 0.3\\ 0, & \textrm{otherwise} \end{array}\right.
25 |
26 |
27 | + Best possible score is 1.0, bigger value is better. Range = [0, 1]
28 | + a30-index (A30) :cite:`van2023groundwater` evaluated metric by showing the number of samples that fit the prediction values with a deviation of ±30%
29 | compared to experimental values.
30 | + In other words, the A30 metric measures the proportion of cases where the absolute difference between the predicted and actual values is less than or equal
31 | to 30% of the actual value. A higher A30 score indicates better predictive accuracy, as the model is able to make more accurate predictions that are closer
32 | to the actual values.
33 |
34 |
35 | Example to use A30 metric:
36 |
37 | .. code-block:: python
38 | :emphasize-lines: 8-9,15-16
39 |
40 | from numpy import array
41 | from permetrics.regression import RegressionMetric
42 |
43 | ## For 1-D array
44 | y_true = array([3, -0.5, 2, 7])
45 | y_pred = array([2.5, 0.0, 2, 8])
46 |
47 | evaluator = RegressionMetric(y_true, y_pred)
48 | print(evaluator.a30_index())
49 |
50 | ## For > 1-D array
51 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
52 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
53 |
54 | evaluator = RegressionMetric(y_true, y_pred)
55 | print(evaluator.A30(multi_output="raw_values"))
56 |
--------------------------------------------------------------------------------
/docs/source/pages/clustering/ES.rst:
--------------------------------------------------------------------------------
1 | Entropy Score (ES)
2 | ==================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Entropy Score (ES)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | Entropy is a metric used to evaluate the quality of clustering results, particularly when the ground truth labels of the data
18 | points are known. It measures the amount of uncertainty or disorder within the clusters produced by a clustering algorithm.
19 |
20 | Here's how the Entropy score is calculated::
21 |
22 | 1) For each cluster, compute the class distribution by counting the occurrences of each class label within the cluster.
23 | 2) Normalize the class distribution by dividing the count of each class label by the total number of data points in the cluster.
24 | 3) Compute the entropy for each cluster using the normalized class distribution.
25 | 4) Weight the entropy of each cluster by its relative size (proportion of data points in the whole dataset).
26 | 5) Sum up the weighted entropies of all clusters.
27 |
28 | The resulting value is the Entropy score, which typically ranges from 0 to 1. A lower Entropy score indicates better clustering,
29 | as it implies more purity and less uncertainty within the clusters.
30 |
31 | Entropy score considers both the composition of each cluster and the distribution of classes within the clusters.
32 | It provides a more comprehensive evaluation of clustering performance compared to simple metrics like Purity.
33 |
34 |
35 | Example:
36 |
37 | .. code-block:: python
38 |
39 | import numpy as np
40 | from permetrics import ClusteringMetric
41 |
42 | ## For integer labels or categorical labels
43 | y_true = np.array([0, 0, 1, 1, 1, 2, 2, 1])
44 | y_pred = np.array([0, 0, 1, 1, 2, 2, 2, 2])
45 |
46 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
47 |
48 | print(cm.entropy_score())
49 | print(cm.ES())
50 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_SSEI.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 |
12 | np.random.seed(100)
13 |
14 |
15 | def generate_dataset(num_samples, num_features, num_clusters, cluster_std):
16 | centroids = np.random.randn(num_clusters, num_features)
17 | labels = np.random.randint(0, num_clusters, num_samples)
18 | data = centroids[labels] + np.random.randn(num_samples, num_features) * cluster_std
19 | return data, centroids, labels
20 |
21 |
22 | def calculate_sse(data, centroids, labels):
23 | centroid_distances = centroids[labels]
24 | squared_distances = np.sum(np.square(data - centroid_distances), axis=1)
25 | sse = np.sum(squared_distances)
26 | return sse
27 |
28 |
29 | num_samples = 10000000
30 | num_features = 2
31 | num_clusters = 5
32 | cluster_std = 0.5
33 |
34 | data, centroids, labels = generate_dataset(num_samples, num_features, num_clusters, cluster_std)
35 |
36 | # Calculate SSE using the optimized function
37 | time01 = time.perf_counter()
38 | sse = calculate_sse(data, centroids, labels)
39 | print("Sum of Squared Errors:", sse, time.perf_counter() - time01)
40 |
41 | time02 = time.perf_counter()
42 | cm = ClusteringMetric(y_true=labels, y_pred=labels, X=data)
43 | sse02 = cm.sum_squared_error_index()
44 | print("SSE: ", sse02, time.perf_counter() - time02 )
45 |
46 | time03 = time.perf_counter()
47 | s3 = cut.calculate_sum_squared_error_index(data, labels)
48 | print("SSE1: ", s3, time.perf_counter() - time03)
49 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/A20.rst:
--------------------------------------------------------------------------------
1 | A20 - A20 index
2 | ===============
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: A20 - A20 index
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{A20}(y, \hat{y}) = \text{A10}(y, \hat{y}) = \frac{1}{n}\sum_{i=1}^{n} \left\{\begin{array}{ll} 1, & \textrm{if } \frac{|\hat{y}_i - y_i|}{y_i} \leq 0.2\\ 0, & \textrm{otherwise} \end{array}\right.
21 |
22 | Latex equation code::
23 |
24 | \text{A20}(y, \hat{y}) = \frac{1}{n}\sum_{i=1}^{n} \left\{\begin{array}{ll} 1, & \textrm{if } \frac{|\hat{y}_i - y_i|}{y_i} \leq 0.2\\ 0, & \textrm{otherwise} \end{array}\right.
25 |
26 |
27 | + a20-index (A20) :cite:`van2023groundwater` evaluated metric by showing the number of samples that fit the prediction values with a deviation of ±20%
28 | compared to experimental values.
29 | + In other words, the A20 metric measures the proportion of cases where the absolute difference between the predicted and actual values is less than or equal
30 | to 20% of the actual value. A higher A20 score indicates better predictive accuracy, as the model is able to make more accurate predictions that are closer
31 | to the actual values.
32 | + Best possible score is 1.0, bigger value is better. Range = [0, 1]
33 |
34 |
35 | Example to use A20 metric:
36 |
37 | .. code-block:: python
38 | :emphasize-lines: 8-9,15-16
39 |
40 | from numpy import array
41 | from permetrics.regression import RegressionMetric
42 |
43 | ## For 1-D array
44 | y_true = array([3, -0.5, 2, 7])
45 | y_pred = array([2.5, 0.0, 2, 8])
46 |
47 | evaluator = RegressionMetric(y_true, y_pred)
48 | print(evaluator.a20_index())
49 |
50 | ## For > 1-D array
51 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
52 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
53 |
54 | evaluator = RegressionMetric(y_true, y_pred)
55 | print(evaluator.A20(multi_output="raw_values"))
56 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/EVS.rst:
--------------------------------------------------------------------------------
1 | EVS - Explained Variance Score
2 | ==============================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: EVS - Explained Variance Score
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | EVS = 1 - \frac{Var\{ y_{true} - y_{pred} \}}{Var \{ y_{true} \} }
21 |
22 | + `Link to equation `_
23 |
24 | The given math formula defines the explained variance score (EVS) :cite:`nguyen2020eo`, which is a metric used in regression analysis to evaluate the
25 | performance of a model. The formula computes the ratio of the variance of the difference between the true values y_true and the predicted values y_pred to
26 | the variance of the true values y_true.
27 |
28 | The resulting score ranges between -infinity and 1, with a score of 1 indicating a perfect match between the true and predicted values and a score of 0
29 | indicating that the model does not perform better than predicting the mean of the true values.
30 |
31 | A higher value of EVS indicates a better performance of the model. Best possible score is 1.0, greater values are better. Range = (-inf, 1.0].
32 |
33 |
34 | Example to use EVS metric:
35 |
36 | .. code-block:: python
37 | :emphasize-lines: 8-9,15-16
38 |
39 | from numpy import array
40 | from permetrics.regression import RegressionMetric
41 |
42 | ## For 1-D array
43 | y_true = array([3, -0.5, 2, 7])
44 | y_pred = array([2.5, 0.0, 2, 8])
45 |
46 | evaluator = RegressionMetric(y_true, y_pred)
47 | print(evaluator.explained_variance_score())
48 |
49 | ## For > 1-D array
50 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
51 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
52 |
53 | evaluator = RegressionMetric(y_true, y_pred)
54 | print(evaluator.EVS(multi_output="raw_values"))
55 |
--------------------------------------------------------------------------------
/permetrics/utils/regressor_util.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 12:23, 19/05/2022 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 |
9 |
10 | def calculate_nse(y_true, y_pred):
11 | return 1 - np.sum((y_true - y_pred) ** 2, axis=0) / np.sum((y_true - np.mean(y_true, axis=0)) ** 2, axis=0)
12 |
13 |
14 | def calculate_wi(y_true, y_pred):
15 | m1 = np.mean(y_true, axis=0)
16 | return 1 - np.sum((y_pred - y_true) ** 2, axis=0) / np.sum((np.abs(y_pred - m1) + np.abs(y_true - m1)) ** 2, axis=0)
17 |
18 |
19 | def calculate_pcc(y_true, y_pred):
20 | m1, m2 = np.mean(y_true, axis=0), np.mean(y_pred, axis=0)
21 | numerator = np.sum((y_true - m1) * (y_pred - m2), axis=0)
22 | denominator = np.sqrt(np.sum((y_true - m1) ** 2, axis=0)) * np.sqrt(np.sum((y_pred - m2) ** 2, axis=0))
23 | return numerator / denominator
24 |
25 |
26 | def calculate_absolute_pcc(y_true, y_pred):
27 | m1, m2 = np.mean(y_true, axis=0), np.mean(y_pred, axis=0)
28 | numerator = np.sum(np.abs(y_true - m1) * np.abs(y_pred - m2), axis=0)
29 | denominator = np.sqrt(np.sum((y_true - m1) ** 2, axis=0)) * np.sqrt(np.sum((y_pred - m2) ** 2, axis=0))
30 | return numerator / denominator
31 |
32 |
33 | def calculate_entropy(y_true, y_pred):
34 | return -np.sum(y_true * np.log2(y_pred), axis=0)
35 |
36 |
37 | def calculate_ec(y_true, y_pred):
38 | m1 = np.mean(y_true, axis=0)
39 | numerator = np.sum((y_true - y_pred)**2, axis=0)
40 | denominator = np.sum((y_true - m1) ** 2, axis=0)
41 | return 1.0 - numerator / denominator
42 |
43 |
44 | def calculate_mse(y_true, y_pred):
45 | return np.mean((y_true - y_pred) ** 2, axis=0)
46 |
--------------------------------------------------------------------------------
/docs/source/pages/classification/F1S.rst:
--------------------------------------------------------------------------------
1 | F1 Score (F1S)
2 | ==============
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: F1 Score (F1S)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. image:: /_static/images/class_score_1.png
19 |
20 | Compute the F1 score, also known as balanced F-score or F-measure.
21 |
22 | The F1 score can be interpreted as a harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0.
23 | The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is
24 |
25 | .. math::
26 |
27 | F1 = 2 * (precision * recall) / (precision + recall)
28 |
29 | In the multi-class and multi-label case, this is the average of the F1 score of each class with weighting depending on the average parameter.
30 |
31 | + Best possible score is 1.0, higher value is better. Range = [0, 1]
32 | + https://towardsdatascience.com/multi-class-metrics-made-simple-part-i-precision-and-recall-9250280bddc2
33 | + https://www.debadityachakravorty.com/ai-ml/cmatrix/
34 | + https://neptune.ai/blog/evaluation-metrics-binary-classification
35 | + https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
36 |
37 |
38 | Example:
39 |
40 | .. code-block:: python
41 | :emphasize-lines: 11,13-16
42 |
43 | from numpy import array
44 | from permetrics.classification import ClassificationMetric
45 |
46 | ## For integer labels or categorical labels
47 | y_true = [0, 1, 0, 0, 1, 0]
48 | y_pred = [0, 1, 0, 0, 0, 1]
49 |
50 | # y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
51 | # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
52 |
53 | cm = ClassificationMetric(y_true, y_pred)
54 |
55 | print(cm.f1_score(average=None))
56 | print(cm.F1S(average="micro"))
57 | print(cm.F1S(average="macro"))
58 | print(cm.F1S(average="weighted"))
59 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/MAE.rst:
--------------------------------------------------------------------------------
1 | MAE - Mean Absolute Error
2 | =========================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: MAE - Mean Absolute Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{MAE}(y, \hat{y}) = \frac{ \sum_{i=0}^{N - 1} |y_i - \hat{y}_i| }{N}
21 |
22 | Mean Absolute Error (MAE) :cite:`nguyen2018resource` is a statistical measure used to evaluate the accuracy of a forecasting model, such as a regression
23 | model or a time series model. It measures the average magnitude of the errors between the predicted values and the actual values in the units of the response
24 | variable. The MAE is calculated as the average of the absolute differences between the predicted values and the actual values. In other words, it is the mean
25 | of the absolute errors. Best possible score is 0.0, smaller value is better. Range = [0, +inf)
26 |
27 | The MAE is a widely used measure of forecast accuracy because it is easy to understand and interpret. A lower MAE indicates better forecast accuracy.
28 | However, like the RMSE, the MAE is not normalized and is dependent on the scale of the response variable, making it difficult to compare the MAE values
29 | across different datasets with different scales.
30 |
31 |
32 | Example to use MAE metric:
33 |
34 | .. code-block:: python
35 | :emphasize-lines: 8-9,15-16
36 |
37 | from numpy import array
38 | from permetrics.regression import RegressionMetric
39 |
40 | ## For 1-D array
41 | y_true = array([3, -0.5, 2, 7])
42 | y_pred = array([2.5, 0.0, 2, 8])
43 |
44 | evaluator = RegressionMetric(y_true, y_pred)
45 | print(evaluator.mean_absolute_error())
46 |
47 | ## For > 1-D array
48 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
49 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
50 |
51 | evaluator = RegressionMetric(y_true, y_pred)
52 | print(evaluator.MAE(multi_output="raw_values"))
53 |
--------------------------------------------------------------------------------
/docs/source/pages/clustering/DBCVI.rst:
--------------------------------------------------------------------------------
1 | Density-Based Clustering Validation Index (DBCVI)
2 | =================================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Density-Based Clustering Validation Index (DBCVI)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | The Density-Based Clustering Validation (DBCV) metric is another clustering validation metric that is used to evaluate the quality
18 | of a clustering solution, particularly for density-based clustering algorithms such as DBSCAN.
19 |
20 | The DBCV metric measures the average ratio of the distances between the data points and their cluster centroids, to the
21 | distances between the data points and the nearest data points in other clusters. The idea is that a good clustering solution
22 | should have compact and well-separated clusters, so the ratio of these distances should be high.
23 |
24 | The DBCV metric is calculated using the following formula::
25 |
26 | DBCV = (1 / n) * sum_{i=1}^n (sum_{j=1}^n (d(i,j) / max{d(i,k), k!=j}))
27 |
28 | where n is the number of data points, d(i,j) is the Euclidean distance between data points i and j, and max{d(i,k), k!=j} is
29 | the maximum distance between data point i and any other data point in a different cluster.
30 |
31 | The DBCV metric ranges from 0 to 1, with lower values indicating better clustering solutions. A value of 0 indicates a perfect
32 | clustering solution, where all data points belong to their own cluster and the distances between clusters are maximized.
33 |
34 |
35 | Example:
36 |
37 | .. code-block:: python
38 |
39 | import numpy as np
40 | from permetrics import ClusteringMetric
41 |
42 | ## For integer labels or categorical labels
43 | data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
44 | y_pred = np.array([0, 0, 1, 1, 1])
45 |
46 | cm = ClusteringMetric(X=data, y_pred=y_pred)
47 |
48 | print(cm.density_based_clustering_validation_index())
49 | print(cm.DBCVI())
50 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/A10.rst:
--------------------------------------------------------------------------------
1 | A10 - A10 index
2 | ===============
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: A10 - A10 index
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{A10}(y, \hat{y}) = \frac{1}{n}\sum_{i=1}^{n} \left\{\begin{array}{ll} 1, & \textrm{if } \frac{|\hat{y}_i - y_i|}{y_i} \leq 0.1\\ 0, & \textrm{otherwise} \end{array}\right.
21 |
22 | Latex equation code::
23 |
24 | \text{A10}(y, \hat{y}) = \frac{1}{n}\sum_{i=1}^{n} \left\{\begin{array}{ll} 1, & \textrm{if } \frac{|\hat{y}_i - y_i|}{y_i} \leq 0.1\\ 0, & \textrm{otherwise} \end{array}\right.
25 |
26 |
27 | + Best possible score is 1.0, bigger value is better. Range = [0, 1]
28 | + a10-index is engineering index for evaluating artificial intelligence models by showing the number of samples that fit the
29 | prediction values with a deviation of ±10% compared to experimental values
30 | + `Link to equation `_
31 |
32 | In other words, the A10 metric measures the proportion of cases where the absolute difference between the predicted and actual values is less than or equal
33 | to 10% of the actual value. A higher A10 score indicates better predictive accuracy, as the model is able to make more accurate predictions that are closer
34 | to the actual values.
35 |
36 |
37 | Example to use A10 metric:
38 |
39 | .. code-block:: python
40 | :emphasize-lines: 8-9,15-16
41 |
42 | from numpy import array
43 | from permetrics.regression import RegressionMetric
44 |
45 | ## For 1-D array
46 | y_true = array([3, -0.5, 2, 7])
47 | y_pred = array([2.5, 0.0, 2, 8])
48 |
49 | evaluator = RegressionMetric(y_true, y_pred)
50 | print(evaluator.a10_index())
51 |
52 | ## For > 1-D array
53 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
54 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
55 |
56 | evaluator = RegressionMetric(y_true, y_pred)
57 | print(evaluator.A10(multi_output="raw_values"))
58 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | run.py
3 | docs/_build/
4 | docs/build/
5 | .github/assets/pdf/
6 | # Pycharm
7 | .idea/
8 | tut_upcode.md
9 | drafts/
10 |
11 | # Byte-compiled / optimized / DLL files
12 | __pycache__/
13 | *.py[cod]
14 | *$py.class
15 |
16 | # C extensions
17 | *.so
18 |
19 | # Distribution / packaging
20 | .Python
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | .eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | wheels/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 | MANIFEST
37 |
38 | # PyInstaller
39 | # Usually these files are written by a python script from a template
40 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
41 | *.manifest
42 | *.spec
43 |
44 | # Installer logs
45 | pip-log.txt
46 | pip-delete-this-directory.txt
47 |
48 | # Unit test / coverage reports
49 | htmlcov/
50 | .tox/
51 | .coverage
52 | .coverage.*
53 | .cache
54 | nosetests.xml
55 | coverage.xml
56 | *.cover
57 | .hypothesis/
58 | .pytest_cache/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 | db.sqlite3
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # celery beat schedule file
89 | celerybeat-schedule
90 |
91 | # SageMath parsed files
92 | *.sage.py
93 |
94 | # Environments
95 | .env
96 | .venv
97 | env/
98 | venv/
99 | ENV/
100 | env.bak/
101 | venv.bak/
102 |
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 |
107 | # Rope project settings
108 | .ropeproject
109 |
110 | # mkdocs documentation
111 | /site
112 |
113 | # mypy
114 | .mypy_cache/
115 | .vscode/settings.json
116 | *.stackdump
117 | examples/clustering/Clustering.ipynb
118 | examples/clustering/Inputs.ipynb
119 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/MAPE.rst:
--------------------------------------------------------------------------------
1 | MAPE - Mean Absolute Percentage Error
2 | =====================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: MAPE - Mean Absolute Percentage Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{MAPE}(y, \hat{y}) = \frac{100\%}{N} \sum_{i=0}^{N - 1} \frac{|y_i - \hat{y}_i|}{|y_i|}
21 |
22 | The Mean Absolute Percentage Error (MAPE) :cite:`nguyen2020new` is a statistical measure of the accuracy of a forecasting model, commonly used in
23 | business and economics. The MAPE measures the average percentage difference between the forecasted and actual values, with a lower MAPE indicating better
24 | forecast accuracy.
25 |
26 | The MAPE is expressed as a percentage, and a commonly used benchmark for a good forecast model is a MAPE of less than 20%. However, the benchmark may vary
27 | depending on the specific application and industry. The MAPE has a range of [0, +infinity), with a best possible score of 0.0, indicating perfect forecast
28 | accuracy. A larger MAPE indicates a larger average percentage difference between the forecasted and actual values, with infinite MAPE indicating a complete
29 | failure of the forecasting model.
30 | + `Link equation `_
31 |
32 |
33 | Example to use MAPE metric:
34 |
35 | .. code-block:: python
36 | :emphasize-lines: 8-9,15-16
37 |
38 | from numpy import array
39 | from permetrics.regression import RegressionMetric
40 |
41 | ## For 1-D array
42 | y_true = array([3, -0.5, 2, 7])
43 | y_pred = array([2.5, 0.0, 2, 8])
44 |
45 | evaluator = RegressionMetric(y_true, y_pred)
46 | print(evaluator.mean_absolute_percentage_error())
47 |
48 | ## For > 1-D array
49 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
50 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
51 |
52 | evaluator = RegressionMetric(y_true, y_pred)
53 | print(evaluator.MAPE(multi_output="raw_values"))
54 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/AR2.rst:
--------------------------------------------------------------------------------
1 | AR2 - Adjusted R2
2 | =================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: AR2 - Adjusted R2
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | AR2(y, \hat{y}) =
21 |
22 | where :math:`\bar{y} = \frac{1}{N} \sum_{i=1}^{N} y_i` and :math:`\sum_{i=1}^{N} (y_i - \hat{y}_i)^2 = \sum_{i=1}^{N} \epsilon_i^2`
23 |
24 | Latex equation code::
25 |
26 | \text{AR2}(y, \hat{y}) = \textrm{Adjusted R}^2 = 1 - \frac{(1 - R^2)(n - 1)}{n - k - 1}
27 |
28 |
29 | + Adjusted Coefficient of Determination (ACOD/AR2): Best possible score is 1.0, bigger value is better. Range = (-inf, 1]
30 | + `Link to equation `_
31 | + Scikit-learn and other websites denoted COD as R^2 (or R squared), it leads to the misunderstanding of R^2 in which R is PCC.
32 | + We should denote it as COD or R2 only.
33 |
34 | Here, $n$ is the sample size, $k$ is the number of predictors, $R^2$ is the coefficient of determination, and the Adjusted R2 is calculated as a modification
35 | of the R2 that takes into account the number of predictors in the model. The Adjusted R2 provides a more accurate measure of the goodness-of-fit of a model
36 | with multiple predictors.
37 |
38 |
39 | Example to use AR2 metric:
40 |
41 | .. code-block:: python
42 | :emphasize-lines: 8-9,15-16
43 |
44 | from numpy import array
45 | from permetrics.regression import RegressionMetric
46 |
47 | ## For 1-D array
48 | y_true = array([3, -0.5, 2, 7])
49 | y_pred = array([2.5, 0.0, 2, 8])
50 |
51 | evaluator = RegressionMetric(y_true, y_pred)
52 | print(evaluator.adjusted_coefficient_of_determination())
53 |
54 | ## For > 1-D array
55 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
56 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
57 |
58 | evaluator = RegressionMetric(y_true, y_pred)
59 | print(evaluator.AR2(multi_output="raw_values"))
60 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/NNSE.rst:
--------------------------------------------------------------------------------
1 | NNSE - Normalized NSE
2 | =====================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: NNSE - Normalized NSE
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{NNSE}(y, \hat{y}) = \frac{1}{2 - NSE}
21 |
22 | Latex equation code::
23 |
24 | \text{E}(y, \hat{y}) = \frac{1}{2 - NSE}
25 |
26 |
27 | The Normalized NSE (NNSE) :cite:`ahmed2021comprehensive` is a statistical measure used to evaluate the performance of hydrological models in simulating
28 | streamflow. It is a variant of the Nash-Sutcliffe Efficiency (NSE), which is a widely used measure of model performance in hydrology.
29 |
30 | The NNSE accounts for the variability in the observed streamflow and provides a more objective measure of model performance than the NSE alone. The NNSE is
31 | commonly used in hydrology and water resources engineering to evaluate the performance of hydrological models in simulating streamflow and to compare the
32 | performance of different models.
33 |
34 | + Normalize Nash-Sutcliffe Efficiency (NNSE): Best possible score is 1.0, greater value is better. Range = [0, 1]
35 | + `Link to equation `_
36 |
37 |
38 | Example to use NNSE metric:
39 |
40 | .. code-block:: python
41 | :emphasize-lines: 8-9,15-16
42 |
43 | from numpy import array
44 | from permetrics.regression import RegressionMetric
45 |
46 | ## For 1-D array
47 | y_true = array([3, -0.5, 2, 7])
48 | y_pred = array([2.5, 0.0, 2, 8])
49 |
50 | evaluator = RegressionMetric(y_true, y_pred)
51 | print(evaluator.normalized_nash_sutcliffe_efficiency())
52 |
53 | ## For > 1-D array
54 | y_true = array([[0.5, 1], [-1, 1], [7, -6], [1, 2], [2.1, 2.2], [3.4, 5.5]])
55 | y_pred = array([[0, 2], [-1, 2], [8, -5], [1.1, 1.9], [2.0, 2.3], [3.0, 4.2]])
56 |
57 | evaluator = RegressionMetric(y_true, y_pred)
58 | print(evaluator.NNSE(multi_output="raw_values"))
59 |
--------------------------------------------------------------------------------
/docs/source/pages/clustering/RSI.rst:
--------------------------------------------------------------------------------
1 | R-Squared Index (RSI)
2 | =====================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: R-Squared Index (RSI)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | The R-squared index is another clustering validation metric that is used to measure the quality of a clustering solution.
18 | It is based on the idea of comparing the variance of the data before and after clustering.
19 | The R-squared index measures the proportion of the total variance in the data that is explained by the clustering solution.
20 |
21 | The R-squared index is calculated using the following formula::
22 |
23 | R-squared = (total variance - variance within clusters) / total variance
24 |
25 | where total variance is the variance of the entire dataset, and variance within clusters is the sum of the variances
26 | of each cluster.
27 | The R-squared index ranges from -inf to 1, with higher values indicating better clustering solutions. A negative value indicates that the clustering
28 | solution is worse than random, while a value of 0 indicates that the clustering solution explains no variance beyond chance.
29 | A value of 1 indicates that the clustering solution perfectly explains all the variance in the data.
30 |
31 |
32 | Note that the R-squared index has some limitations, as it can be biased towards solutions with more clusters.
33 | It is also sensitive to the scale and dimensionality of the data, and may not be appropriate for all clustering problems.
34 | Therefore, it's important to consider multiple validation metrics when evaluating clustering solutions.
35 |
36 | Example:
37 |
38 | .. code-block:: python
39 |
40 | import numpy as np
41 | from permetrics import ClusteringMetric
42 |
43 | ## For integer labels or categorical labels
44 | data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
45 | y_pred = np.array([0, 0, 1, 1, 1])
46 |
47 | cm = ClusteringMetric(X=data, y_pred=y_pred)
48 |
49 | print(cm.r_squared_index())
50 | print(cm.RSI())
51 |
--------------------------------------------------------------------------------
/docs/source/pages/clustering/BI.rst:
--------------------------------------------------------------------------------
1 | Beale Index (BI)
2 | ================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Beale Index (BI)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | The Beale Index is a clustering validation metric that measures the quality of a clustering solution by computing the
18 | ratio of the within-cluster sum of squares to the between-cluster sum of squares.
19 | It is also known as the "variance ratio criterion" or the "F-ratio".
20 |
21 | The within-cluster sum of squares is a measure of the variability of the data points within each cluster,
22 | while the between-cluster sum of squares is a measure of the variability between the clusters.
23 | The idea is that a good clustering solution should have low within-cluster variation and high
24 | between-cluster variation, which results in a high Beale Index value.
25 |
26 | The Beale Index can be calculated using the following formula::
27 |
28 | Beale Index = (sum of squared errors within clusters / degrees of freedom within clusters) / (sum of squared errors between clusters / degrees of freedom between clusters)
29 |
30 | where the degrees of freedom are the number of data points minus the number of clusters, and the sum of squared errors is
31 | the sum of the squared distances between each data point and the centroid of its assigned cluster.
32 |
33 | The Beale Index ranges from 0 to infinity, with higher values indicating better clustering solutions.
34 | However, the Beale Index has a tendency to favor solutions with more clusters, so it's important to
35 | consider other metrics in conjunction with it.
36 |
37 |
38 | Example:
39 |
40 | .. code-block:: python
41 |
42 | import numpy as np
43 | from permetrics import ClusteringMetric
44 |
45 | ## For integer labels or categorical labels
46 | data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
47 | y_pred = np.array([0, 0, 1, 1, 1])
48 |
49 | cm = ClusteringMetric(X=data, y_pred=y_pred)
50 |
51 | print(cm.beale_index())
52 | print(cm.BI())
53 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/COR.rst:
--------------------------------------------------------------------------------
1 | COR - Correlation
2 | =================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: COR - Correlation
7 |
8 |
9 | .. toctree::
10 | :maxdepth: 3
11 |
12 | .. toctree::
13 | :maxdepth: 3
14 |
15 | .. toctree::
16 | :maxdepth: 3
17 |
18 |
19 | .. math::
20 |
21 | \text{COR}(y, \hat{y}) = \frac{ COV(y, \hat{y}) }{ std(y) * std(\hat{y})}
22 |
23 | Correlation :cite:`joreskog1978structural` measures the strength of the relationship between the variables and is a scaled measure of covariance. The
24 | correlation coefficient ranges from -1 to +1, where a value of 1 indicates a perfect positive correlation, a value of -1 indicates a perfect negative
25 | correlation, and a value of 0 indicates no correlation.
26 |
27 | To calculate the correlation coefficient, you divide the covariance of the variables by the product of their standard deviations. This normalization allows
28 | for comparison between different pairs of variables. The correlation coefficient is dimensionless and does not have any specific units of measurement.
29 |
30 | + Best possible value = 1, bigger value is better. Range = [-1, +1)
31 | + Measures the strength of the relationship between variables, is the scaled measure of covariance. It is dimensionless.
32 | + the correlation coefficient is always a pure value and not measured in any units.
33 | + `Link to equation `_
34 |
35 |
36 | Example to use COR metric:
37 |
38 | .. code-block:: python
39 | :emphasize-lines: 8-9,15-16
40 |
41 | from numpy import array
42 | from permetrics.regression import RegressionMetric
43 |
44 | ## For 1-D array
45 | y_true = array([3, -0.5, 2, 7])
46 | y_pred = array([2.5, 0.0, 2, 8])
47 |
48 | evaluator = RegressionMetric(y_true, y_pred)
49 | print(evaluator.correlation())
50 |
51 | ## For > 1-D array
52 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
53 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
54 |
55 | evaluator = RegressionMetric(y_true, y_pred)
56 | print(evaluator.COR(multi_output="raw_values"))
57 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/R2s.rst:
--------------------------------------------------------------------------------
1 | R2s - (Pearson’s Correlation Index)**2
2 | ======================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: R2s - (Pearson’s Correlation Index)**2
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{R2s}(y, \hat{y}) = \Bigg[ \frac{ \sum_{i=0}^{N - 1} ((y_i - mean(y))*(\hat{y_i} - mean(\hat{y}))) }{ \sqrt{ \sum_{i=0}^{N - 1} (y_i - mean(y))^2}*\sqrt{\sum_{i=0}^{N - 1} (\hat{y_i} - mean(\hat{y}))^2} } \Bigg]^2
21 |
22 | Latex equation code::
23 |
24 | \text{R2s}(y, \hat{y}) = \Bigg[ \frac{ \sum_{i=0}^{N - 1} ((y_i - mean(y))*(\hat{y_i} - mean(\hat{y}))) }{ \sqrt{ \sum_{i=0}^{N - 1} (y_i - mean(y))^2}*\sqrt{\sum_{i=0}^{N - 1} (\hat{y_i} - mean(\hat{y}))^2} } \Bigg]^2
25 |
26 | + (Pearson’s Correlation Index)^2 = R^2 = R2s (R square): Best possible score is 1.0, bigger value is better. Range = [0, 1]
27 | + This actually a useless metric that I implemented here just to demonstrate the misunderstanding between R2s and R2 (Coefficient of Determination).
28 | + Most of online tutorials (article, wikipedia,...) or even scikit-learn library are denoted the wrong R2s and R2.
29 | + R^2 = R2s = R squared makes people think it as (Pearson’s Correlation Index)^2
30 | + However, R2 = Coefficient of Determination, `link `_
31 |
32 |
33 | Example to use R2s metric:
34 |
35 | .. code-block:: python
36 | :emphasize-lines: 8-9,15-16
37 |
38 | from numpy import array
39 | from permetrics.regression import RegressionMetric
40 |
41 | ## For 1-D array
42 | y_true = array([3, -0.5, 2, 7])
43 | y_pred = array([2.5, 0.0, 2, 8])
44 |
45 | evaluator = RegressionMetric(y_true, y_pred)
46 | print(evaluator.pearson_correlation_coefficient_square())
47 |
48 | ## For > 1-D array
49 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
50 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
51 |
52 | evaluator = RegressionMetric(y_true, y_pred)
53 | print(evaluator.R2s(multi_output="raw_values"))
54 |
--------------------------------------------------------------------------------
/permetrics/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 11:23, 16/03/2020 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 | #
7 | # ## 1. Import packages, classes
8 | # ## 2. Create object
9 | # ## 3. From object calls function and use
10 | #
11 | # import numpy as np
12 | # from permetrics import RegressionMetric, ClassificationMetric, ClusteringMetric
13 | #
14 | # ##### Regression performance
15 | # y_true = np.array([3, -0.5, 2, 7, 5, 6])
16 | # y_pred = np.array([2.5, 0.0, 2, 8, 5, 6])
17 | #
18 | # evaluator = RegressionMetric(y_true, y_pred)
19 | #
20 | # ## Get the result of any function you want to
21 | # rmse = evaluator.RMSE()
22 | # mse = evaluator.MSE()
23 | # mae = evaluator.MAE()
24 | # print(f"RMSE: {rmse}, MSE: {mse}, MAE: {mae}")
25 | #
26 | #
27 | # ##### Classification performance
28 | # y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "bird"]
29 | # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "ant"]
30 | #
31 | # evaluator = ClassificationMetric(y_true, y_pred)
32 | #
33 | # ## Get the result of any function you want to
34 | # print(evaluator.f1_score())
35 | # print(evaluator.F1S(average="micro"))
36 | # print(evaluator.F1S(average="macro"))
37 | # print(evaluator.F1S(average="weighted"))
38 | #
39 | #
40 | # ##### Clustering performance
41 | # X = np.random.uniform(-1, 10, size=(300, 5))
42 | # y_true = np.random.randint(0, 4, size=300)
43 | # y_pred = np.random.randint(0, 4, size=300)
44 | #
45 | # external_evaluator = ClusteringMetric(y_true=y_true, y_pred=y_pred)
46 | # print(external_evaluator.mutual_info_score())
47 | # print(external_evaluator.MIS())
48 | #
49 | # internal_evaluator = ClusteringMetric(y_pred=y_pred, X=X)
50 | # print(internal_evaluator.banfeld_raftery_index())
51 | # print(internal_evaluator.BRI())
52 |
53 |
54 | __version__ = "2.0.0"
55 |
56 | from .evaluator import Evaluator
57 | from .classification import ClassificationMetric
58 | from .regression import RegressionMetric
59 | from .clustering import ClusteringMetric
60 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_XBI.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from scipy.spatial.distance import cdist
12 |
13 | np.random.seed(100)
14 |
15 |
16 | def generate_dataset(num_samples, num_features, num_clusters, cluster_std):
17 | centroids = np.random.randn(num_clusters, num_features)
18 | labels = np.random.randint(0, num_clusters, num_samples)
19 | data = centroids[labels] + np.random.randn(num_samples, num_features) * cluster_std
20 | return data, centroids, labels
21 |
22 |
23 | def xie_beni_index(data, labels, centroids):
24 | num_clusters = len(np.unique(labels)) # Number of clusters
25 | wgss = np.sum(np.min(cdist(data, centroids, metric='euclidean'), axis=1) ** 2)
26 | list_dist = []
27 | for k in range(num_clusters):
28 | for k0 in range(k + 1, num_clusters):
29 | list_dist.append(np.sum((centroids[k] - centroids[k0]) ** 2))
30 | C = (wgss / np.min(list_dist)) / len(data)
31 | return C
32 |
33 |
34 | num_samples = 10000000
35 | num_features = 2
36 | num_clusters = 5
37 | cluster_std = 0.5
38 |
39 | data, centroids, labels = generate_dataset(num_samples, num_features, num_clusters, cluster_std)
40 |
41 | time03 = time.perf_counter()
42 | s3 = xie_beni_index(data, labels, centroids)
43 | print("XBI 1: ", s3, time.perf_counter() - time03)
44 |
45 | time02 = time.perf_counter()
46 | cm = ClusteringMetric(y_true=labels, y_pred=labels, X=data)
47 | res = cm.xie_beni_index()
48 | print("XBI 2: ", res, time.perf_counter() - time02 )
49 |
50 | time03 = time.perf_counter()
51 | s3 = cut.calculate_xie_beni_index(data, labels)
52 | print("XBI 3: ", s3, time.perf_counter() - time03)
53 |
--------------------------------------------------------------------------------
/docs/source/pages/clustering/BHI.rst:
--------------------------------------------------------------------------------
1 | Ball Hall Index
2 | ===============
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Ball Hall Index
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | The Ball Hall Index is a clustering validity index that measures the compactness and separation of clusters in a clustering result. It provides a quantitative measure of how well-separated and tight the clusters are.
18 |
19 | The formula for calculating the Ball Hall Index is as follows::
20 |
21 | BHI = Xichma(1 / (2 * n_i) * Xichma(d(x, c_i)) / n
22 |
23 | Where:
24 |
25 | n is the total number of data points
26 | n_i is the number of data points in cluster i
27 | d(x, c_i) is the Euclidean distance between a data point x and the centroid c_i of cluster i
28 |
29 | The Ball Hall Index computes the average distance between each data point and its cluster centroid and then averages this across all clusters. The index is inversely proportional to the compactness and separation of the clusters. A smaller BHI value indicates better-defined and well-separated clusters.
30 |
31 | A lower BHI value indicates better clustering, as it signifies that the data points are closer to their own cluster centroid than to the centroids of other clusters, indicating a clear separation between clusters.
32 |
33 | The Ball Hall Index is often used as an internal evaluation metric for clustering algorithms to compare different clustering results or to determine the optimal number of clusters. However, it should be noted that it is not without limitations and should be used in conjunction with other evaluation metrics and domain knowledge for a comprehensive assessment of clustering results.
34 |
35 |
36 | Example:
37 |
38 | .. code-block:: python
39 |
40 | import numpy as np
41 | from permetrics import ClusteringMetric
42 |
43 | ## For integer labels or categorical labels
44 | data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
45 | y_pred = np.array([0, 0, 1, 1, 1])
46 |
47 | cm = ClusteringMetric(X=data, y_pred=y_pred)
48 |
49 | print(cm.ball_hall_index())
50 | print(cm.BHI())
51 |
--------------------------------------------------------------------------------
/examples/classification/03_multiple_metrics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 11:37, 25/03/2022 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | ## To reduce coding time for using multiple metrics. There are few ways to do it with permetrics
8 | ## We have to use OOP style
9 |
10 | import numpy as np
11 | from permetrics.classification import ClassificationMetric
12 |
13 | y_true = [0, 1, 0, 0, 1, 0]
14 | y_pred = [0, 1, 0, 0, 0, 1]
15 |
16 | evaluator = ClassificationMetric(y_true, y_pred)
17 |
18 | ## Define list of metrics you want to use
19 |
20 |
21 | ## 1. Get list metrics by using loop
22 | list_metrics = ["PS", "RS", "LS", "SS"]
23 | list_results = []
24 | for metric in list_metrics:
25 | list_results.append( evaluator.get_metric_by_name(metric) )
26 | print(list_results)
27 |
28 |
29 | ## 2. Get list metrics by using function
30 | dict_result_2 = evaluator.get_metrics_by_list_names(list_metrics)
31 | print(dict_result_2)
32 |
33 |
34 | ## 3. Get list metrics by using function and parameters
35 | dict_metrics = {
36 | "PS": {"average": "micro"},
37 | "RS": {"average": "macro"},
38 | "LS": None,
39 | "SS": {"average": "weighted"},
40 | }
41 | dict_result_3 = evaluator.get_metrics_by_dict(dict_metrics)
42 | print(dict_result_3)
43 |
44 | # CM = confusion_matrix
45 | # PS = precision_score
46 | # NPV = negative_predictive_value
47 | # RS = recall_score
48 | # AS = accuracy_score
49 | # F1S = f1_score
50 | # F2S = f2_score
51 | # FBS = fbeta_score
52 | # SS = specificity_score
53 | # MCC = matthews_correlation_coefficient
54 | # HS = hamming_score
55 | # LS = lift_score
56 | # CKS = cohen_kappa_score
57 | # JSI = JSC = jaccard_similarity_coefficient = jaccard_similarity_index
58 | # GMS = g_mean_score
59 | # GINI = gini_index
60 | # ROC = AUC = RAS = roc_auc_score
61 |
--------------------------------------------------------------------------------
/examples/clustering/00_all_metrics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 06:17, 27/07/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | ## 1. Import packages, classes
8 | ## 2. Create object
9 | ## 3. From object call function and use
10 |
11 | import numpy as np
12 | from permetrics import ClusteringMetric
13 | from sklearn.datasets import make_blobs
14 |
15 | # generate sample data
16 | X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
17 | y_pred = np.random.randint(0, 4, size=300)
18 |
19 | evaluator = ClusteringMetric(y_true=y_true, y_pred=y_pred, X=X)
20 |
21 | ## Call specific function inside object, each function has 2 names (fullname and short name)
22 | ## + Internal metrics: Need X and y_pred and has suffix as index
23 | ## + External metrics: Need y_true and y_pred and has suffix as score
24 |
25 | print(evaluator.ball_hall_index())
26 | print(evaluator.BHI())
27 |
28 | # BHI = ball_hall_index
29 | # CHI = calinski_harabasz_index
30 | # XBI = xie_beni_index
31 | # BRI = banfeld_raftery_index
32 | # DBI = davies_bouldin_index
33 | # DRI = det_ratio_index
34 | # DI = dunn_index
35 | # KDI = ksq_detw_index
36 | # LDRI = log_det_ratio_index
37 | # LSRI = log_ss_ratio_index
38 | # SI = silhouette_index
39 | #
40 | # MIS = mutual_info_score
41 | # NMIS = normalized_mutual_info_score
42 | # RaS = rand_score
43 | # FMS = fowlkes_mallows_score
44 | # HS = homogeneity_score
45 | # CS = completeness_score
46 | # VMS = v_measure_score
47 | # PrS = precision_score
48 | # ReS = recall_score
49 | # FmS = f_measure_score
50 | # CDS = czekanowski_dice_score
51 | # HGS = hubert_gamma_score
52 | # JS = jaccard_score
53 | # KS = kulczynski_score
54 | # MNS = mc_nemar_score
55 | # PhS = phi_score
56 | # RTS = rogers_tanimoto_score
57 | # RRS = russel_rao_score
58 | # SS1S = sokal_sneath1_score
59 | # SS2S = sokal_sneath2_score
60 |
--------------------------------------------------------------------------------
/docs/source/pages/examples/classification.rst:
--------------------------------------------------------------------------------
1 | Classification Metrics
2 | ======================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Classification Metrics
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | Functional Style
19 | ----------------
20 |
21 | + This is a traditional way to call a specific metric you want to use. Everytime you want to use a metric, you need to pass y_true and y_pred
22 |
23 | .. code-block:: python
24 | :emphasize-lines: 6,11,14-16,19-20
25 |
26 | ## 1. Import packages, classes
27 | ## 2. Create object
28 | ## 3. From object call function and use
29 |
30 | import numpy as np
31 | from permetrics import ClassificationMetric
32 |
33 | y_true = [0, 1, 0, 0, 1, 0]
34 | y_pred = [0, 1, 0, 0, 0, 1]
35 |
36 | evaluator = ClassificationMetric()
37 |
38 | ## 3.1 Call specific function inside object, each function has 2 names like below
39 | ps1 = evaluator.precision_score(y_true, y_pred)
40 | ps2 = evaluator.PS(y_true, y_pred)
41 | ps3 = evaluator.PS(y_true, y_pred)
42 | print(f"Precision: {ps1}, {ps2}, {ps3}")
43 |
44 | recall = evaluator.recall_score(y_true, y_pred)
45 | accuracy = evaluator.accuracy_score(y_true, y_pred)
46 | print(f"recall: {recall}, accuracy: {accuracy}")
47 |
48 |
49 | Object-Oriented Style
50 | ---------------------
51 |
52 | + This is modern and better way to use metrics. You only need to pass y_true, y_pred one time when creating metric object.
53 | + After that, you can get the value of any metrics without passing y_true, y_pred
54 |
55 | .. code-block:: python
56 | :emphasize-lines: 2,7,11-13
57 |
58 | import numpy as np
59 | from permetrics import ClassificationMetric
60 |
61 | y_true = [0, 1, 0, 0, 1, 0]
62 | y_pred = [0, 1, 0, 0, 0, 1]
63 |
64 | evaluator = ClassificationMetric(y_true, y_pred)
65 |
66 | ## Get the result of any function you want to
67 |
68 | hamming_score = evaluator.hamming_score()
69 | mcc = evaluator.matthews_correlation_coefficient()
70 | specificity = evaluator.specificity_score()
71 | print(f"HL: {hamming_score}, MCC: {mcc}, specificity: {specificity}")
72 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/SMAPE.rst:
--------------------------------------------------------------------------------
1 | SMAPE - Symmetric Mean Absolute Percentage Error
2 | ================================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: SMAPE - Symmetric Mean Absolute Percentage Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | \text{SMAPE}(y, \hat{y}) = \frac{100\%}{N} \sum_{i=0}^{N - 1} \frac{ 2*|y_i - \hat{y}_i|}{|y| + |\hat{y}|}
21 |
22 | Latex equation code::
23 |
24 | \text{SMAPE}(y, \hat{y}) = \frac{100\%}{N} \sum_{i=0}^{N - 1} \frac{ 2*|y_i - \hat{y}_i|}{|y| + |\hat{y}|}
25 |
26 |
27 | Symmetric Mean Absolute Percentage Error (SMAPE) :cite:`thieu2019efficient`, which is an accuracy measure commonly used in forecasting and time series
28 | analysis.
29 |
30 | Given the actual values y and the predicted values y_hat, the SMAPE is calculated as the average of the absolute percentage errors between the two, where
31 | each error is weighted by the sum of the absolute values of the actual and predicted values.
32 |
33 | The resulting score ranges between 0 and 1, where a score of 0 indicates a perfect match between the actual and predicted values, and a score of 1 indicates
34 | no match at all. A smaller value of SMAPE is better, and it is often multiplied by 100% to obtain the percentage error. Best possible score is 0.0, smaller
35 | value is better. Range = [0, 1].
36 |
37 | + `Link to equation `_
38 |
39 |
40 | Example to use SMAPE metric:
41 |
42 | .. code-block:: python
43 | :emphasize-lines: 8-9,15-16
44 |
45 | from numpy import array
46 | from permetrics.regression import RegressionMetric
47 |
48 | ## For 1-D array
49 | y_true = array([3, -0.5, 2, 7])
50 | y_pred = array([2.5, 0.0, 2, 8])
51 |
52 | evaluator = RegressionMetric(y_true, y_pred)
53 | print(evaluator.symmetric_mean_absolute_percentage_error())
54 |
55 | ## For > 1-D array
56 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
57 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
58 |
59 | evaluator = RegressionMetric(y_true, y_pred)
60 | print(evaluator.SMAPE(multi_output="raw_values"))
61 |
--------------------------------------------------------------------------------
/examples/clustering/06_speed_up_TS-GAS-GPS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:17, 22/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import time
9 | from permetrics import ClusteringMetric
10 | import permetrics.utils.cluster_util as cut
11 | from sklearn.datasets import make_blobs
12 |
13 | np.random.seed(100)
14 |
15 |
16 | def generate_dataset(n_samples, n_features, n_clusters, random_state=42):
17 | X, y_true, centers = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
18 | cluster_std=1.0, random_state=random_state, return_centers=True)
19 | y_pred = np.random.randint(0, n_clusters, n_samples)
20 | return X, y_true, y_pred, centers
21 |
22 |
23 | num_samples = 100000
24 | num_features = 2
25 | num_clusters = 7
26 | cluster_std = 0.5
27 |
28 | data, y_true, y_pred, centers = generate_dataset(num_samples, num_features, num_clusters)
29 |
30 |
31 | # time02 = time.perf_counter()
32 | # cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
33 | # res = cm.tau_score()
34 | # print("res: ", res, time.perf_counter() - time02 )
35 | #
36 | # time03 = time.perf_counter()
37 | # s3 = cut.calculate_tau_score(y_true, y_pred)
38 | # print("res: ", s3, time.perf_counter() - time03)
39 |
40 |
41 | # time02 = time.perf_counter()
42 | # cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
43 | # res = cm.gamma_score()
44 | # print("res: ", res, time.perf_counter() - time02 )
45 | #
46 | # time03 = time.perf_counter()
47 | # s3 = cut.calculate_gamma_score(y_true, y_pred)
48 | # print("res: ", s3, time.perf_counter() - time03)
49 |
50 |
51 | time02 = time.perf_counter()
52 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
53 | res = cm.gplus_score()
54 | print("res: ", res, time.perf_counter() - time02 )
55 |
56 | time03 = time.perf_counter()
57 | s3 = cut.calculate_gplus_score(y_true, y_pred)
58 | print("res: ", s3, time.perf_counter() - time03)
59 |
--------------------------------------------------------------------------------
/examples/classification/11_kld_loss.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:10, 12/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | from permetrics import ClassificationMetric
9 |
10 |
11 | def kl_divergence_loss_binary(y_true, y_pred):
12 | epsilon = 1e-10 # Small epsilon value to avoid division by zero
13 | y_true = np.clip(y_true, epsilon, 1 - epsilon) # Clip true labels
14 | y_pred = np.clip(y_pred, epsilon, 1 - epsilon) # Clip predicted probabilities
15 | loss = y_true * np.log(y_true / y_pred) + (1 - y_true) * np.log((1 - y_true) / (1 - y_pred))
16 | return np.mean(loss)
17 |
18 | def kl_divergence_loss_multiclass(y_true, y_pred):
19 | epsilon = 1e-10 # Small epsilon value to avoid division by zero
20 | y_true = np.clip(y_true, epsilon, 1 - epsilon) # Clip true labels
21 | y_pred = np.clip(y_pred, epsilon, 1 - epsilon) # Clip predicted probabilities
22 | loss = np.sum(y_true * np.log(y_true / y_pred), axis=1)
23 | return np.mean(loss)
24 |
25 | # Binary classification example
26 | y_true_binary = np.array([0, 1, 1, 0]) # True binary labels
27 | y_pred_binary = np.array([0.3, 0.7, 0.9, 0.2]) # Predicted probabilities
28 | binary_loss = kl_divergence_loss_binary(y_true_binary, y_pred_binary)
29 | print("Binary Loss:", binary_loss)
30 |
31 | # Multi-class classification example
32 | y_true_multiclass = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) # True class labels (one-hot encoded)
33 | y_pred_multiclass = np.array([[0.7, 0.2, 0.1], [0.3, 0.6, 0.1], [0.2, 0.1, 0.7]]) # Predicted class probabilities
34 | multiclass_loss = kl_divergence_loss_multiclass(y_true_multiclass, y_pred_multiclass)
35 | print("Multi-Class Loss:", multiclass_loss)
36 |
37 | cu = ClassificationMetric(y_true_binary, y_pred_binary)
38 | print(cu.kullback_leibler_divergence_loss())
39 |
40 | cu = ClassificationMetric()
41 | print(cu.kullback_leibler_divergence_loss(y_true_multiclass, y_pred_multiclass))
42 |
--------------------------------------------------------------------------------
/examples/clustering/04_exam_cases_scores.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 10:56, 02/08/2023 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | from permetrics import ClusteringMetric
9 | from sklearn.datasets import make_blobs
10 | from sklearn.metrics import mutual_info_score, completeness_score, recall_score
11 |
12 | # generate sample data
13 | X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
14 | y_pred_rand = []
15 | for idx in range(0, len(y_true)):
16 | y_pred_rand.append(np.random.choice(list(set(range(0, 4)) - {idx})))
17 |
18 | temp = [
19 | y_true.copy(), y_pred_rand,
20 | np.random.randint(0, 2, 300), np.random.randint(0, 6, 300),
21 | np.zeros((300,)), np.ones((300,)),
22 | 9*np.ones((300))
23 | ]
24 | for idx in range(7):
25 | evaluator = ClusteringMetric(y_true=y_true, y_pred=temp[idx], X=X)
26 | print(evaluator.gplus_score())
27 |
28 | # print(evaluator.get_metrics_by_list_names(["MIS", "NMIS", "RaS", "ARS", "FMS", "HS", "CS", "VMS", "PrS", "ReS", "FmS",
29 | # "CDS", "HGS", "JS", "KS", "MNS", "PhS", "RTS", "RRS", "SS1S", "SS2S",
30 | # "PuS", "ES", "TS", "GAS", "GPS"]))
31 |
32 | # MIS = mutual_info_score
33 | # NMIS = normalized_mutual_info_score
34 | # RaS = rand_score
35 | # ARS = adjusted_rand_score
36 | # FMS = fowlkes_mallows_score
37 | # HS = homogeneity_score
38 | # CS = completeness_score
39 | # VMS = v_measure_score
40 | # PrS = precision_score
41 | # ReS = recall_score
42 | # FmS = f_measure_score
43 | # CDS = czekanowski_dice_score
44 | # HGS = hubert_gamma_score
45 | # JS = jaccard_score
46 | # KS = kulczynski_score
47 | # MNS = mc_nemar_score
48 | # PhS = phi_score
49 | # RTS = rogers_tanimoto_score
50 | # RRS = russel_rao_score
51 | # SS1S = sokal_sneath1_score
52 | # SS2S = sokal_sneath2_score
53 | # PuS = purity_score
54 | # ES = entropy_score
55 | # TS = tau_score
56 | # GAS = gamma_score
57 | # GPS = gplus_score
58 |
--------------------------------------------------------------------------------
/docs/source/pages/clustering/DHI.rst:
--------------------------------------------------------------------------------
1 | Duda Hart Index (DHI)
2 | =====================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Duda Hart Index (DHI)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | The Duda index, also known as the D-index or Duda-Hart index, is a clustering evaluation metric that measures the compactness and
18 | separation of clusters. It was proposed by Richard O. Duda and Peter E. Hart in their book "Pattern Classification and Scene Analysis."
19 |
20 | The Duda index is defined as the ratio between the average pairwise distance within clusters and the average pairwise distance
21 | between clusters. A lower value of the Duda index indicates better clustering, indicating that the clusters are more compact
22 | and well-separated. Here's the formula to calculate the Duda index::
23 |
24 | Duda Index = (Average pairwise intra-cluster distance) / (Average pairwise inter-cluster distance)
25 |
26 | To calculate the Duda index, you need the following steps::
27 |
28 | Compute the average pairwise distance within each cluster (intra-cluster distance).
29 | Compute the average pairwise distance between different clusters (inter-cluster distance).
30 | Divide the average intra-cluster distance by the average inter-cluster distance to obtain the Duda index.
31 |
32 | The Duda index is a useful metric for evaluating clustering results, particularly when the compactness and separation of
33 | clusters are important. However, it's worth noting that the Duda index assumes Euclidean distance and may not work well
34 | with all types of data or distance metrics.
35 |
36 | When implementing the Duda index, you'll need to calculate the pairwise distances between data points within and between
37 | clusters. You can use distance functions like Euclidean distance or other suitable distance metrics based on your
38 | specific problem and data characteristics.
39 |
40 |
41 | Example:
42 |
43 | .. code-block:: python
44 |
45 | import numpy as np
46 | from permetrics import ClusteringMetric
47 |
48 | ## For integer labels or categorical labels
49 | data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
50 | y_pred = np.array([0, 0, 1, 1, 1])
51 |
52 | cm = ClusteringMetric(X=data, y_pred=y_pred)
53 |
54 | print(cm.duda_hart_index())
55 | print(cm.DHI())
56 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('.'))
16 | sys.path.insert(0, os.path.abspath('../../'))
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = 'Permetrics'
21 | copyright = '2021, Thieu'
22 | author = 'Thieu'
23 |
24 | # The full version, including alpha/beta/rc tags
25 | release = '2.0.0'
26 |
27 |
28 | # -- General configuration ---------------------------------------------------
29 |
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 | "sphinx.ext.autodoc",
35 | "sphinx.ext.napoleon",
36 | "sphinx.ext.intersphinx",
37 | "sphinx.ext.viewcode",
38 | "sphinxcontrib.bibtex",
39 | ]
40 |
41 | # Add any paths that contain templates here, relative to this directory.
42 | templates_path = ['_templates']
43 |
44 | # List of patterns, relative to source directory, that match files and
45 | # directories to ignore when looking for source files.
46 | # This pattern also affects html_static_path and html_extra_path.
47 | exclude_patterns = []
48 |
49 |
50 | # -- Options for HTML output -------------------------------------------------
51 |
52 | # The theme to use for HTML and HTML Help pages. See the documentation for
53 | # a list of builtin themes.
54 | #
55 | # html_theme = 'alabaster'
56 | html_theme = 'sphinx_rtd_theme'
57 |
58 | # Add any paths that contain custom static files (such as style sheets) here,
59 | # relative to this directory. They are copied after the builtin static files,
60 | # so a file named "default.css" will overwrite the builtin "default.css".
61 | html_static_path = ['_static']
62 |
63 | bibtex_default_style = 'unsrt'
64 | bibtex_bibfiles = ['mybibfile.bib']
65 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/WI.rst:
--------------------------------------------------------------------------------
1 | WI - Willmott Index
2 | ===================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: WI - Willmott Index
7 |
8 |
9 | .. toctree::
10 | :maxdepth: 3
11 |
12 | .. toctree::
13 | :maxdepth: 3
14 |
15 | .. toctree::
16 | :maxdepth: 3
17 |
18 | .. math::
19 |
20 | \text{WI}(y, \hat{y}) = 1 - \frac{ \sum_{i=0}^{N - 1} (\hat{y_i} - y_i)^2 }{ \sum_{i=0}^{N - 1} (|\hat{y_i} - mean(y)| + |y_i - mean(y)|)^2}
21 |
22 | Latex equation code::
23 |
24 | \text{WI}(y, \hat{y}) = 1 - \frac{ \sum_{i=0}^{N - 1} (\hat{y_i} - y_i)^2 }{ \sum_{i=0}^{N - 1} (|\hat{y_i} - mean(y)| + |y_i - mean(y)|)^2}
25 |
26 |
27 | The Willmott Index (WI) :cite:`da2017reference` is a statistical measure used to evaluate the performance of a forecasting model, particularly in the
28 | context of hydrological or climate-related variables. The WI compares the accuracy of a model to the accuracy of a reference model that simply predicts the
29 | mean value of the observed variable. Best possible score is 1.0, bigger value is better. Range = [0, 1]
30 |
31 | The WI ranges between 0 and 1, with a value of 1 indicating perfect agreement between the predicted and observed values. A value of 0 indicates that the
32 | predicted values are no better than predicting the mean of the observed values.
33 |
34 | The WI is commonly used in hydrology and climate-related fields to evaluate the accuracy of models that predict variables such as precipitation, temperature,
35 | and evapotranspiration. It is a useful tool for comparing the performance of different models or different methods of estimating a variable.
36 |
37 | + `Link to equation `_
38 |
39 |
40 | Example to use WI metric:
41 |
42 | .. code-block:: python
43 | :emphasize-lines: 8-9,15-16
44 |
45 | from numpy import array
46 | from permetrics.regression import RegressionMetric
47 |
48 | ## For 1-D array
49 | y_true = array([3, -0.5, 2, 7])
50 | y_pred = array([2.5, 0.0, 2, 8])
51 |
52 | evaluator = RegressionMetric(y_true, y_pred)
53 | print(evaluator.willmott_index())
54 |
55 | ## For > 1-D array
56 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
57 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
58 |
59 | evaluator = RegressionMetric(y_true, y_pred)
60 | print(evaluator.WI(multi_output="raw_values"))
61 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/MBE.rst:
--------------------------------------------------------------------------------
1 | BE - Mean Bias Error
2 | =====================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: MBE - Mean Bias Error
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 |
19 | .. math::
20 |
21 | \text{MBE}(y, \hat{y}) = \frac{1}{n} \sum_{i=1}^{n}(f_i - y_i)
22 |
23 | Latex equation code::
24 |
25 | \text{MBE}(y, \hat{y}) = \frac{1}{n} \sum_{i=1}^{n}(f_i - y_i)
26 |
27 |
28 | The Mean Bias Error (MBE) :cite:`kato2016prediction` is a statistical measure used to assess the bias of a forecasting model. The MBE measures the average
29 | difference between the forecasted and actual values, without considering their direction.
30 |
31 | The MBE is expressed in the same units as the forecasted and actual values, and a best possible score of 0.0 indicates no bias in the forecasting model. The
32 | MBE has a range of (-infinity, +infinity), with a positive MBE indicating that the forecasted values are, on average, larger than the actual values, and a
33 | negative MBE indicating the opposite.
34 |
35 | The MBE is a useful measure to evaluate the systematic errors of a forecasting model, such as overestimation or underestimation of the forecasted values.
36 | However, it does not provide information about the magnitude or direction of the individual errors, and it should be used in conjunction with other
37 | statistical measures, such as the Mean Absolute Error (MAE), to provide a more comprehensive evaluation of the forecasting model's accuracy.
38 |
39 | It is important to note that the MBE is sensitive to outliers and may not be appropriate for data with non-normal distributions or extreme values. In such
40 | cases, other measures, such as the Median Bias Error (MBE), may be more appropriate.
41 |
42 |
43 | Example to use MBE metric:
44 |
45 | .. code-block:: python
46 | :emphasize-lines: 8-9,15-16
47 |
48 | from numpy import array
49 | from permetrics.regression import RegressionMetric
50 |
51 | ## For 1-D array
52 | y_true = array([3, -0.5, 2, 7])
53 | y_pred = array([2.5, 0.0, 2, 8])
54 |
55 | evaluator = RegressionMetric(y_true, y_pred)
56 | print(evaluator.mean_bias_error())
57 |
58 | ## For > 1-D array
59 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
60 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
61 |
62 | evaluator = RegressionMetric(y_true, y_pred)
63 | print(evaluator.MBE(multi_output="raw_values"))
64 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/R2.rst:
--------------------------------------------------------------------------------
1 | R2 - Coefficient of Determination
2 | =================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: R2 - Coefficient of Determination
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | .. math::
19 |
20 | R2(y, \hat{y}) = 1 - \frac{\sum_{i=1}^{N} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{N} (y_i - \bar{y})^2}
21 |
22 | where :math:`\bar{y} = \frac{1}{N} \sum_{i=1}^{N} y_i` and :math:`\sum_{i=1}^{N} (y_i - \hat{y}_i)^2 = \sum_{i=1}^{N} \epsilon_i^2`
23 |
24 | Latex equation code::
25 |
26 | R2(y, \hat{y}) = 1 - \frac{\sum_{i=1}^{N} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{N} (y_i - \bar{y})^2}
27 |
28 |
29 | + Coefficient of Determination (COD/R2) :cite:`nguyen2021nqsv`: Best possible score is 1.0, bigger value is better. Range = (-inf, 1]
30 | + `Link to equation `_
31 | + Scikit-learn and other websites denoted COD as R^2 (or R squared), it leads to the misunderstanding of R^2 in which R is Pearson’s Correlation Coefficient.
32 | + We should denote it as COD or R2 only.
33 |
34 | + It represents the proportion of variance (of y) that has been explained by the independent variables in the model. It provides an indication of goodness of
35 | fit and therefore a measure of how well unseen samples are likely to be predicted by the model, through the proportion of explained variance.
36 | + As such variance is dataset dependent, R2 may not be meaningfully comparable across different datasets.
37 | + Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of
38 | y, disregarding the input features, would get a R2 score of 0.0.
39 |
40 |
41 | Example to use R2 metric:
42 |
43 | .. code-block:: python
44 | :emphasize-lines: 8-9,15-16
45 |
46 | from numpy import array
47 | from permetrics.regression import RegressionMetric
48 |
49 | ## For 1-D array
50 | y_true = array([3, -0.5, 2, 7])
51 | y_pred = array([2.5, 0.0, 2, 8])
52 |
53 | evaluator = RegressionMetric(y_true, y_pred)
54 | print(evaluator.coefficient_of_determination())
55 |
56 | ## For > 1-D array
57 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
58 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
59 |
60 | evaluator = RegressionMetric(y_true, y_pred)
61 | print(evaluator.R2(multi_output="raw_values"))
62 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/KLD.rst:
--------------------------------------------------------------------------------
1 | KLD - Kullback-Leibler Divergence
2 | =================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: KLD - Kullback-Leibler Divergence
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 |
18 | The Kullback-Leibler Divergence (KLD), :cite:`hershey2007approximating` also known as relative entropy, is a statistical measure of how different two probability
19 | distributions are from each other. It was introduced by Solomon Kullback and Richard Leibler in 1951. The KLD is calculated as the sum of the logarithmic
20 | differences between the probabilities of each possible outcome, weighted by the probability of the outcome in the reference distribution. The KLD is always
21 | non-negative, and it is equal to zero if and only if the two distributions are identical. The equation for KLD between two probability distributions P and Q
22 | is given by:
23 |
24 | .. math::
25 |
26 | D_{KL}(P||Q) = \sum_{i} P(i) \log\frac{P(i)}{Q(i)}
27 |
28 | where P(i) and Q(i) are the probabilities of the i-th possible outcome in the two distributions, respectively.
29 |
30 | The KLD measures the information lost when approximating one probability distribution by another. It is widely used in information theory, machine learning,
31 | and data science applications, such as clustering, classification, and data compression. The KLD has also found applications in other fields, such as
32 | physics, economics, and biology, to measure the distance between two probability distributions.
33 |
34 | + Best possible score is 0.0 . Range = (-inf, +inf)
35 | + `Link to equation `_
36 |
37 |
38 | Example to use KLD metric:
39 |
40 | .. code-block:: python
41 | :emphasize-lines: 8-9,15-16
42 |
43 | from numpy import array
44 | from permetrics.regression import RegressionMetric
45 |
46 | ## For 1-D array
47 | y_true = array([3, -0.5, 2, 7])
48 | y_pred = array([2.5, 0.0, 2, 8])
49 |
50 | evaluator = RegressionMetric(y_true, y_pred)
51 | print(evaluator.kullback_leibler_divergence())
52 |
53 | ## For > 1-D array
54 | y_true = array([[0.5, 1], [-1, 1], [7, -6], [1, 2], [2.1, 2.2], [3.4, 5.5]])
55 | y_pred = array([[0, 2], [-1, 2], [8, -5], [1.1, 1.9], [2.0, 2.3], [3.0, 4.2]])
56 |
57 | evaluator = RegressionMetric(y_true, y_pred)
58 | print(evaluator.KLD(multi_output="raw_values"))
59 |
60 |
61 |
--------------------------------------------------------------------------------
/docs/source/pages/clustering/TS.rst:
--------------------------------------------------------------------------------
1 | Tau Score (TS)
2 | ==============
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: Tau Score (TS)
7 |
8 | .. toctree::
9 | :maxdepth: 3
10 |
11 | .. toctree::
12 | :maxdepth: 3
13 |
14 | .. toctree::
15 | :maxdepth: 3
16 |
17 | The Tau index, also known as the Tau coefficient, is a measure of agreement or similarity between two clustering solutions.
18 | It is commonly used to compare the similarity of two different clusterings or to evaluate the stability of a clustering algorithm.
19 |
20 | The Tau index is based on the concept of concordance, which measures the extent to which pairs of objects are assigned to
21 | the same clusters in two different clustering solutions. The index ranges from -1 to 1, where 1 indicates perfect agreement,
22 | 0 indicates random agreement, and -1 indicates perfect disagreement or inversion of the clustering solutions.
23 |
24 | The calculation of the Tau index involves constructing a contingency table that counts the number of pairs of objects
25 | that are concordant (i.e., assigned to the same cluster in both solutions) and discordant
26 | (i.e., assigned to different clusters in the two solutions).
27 |
28 | The formula for calculating the Tau index is as follows::
29 |
30 | Tau = (concordant_pairs - discordant_pairs) / (concordant_pairs + discordant_pairs)
31 |
32 | A higher value of the Tau index indicates greater similarity or agreement between the two clusterings,
33 | while a lower value indicates less agreement. It's important to note that the interpretation of the Tau index
34 | depends on the specific clustering algorithm and the data being clustered.
35 |
36 | The Tau index can be useful in various applications, such as evaluating the stability of clustering algorithms,
37 | comparing different clustering solutions, or assessing the robustness of a clustering method to perturbations
38 | in the data. However, like any clustering evaluation measure, it has its limitations and should be used
39 | in conjunction with other evaluation techniques to gain a comprehensive understanding of the clustering performance.
40 |
41 | Example:
42 |
43 | .. code-block:: python
44 |
45 | import numpy as np
46 | from permetrics import ClusteringMetric
47 |
48 | ## For integer labels or categorical labels
49 | y_true = np.array([0, 0, 1, 1, 1, 2, 2, 1])
50 | y_pred = np.array([0, 0, 1, 1, 2, 2, 2, 2])
51 |
52 | cm = ClusteringMetric(y_true=y_true, y_pred=y_pred)
53 |
54 | print(cm.tau_score())
55 | print(cm.TS())
56 |
--------------------------------------------------------------------------------
/tests/test_comparisons/test_sklearn_regression.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Created by "Thieu" at 18:21, 22/02/2024 ----------%
3 | # Email: nguyenthieu2102@gmail.com %
4 | # Github: https://github.com/thieu1995 %
5 | # --------------------------------------------------%
6 |
7 | import numpy as np
8 | import pytest
9 | from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, \
10 | mean_squared_error, median_absolute_error, r2_score, mean_absolute_percentage_error
11 |
12 | from permetrics import RegressionMetric
13 |
14 |
15 | def is_close_enough(x1, x2, eps=1e-5):
16 | if abs(x1 - x2) <= eps:
17 | return True
18 | return False
19 |
20 |
21 | @pytest.fixture(scope="module") # scope: Call only 1 time at the beginning
22 | def data():
23 | y_true = np.array([3, -0.5, 2, 7, 5, 3, 4, -3, 10])
24 | y_pred = np.array([2.5, 0.0, 2, 8, 5, 2, 3.5, -4, 9])
25 | rm = RegressionMetric(y_true=y_true, y_pred=y_pred)
26 | return y_true, y_pred, rm
27 |
28 |
29 | def test_EVS(data):
30 | y_true, y_pred, rm = data
31 | res11 = rm.EVS()
32 | res12 = explained_variance_score(y_true, y_pred)
33 | assert is_close_enough(res11, res12)
34 |
35 |
36 | def test_ME(data):
37 | y_true, y_pred, rm = data
38 | res11 = rm.ME()
39 | res12 = max_error(y_true, y_pred)
40 | assert is_close_enough(res11, res12)
41 |
42 |
43 | def test_MAE(data):
44 | y_true, y_pred, rm = data
45 | res11 = rm.MAE()
46 | res12 = mean_absolute_error(y_true, y_pred)
47 | assert is_close_enough(res11, res12)
48 |
49 |
50 | def test_MSE(data):
51 | y_true, y_pred, rm = data
52 | res11 = rm.MSE()
53 | res12 = mean_squared_error(y_true, y_pred)
54 | assert is_close_enough(res11, res12)
55 |
56 |
57 | def test_MedAE(data):
58 | y_true, y_pred, rm = data
59 | res11 = rm.MedAE()
60 | res12 = median_absolute_error(y_true, y_pred)
61 | assert is_close_enough(res11, res12)
62 |
63 |
64 | def test_R2(data):
65 | y_true, y_pred, rm = data
66 | res11 = rm.R2()
67 | res12 = r2_score(y_true, y_pred)
68 | assert is_close_enough(res11, res12)
69 |
70 |
71 | def test_MAPE(data):
72 | y_true, y_pred, rm = data
73 | res11 = rm.MAPE()
74 | res12 = mean_absolute_percentage_error(y_true, y_pred)
75 | assert is_close_enough(res11, res12)
76 |
--------------------------------------------------------------------------------
/docs/source/pages/regression/MAAPE.rst:
--------------------------------------------------------------------------------
1 | MAAPE - Mean Arctangent Absolute Percentage Error
2 | =================================================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 | :caption: MAAPE - Mean Arctangent Absolute Percentage Error
7 |
8 |
9 | .. toctree::
10 | :maxdepth: 3
11 |
12 | .. toctree::
13 | :maxdepth: 3
14 |
15 | .. toctree::
16 | :maxdepth: 3
17 |
18 |
19 | .. math::
20 |
21 | MAAPE = \frac{100}{n} \sum_{i=1}^{n} \left|\frac{A_i - F_i}{A_i}\right| \arctan\left(\frac{A_i - F_i}{A_i}\right)
22 |
23 | where A_i is the i-th actual value, F_i is the i-th forecasted value, and n is the number of observations.
24 |
25 | The Mean Arctangent Absolute Percentage Error (MAAPE) is a statistical measure used to evaluate the accuracy of a forecasting model. It was introduced by
26 | Armstrong in 1985 as an alternative to the Mean Absolute Percentage Error (MAPE) that avoids the issue of dividing by zero when the actual value is zero.
27 |
28 | The MAAPE is calculated as the average of the arctangent of the absolute percentage errors between the forecasted and actual values. The arctangent function
29 | is used to transform the percentage errors into a bounded range of -pi/2 to pi/2, which is more suitable for averaging than the unbounded range of the
30 | percentage errors.
31 |
32 | The MAAPE measures the average magnitude and direction of the errors between the forecasted and actual values, with values ranging from 0% to 100%. A lower
33 | MAAPE indicates better forecast accuracy. The MAAPE is commonly used in time series forecasting applications, such as sales forecasting, stock price
34 | prediction, and demand forecasting.
35 |
36 | + Best possible score is 0.0, smaller value is better. Range = [0, +inf)
37 | + `Link to equation `_
38 |
39 |
40 | Example to use MAAPE metric:
41 |
42 | .. code-block:: python
43 | :emphasize-lines: 8-9,15-16
44 |
45 | from numpy import array
46 | from permetrics.regression import RegressionMetric
47 |
48 | ## For 1-D array
49 | y_true = array([3, -0.5, 2, 7])
50 | y_pred = array([2.5, 0.0, 2, 8])
51 |
52 | evaluator = RegressionMetric(y_true, y_pred)
53 | print(evaluator.mean_arctangent_absolute_percentage_error())
54 |
55 | ## For > 1-D array
56 | y_true = array([[0.5, 1], [-1, 1], [7, -6]])
57 | y_pred = array([[0, 2], [-1, 2], [8, -5]])
58 |
59 | evaluator = RegressionMetric(y_true, y_pred)
60 | print(evaluator.MAAPE(multi_output="raw_values"))
61 |
--------------------------------------------------------------------------------