├── .gitattributes ├── .github ├── FUNDING.yml ├── actions │ └── install-env │ │ └── action.yml └── workflows │ ├── code-quality.yml │ ├── hugo.yml │ └── unit-tests.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── .pylintrc ├── CITATION.cff ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── docs ├── .hugo_build.lock ├── archetypes │ └── default.md ├── config.toml ├── content │ ├── _index.md │ ├── ca.ipynb │ ├── famd.ipynb │ ├── faq.ipynb │ ├── gpa.ipynb │ ├── mca.ipynb │ ├── mfa.ipynb │ └── pca.ipynb ├── layouts │ └── _default │ │ └── _markup │ │ └── render-codeblock-mermaid.html ├── static │ ├── favicon.ico │ └── images │ │ ├── favicon.png │ │ └── logo.png └── themes │ └── hugo-bearblog │ ├── archetypes │ ├── blog.md │ └── default.md │ ├── layouts │ ├── 404.html │ ├── _default │ │ ├── baseof.html │ │ ├── list.html │ │ └── single.html │ ├── index.html │ ├── partials │ │ ├── custom_body.html │ │ ├── custom_head.html │ │ ├── favicon.html │ │ ├── footer.html │ │ ├── header.html │ │ ├── nav.html │ │ ├── seo_tags.html │ │ └── style.html │ └── robots.txt │ └── theme.toml ├── figures ├── decastar.svg └── decastar_bis.svg ├── poetry.lock ├── prince ├── __init__.py ├── ca.py ├── datasets.py ├── datasets │ ├── 02-resultats-par-region.csv │ ├── beers.csv.zip │ ├── decathlon.csv │ ├── hearthstone_cards.csv │ ├── per-capita-energy-stacked.csv │ ├── premier_league.csv │ ├── punctuation_marks.csv │ └── resultats-par-departement.csv ├── famd.py ├── gpa.py ├── mca.py ├── mfa.py ├── pca.py ├── plot.py ├── svd.py └── utils.py ├── pyproject.toml └── tests ├── DESCRIPTION ├── __init__.py ├── test_ca.py ├── test_famd.py ├── test_gpa.py ├── test_mca.py ├── test_mfa.py ├── test_pca.py └── test_svd.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.R linguist-vendored 2 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: MaxHalford 2 | -------------------------------------------------------------------------------- /.github/actions/install-env/action.yml: -------------------------------------------------------------------------------- 1 | name: Install env 2 | runs: 3 | using: "composite" 4 | steps: 5 | - name: Check out repository 6 | uses: actions/checkout@v3 7 | 8 | - name: Install R 9 | uses: r-lib/actions/setup-r@v2 10 | 11 | - name: Install R packages 12 | uses: r-lib/actions/setup-r-dependencies@v2 13 | with: 14 | cache-version: 1 15 | working-directory: tests 16 | 17 | - name: Set up Python 18 | id: set-up-python 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: "3.11" 22 | 23 | - name: Load cached Poetry installation 24 | uses: actions/cache@v3 25 | with: 26 | path: ~/.local 27 | key: poetry-0 28 | 29 | - name: Install poetry 30 | uses: snok/install-poetry@v1 31 | with: 32 | virtualenvs-create: true 33 | virtualenvs-in-project: true 34 | installer-parallel: true 35 | 36 | - name: Load cached virtual env 37 | uses: actions/cache@v3 38 | with: 39 | path: .venv 40 | key: venv-${{ runner.os }}-${{ steps.set-up-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 41 | 42 | - name: Install dependencies 43 | shell: bash 44 | if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' 45 | run: poetry install --no-interaction --no-root 46 | 47 | - name: Install project 48 | shell: bash 49 | run: poetry install --no-interaction 50 | 51 | - name: Activate environment 52 | shell: bash 53 | run: source $VENV 54 | -------------------------------------------------------------------------------- /.github/workflows/code-quality.yml: -------------------------------------------------------------------------------- 1 | name: Code quality 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - "*" 7 | push: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | run: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | - uses: ./.github/actions/install-env 17 | - run: poetry run pre-commit run --all-files 18 | -------------------------------------------------------------------------------- /.github/workflows/hugo.yml: -------------------------------------------------------------------------------- 1 | # Sample workflow for building and deploying a Hugo site to GitHub Pages 2 | name: Deploy Hugo site to Pages 3 | 4 | on: 5 | # Allows you to run this workflow manually from the Actions tab 6 | workflow_dispatch: 7 | 8 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 9 | permissions: 10 | contents: read 11 | pages: write 12 | id-token: write 13 | 14 | # Allow one concurrent deployment 15 | concurrency: 16 | group: "pages" 17 | cancel-in-progress: true 18 | 19 | # Default to bash 20 | defaults: 21 | run: 22 | shell: bash 23 | 24 | jobs: 25 | # Build job 26 | build: 27 | runs-on: ubuntu-latest 28 | env: 29 | HUGO_VERSION: 0.144.2 30 | steps: 31 | - name: Install Hugo CLI 32 | run: | 33 | wget -O ${{ runner.temp }}/hugo.deb https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_linux-amd64.deb \ 34 | && sudo dpkg -i ${{ runner.temp }}/hugo.deb 35 | 36 | - name: Install Dart Sass 37 | run: sudo snap install dart-sass 38 | 39 | - name: Checkout 40 | uses: actions/checkout@v4 41 | with: 42 | submodules: recursive 43 | fetch-depth: 0 44 | 45 | - name: Install environment 46 | uses: ./.github/actions/install-env 47 | 48 | - name: Execute notebooks 49 | run: poetry run jupyter nbconvert --execute --to notebook --inplace docs/content/*.ipynb 50 | 51 | - name: Convert notebooks 52 | run: poetry run jupyter nbconvert --to markdown docs/content/*.ipynb 53 | 54 | - name: Clean MarkDown 55 | run: (for f in docs/content/*.md; do sed -e '/" 208 | ], 209 | "text/plain": [ 210 | "alt.Chart(...)" 211 | ] 212 | }, 213 | "execution_count": 2, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "import altair as alt\n", 220 | "\n", 221 | "alt.Chart(points).mark_line(opacity=0.5).encode(\n", 222 | " x='x',\n", 223 | " y='y',\n", 224 | " detail='shape',\n", 225 | " color='shape:N'\n", 226 | ")" 227 | ] 228 | }, 229 | { 230 | "attachments": {}, 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "The dataframe of points has to converted to a 3D numpy array of shape `(shapes, points, dims)`. There are many ways to do this. Here, we use xarray as a helper package." 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 3, 240 | "metadata": { 241 | "execution": { 242 | "iopub.execute_input": "2024-09-07T18:18:01.756840Z", 243 | "iopub.status.busy": "2024-09-07T18:18:01.756743Z", 244 | "iopub.status.idle": "2024-09-07T18:18:01.807548Z", 245 | "shell.execute_reply": "2024-09-07T18:18:01.807313Z" 246 | } 247 | }, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "(3, 3, 2)" 253 | ] 254 | }, 255 | "execution_count": 3, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "ds = points.set_index(['shape', 'point']).to_xarray()\n", 262 | "da = ds.to_stacked_array('xy', ['shape', 'point'])\n", 263 | "shapes = da.values\n", 264 | "shapes.shape" 265 | ] 266 | }, 267 | { 268 | "attachments": {}, 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "This can also be done in NumPy:" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 4, 278 | "metadata": { 279 | "execution": { 280 | "iopub.execute_input": "2024-09-07T18:18:01.809002Z", 281 | "iopub.status.busy": "2024-09-07T18:18:01.808906Z", 282 | "iopub.status.idle": "2024-09-07T18:18:01.818337Z", 283 | "shell.execute_reply": "2024-09-07T18:18:01.818121Z" 284 | } 285 | }, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "(3, 3, 2)" 291 | ] 292 | }, 293 | "execution_count": 4, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "import numpy as np\n", 300 | "\n", 301 | "gb = points.groupby('shape')\n", 302 | "np.stack([gb.get_group(g)[['x', 'y']] for g in gb.groups]).shape" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 5, 308 | "metadata": { 309 | "execution": { 310 | "iopub.execute_input": "2024-09-07T18:18:01.819667Z", 311 | "iopub.status.busy": "2024-09-07T18:18:01.819581Z", 312 | "iopub.status.idle": "2024-09-07T18:18:01.826748Z", 313 | "shell.execute_reply": "2024-09-07T18:18:01.826491Z" 314 | } 315 | }, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/plain": [ 320 | "array([[[0., 0.],\n", 321 | " [0., 2.],\n", 322 | " [1., 0.]],\n", 323 | "\n", 324 | " [[3., 2.],\n", 325 | " [1., 2.],\n", 326 | " [3., 3.]],\n", 327 | "\n", 328 | " [[0., 0.],\n", 329 | " [0., 4.],\n", 330 | " [2., 0.]]])" 331 | ] 332 | }, 333 | "execution_count": 5, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "shapes" 340 | ] 341 | }, 342 | { 343 | "attachments": {}, 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "The shapes can now be aligned." 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 6, 353 | "metadata": { 354 | "execution": { 355 | "iopub.execute_input": "2024-09-07T18:18:01.828533Z", 356 | "iopub.status.busy": "2024-09-07T18:18:01.828396Z", 357 | "iopub.status.idle": "2024-09-07T18:18:02.157698Z", 358 | "shell.execute_reply": "2024-09-07T18:18:02.157289Z" 359 | } 360 | }, 361 | "outputs": [], 362 | "source": [ 363 | "import prince\n", 364 | "\n", 365 | "gpa = prince.GPA()\n", 366 | "aligned_shapes = gpa.fit_transform(shapes)" 367 | ] 368 | }, 369 | { 370 | "attachments": {}, 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "We then convert the 3D numpy array to a dataframe (using `xarray`) for plotting." 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 7, 380 | "metadata": { 381 | "execution": { 382 | "iopub.execute_input": "2024-09-07T18:18:02.159473Z", 383 | "iopub.status.busy": "2024-09-07T18:18:02.159364Z", 384 | "iopub.status.idle": "2024-09-07T18:18:02.187045Z", 385 | "shell.execute_reply": "2024-09-07T18:18:02.186796Z" 386 | } 387 | }, 388 | "outputs": [ 389 | { 390 | "data": { 391 | "text/html": [ 392 | "\n", 393 | "
\n", 394 | "" 447 | ], 448 | "text/plain": [ 449 | "alt.Chart(...)" 450 | ] 451 | }, 452 | "execution_count": 7, 453 | "metadata": {}, 454 | "output_type": "execute_result" 455 | } 456 | ], 457 | "source": [ 458 | "da.values = aligned_shapes\n", 459 | "aligned_points = da.to_unstacked_dataset('xy').to_dataframe().reset_index()\n", 460 | "\n", 461 | "alt.Chart(aligned_points).mark_line(opacity=0.5).encode(\n", 462 | " x='x',\n", 463 | " y='y',\n", 464 | " detail='shape',\n", 465 | " color='shape:N'\n", 466 | ")" 467 | ] 468 | }, 469 | { 470 | "attachments": {}, 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "The triangles were all the same shape, so they are now perfectly aligned." 475 | ] 476 | } 477 | ], 478 | "metadata": { 479 | "kernelspec": { 480 | "display_name": ".venv", 481 | "language": "python", 482 | "name": "python3" 483 | }, 484 | "language_info": { 485 | "codemirror_mode": { 486 | "name": "ipython", 487 | "version": 3 488 | }, 489 | "file_extension": ".py", 490 | "mimetype": "text/x-python", 491 | "name": "python", 492 | "nbconvert_exporter": "python", 493 | "pygments_lexer": "ipython3", 494 | "version": "3.11.4" 495 | }, 496 | "vscode": { 497 | "interpreter": { 498 | "hash": "441c2ec70d9faeb70e7723f55150c6260f4a26a9c828b90915d3399002e14f43" 499 | } 500 | } 501 | }, 502 | "nbformat": 4, 503 | "nbformat_minor": 2 504 | } 505 | -------------------------------------------------------------------------------- /docs/layouts/_default/_markup/render-codeblock-mermaid.html: -------------------------------------------------------------------------------- 1 |
2 | {{- .Inner | safeHTML }} 3 |
4 | {{ .Page.Store.Set "hasMermaid" true }} 5 | -------------------------------------------------------------------------------- /docs/static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/docs/static/favicon.ico -------------------------------------------------------------------------------- /docs/static/images/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/docs/static/images/favicon.png -------------------------------------------------------------------------------- /docs/static/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/docs/static/images/logo.png -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/archetypes/blog.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "{{ replace .Name "-" " " | title }}" 3 | date = "{{ .Date }}" 4 | 5 | # 6 | # description is optional 7 | # 8 | # description = "An optional description for SEO. If not provided, an automatically created summary will be used." 9 | 10 | tags = [{{ range $plural, $terms := .Site.Taxonomies }}{{ range $term, $val := $terms }}"{{ printf "%s" $term }}",{{ end }}{{ end }}] 11 | +++ 12 | 13 | This is a page about »{{ replace .Name "-" " " | title }}«. 14 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/archetypes/default.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "{{ replace .Name "-" " " | title }}" 3 | date = "{{ .Date }}" 4 | 5 | # 6 | # Set menu to "main" to add this page to 7 | # the main menu on top of the page 8 | # 9 | menu = "main" 10 | 11 | # 12 | # description is optional 13 | # 14 | # description = "An optional description for SEO. If not provided, an automatically created summary will be used." 15 | 16 | # 17 | # tags are optional 18 | # 19 | # tags = [{{ range $plural, $terms := .Site.Taxonomies }}{{ range $term, $val := $terms }}"{{ printf "%s" $term }}",{{ end }}{{ end }}] 20 | +++ 21 | 22 | This is a page about »{{ replace .Name "-" " " | title }}«. 23 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/404.html: -------------------------------------------------------------------------------- 1 | {{ define "title" }}404{{ end }} 2 | 3 | {{ define "main" }} 4 |

404

5 |

ʕノ•ᴥ•ʔノ ︵ ┻━┻

6 | {{ end }} 7 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/_default/baseof.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {{- partial "favicon.html" . -}} 8 | 9 | {{- block "title" . }}{{ with .Title }}{{ . }} | {{ end }}{{ .Site.Title 10 | }}{{- end }} 11 | 12 | 13 | {{- partial "seo_tags.html" . -}} 14 | 15 | 16 | {{ with .OutputFormats.Get "rss" -}} {{ printf ` 17 | 18 | ` .Rel .MediaType.Type .Permalink $.Site.Title | safeHTML }} {{ end -}} {{- 19 | partial "style.html" . -}} 20 | 21 | 24 | {{- partial "custom_head.html" . -}} {{- if not (eq hugo.Environment 25 | "development") -}} 26 | 31 | {{- end -}} 32 | 33 | 34 | 35 |
{{- partial "header.html" . -}}
36 |
{{- block "main" . }}{{- end }}
37 | 38 | 39 | 42 | {{- partial "custom_body.html" . -}} 43 | 44 | 45 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/_default/list.html: -------------------------------------------------------------------------------- 1 | {{ define "main" }} 2 | 3 | {{ if .Data.Singular }} 4 |

Filtering for "{{ .Title }}"

5 | 6 | Remove filter 7 | 8 | {{ end }} 9 | 27 | {{ if .Data.Singular }} 28 | {{else}} 29 | 30 |
31 | {{ range .Site.Taxonomies.tags }} 32 | #{{ .Page.Title }}  33 | {{ end }} 34 |
35 |
36 | {{ end }} 37 |
38 | {{ end }} 39 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/_default/single.html: -------------------------------------------------------------------------------- 1 | {{ define "main" }} 2 | {{ if eq .Type "blog" }}{{ if not .Params.menu }} 3 |

4 | 5 | 8 | 9 |

10 | {{ end }}{{ end }} 11 | 12 |

{{ .Title }}

13 | {{ if and (gt .WordCount 400 ) (.Params.toc) }} 14 |

Table of contents

15 | {{.TableOfContents}} 16 | {{ end }} 17 | {{ .Content }} 18 |
19 |

20 | {{ range (.GetTerms "tags") }} 21 | #{{ .LinkTitle }} 22 | {{ end }} 23 |

24 | {{ end }} 25 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/index.html: -------------------------------------------------------------------------------- 1 | {{ define "main" }} 2 | {{ .Content }} 3 | 4 | {{ if .Page.Store.Get "hasMermaid" }} 5 | 6 | 9 | {{ end }} 10 | 11 | {{ end }} 12 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/partials/custom_body.html: -------------------------------------------------------------------------------- 1 | 4 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/partials/custom_head.html: -------------------------------------------------------------------------------- 1 | 4 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/partials/favicon.html: -------------------------------------------------------------------------------- 1 | {{ with .Site.Params.favicon }} 2 | {{ end }} 3 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/partials/footer.html: -------------------------------------------------------------------------------- 1 | {{ if ne .Site.Params.hideMadeWithLine true }}Made with Hugo ʕ•ᴥ•ʔ Bear{{ end }} 2 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/partials/header.html: -------------------------------------------------------------------------------- 1 | 2 |

{{ .Site.Title }} foo

3 |
4 | 5 |
6 |
7 |
8 | 9 |
10 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/partials/nav.html: -------------------------------------------------------------------------------- 1 | {{ range .Site.Menus.main }} 2 | {{ index .Page.Aliases 0 | upper }} 3 | {{ end }} 4 | GitHub 5 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/partials/seo_tags.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {{ template "_internal/opengraph.html" . }} 8 | 9 | 10 | {{ template "_internal/twitter_cards.html" . }} 11 | 12 | 13 | {{ template "_internal/schema.html" . }} 14 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/partials/style.html: -------------------------------------------------------------------------------- 1 | 143 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/layouts/robots.txt: -------------------------------------------------------------------------------- 1 | User-Agent: * 2 | Sitemap: {{ "sitemap.xml" | absURL }} 3 | -------------------------------------------------------------------------------- /docs/themes/hugo-bearblog/theme.toml: -------------------------------------------------------------------------------- 1 | # theme.toml template for a Hugo theme 2 | # See https://github.com/gohugoio/hugoThemes#themetoml for an example 3 | 4 | name = "Hugo Bear Blog" 5 | license = "MIT" 6 | licenselink = "https://github.com/janraasch/hugo-bearblog/blob/master/LICENSE" 7 | description = "A Hugo theme based on »Bear Blog«. Free, no-nonsense, super-fast blogging. »Bear Blog« now includes a dark color scheme to support dark mode!" 8 | homepage = "https://github.com/janraasch/hugo-bearblog" 9 | demosite = "https://janraasch.github.io/hugo-bearblog/" 10 | tags = ["blog", "responsive", "minimal", "seo", "clean", "simple", "light", "minimalist", "mobile", "fast", "white", "minimalistic", "reading", "dark mode"] 11 | features = ["favicon", "seo", "no stylesheets", "no javascript", "rss", "dark mode"] 12 | min_version = "0.73.0" 13 | # https://gohugo.io/content-management/taxonomies#default-taxonomies 14 | # https://gohugo.io/templates/taxonomy-templates/#example-list-tags-in-a-single-page-template 15 | # https://gohugo.io/templates/taxonomy-templates/#example-list-all-site-tags 16 | 17 | [author] 18 | name = "Jan Raasch" 19 | homepage = "https://www.janraasch.com" 20 | 21 | # If porting an existing theme 22 | [original] 23 | name = "ʕ•ᴥ•ʔ Bear Blog" 24 | homepage = "https://bearblog.dev" 25 | repo = "https://github.com/HermanMartinus/bearblog" 26 | -------------------------------------------------------------------------------- /figures/decastar.svg: -------------------------------------------------------------------------------- 1 | −6−5−4−3−2−1012345component 0 — 31.14%−2.5−2.0−1.5−1.0−0.50.00.51.01.52.02.53.0component 1 — 20.27%DecastarOlympicGcompetition 2 | -------------------------------------------------------------------------------- /prince/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import importlib.metadata 4 | 5 | from . import datasets 6 | from .ca import CA 7 | from .famd import FAMD 8 | from .gpa import GPA 9 | from .mca import MCA 10 | from .mfa import MFA 11 | from .pca import PCA 12 | 13 | __version__ = importlib.metadata.version("prince") 14 | __all__ = ["CA", "FAMD", "MCA", "MFA", "PCA", "GPA", "datasets"] 15 | -------------------------------------------------------------------------------- /prince/ca.py: -------------------------------------------------------------------------------- 1 | """Correspondence Analysis (CA)""" 2 | 3 | from __future__ import annotations 4 | 5 | import functools 6 | 7 | import altair as alt 8 | import numpy as np 9 | import pandas as pd 10 | from scipy import sparse 11 | from sklearn.utils import check_array 12 | 13 | from prince import svd, utils 14 | 15 | 16 | def select_active_columns(method): 17 | @functools.wraps(method) 18 | def _impl(self, X=None, *method_args, **method_kwargs): 19 | if hasattr(self, "active_cols_") and isinstance(X, pd.DataFrame): 20 | return method(self, X[self.active_cols_], *method_args, **method_kwargs) 21 | return method(self, X, *method_args, **method_kwargs) 22 | 23 | return _impl 24 | 25 | 26 | def select_active_rows(method): 27 | @functools.wraps(method) 28 | def _impl(self, X=None, *method_args, **method_kwargs): 29 | if hasattr(self, "active_rows_") and isinstance(X, pd.DataFrame): 30 | return method(self, X.loc[self.active_rows_], *method_args, **method_kwargs) 31 | return method(self, X, *method_args, **method_kwargs) 32 | 33 | return _impl 34 | 35 | 36 | class CA(utils.EigenvaluesMixin): 37 | def __init__( 38 | self, 39 | n_components=2, 40 | n_iter=10, 41 | copy=True, 42 | check_input=True, 43 | random_state=None, 44 | engine="sklearn", 45 | ): 46 | self.n_components = n_components 47 | self.n_iter = n_iter 48 | self.copy = copy 49 | self.check_input = check_input 50 | self.random_state = random_state 51 | self.engine = engine 52 | 53 | @utils.check_is_dataframe_input 54 | def fit(self, X, y=None): 55 | # Check input 56 | if self.check_input: 57 | check_array(X) 58 | 59 | # Check all values are positive 60 | if (X < 0).any().any(): 61 | raise ValueError("All values in X should be positive") 62 | 63 | _, row_names, _, col_names = utils.make_labels_and_names(X) 64 | 65 | if isinstance(X, pd.DataFrame): 66 | X = X.to_numpy() 67 | 68 | if self.copy: 69 | X = np.copy(X) 70 | 71 | # Compute the correspondence matrix which contains the relative frequencies 72 | X = X.astype(float) / np.sum(X) 73 | 74 | # Compute row and column masses 75 | self.row_masses_ = pd.Series(X.sum(axis=1), index=row_names) 76 | self.col_masses_ = pd.Series(X.sum(axis=0), index=col_names) 77 | 78 | self.active_rows_ = self.row_masses_.index.unique() 79 | self.active_cols_ = self.col_masses_.index.unique() 80 | 81 | # Compute standardised residuals 82 | r = self.row_masses_.to_numpy() 83 | c = self.col_masses_.to_numpy() 84 | S = sparse.diags(r**-0.5) @ (X - np.outer(r, c)) @ sparse.diags(c**-0.5) 85 | 86 | # Compute SVD on the standardised residuals 87 | self.svd_ = svd.compute_svd( 88 | X=S, 89 | n_components=min(self.n_components, min(X.shape) - 1), 90 | n_iter=self.n_iter, 91 | random_state=self.random_state, 92 | engine=self.engine, 93 | ) 94 | 95 | # Compute total inertia 96 | self.total_inertia_ = np.einsum("ij,ji->", S, S.T) 97 | 98 | self.row_contributions_ = pd.DataFrame( 99 | sparse.diags(self.row_masses_.values) 100 | @ np.divide( 101 | # Same as row_coordinates(X) 102 | ( 103 | sparse.diags(self.row_masses_.values**-0.5) 104 | @ self.svd_.U 105 | @ sparse.diags(self.svd_.s) 106 | ) 107 | ** 2, 108 | self.eigenvalues_, 109 | out=np.zeros((len(self.row_masses_), len(self.eigenvalues_))), 110 | where=self.eigenvalues_ > 0, 111 | ), 112 | index=self.row_masses_.index, 113 | ) 114 | 115 | self.column_contributions_ = pd.DataFrame( 116 | sparse.diags(self.col_masses_.values) 117 | @ np.divide( 118 | # Same as col_coordinates(X) 119 | ( 120 | sparse.diags(self.col_masses_.values**-0.5) 121 | @ self.svd_.V.T 122 | @ sparse.diags(self.svd_.s) 123 | ) 124 | ** 2, 125 | self.eigenvalues_, 126 | out=np.zeros((len(self.col_masses_), len(self.eigenvalues_))), 127 | where=self.eigenvalues_ > 0, 128 | ), 129 | index=self.col_masses_.index, 130 | ) 131 | 132 | return self 133 | 134 | @property 135 | @utils.check_is_fitted 136 | def eigenvalues_(self): 137 | """Returns the eigenvalues associated with each principal component.""" 138 | return np.square(self.svd_.s) 139 | 140 | @utils.check_is_dataframe_input 141 | @select_active_columns 142 | def row_coordinates(self, X): 143 | """The row principal coordinates.""" 144 | 145 | _, row_names, _, _ = utils.make_labels_and_names(X) 146 | index_name = X.index.name 147 | 148 | if isinstance(X, pd.DataFrame): 149 | try: 150 | X = X.sparse.to_coo().astype(float) 151 | except AttributeError: 152 | X = X.to_numpy() 153 | 154 | if self.copy: 155 | X = X.copy() 156 | 157 | # Normalise the rows so that they sum up to 1 158 | if isinstance(X, np.ndarray): 159 | X = X / X.sum(axis=1)[:, None] 160 | else: 161 | X = X / X.sum(axis=1) 162 | 163 | return pd.DataFrame( 164 | data=X @ sparse.diags(self.col_masses_.to_numpy() ** -0.5) @ self.svd_.V.T, 165 | index=pd.Index(row_names, name=index_name), 166 | ) 167 | 168 | @utils.check_is_dataframe_input 169 | @select_active_columns 170 | def row_cosine_similarities(self, X): 171 | """Return the cos2 for each row against the dimensions. 172 | 173 | The cos2 value gives an indicator of the accuracy of the row projection on the dimension. 174 | 175 | Values above 0.5 usually means that the row is relatively accurately well projected onto that dimension. Its often 176 | used to identify which factor/dimension is important for a given element as the cos2 can be interpreted as the proportion 177 | of the variance of the element attributed to a particular factor. 178 | 179 | """ 180 | F = self.row_coordinates(X) 181 | return self._row_cosine_similarities(X, F) 182 | 183 | @select_active_columns 184 | def _row_cosine_similarities(self, X, F): 185 | # Active 186 | X_act = X.loc[self.active_rows_] 187 | X_act = X_act / X_act.sum().sum() 188 | marge_col = X_act.sum(axis=0) 189 | Tc = X_act.div(X_act.sum(axis=1), axis=0).div(marge_col, axis=1) - 1 190 | dist2_row = (Tc**2).mul(marge_col, axis=1).sum(axis=1) 191 | 192 | # Supplementary 193 | X_sup = X.loc[X.index.difference(self.active_rows_, sort=False)] 194 | X_sup = X_sup.div(X_sup.sum(axis=1), axis=0) 195 | dist2_row_sup = ((X_sup - marge_col) ** 2).div(marge_col, axis=1).sum(axis=1) 196 | 197 | dist2_row = pd.concat((dist2_row, dist2_row_sup)) 198 | 199 | # Can't use pandas.div method because it doesn't support duplicate indices 200 | return F**2 / dist2_row.to_numpy()[:, None] 201 | 202 | @utils.check_is_dataframe_input 203 | @select_active_rows 204 | def column_coordinates(self, X): 205 | """The column principal coordinates.""" 206 | 207 | _, _, _, col_names = utils.make_labels_and_names(X) 208 | index_name = X.columns.name 209 | 210 | if isinstance(X, pd.DataFrame): 211 | is_sparse = X.dtypes.apply(lambda dtype: isinstance(dtype, pd.SparseDtype)).all() 212 | if is_sparse: 213 | X = X.sparse.to_coo() 214 | else: 215 | X = X.to_numpy() 216 | 217 | if self.copy: 218 | X = X.copy() 219 | 220 | # Transpose and make sure the rows sum up to 1 221 | if isinstance(X, np.ndarray): 222 | X = X.T / X.T.sum(axis=1)[:, None] 223 | else: 224 | X = X.T / X.T.sum(axis=1) 225 | 226 | return pd.DataFrame( 227 | data=X @ sparse.diags(self.row_masses_.to_numpy() ** -0.5) @ self.svd_.U, 228 | index=pd.Index(col_names, name=index_name), 229 | ) 230 | 231 | @utils.check_is_dataframe_input 232 | @select_active_rows 233 | def column_cosine_similarities(self, X): 234 | """Return the cos2 for each column against the dimensions. 235 | 236 | The cos2 value gives an indicator of the accuracy of the column projection on the dimension. 237 | 238 | Values above 0.5 usually means that the column is relatively accurately well projected onto that dimension. Its often 239 | used to identify which factor/dimension is important for a given element as the cos2 can be interpreted as the proportion 240 | of the variance of the element attributed to a particular factor. 241 | """ 242 | G = self.column_coordinates(X) 243 | return self._column_cosine_similarities(X, G) 244 | 245 | @select_active_rows 246 | def _column_cosine_similarities(self, X, G): 247 | # Active 248 | X_act = X[self.active_cols_] 249 | X_act = X_act / X_act.sum().sum() 250 | marge_row = X_act.sum(axis=1) 251 | Tc = X_act.div(marge_row, axis=0).div(X_act.sum(axis=0), axis=1) - 1 252 | dist2_col = (Tc**2).mul(marge_row, axis=0).sum(axis=0) 253 | 254 | # Supplementary 255 | X_sup = X[X.columns.difference(self.active_cols_, sort=False)] 256 | X_sup = X_sup.div(X_sup.sum(axis=0), axis=1) 257 | dist2_col_sup = ((X_sup.sub(marge_row, axis=0)) ** 2).div(marge_row, axis=0).sum(axis=0) 258 | 259 | dist2_col = pd.concat((dist2_col, dist2_col_sup)) 260 | return (G**2).div(dist2_col, axis=0) 261 | 262 | @utils.check_is_dataframe_input 263 | @utils.check_is_fitted 264 | def plot( 265 | self, 266 | X, 267 | x_component=0, 268 | y_component=1, 269 | show_row_markers=True, 270 | show_column_markers=True, 271 | show_row_labels=False, 272 | show_column_labels=False, 273 | ): 274 | eig = self._eigenvalues_summary.to_dict(orient="index") 275 | 276 | row_chart_markers = None 277 | row_chart_labels = None 278 | column_chart_markers = None 279 | column_chart_labels = None 280 | 281 | if show_row_markers or show_row_labels: 282 | row_coords = self.row_coordinates(X) 283 | row_coords.columns = [f"component {i}" for i in row_coords.columns] 284 | row_coords = row_coords.assign( 285 | variable=row_coords.index.name or "row", 286 | value=row_coords.index.astype(str), 287 | ) 288 | row_labels = pd.Series(row_coords.index, index=row_coords.index) 289 | row_chart = alt.Chart(row_coords.assign(label=row_labels)).encode( 290 | x=alt.X( 291 | f"component {x_component}", 292 | scale=alt.Scale(zero=False), 293 | axis=alt.Axis( 294 | title=f"component {x_component} — {eig[x_component]['% of variance'] / 100:.2%}" 295 | ), 296 | ), 297 | y=alt.Y( 298 | f"component {y_component}", 299 | scale=alt.Scale(zero=False), 300 | axis=alt.Axis( 301 | title=f"component {y_component} — {eig[y_component]['% of variance'] / 100:.2%}" 302 | ), 303 | ), 304 | ) 305 | row_chart_markers = row_chart.mark_circle(size=50 if show_row_markers else 0).encode( 306 | color="variable", 307 | tooltip=[ 308 | "variable", 309 | "value", 310 | f"component {x_component}", 311 | f"component {y_component}", 312 | ], 313 | ) 314 | if show_row_labels: 315 | row_chart_labels = row_chart.mark_text().encode(text="label:N") 316 | 317 | if show_column_markers or show_column_labels: 318 | column_coords = self.column_coordinates(X) 319 | column_coords.columns = [f"component {i}" for i in column_coords.columns] 320 | column_coords = column_coords.assign( 321 | variable=column_coords.index.name or "column", 322 | value=column_coords.index.astype(str), 323 | ) 324 | column_labels = pd.Series(column_coords.index, index=column_coords.index) 325 | column_chart = alt.Chart(column_coords.assign(label=column_labels)).encode( 326 | x=alt.X( 327 | f"component {x_component}", 328 | scale=alt.Scale(zero=False), 329 | axis=alt.Axis( 330 | title=f"component {x_component} — {eig[x_component]['% of variance'] / 100:.2%}" 331 | ), 332 | ), 333 | y=alt.Y( 334 | f"component {y_component}", 335 | scale=alt.Scale(zero=False), 336 | axis=alt.Axis( 337 | title=f"component {y_component} — {eig[y_component]['% of variance'] / 100:.2%}" 338 | ), 339 | ), 340 | ) 341 | column_chart_markers = column_chart.mark_circle( 342 | size=50 if show_column_markers else 0 343 | ).encode( 344 | color="variable", 345 | tooltip=[ 346 | "variable", 347 | "value", 348 | f"component {x_component}", 349 | f"component {y_component}", 350 | ], 351 | ) 352 | if show_column_labels: 353 | column_chart_labels = column_chart.mark_text().encode(text="label:N") 354 | 355 | charts = filter( 356 | None, 357 | ( 358 | row_chart_markers, 359 | row_chart_labels, 360 | column_chart_markers, 361 | column_chart_labels, 362 | ), 363 | ) 364 | 365 | return alt.layer(*charts).interactive() 366 | -------------------------------------------------------------------------------- /prince/datasets.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pathlib 4 | 5 | import pandas as pd 6 | 7 | DATASETS_DIR = pathlib.Path(__file__).parent / "datasets" 8 | 9 | 10 | def load_energy_mix(year=2019, normalize=True): 11 | """Per capita energy mix by country in 2019. 12 | 13 | Each row corresponds to a country. There is one column for each energy source. 14 | A value corresponds to the average energy consumption of a source per capita. 15 | For instance, in France, every citizen consumed 15,186 kWh of nuclear energy. 16 | 17 | This data comes from https://ourworldindata.org/energy-mix 18 | 19 | Parameters 20 | ---------- 21 | year 22 | The year the study was made. 23 | normalize 24 | Whether or not to normalize the kWh by country. 25 | 26 | """ 27 | 28 | df = ( 29 | pd.read_csv(DATASETS_DIR / "per-capita-energy-stacked.csv") 30 | .query("Year == @year") 31 | .query("Entity not in ['Africa', 'Europe', 'North America', 'World']") 32 | .drop(columns=["Code", "Year"]) 33 | .rename(columns={"Entity": "Country"}) 34 | .rename(columns=lambda x: x.replace(" per capita (kWh)", "").lower()) 35 | .set_index(["continent", "country"]) 36 | ) 37 | if normalize: 38 | return df.div(df.sum(axis="columns"), axis="rows") 39 | return df 40 | 41 | 42 | def load_decathlon(): 43 | """The Decathlon dataset from FactoMineR.""" 44 | decathlon = pd.read_csv(DATASETS_DIR / "decathlon.csv") 45 | decathlon.columns = ["athlete", *map(str.lower, decathlon.columns[1:])] 46 | decathlon.athlete = decathlon.athlete.apply(str.title) 47 | decathlon = decathlon.set_index(["competition", "athlete"]) 48 | return decathlon 49 | 50 | 51 | def load_french_elections(): 52 | """Voting data for the 2022 French elections, by region. 53 | 54 | The [original dataset](https://www.data.gouv.fr/fr/datasets/resultats-du-premier-tour-de-lelection-presidentielle-2022-par-commune-et-par-departement/#resources) 55 | has been transformed into a contingency matrix. The latter tallies the number of votes for the 56 | 12 candidates across all 18 regions. The number of blank and abstentions are also recorded. 57 | More information about these regions, including a map, can be found 58 | [on Wikipedia](https://www.wikiwand.com/fr/Région_française). 59 | 60 | """ 61 | dataset = pd.read_csv(DATASETS_DIR / "02-resultats-par-region.csv") 62 | cont = dataset.pivot(index="reg_name", columns="cand_nom", values="cand_nb_voix") 63 | cont["Abstention"] = dataset.groupby("reg_name")["abstention_nb"].min() 64 | cont["Blank"] = dataset.groupby("reg_name")["blancs_nb"].min() 65 | cont.columns = [c.title() for c in cont.columns] 66 | cont.index.name = "region" 67 | cont.columns.name = "candidate" 68 | return cont 69 | 70 | 71 | def load_punctuation_marks(): 72 | """Punctuation marks of six French writers.""" 73 | return pd.read_csv(DATASETS_DIR / "punctuation_marks.csv", index_col="author") 74 | 75 | 76 | def load_hearthstone_cards(): 77 | """Hearthstone standard cards. 78 | 79 | Source: https://gist.github.com/MaxHalford/32ed2c80672d7391ec5b4e6f291f14c1 80 | 81 | """ 82 | return pd.read_csv(DATASETS_DIR / "hearthstone_cards.csv", index_col="id") 83 | 84 | 85 | def load_burgundy_wines(): 86 | """Burgundy wines dataset. 87 | 88 | Source: https://personal.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf 89 | 90 | """ 91 | wines = pd.DataFrame( 92 | data=[ 93 | ["Yes", "No", "No", "Yes", "No", "No", "No", "No", "No", "No"], 94 | ["No", "Maybe", "Yes", "No", "Yes", "Maybe", "Yes", "No", "Yes", "Yes"], 95 | ["No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes"], 96 | ["No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"], 97 | ["Yes", "No", "No", "Yes", "No", "No", "No", "Yes", "No", "No"], 98 | ["Yes", "Maybe", "No", "Yes", "No", "Maybe", "No", "Yes", "No", "No"], 99 | ], 100 | columns=pd.MultiIndex.from_tuples( 101 | [ 102 | ("Expert 1", "Fruity"), 103 | ("Expert 1", "Woody"), 104 | ("Expert 1", "Coffee"), 105 | ("Expert 2", "Red fruit"), 106 | ("Expert 2", "Roasted"), 107 | ("Expert 2", "Vanillin"), 108 | ("Expert 2", "Woody"), 109 | ("Expert 3", "Fruity"), 110 | ("Expert 3", "Butter"), 111 | ("Expert 3", "Woody"), 112 | ], 113 | names=("expert", "aspect"), 114 | ), 115 | index=[f"Wine {i + 1}" for i in range(6)], 116 | ) 117 | wines.insert(0, "Oak type", [1, 2, 2, 2, 1, 1]) 118 | return wines 119 | 120 | 121 | def load_beers(): 122 | """Beers dataset. 123 | 124 | The data is taken from https://github.com/philipperemy/beer-dataset. 125 | 126 | """ 127 | return pd.read_csv(DATASETS_DIR / "beers.csv.zip", index_col="name") 128 | 129 | 130 | def load_premier_league(): 131 | """Premier League dataset. 132 | 133 | The data is taken from Wikipedia, using pd.read_html. 134 | 135 | """ 136 | return pd.read_csv(DATASETS_DIR / "premier_league.csv", index_col=0, header=[0, 1]) 137 | -------------------------------------------------------------------------------- /prince/datasets/beers.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/prince/datasets/beers.csv.zip -------------------------------------------------------------------------------- /prince/datasets/decathlon.csv: -------------------------------------------------------------------------------- 1 | "","100m","Long.jump","Shot.put","High.jump","400m","110m.hurdle","Discus","Pole.vault","Javeline","1500m","Rank","Points","Competition" 2 | "SEBRLE",11.04,7.58,14.83,2.07,49.81,14.69,43.75,5.02,63.19,291.7,1,8217,"Decastar" 3 | "CLAY",10.76,7.4,14.26,1.86,49.37,14.05,50.72,4.92,60.15,301.5,2,8122,"Decastar" 4 | "KARPOV",11.02,7.3,14.77,2.04,48.37,14.09,48.95,4.92,50.31,300.2,3,8099,"Decastar" 5 | "BERNARD",11.02,7.23,14.25,1.92,48.93,14.99,40.87,5.32,62.77,280.1,4,8067,"Decastar" 6 | "YURKOV",11.34,7.09,15.19,2.1,50.42,15.31,46.26,4.72,63.44,276.4,5,8036,"Decastar" 7 | "WARNERS",11.11,7.6,14.31,1.98,48.68,14.23,41.1,4.92,51.77,278.1,6,8030,"Decastar" 8 | "ZSIVOCZKY",11.13,7.3,13.48,2.01,48.62,14.17,45.67,4.42,55.37,268,7,8004,"Decastar" 9 | "McMULLEN",10.83,7.31,13.76,2.13,49.91,14.38,44.41,4.42,56.37,285.1,8,7995,"Decastar" 10 | "MARTINEAU",11.64,6.81,14.57,1.95,50.14,14.93,47.6,4.92,52.33,262.1,9,7802,"Decastar" 11 | "HERNU",11.37,7.56,14.41,1.86,51.1,15.06,44.99,4.82,57.19,285.1,10,7733,"Decastar" 12 | "BARRAS",11.33,6.97,14.09,1.95,49.48,14.48,42.1,4.72,55.4,282,11,7708,"Decastar" 13 | "NOOL",11.33,7.27,12.68,1.98,49.2,15.29,37.92,4.62,57.44,266.6,12,7651,"Decastar" 14 | "BOURGUIGNON",11.36,6.8,13.46,1.86,51.16,15.67,40.49,5.02,54.68,291.7,13,7313,"Decastar" 15 | "Sebrle",10.85,7.84,16.36,2.12,48.36,14.05,48.72,5,70.52,280.01,1,8893,"OlympicG" 16 | "Clay",10.44,7.96,15.23,2.06,49.19,14.13,50.11,4.9,69.71,282,2,8820,"OlympicG" 17 | "Karpov",10.5,7.81,15.93,2.09,46.81,13.97,51.65,4.6,55.54,278.11,3,8725,"OlympicG" 18 | "Macey",10.89,7.47,15.73,2.15,48.97,14.56,48.34,4.4,58.46,265.42,4,8414,"OlympicG" 19 | "Warners",10.62,7.74,14.48,1.97,47.97,14.01,43.73,4.9,55.39,278.05,5,8343,"OlympicG" 20 | "Zsivoczky",10.91,7.14,15.31,2.12,49.4,14.95,45.62,4.7,63.45,269.54,6,8287,"OlympicG" 21 | "Hernu",10.97,7.19,14.65,2.03,48.73,14.25,44.72,4.8,57.76,264.35,7,8237,"OlympicG" 22 | "Nool",10.8,7.53,14.26,1.88,48.81,14.8,42.05,5.4,61.33,276.33,8,8235,"OlympicG" 23 | "Bernard",10.69,7.48,14.8,2.12,49.13,14.17,44.75,4.4,55.27,276.31,9,8225,"OlympicG" 24 | "Schwarzl",10.98,7.49,14.01,1.94,49.76,14.25,42.43,5.1,56.32,273.56,10,8102,"OlympicG" 25 | "Pogorelov",10.95,7.31,15.1,2.06,50.79,14.21,44.6,5,53.45,287.63,11,8084,"OlympicG" 26 | "Schoenbeck",10.9,7.3,14.77,1.88,50.3,14.34,44.41,5,60.89,278.82,12,8077,"OlympicG" 27 | "Barras",11.14,6.99,14.91,1.94,49.41,14.37,44.83,4.6,64.55,267.09,13,8067,"OlympicG" 28 | "Smith",10.85,6.81,15.24,1.91,49.27,14.01,49.02,4.2,61.52,272.74,14,8023,"OlympicG" 29 | "Averyanov",10.55,7.34,14.44,1.94,49.72,14.39,39.88,4.8,54.51,271.02,15,8021,"OlympicG" 30 | "Ojaniemi",10.68,7.5,14.97,1.94,49.12,15.01,40.35,4.6,59.26,275.71,16,8006,"OlympicG" 31 | "Smirnov",10.89,7.07,13.88,1.94,49.11,14.77,42.47,4.7,60.88,263.31,17,7993,"OlympicG" 32 | "Qi",11.06,7.34,13.55,1.97,49.65,14.78,45.13,4.5,60.79,272.63,18,7934,"OlympicG" 33 | "Drews",10.87,7.38,13.07,1.88,48.51,14.01,40.11,5,51.53,274.21,19,7926,"OlympicG" 34 | "Parkhomenko",11.14,6.61,15.69,2.03,51.04,14.88,41.9,4.8,65.82,277.94,20,7918,"OlympicG" 35 | "Terek",10.92,6.94,15.15,1.94,49.56,15.12,45.62,5.3,50.62,290.36,21,7893,"OlympicG" 36 | "Gomez",11.08,7.26,14.57,1.85,48.61,14.41,40.95,4.4,60.71,269.7,22,7865,"OlympicG" 37 | "Turi",11.08,6.91,13.62,2.03,51.67,14.26,39.83,4.8,59.34,290.01,23,7708,"OlympicG" 38 | "Lorenzo",11.1,7.03,13.22,1.85,49.34,15.38,40.22,4.5,58.36,263.08,24,7592,"OlympicG" 39 | "Karlivans",11.33,7.26,13.3,1.97,50.54,14.98,43.34,4.5,52.92,278.67,25,7583,"OlympicG" 40 | "Korkizoglou",10.86,7.07,14.81,1.94,51.16,14.96,46.07,4.7,53.05,317,26,7573,"OlympicG" 41 | "Uldal",11.23,6.99,13.53,1.85,50.95,15.09,43.01,4.5,60,281.7,27,7495,"OlympicG" 42 | "Casarsa",11.36,6.68,14.92,1.94,53.2,15.39,48.66,4.4,58.62,296.12,28,7404,"OlympicG" 43 | -------------------------------------------------------------------------------- /prince/datasets/premier_league.csv: -------------------------------------------------------------------------------- 1 | ,2021-22,2021-22,2021-22,2021-22,2021-22,2021-22,2022-23,2022-23,2022-23,2022-23,2022-23,2022-23,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24 2 | ,W,D,L,GF,GA,Pts,W,D,L,GF,GA,Pts,W,D,L,GF,GA,Pts 3 | Team,,,,,,,,,,,,,,,,,, 4 | Arsenal,22,3,13,61,48,69,26,6,6,88,43,84,28,5,5,91,29,89 5 | Aston Villa,13,6,19,52,54,45,18,7,13,51,46,61,20,8,10,76,61,68 6 | Brentford,13,7,18,48,56,46,15,14,9,58,46,59,10,9,19,56,65,39 7 | Brighton & Hove Albion,12,15,11,42,44,51,18,8,12,72,53,62,12,12,14,55,62,48 8 | Chelsea,21,11,6,76,33,74,11,11,16,38,47,44,18,9,11,77,63,63 9 | Crystal Palace,11,15,12,50,46,48,11,12,15,40,49,45,13,10,15,57,58,49 10 | Everton,11,6,21,43,66,39,8,12,18,34,57,36,13,9,16,40,51,40 11 | Liverpool,28,8,2,94,26,92,19,10,9,75,47,67,24,10,4,86,41,82 12 | Manchester City,29,6,3,99,26,93,28,5,5,94,33,89,28,7,3,96,34,91 13 | Manchester United,16,10,12,57,57,58,23,6,9,58,43,75,18,6,14,57,58,60 14 | Newcastle United,13,10,15,44,62,49,19,14,5,68,33,71,18,6,14,85,62,60 15 | Tottenham Hotspur,22,5,11,69,40,71,18,6,14,70,63,60,20,6,12,74,61,66 16 | West Ham United,16,8,14,60,51,56,11,7,20,42,55,40,14,10,14,60,74,52 17 | Wolverhampton Wanderers,15,6,17,38,43,51,11,8,19,31,58,41,13,7,18,50,65,46 18 | -------------------------------------------------------------------------------- /prince/datasets/punctuation_marks.csv: -------------------------------------------------------------------------------- 1 | "author","period","comma","other" 2 | "Rousseau",7836,13112,6026 3 | "Chateaubriand",53655,102383,42413 4 | "Hugo",115615,184541,59226 5 | "Zola",161926,340479,62754 6 | "Proust",38177,105101,12670 7 | "Giraudoux",46371,58367,14299 8 | -------------------------------------------------------------------------------- /prince/famd.py: -------------------------------------------------------------------------------- 1 | """Factor Analysis of Mixed Data (FAMD)""" 2 | 3 | from __future__ import annotations 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import sklearn.utils 8 | from sklearn import preprocessing 9 | 10 | from prince import pca, utils 11 | 12 | 13 | class FAMD(pca.PCA): 14 | def __init__( 15 | self, 16 | n_components=2, 17 | n_iter=3, 18 | copy=True, 19 | check_input=True, 20 | random_state=None, 21 | engine="sklearn", 22 | handle_unknown="error", 23 | ): 24 | super().__init__( 25 | rescale_with_mean=True, 26 | rescale_with_std=False, 27 | n_components=n_components, 28 | n_iter=n_iter, 29 | copy=copy, 30 | check_input=check_input, 31 | random_state=random_state, 32 | engine=engine, 33 | ) 34 | self.handle_unknown = handle_unknown 35 | 36 | def _check_input(self, X): 37 | if self.check_input: 38 | sklearn.utils.check_array(X, dtype=[str, "numeric"]) 39 | 40 | @utils.check_is_dataframe_input 41 | def fit(self, X, y=None): 42 | # Separate numerical columns from categorical columns 43 | self.num_cols_ = X.select_dtypes(include=["float"]).columns.tolist() 44 | if not self.num_cols_: 45 | raise ValueError("All variables are qualitative: MCA should be used") 46 | self.cat_cols_ = X.columns.difference(self.num_cols_).tolist() 47 | if not self.cat_cols_: 48 | raise ValueError("All variables are quantitative: PCA should be used") 49 | 50 | # Preprocess numerical columns 51 | X_num = X[self.num_cols_].copy() 52 | self.num_scaler_ = preprocessing.StandardScaler().fit(X_num) 53 | X_num[:] = self.num_scaler_.transform(X_num) 54 | 55 | # Preprocess categorical columns 56 | X_cat = X[self.cat_cols_] 57 | self.cat_scaler_ = preprocessing.OneHotEncoder(handle_unknown=self.handle_unknown).fit( 58 | X_cat 59 | ) 60 | X_cat_oh = pd.DataFrame.sparse.from_spmatrix( 61 | self.cat_scaler_.transform(X_cat), 62 | index=X_cat.index, 63 | columns=self.cat_scaler_.get_feature_names_out(self.cat_cols_), 64 | ) 65 | prop = X_cat_oh.sum() / X_cat_oh.sum().sum() * 2 66 | X_cat_oh_norm = X_cat_oh.sub(X_cat_oh.mean(axis="rows")).div(prop**0.5, axis="columns") 67 | 68 | # PCA.fit doesn't work with sparse matrices. Well, it accepts them, but it densifies them. 69 | # We pre-densify them here to avoid a warning. 70 | # TODO: In the future, PCA should be able to handle sparse matrices. 71 | X_cat_oh_norm = X_cat_oh_norm.sparse.to_dense() 72 | 73 | Z = pd.concat([X_num, X_cat_oh_norm], axis=1) 74 | super().fit(Z) 75 | 76 | # Determine column_coordinates_ 77 | # This is based on line 184 in FactoMineR's famd.R file 78 | rc = self.row_coordinates(X) 79 | weights = np.ones(len(X_cat_oh)) / len(X_cat_oh) 80 | norm = (rc**2).multiply(weights, axis=0).sum() 81 | eta2 = pd.DataFrame(index=rc.columns) 82 | for i, col in enumerate(self.cat_cols_): 83 | # TODO: there must be a better way to select a subset of the one-hot encoded matrix 84 | tt = X_cat_oh[[f"{col}_{i}" for i in self.cat_scaler_.categories_[i]]] 85 | ni = (tt / len(tt)).sum() 86 | eta2[col] = ( 87 | rc.apply(lambda x: (tt.multiply(x * weights, axis=0).sum() ** 2 / ni).sum()) / norm 88 | ).values 89 | self.column_coordinates_ = pd.concat( 90 | [self.column_coordinates_.loc[self.num_cols_] ** 2, eta2.T] 91 | ) 92 | self.column_coordinates_.columns.name = "component" 93 | self.column_coordinates_.index.name = "variable" 94 | 95 | return self 96 | 97 | @utils.check_is_dataframe_input 98 | @utils.check_is_fitted 99 | def row_coordinates(self, X): 100 | # Separate numerical columns from categorical columns 101 | X_num = X[self.num_cols_].copy() 102 | X_cat = X[self.cat_cols_] 103 | 104 | # Preprocess numerical columns 105 | X_num[:] = self.num_scaler_.transform(X_num) 106 | 107 | # Preprocess categorical columns 108 | X_cat = pd.DataFrame.sparse.from_spmatrix( 109 | self.cat_scaler_.transform(X_cat), 110 | index=X_cat.index, 111 | columns=self.cat_scaler_.get_feature_names_out(self.cat_cols_), 112 | ) 113 | prop = X_cat.sum() / X_cat.sum().sum() * 2 114 | X_cat = X_cat.sub(X_cat.mean(axis="rows")).div(prop**0.5, axis="columns") 115 | 116 | Z = pd.concat([X_num, X_cat.sparse.to_dense()], axis=1).fillna(0.0) 117 | 118 | return super().row_coordinates(Z) 119 | 120 | @utils.check_is_dataframe_input 121 | @utils.check_is_fitted 122 | def inverse_transform(self, X): 123 | raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet") 124 | 125 | @utils.check_is_dataframe_input 126 | @utils.check_is_fitted 127 | def row_standard_coordinates(self, X): 128 | raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet") 129 | 130 | @utils.check_is_dataframe_input 131 | @utils.check_is_fitted 132 | def row_cosine_similarities(self, X): 133 | raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet") 134 | 135 | @utils.check_is_dataframe_input 136 | @utils.check_is_fitted 137 | def column_correlations(self, X): 138 | raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet") 139 | 140 | @utils.check_is_dataframe_input 141 | @utils.check_is_fitted 142 | def column_cosine_similarities_(self, X): 143 | raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet") 144 | 145 | @property 146 | def column_contributions_(self): 147 | return self.column_coordinates_ / self.eigenvalues_ 148 | -------------------------------------------------------------------------------- /prince/gpa.py: -------------------------------------------------------------------------------- 1 | """Generalized Procrustes Analysis (GPA)""" 2 | 3 | from __future__ import annotations 4 | 5 | import numpy as np 6 | from scipy.linalg import orthogonal_procrustes 7 | from scipy.spatial import procrustes 8 | from sklearn import base 9 | from sklearn import utils as sk_utils 10 | 11 | from prince import utils 12 | 13 | 14 | class GPA(base.BaseEstimator, base.TransformerMixin): 15 | """Generalized Procrustes Analysis (GPA). 16 | 17 | Algorithm outline: 18 | 19 | 1. Choose a reference shape. 20 | 2. Apply Procrustes Analysis to superimpose all shapes to the reference shape. 21 | 3. Compute the mean shape of the superimposed shapes. 22 | 4. Repeat steps 2 and 3 until convergence. 23 | 24 | Parameters 25 | ---------- 26 | max_iter 27 | The maximum number of Procrustes analysis iterations. 28 | tol 29 | The tolerance for the optimization; stops if the Procrustes distance decreases by less or 30 | equal to `tol` between iterations. 31 | init 32 | Method for initializing reference shape. 33 | - 'random' : choose reference shape from shape list 34 | - 'mean' : initialize reference shape as mean of shape list 35 | scale 36 | Whether to compute transformations with a scale component. 37 | copy 38 | Whether to copy data or perform the computations inplace. If False, data passed to fit are 39 | overwritten and running fit(X).transform(X) will not yield the expected results, 40 | use fit_transform(X) instead. 41 | check_input 42 | Whether to check the consistency of the inputs. 43 | random_state 44 | Determines random number generation for initialization when `init=='random'`. 45 | 46 | References 47 | ---------- 48 | https://wikipedia.org/wiki/Generalized_Procrustes_analysis 49 | https://medium.com/@olga_kravchenko/generalized-procrustes-analysis-with-python-numpy-c571e8e8a421 50 | 51 | """ 52 | 53 | def __init__( 54 | self, 55 | max_iter=10, 56 | tol=1e-4, 57 | init="random", 58 | scale=True, 59 | copy=True, 60 | check_input=True, 61 | random_state=None, 62 | ): 63 | self.max_iter = max_iter 64 | self.tol = tol 65 | self.init = init 66 | self.scale = scale 67 | self.copy = copy 68 | self.check_input = check_input 69 | self.random_state = random_state 70 | 71 | def fit(self, X, y=None): 72 | """Fit the model with X. 73 | 74 | The algorithm naturally fits and transforms at the same time, so this 75 | simply calls ``.fit_transform`` 76 | 77 | Parameters: 78 | X (array-like of shape (n_shapes, n_points, n_dim)): Matrix of 79 | shapes to match to each other. 80 | y: Ignored 81 | 82 | Returns: 83 | self (object): The instance itself 84 | """ 85 | self.fit_transform(X) 86 | 87 | return self 88 | 89 | @utils.check_is_fitted 90 | def transform(self, X): 91 | """Align X to the reference shape. 92 | 93 | Parameters: 94 | X (array-like of shape (n_shapes, n_points, n_dim)): Matrix of 95 | shapes to align to the refernce shape. 96 | 97 | Returns: 98 | X_new (array-like of shape (n_shapes, n_points, n_dim)): Matrix of 99 | aligned shapes 100 | """ 101 | self._check_is_fitted() 102 | if self.check_input: 103 | self._check_input(X) 104 | 105 | X_new = np.empty(X.shape) 106 | for shape_idx in range(X.shape[0]): 107 | _, X_new[shape_idx], _ = procrustes(self.reference_shape, X[shape_idx]) 108 | 109 | return X_new 110 | 111 | def fit_transform(self, X, y=None): 112 | """Fit the model with X and return the aligned shapes. 113 | 114 | Parameters: 115 | X (array-like of shape (n_shapes, n_points, n_dim)): Matrix of 116 | shapes to match to each other. 117 | y: Ignored 118 | 119 | Returns: 120 | X_new (array-like of shape (n_shapes, n_points, n_dim)): Matrix X 121 | of aligned shapes 122 | """ 123 | 124 | # Check input 125 | if self.check_input: 126 | self._check_input(X) 127 | 128 | # Copy data 129 | if self.copy: 130 | X = np.array(X, copy=True) 131 | 132 | # scikit-learn SLEP010 133 | n_shapes, n_points, n_dim = X.shape 134 | self.n_features_in_ = n_dim 135 | 136 | # Pick reference shape 137 | if self.init == "random": 138 | random_state = sk_utils.check_random_state(self.random_state) 139 | ref_shape_idx = random_state.randint(X.shape[0]) 140 | reference_shape = X[ref_shape_idx].copy() 141 | elif self.init == "mean": 142 | reference_shape = X.mean(axis=0) 143 | else: 144 | raise ValueError("init method must be one of ('random', 'mean')") 145 | 146 | for iter_idx in range(self.max_iter): 147 | # Align each shape to reference shape 148 | for shape_idx in range(X.shape[0]): 149 | if self.scale: 150 | _, X[shape_idx], _ = procrustes(reference_shape, X[shape_idx]) 151 | else: 152 | _, X[shape_idx] = unscaled_procrustes(reference_shape, X[shape_idx]) 153 | 154 | # Compute diagnostics 155 | mean_shape = X.mean(axis=0) 156 | procrustes_distance = np.linalg.norm(reference_shape - mean_shape) 157 | 158 | # Update reference shape 159 | reference_shape = mean_shape 160 | 161 | # Check for convergence 162 | if procrustes_distance <= self.tol: 163 | break 164 | 165 | # Store properties 166 | self._reference_shape = reference_shape 167 | 168 | # Return the aligned shapes 169 | return X 170 | 171 | def _check_input(self, X): 172 | sk_utils.check_array(X, allow_nd=True) 173 | if X.ndim != 3: 174 | raise ValueError("Expected 3-dimensional input of (n_shapes, n_points, n_dim)") 175 | 176 | def _check_is_fitted(self): 177 | sk_utils.validation.check_is_fitted(self, "_reference_shape") 178 | 179 | @property 180 | def reference_shape(self): 181 | """Returns the final reference shape.""" 182 | self._check_is_fitted() 183 | return self._reference_shape 184 | 185 | 186 | def unscaled_procrustes(reference, data): 187 | """Fit `data` to `reference` using procrustes analysis without scaling. 188 | Uses translation (mean-centering), reflection, and orthogonal rotation. 189 | 190 | Parameters: 191 | reference (array-like of shape (n_points, n_dim)): reference shape to 192 | fit `data` to 193 | data (array-like of shape (n_points, n_dim)): shape to align to 194 | `reference` 195 | 196 | Returns: 197 | reference_centered (np.ndarray of shape (n_points, n_dim)): 0-centered 198 | `reference` shape 199 | data_aligned (np.ndarray of shape (n_points, n_dim)): `data` aligned to 200 | the reference shape 201 | """ 202 | # Convert inputs to np.ndarray types 203 | reference = np.array(reference, dtype=np.double) 204 | data = np.array(data, dtype=np.double) 205 | 206 | # Translate data to the origin 207 | reference_centered = reference - reference.mean(axis=0) 208 | data_centered = data - data.mean(axis=0) 209 | 210 | # Rotate / reflect data to match reference 211 | # transform mtx2 to minimize disparity 212 | R, _ = orthogonal_procrustes(data_centered, reference_centered) 213 | data_aligned = data_centered @ R 214 | 215 | return reference_centered, data_aligned 216 | -------------------------------------------------------------------------------- /prince/mca.py: -------------------------------------------------------------------------------- 1 | """Multiple Correspondence Analysis (MCA)""" 2 | 3 | from __future__ import annotations 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import sklearn.base 8 | import sklearn.preprocessing 9 | import sklearn.utils 10 | 11 | from prince import utils 12 | 13 | from . import ca 14 | 15 | 16 | class MCA(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin, ca.CA): 17 | def __init__( 18 | self, 19 | n_components=2, 20 | n_iter=10, 21 | copy=True, 22 | check_input=True, 23 | random_state=None, 24 | engine="sklearn", 25 | one_hot=True, 26 | correction=None, 27 | ): 28 | if correction is not None: 29 | if correction not in {"benzecri", "greenacre"}: 30 | raise ValueError("correction must be either 'benzecri' or 'greenacre' if provided.") 31 | if not one_hot: 32 | raise ValueError( 33 | "correction can only be applied when one_hot is True. This is because the " 34 | "number of original variables is needed to apply the correction." 35 | ) 36 | 37 | super().__init__( 38 | n_components=n_components, 39 | n_iter=n_iter, 40 | copy=copy, 41 | check_input=check_input, 42 | random_state=random_state, 43 | engine=engine, 44 | ) 45 | self.one_hot = one_hot 46 | self.correction = correction 47 | 48 | def _prepare(self, X): 49 | if self.one_hot: 50 | X = pd.get_dummies(X, columns=X.columns, prefix_sep="__") 51 | if (one_hot_columns_ := getattr(self, "one_hot_columns_", None)) is not None: 52 | X = X.reindex(columns=one_hot_columns_.union(X.columns), fill_value=False) 53 | return X 54 | 55 | def get_feature_names_out(self, input_features=None): 56 | return np.arange(self.n_components_) 57 | 58 | @property 59 | def eigenvalues_(self): 60 | """Returns the eigenvalues associated with each principal component.""" 61 | eigenvalues = super().eigenvalues_ 62 | # Benzécri and Greenacre corrections 63 | if self.correction in {"benzecri", "greenacre"}: 64 | K = self.K_ 65 | return np.array( 66 | [(K / (K - 1) * (eig - 1 / K)) ** 2 if eig > 1 / K else 0 for eig in eigenvalues] 67 | ) 68 | return eigenvalues 69 | 70 | @property 71 | @utils.check_is_fitted 72 | def percentage_of_variance_(self): 73 | """Returns the percentage of explained inertia per principal component.""" 74 | # Benzécri correction 75 | if self.correction == "benzecri": 76 | eigenvalues = self.eigenvalues_ 77 | return 100 * eigenvalues / eigenvalues.sum() 78 | # Greenacre correction 79 | if self.correction == "greenacre": 80 | eigenvalues = super().eigenvalues_ 81 | benzecris = self.eigenvalues_ 82 | K, J = (self.K_, self.J_) 83 | average_inertia = (K / (K - 1)) * ((eigenvalues**2).sum() - (J - K) / K**2) 84 | return 100 * benzecris / average_inertia 85 | # No correction 86 | return super().percentage_of_variance_ 87 | 88 | @utils.check_is_dataframe_input 89 | def fit(self, X, y=None): 90 | """Fit the MCA for the dataframe X. 91 | 92 | The MCA is computed on the indicator matrix (i.e. `X.get_dummies()`). If some of the columns are already 93 | in indicator matrix format, you'll want to pass in `K` as the number of "real" variables that it represents. 94 | (That's used for correcting the inertia linked to each dimension.) 95 | 96 | """ 97 | 98 | if self.check_input: 99 | sklearn.utils.check_array(X, dtype=[str, "numeric"]) 100 | 101 | # K is the number of actual variables, to apply the Benzécri correction 102 | self.K_ = X.shape[1] 103 | 104 | # One-hot encode the data 105 | one_hot = self._prepare(X) 106 | self.one_hot_columns_ = one_hot.columns 107 | 108 | # We need the number of columns to apply the Greenacre correction 109 | self.J_ = one_hot.shape[1] 110 | 111 | # Apply CA to the indicator matrix 112 | super().fit(one_hot) 113 | 114 | return self 115 | 116 | @utils.check_is_dataframe_input 117 | @utils.check_is_fitted 118 | def row_coordinates(self, X): 119 | return super().row_coordinates(self._prepare(X)) 120 | 121 | @utils.check_is_dataframe_input 122 | @utils.check_is_fitted 123 | def row_cosine_similarities(self, X): 124 | oh = self._prepare(X) 125 | return super()._row_cosine_similarities(X=oh, F=super().row_coordinates(oh)) 126 | 127 | @utils.check_is_dataframe_input 128 | @utils.check_is_fitted 129 | def column_coordinates(self, X): 130 | return super().column_coordinates(self._prepare(X)) 131 | 132 | @utils.check_is_dataframe_input 133 | @utils.check_is_fitted 134 | def column_cosine_similarities(self, X): 135 | oh = self._prepare(X) 136 | return super()._column_cosine_similarities(X=oh, G=super().column_coordinates(oh)) 137 | 138 | @utils.check_is_dataframe_input 139 | @utils.check_is_fitted 140 | def transform(self, X): 141 | """Computes the row principal coordinates of a dataset.""" 142 | if self.check_input: 143 | sklearn.utils.check_array(X, dtype=[str, "numeric"]) 144 | return self.row_coordinates(X) 145 | -------------------------------------------------------------------------------- /prince/mfa.py: -------------------------------------------------------------------------------- 1 | """Multiple Factor Analysis (MFA)""" 2 | 3 | from __future__ import annotations 4 | 5 | import collections 6 | 7 | import altair as alt 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from prince import pca, utils 12 | 13 | 14 | class MFA(pca.PCA, collections.UserDict): 15 | def __init__( 16 | self, 17 | n_components=2, 18 | n_iter=3, 19 | copy=True, 20 | check_input=True, 21 | random_state=None, 22 | engine="sklearn", 23 | ): 24 | super().__init__( 25 | rescale_with_mean=True, 26 | rescale_with_std=True, 27 | n_components=n_components, 28 | n_iter=n_iter, 29 | copy=copy, 30 | check_input=check_input, 31 | random_state=random_state, 32 | engine=engine, 33 | ) 34 | collections.UserDict.__init__(self) 35 | 36 | @utils.check_is_dataframe_input 37 | def fit(self, X, y=None, groups=None, supplementary_groups=None): 38 | # Checks groups are provided 39 | self.groups_ = self._determine_groups(X, groups) 40 | if supplementary_groups is not None: 41 | for group in supplementary_groups: 42 | if group not in self.groups_: 43 | raise ValueError(f"Supplementary group '{group}' is not in the groups") 44 | self.supplementary_groups_ = supplementary_groups 45 | 46 | # Check group types are consistent 47 | self.all_nums_ = {} 48 | for group, cols in sorted(self.groups_.items()): 49 | all_num = all(pd.api.types.is_numeric_dtype(X[c]) for c in cols) 50 | all_cat = all(pd.api.types.is_string_dtype(X[c]) for c in cols) 51 | if not (all_num or all_cat): 52 | raise ValueError(f'Not all columns in "{group}" group are of the same type') 53 | self.all_nums_[group] = all_num 54 | 55 | # Run a factor analysis in each group 56 | for group, cols in sorted(self.groups_.items()): 57 | if self.all_nums_[group]: 58 | fa = pca.PCA( 59 | rescale_with_mean=True, 60 | rescale_with_std=True, 61 | n_components=self.n_components, 62 | n_iter=self.n_iter, 63 | copy=True, 64 | random_state=self.random_state, 65 | engine=self.engine, 66 | ) 67 | else: 68 | raise NotImplementedError("Groups of non-numerical variables are not supported yet") 69 | self[group] = fa.fit(X.loc[:, cols]) 70 | 71 | # Fit the global PCA 72 | Z = self._build_Z(X) 73 | column_weights = np.array( 74 | [ 75 | 1 / self[group].eigenvalues_[0] 76 | for group, cols in self.groups_.items() 77 | for _ in cols 78 | if group not in getattr(self, "supplementary_groups_", []) 79 | ] 80 | ) 81 | super().fit( 82 | Z, 83 | column_weight=column_weights, 84 | supplementary_columns=[ 85 | column 86 | for group in getattr(self, "supplementary_groups_", []) 87 | for column in self.groups_[group] 88 | ], 89 | ) 90 | 91 | return self 92 | 93 | def _determine_groups(self, X: pd.DataFrame, groups: dict | list | None) -> dict: 94 | if groups is None: 95 | if isinstance(X.columns, pd.MultiIndex): 96 | groups = X.columns.get_level_values(0).unique().tolist() 97 | else: 98 | raise ValueError("Groups have to be specified") 99 | 100 | if isinstance(groups, list): 101 | if not isinstance(X.columns, pd.MultiIndex): 102 | raise ValueError( 103 | "X has to have MultiIndex columns if groups are provided as a list" 104 | ) 105 | groups = { 106 | group: [ 107 | (group, column) 108 | for column in X.columns.get_level_values(1)[ 109 | X.columns.get_level_values(0) == group 110 | ] 111 | ] 112 | for group in groups 113 | } 114 | return groups 115 | 116 | def _build_Z(self, X): 117 | return pd.concat( 118 | (X[cols] for _, cols in self.groups_.items()), 119 | axis="columns", 120 | ) 121 | 122 | @utils.check_is_dataframe_input 123 | @utils.check_is_fitted 124 | def row_coordinates(self, X): 125 | """Returns the row principal coordinates.""" 126 | Z = self._build_Z(X) 127 | return super().row_coordinates(Z) 128 | 129 | @utils.check_is_dataframe_input 130 | @utils.check_is_fitted 131 | def partial_row_coordinates(self, X): 132 | """Returns the partial row principal coordinates.""" 133 | Z = self._build_Z(X) 134 | coords = [] 135 | for _, names in self.groups_.items(): 136 | partial_coords = pd.DataFrame(0.0, index=Z.index, columns=Z.columns) 137 | partial_coords.loc[:, names] = (Z[names] - Z[names].mean()) / Z[names].std(ddof=0) 138 | partial_coords = partial_coords * self.column_weight_ 139 | partial_coords = (len(self.groups_) * partial_coords).dot(self.svd_.V.T) 140 | coords.append(partial_coords) 141 | coords = pd.concat(coords, axis=1, keys=self.groups_.keys()) 142 | coords.columns.name = "component" 143 | return coords 144 | 145 | @utils.check_is_dataframe_input 146 | @utils.check_is_fitted 147 | def column_coordinates(self, X): 148 | Z = self._build_Z(X) 149 | return super().column_coordinates(Z) 150 | 151 | @utils.check_is_dataframe_input 152 | @utils.check_is_fitted 153 | def inverse_transform(self, X): 154 | raise NotImplementedError("MFA inherits from PCA, but this method is not implemented yet") 155 | 156 | @utils.check_is_dataframe_input 157 | @utils.check_is_fitted 158 | def row_standard_coordinates(self, X): 159 | Z = self._build_Z(X) 160 | return super().row_standard_coordinates(Z) 161 | 162 | @utils.check_is_dataframe_input 163 | @utils.check_is_fitted 164 | def row_cosine_similarities(self, X): 165 | Z = self._build_Z(X) 166 | return super().row_cosine_similarities(Z) 167 | 168 | @utils.check_is_dataframe_input 169 | @utils.check_is_fitted 170 | def column_cosine_similarities_(self, X): 171 | Z = self._build_Z(X) 172 | return super().column_cosine_similarities_(Z) 173 | 174 | @utils.check_is_dataframe_input 175 | @utils.check_is_fitted 176 | def plot(self, X, x_component=0, y_component=1, show_partial_rows=False, **params): 177 | index_name = X.index.name or "index" 178 | 179 | params["tooltip"] = ( 180 | X.index.names if isinstance(X.index, pd.MultiIndex) else [index_name] 181 | ) + [ 182 | "group", 183 | f"component {x_component}", 184 | f"component {y_component}", 185 | ] 186 | 187 | eig = self._eigenvalues_summary.to_dict(orient="index") 188 | 189 | row_plot = None 190 | partial_row_plot = None 191 | edges_plot = None 192 | 193 | # Barycenters 194 | row_coords = self.row_coordinates(X) 195 | row_coords.columns = [f"component {i}" for i in row_coords.columns] 196 | row_coords = row_coords.reset_index() 197 | row_coords["group"] = "Global" 198 | if show_partial_rows: 199 | params["color"] = "group:N" 200 | row_plot = ( 201 | alt.Chart(row_coords) 202 | .mark_point(filled=True, size=50) 203 | .encode( 204 | alt.X( 205 | f"component {x_component}", 206 | scale=alt.Scale(zero=False), 207 | axis=alt.Axis( 208 | title=f"component {x_component} — {eig[x_component]['% of variance'] / 100:.2%}" 209 | ), 210 | ), 211 | alt.Y( 212 | f"component {y_component}", 213 | scale=alt.Scale(zero=False), 214 | axis=alt.Axis( 215 | title=f"component {y_component} — {eig[y_component]['% of variance'] / 100:.2%}" 216 | ), 217 | ), 218 | **params, 219 | ) 220 | ) 221 | 222 | # Partial row coordinates 223 | if show_partial_rows: 224 | partial_row_coords = self.partial_row_coordinates(X).stack(level=0, future_stack=True) 225 | partial_row_coords.columns = [f"component {i}" for i in partial_row_coords.columns] 226 | partial_row_coords = partial_row_coords.reset_index(names=[index_name, "group"]) 227 | 228 | partial_row_plot = ( 229 | alt.Chart(partial_row_coords) 230 | .mark_point(shape="circle") 231 | .encode( 232 | alt.X(f"component {x_component}", scale=alt.Scale(zero=False)), 233 | alt.Y(f"component {y_component}", scale=alt.Scale(zero=False)), 234 | **params, 235 | ) 236 | ) 237 | 238 | # Edges to connect the main markers to the partial markers 239 | if show_partial_rows: 240 | edges = pd.merge( 241 | left=row_coords[ 242 | [index_name, f"component {x_component}", f"component {y_component}"] 243 | ], 244 | right=partial_row_coords[ 245 | [index_name, f"component {x_component}", f"component {y_component}", "group"] 246 | ], 247 | on=index_name, 248 | suffixes=("_global", "_partial"), 249 | ) 250 | edges_plot = ( 251 | alt.Chart(edges) 252 | .mark_line(opacity=0.7) 253 | .encode( 254 | x=f"component {x_component}_global:Q", 255 | y=f"component {y_component}_global:Q", 256 | x2=f"component {x_component}_partial:Q", 257 | y2=f"component {y_component}_partial:Q", 258 | color="group:N", 259 | strokeDash=alt.value([2, 2]), 260 | ) 261 | ) 262 | 263 | charts = filter( 264 | None, 265 | (row_plot, partial_row_plot, edges_plot), 266 | ) 267 | 268 | return alt.layer(*charts).interactive() 269 | -------------------------------------------------------------------------------- /prince/pca.py: -------------------------------------------------------------------------------- 1 | """Principal Component Analysis (PCA)""" 2 | 3 | from __future__ import annotations 4 | 5 | import functools 6 | 7 | import altair as alt 8 | import numpy as np 9 | import pandas as pd 10 | import sklearn.base 11 | import sklearn.utils 12 | from sklearn import preprocessing 13 | 14 | from prince import svd, utils 15 | 16 | 17 | def select_active_variables(method): 18 | @functools.wraps(method) 19 | def _impl(self, X=None, *method_args, **method_kwargs): 20 | if hasattr(self, "feature_names_in_") and isinstance(X, pd.DataFrame): 21 | return method(self, X[self.feature_names_in_], *method_args, **method_kwargs) 22 | return method(self, X, *method_args, **method_kwargs) 23 | 24 | return _impl 25 | 26 | 27 | class PCA(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin, utils.EigenvaluesMixin): 28 | """Principal Component Analysis (PCA). 29 | 30 | Parameters 31 | ---------- 32 | rescale_with_mean 33 | Whether or not to subtract each column's mean before performing SVD. 34 | rescale_with_std 35 | Whether or not to standardize each column before performing SVD. 36 | n_components 37 | The number of principal components to compute. 38 | n_iter 39 | The number of iterations used for computing the SVD. 40 | copy 41 | Whether nor to perform the computations inplace. 42 | check_input 43 | Whether to check the coherence of the inputs or not. 44 | 45 | """ 46 | 47 | def __init__( 48 | self, 49 | rescale_with_mean=True, 50 | rescale_with_std=True, 51 | n_components=2, 52 | n_iter=3, 53 | copy=True, 54 | check_input=True, 55 | random_state=None, 56 | engine="sklearn", 57 | ): 58 | self.n_components = n_components 59 | self.n_iter = n_iter 60 | self.rescale_with_mean = rescale_with_mean 61 | self.rescale_with_std = rescale_with_std 62 | self.copy = copy 63 | self.check_input = check_input 64 | self.random_state = random_state 65 | self.engine = engine 66 | 67 | def _check_input(self, X): 68 | if self.check_input: 69 | sklearn.utils.check_array(X) 70 | 71 | def get_feature_names_out(self, input_features=None): 72 | return np.arange(self.n_components_) 73 | 74 | @utils.check_is_dataframe_input 75 | def fit( 76 | self, 77 | X, 78 | y=None, 79 | sample_weight=None, 80 | column_weight=None, 81 | supplementary_columns=None, 82 | ): 83 | self._check_input(X) 84 | 85 | # Massage input 86 | supplementary_columns = supplementary_columns or [] 87 | active_variables = X.columns.difference(supplementary_columns, sort=False).tolist() 88 | sample_weight = np.ones(len(X)) if sample_weight is None else sample_weight 89 | sample_weight = sample_weight / sample_weight.sum() 90 | column_weight = np.ones(len(active_variables)) if column_weight is None else column_weight 91 | self.column_weight_ = column_weight 92 | 93 | # https://scikit-learn.org/stable/developers/develop.html#universal-attributes 94 | self.feature_names_in_ = active_variables 95 | self.n_features_in_ = len(active_variables) 96 | 97 | X_active = X[active_variables].to_numpy(dtype=np.float64, copy=self.copy) 98 | if supplementary_columns: 99 | X_sup = X[supplementary_columns].to_numpy(dtype=np.float64, copy=self.copy) 100 | 101 | # Scale datarow_contributions 102 | if self.rescale_with_mean or self.rescale_with_std: 103 | self.scaler_ = preprocessing.StandardScaler( 104 | copy=self.copy, 105 | with_mean=self.rescale_with_mean, 106 | with_std=self.rescale_with_std, 107 | ).fit(X_active, sample_weight=sample_weight) 108 | X_active = self.scaler_.transform(X_active) # TODO: maybe fit_transform is faster 109 | if supplementary_columns: 110 | X_sup = preprocessing.StandardScaler( 111 | copy=self.copy, 112 | with_mean=self.rescale_with_mean, 113 | with_std=self.rescale_with_std, 114 | ).fit_transform(X_sup) 115 | 116 | self._column_dist = pd.Series( 117 | (X_active**2 * sample_weight[:, np.newaxis]).sum(axis=0), 118 | index=active_variables, 119 | ) 120 | if supplementary_columns: 121 | self._column_dist = pd.concat( 122 | ( 123 | self._column_dist, 124 | pd.Series( 125 | (X_sup**2 / len(X_sup)).sum(axis=0), 126 | index=supplementary_columns, 127 | ), 128 | ) 129 | ) 130 | 131 | self.svd_ = svd.compute_svd( 132 | X=X_active, 133 | n_components=self.n_components, 134 | n_iter=self.n_iter, 135 | random_state=self.random_state, 136 | engine=self.engine, 137 | row_weights=sample_weight, 138 | column_weights=column_weight, 139 | ) 140 | 141 | self.total_inertia_ = np.sum( 142 | np.square(X_active) * column_weight * sample_weight[:, np.newaxis] 143 | ) 144 | 145 | self.column_coordinates_ = pd.DataFrame( 146 | data=self.svd_.V.T * self.eigenvalues_**0.5, 147 | index=active_variables, 148 | ) 149 | if supplementary_columns: 150 | self.column_coordinates_ = pd.concat( 151 | [ 152 | self.column_coordinates_, 153 | pd.DataFrame( 154 | data=X_sup.T @ (self.svd_.U / len(self.svd_.U) ** 0.5), 155 | index=supplementary_columns, 156 | ), 157 | ] 158 | ) 159 | self.column_coordinates_.columns.name = "component" 160 | self.column_coordinates_.index.name = "variable" 161 | row_coords = pd.DataFrame( 162 | self.svd_.U * self.eigenvalues_**0.5, 163 | # HACK: there's a circular dependency between row_contributions_ 164 | # and active_row_coordinates in self.__init__ 165 | index=self.row_contributions_.index if hasattr(self, "row_contributions_") else None, 166 | ) 167 | row_coords.columns.name = "component" 168 | self.row_contributions_ = (row_coords**2 * sample_weight[:, np.newaxis]).div( 169 | self.eigenvalues_, axis=1 170 | ) 171 | self.row_contributions_.index = X.index 172 | 173 | return self 174 | 175 | @property 176 | @utils.check_is_fitted 177 | def eigenvalues_(self): 178 | """Returns the eigenvalues associated with each principal component.""" 179 | return np.square(self.svd_.s) 180 | 181 | def _scale(self, X): 182 | if not hasattr(self, "scaler_"): 183 | return X 184 | 185 | if sup_variables := X.columns.difference(self.feature_names_in_, sort=False).tolist(): 186 | X = np.concatenate( 187 | ( 188 | self.scaler_.transform(X[self.feature_names_in_].to_numpy()), 189 | preprocessing.StandardScaler( 190 | copy=self.copy, 191 | with_mean=self.rescale_with_mean, 192 | with_std=self.rescale_with_std, 193 | ).fit_transform(X[sup_variables]), 194 | ), 195 | axis=1, 196 | ) 197 | else: 198 | X = self.scaler_.transform(X.to_numpy()) 199 | 200 | return X 201 | 202 | @utils.check_is_dataframe_input 203 | @utils.check_is_fitted 204 | @select_active_variables 205 | def row_coordinates(self, X: pd.DataFrame): 206 | """Returns the row principal coordinates. 207 | 208 | The row principal coordinates are obtained by projecting `X` on the right eigenvectors. 209 | 210 | Synonyms 211 | -------- 212 | Row projections 213 | Factor scores 214 | Loadings 215 | 216 | """ 217 | 218 | index = X.index if isinstance(X, pd.DataFrame) else None 219 | X = self._scale(X) 220 | X = np.array(X, copy=self.copy) 221 | X *= self.column_weight_ 222 | 223 | coord = pd.DataFrame(data=X.dot(self.svd_.V.T), index=index) 224 | coord.columns.name = "component" 225 | return coord 226 | 227 | @utils.check_is_dataframe_input 228 | @utils.check_is_fitted 229 | def transform(self, X, as_array=False): 230 | """Computes the row principal coordinates of a dataset. 231 | 232 | Same as calling `row_coordinates`. This is just for compatibility with 233 | scikit-learn. 234 | 235 | """ 236 | self._check_input(X) 237 | rc = self.row_coordinates(X) 238 | return rc.to_numpy() if as_array else rc 239 | 240 | @utils.check_is_dataframe_input 241 | def fit_transform(self, X, y=None, as_array=False): 242 | """A faster way to fit/transform. 243 | 244 | This methods produces exactly the same result as calling `fit(X)` followed 245 | by `transform(X)`. It is however much faster, as it avoids a matrix multiplication 246 | between the input data and the right eigenvectors. The row coordinates are instead obtained 247 | directly from the left eigenvectors. 248 | 249 | """ 250 | self._check_input(X) 251 | self.fit(X) 252 | rc = self.row_coordinates(X) 253 | return rc.to_numpy() if as_array else rc 254 | 255 | @utils.check_is_dataframe_input 256 | @utils.check_is_fitted 257 | def inverse_transform(self, X, as_array=False): 258 | """Transforms row projections back to their original space. 259 | 260 | In other words, return a dataset whose transform would be X. 261 | 262 | """ 263 | 264 | X_inv = np.dot(X, self.svd_.V) 265 | 266 | if hasattr(self, "scaler_"): 267 | X_inv = self.scaler_.inverse_transform(X_inv) 268 | 269 | if as_array: 270 | return X_inv 271 | 272 | # Extract index 273 | index = X.index if isinstance(X, pd.DataFrame) else None 274 | return pd.DataFrame(data=X_inv, index=index) 275 | 276 | @utils.check_is_dataframe_input 277 | @utils.check_is_fitted 278 | def row_standard_coordinates(self, X: pd.DataFrame = None): 279 | """Returns the row standard coordinates. 280 | 281 | The row standard coordinates are obtained by dividing each row principal coordinate by it's 282 | associated eigenvalue. 283 | 284 | """ 285 | return self.row_coordinates(X).div(self.eigenvalues_, axis="columns") 286 | 287 | @utils.check_is_dataframe_input 288 | @utils.check_is_fitted 289 | @select_active_variables 290 | def row_cosine_similarities(self, X): 291 | """Returns the cosine similarities between the rows and their principal components. 292 | 293 | The row cosine similarities are obtained by calculating the cosine of the angle shaped by 294 | the row principal coordinates and the row principal components. This is calculated by 295 | squaring each row projection coordinate and dividing each squared coordinate by the sum of 296 | the squared coordinates, which results in a ratio comprised between 0 and 1 representing 297 | the squared cosine. 298 | 299 | """ 300 | squared_coordinates = (np.square(self._scale(X)) * self.column_weight_).sum(axis=1) 301 | return (self.row_coordinates(X) ** 2).div(squared_coordinates, axis=0) 302 | 303 | @property 304 | @utils.check_is_fitted 305 | def column_correlations(self): 306 | """Calculate correlations between variables and components. 307 | 308 | The correlation between a variable and a component estimates the information they share. In 309 | the PCA framework, this correlation is called a loading. 310 | 311 | Note that the sum of the squared coefficients of correlation between a variable and all the 312 | components is equal to 1. As a consequence, the squared loadings are easier to interpret 313 | than the loadings (because the squared loadings give the proportion of the variance of the 314 | variables explained by the components). 315 | 316 | """ 317 | return self.column_coordinates_.div(self._column_dist**0.5, axis=0) 318 | 319 | @property 320 | @utils.check_is_fitted 321 | def column_cosine_similarities_(self): 322 | return self.column_correlations**2 323 | 324 | @property 325 | @utils.check_is_fitted 326 | def column_contributions_(self): 327 | return ( 328 | ((self.column_coordinates_.loc[self.feature_names_in_]) ** 2) 329 | * self.column_weight_[:, np.newaxis] 330 | ).div(self.eigenvalues_, axis=1) 331 | 332 | @utils.check_is_dataframe_input 333 | @utils.check_is_fitted 334 | def plot( 335 | self, 336 | X, 337 | x_component=0, 338 | y_component=1, 339 | color_rows_by=None, 340 | show_row_markers=True, 341 | show_column_markers=True, 342 | show_row_labels=False, 343 | show_column_labels=False, 344 | row_labels_column=None, 345 | ): 346 | row_params = { 347 | "tooltip": ( 348 | X.index.names 349 | if isinstance(X.index, pd.MultiIndex) 350 | else [X.index.name or "index"] # index is the default name 351 | ) 352 | + [ 353 | f"component {x_component}", 354 | f"component {y_component}", 355 | ] 356 | } 357 | if color_rows_by: 358 | row_params["color"] = color_rows_by 359 | 360 | eig = self._eigenvalues_summary.to_dict(orient="index") 361 | 362 | row_chart_markers = None 363 | row_chart_labels = None 364 | column_chart_markers = None 365 | column_chart_labels = None 366 | 367 | if show_row_markers or show_row_labels: 368 | row_coords = self.row_coordinates(X) 369 | row_coords.columns = [f"component {i}" for i in row_coords.columns] 370 | row_labels = ( 371 | pd.Series( 372 | row_coords.index.get_level_values( 373 | row_labels_column or row_coords.index.names[0] 374 | ), 375 | index=row_coords.index, 376 | ) 377 | if isinstance(row_coords.index, pd.MultiIndex) 378 | else pd.Series(row_coords.index, index=row_coords.index) 379 | ) 380 | 381 | row_chart = alt.Chart(row_coords.assign(label=row_labels).reset_index()).encode( 382 | alt.X( 383 | f"component {x_component}", 384 | scale=alt.Scale(zero=False), 385 | axis=alt.Axis( 386 | title=f"component {x_component} — {eig[x_component]['% of variance'] / 100:.2%}" 387 | ), 388 | ), 389 | alt.Y( 390 | f"component {y_component}", 391 | scale=alt.Scale(zero=False), 392 | axis=alt.Axis( 393 | title=f"component {y_component} — {eig[y_component]['% of variance'] / 100:.2%}" 394 | ), 395 | ), 396 | **row_params, 397 | ) 398 | row_chart_markers = row_chart.mark_circle(size=50 if show_row_markers else 0) 399 | if show_row_labels: 400 | row_chart_labels = row_chart.mark_text().encode(text="label:N") 401 | 402 | if show_column_markers or show_column_labels: 403 | column_coords = self.column_coordinates_.copy() 404 | column_coords.columns = [f"component {i}" for i in column_coords.columns] 405 | # Scale the column coordinates to the row coordinates 406 | column_coords = column_coords * row_coords.abs().max() 407 | column_labels = pd.Series(column_coords.index, index=column_coords.index) 408 | 409 | column_chart = alt.Chart( 410 | column_coords.assign(label=column_labels).reset_index() 411 | ).encode( 412 | alt.X(f"component {x_component}", scale=alt.Scale(zero=False)), 413 | alt.Y(f"component {y_component}", scale=alt.Scale(zero=False)), 414 | tooltip=["variable"], 415 | ) 416 | column_chart_markers = column_chart.mark_square( 417 | color="green", size=50 if show_column_markers else 0 418 | ) 419 | if show_column_labels: 420 | column_chart_labels = column_chart.mark_text().encode(text="label:N") 421 | 422 | charts = filter( 423 | None, 424 | ( 425 | row_chart_markers, 426 | row_chart_labels, 427 | column_chart_markers, 428 | column_chart_labels, 429 | ), 430 | ) 431 | 432 | return alt.layer(*charts).interactive() 433 | -------------------------------------------------------------------------------- /prince/plot.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections import OrderedDict 4 | 5 | import numpy as np 6 | from scipy import linalg 7 | 8 | GRAY = OrderedDict([("light", "#bababa"), ("dark", "#404040")]) 9 | 10 | 11 | def stylize_axis(ax, grid=True): 12 | if grid: 13 | ax.grid() 14 | 15 | ax.xaxis.set_ticks_position("none") 16 | ax.yaxis.set_ticks_position("none") 17 | 18 | ax.axhline(y=0, linestyle="-", linewidth=1.2, color=GRAY["dark"], alpha=0.6) 19 | ax.axvline(x=0, linestyle="-", linewidth=1.2, color=GRAY["dark"], alpha=0.6) 20 | 21 | return ax 22 | 23 | 24 | def build_ellipse(X, Y): 25 | """Construct ellipse coordinates from two arrays of numbers. 26 | 27 | Args: 28 | X (1D array_like) 29 | Y (1D array_like) 30 | 31 | Returns: 32 | float: The mean of `X`. 33 | float: The mean of `Y`. 34 | float: The width of the ellipse. 35 | float: The height of the ellipse. 36 | float: The angle of orientation of the ellipse. 37 | 38 | """ 39 | x_mean = np.mean(X) 40 | y_mean = np.mean(Y) 41 | 42 | cov_matrix = np.cov(np.vstack((X, Y))) 43 | U, s, V = linalg.svd(cov_matrix, full_matrices=False) 44 | 45 | chi_95 = np.sqrt(4.61) # 90% quantile of the chi-square distribution 46 | width = np.sqrt(cov_matrix[0][0]) * chi_95 * 2 47 | height = np.sqrt(cov_matrix[1][1]) * chi_95 * 2 48 | 49 | eigenvector = V.T[0] 50 | angle = np.arctan(eigenvector[1] / eigenvector[0]) 51 | 52 | return x_mean, y_mean, width, height, angle 53 | -------------------------------------------------------------------------------- /prince/svd.py: -------------------------------------------------------------------------------- 1 | """Singular Value Decomposition (SVD)""" 2 | 3 | from __future__ import annotations 4 | 5 | import dataclasses 6 | 7 | try: 8 | import fbpca 9 | 10 | FBPCA_INSTALLED = True 11 | except ImportError: 12 | FBPCA_INSTALLED = False 13 | import numpy as np 14 | import scipy 15 | from sklearn.utils import extmath 16 | 17 | 18 | @dataclasses.dataclass 19 | class SVD: 20 | U: np.ndarray 21 | s: np.ndarray 22 | V: np.ndarray 23 | 24 | 25 | def compute_svd( 26 | X: np.ndarray, 27 | n_components: int, 28 | n_iter: int, 29 | engine: str, 30 | random_state: int | None = None, 31 | row_weights: np.ndarray | None = None, 32 | column_weights: np.ndarray | None = None, 33 | ) -> SVD: 34 | """Computes an SVD with k components.""" 35 | 36 | if row_weights is not None: 37 | X = X * np.sqrt(row_weights[:, np.newaxis]) # row-wise scaling 38 | if column_weights is not None: 39 | X = X * np.sqrt(column_weights) 40 | 41 | # Compute the SVD 42 | if engine == "fbpca": 43 | if FBPCA_INSTALLED: 44 | U, s, V = fbpca.pca(X, k=n_components, n_iter=n_iter) 45 | else: 46 | raise ValueError("fbpca is not installed; please install it if you want to use it") 47 | elif engine == "scipy": 48 | U, s, V = scipy.linalg.svd(X) 49 | U = U[:, :n_components] 50 | s = s[:n_components] 51 | V = V[:n_components, :] 52 | elif engine == "sklearn": 53 | U, s, V = extmath.randomized_svd( 54 | X, n_components=n_components, n_iter=n_iter, random_state=random_state 55 | ) 56 | else: 57 | raise ValueError("engine has to be one of ('fbpca', 'scipy', 'sklearn')") 58 | 59 | # U, V = extmath.svd_flip(U, V) 60 | 61 | if row_weights is not None: 62 | U = U / np.sqrt(row_weights)[:, np.newaxis] # row-wise scaling 63 | if column_weights is not None: 64 | V = V / np.sqrt(column_weights) 65 | 66 | return SVD(U, s, V) 67 | -------------------------------------------------------------------------------- /prince/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | 5 | import altair as alt 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.utils import validation 9 | 10 | 11 | def check_is_fitted(method): 12 | @functools.wraps(method) 13 | def _impl(self, *method_args, **method_kwargs): 14 | validation.check_is_fitted(self) 15 | return method(self, *method_args, **method_kwargs) 16 | 17 | return _impl 18 | 19 | 20 | def check_is_dataframe_input(func): 21 | @functools.wraps(func) 22 | def wrapper(*args, **kwargs): 23 | X = args[1] # Assuming the first argument is 'self' or an instance 24 | if not isinstance(X, pd.DataFrame): 25 | raise ValueError( 26 | f"The X argument must be a pandas DataFrame, but got {type(X).__name__}" 27 | ) 28 | return func(*args, **kwargs) 29 | 30 | return wrapper 31 | 32 | 33 | def make_labels_and_names(X): 34 | if isinstance(X, pd.DataFrame): 35 | row_label = X.index.name if X.index.name else "Rows" 36 | row_names = X.index.tolist() 37 | col_label = X.columns.name if X.columns.name else "Columns" 38 | col_names = X.columns.tolist() 39 | else: 40 | row_label = "Rows" 41 | row_names = list(range(X.shape[0])) 42 | col_label = "Columns" 43 | col_names = list(range(X.shape[1])) 44 | 45 | return row_label, row_names, col_label, col_names 46 | 47 | 48 | class EigenvaluesMixin: 49 | @property 50 | @check_is_fitted 51 | def percentage_of_variance_(self): 52 | """Returns the percentage of explained inertia per principal component.""" 53 | return 100 * self.eigenvalues_ / self.total_inertia_ 54 | 55 | @property 56 | @check_is_fitted 57 | def cumulative_percentage_of_variance_(self): 58 | """Returns the percentage of explained inertia per principal component.""" 59 | return np.cumsum(self.percentage_of_variance_) 60 | 61 | @property 62 | @check_is_fitted 63 | def _eigenvalues_summary(self): 64 | """Return a summary of the eigenvalues and their importance.""" 65 | return pd.DataFrame( 66 | { 67 | "eigenvalue": self.eigenvalues_, 68 | r"% of variance": self.percentage_of_variance_, 69 | r"% of variance (cumulative)": self.cumulative_percentage_of_variance_, 70 | }, 71 | index=pd.RangeIndex(0, len(self.eigenvalues_), name="component"), 72 | ) 73 | 74 | @property 75 | def eigenvalues_summary(self): 76 | """Return a summary of the eigenvalues and their importance.""" 77 | summary = self._eigenvalues_summary 78 | summary["% of variance"] /= 100 79 | summary["% of variance (cumulative)"] /= 100 80 | summary["eigenvalue"] = summary["eigenvalue"].map("{:,.3f}".format) 81 | summary["% of variance"] = summary["% of variance"].map("{:.2%}".format) 82 | summary["% of variance (cumulative)"] = summary["% of variance (cumulative)"].map( 83 | "{:.2%}".format 84 | ) 85 | summary.index.name = "component" 86 | return summary 87 | 88 | def scree_plot(self): 89 | """Scree plot. 90 | 91 | References 92 | ---------- 93 | https://en.wikipedia.org/wiki/Scree_plot 94 | 95 | """ 96 | eig = self._eigenvalues_summary.reset_index() 97 | eig["component"] = eig["component"].astype(str) 98 | return ( 99 | alt.Chart( 100 | self._eigenvalues_summary.reset_index().assign( 101 | component=lambda x: x["component"].astype(str) 102 | ) 103 | ) 104 | .mark_bar(size=10) 105 | .encode(x="component", y="eigenvalue", tooltip=eig.columns.tolist()) 106 | ) 107 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "prince" 3 | version = "0.16.0" 4 | description = "Factor analysis in Python: PCA, CA, MCA, MFA, FAMD, GPA" 5 | authors = ["Max Halford "] 6 | license = "MIT" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.10,<4.0" 10 | scikit-learn = "^1.5.1" 11 | pandas = "^2.2.0" 12 | altair = "^5.0.0" 13 | 14 | [tool.poetry.group.dev.dependencies] 15 | nbconvert = "^7.16.5" 16 | fbpca = "^1.0" 17 | pytest = "^8.3.4" 18 | ipykernel = "^6.13.0" 19 | rpy2 = "^3.5.2" 20 | ruff = "^0.8.5" 21 | xarray = "^2025.1.0" 22 | pre-commit = "^4.0.1" 23 | 24 | [tool.ruff] 25 | lint.select = ["E", "F", "I", "UP"] # https://beta.ruff.rs/docs/rules/ 26 | line-length = 100 27 | target-version = 'py310' 28 | lint.ignore = ["E501"] 29 | 30 | [tool.ruff.lint.isort] 31 | required-imports = ["from __future__ import annotations"] 32 | 33 | [build-system] 34 | requires = ["poetry-core>=1.0.0"] 35 | build-backend = "poetry.core.masonry.api" 36 | 37 | [tool.pytest.ini_options] 38 | addopts = [ 39 | "--verbose", 40 | "--doctest-modules", 41 | "--doctest-glob=*.md" 42 | ] 43 | doctest_optionflags = "NORMALIZE_WHITESPACE NUMBER ELLIPSIS" 44 | -------------------------------------------------------------------------------- /tests/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: prince-test 2 | Version: 0.0.0.1 3 | Title: Test dependencies 4 | Imports: 5 | FactoMineR 6 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import rpy2.rinterface_lib 6 | from rpy2.robjects import r as R 7 | 8 | 9 | def load_df_from_R(code): 10 | df = R(code) 11 | if isinstance(df.names, rpy2.rinterface_lib.sexp.NULLType): 12 | return pd.DataFrame(np.array(df)) 13 | return pd.DataFrame(np.array(df), index=df.names[0], columns=df.names[1]) 14 | -------------------------------------------------------------------------------- /tests/test_ca.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | import tempfile 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pytest 9 | import rpy2.robjects as robjects 10 | import sklearn.utils.estimator_checks 11 | import sklearn.utils.validation 12 | from rpy2.robjects import r as R 13 | from scipy import sparse 14 | 15 | import prince 16 | from tests import load_df_from_R 17 | 18 | 19 | @pytest.mark.parametrize( 20 | "sup_rows, sup_cols", 21 | [ 22 | pytest.param( 23 | sup_rows, 24 | sup_cols, 25 | id=":".join(["sup_rows" if sup_rows else "", "sup_cols" if sup_cols else ""]).strip( 26 | ":" 27 | ), 28 | ) 29 | for sup_rows in [False, True] 30 | for sup_cols in [False, True] 31 | ], 32 | ) 33 | class TestCA: 34 | _row_name = "row" 35 | _col_name = "col" 36 | 37 | @pytest.fixture(autouse=True) 38 | def _prepare(self, sup_rows, sup_cols): 39 | self.sup_rows = sup_rows 40 | self.sup_cols = sup_cols 41 | 42 | n_components = 5 43 | 44 | # Fit Prince 45 | self.dataset = prince.datasets.load_french_elections() 46 | active = self.dataset.copy() 47 | if sup_rows: 48 | active = active.drop("Île-de-France") 49 | if self.sup_cols: 50 | active = active.drop(columns=["Abstention", "Blank"]) 51 | self.ca = prince.CA(n_components=n_components) 52 | self.ca.fit(active) 53 | 54 | # Fit FactoMineR 55 | R("library('FactoMineR')") 56 | with tempfile.NamedTemporaryFile() as fp: 57 | self.dataset.to_csv(fp) 58 | R(f"dataset <- read.csv('{fp.name}', row.names=1)") 59 | 60 | args = f"dataset, ncp={n_components}, graph=F" 61 | if self.sup_cols: 62 | if sup_rows: 63 | R(f"ca <- CA({args}, col.sup=c(13, 14), row.sup=c(18))") 64 | else: 65 | R(f"ca <- CA({args}, col.sup=c(13, 14))") 66 | else: 67 | if sup_rows: 68 | R(f"ca <- CA({args}, row.sup=c(18))") 69 | else: 70 | R(f"ca <- CA({args})") 71 | 72 | def test_check_is_fitted(self): 73 | assert isinstance(self.ca, prince.CA) 74 | sklearn.utils.validation.check_is_fitted(self.ca) 75 | 76 | def test_svd_U(self): 77 | F = load_df_from_R("ca$svd$U").to_numpy() 78 | P = sparse.diags(self.ca.row_masses_.to_numpy() ** -0.5) @ self.ca.svd_.U 79 | np.testing.assert_allclose(np.abs(F), np.abs(P)) 80 | 81 | def test_svd_V(self): 82 | F = load_df_from_R("ca$svd$V").to_numpy() 83 | P = sparse.diags(self.ca.col_masses_.to_numpy() ** -0.5) @ self.ca.svd_.V.T 84 | np.testing.assert_allclose(np.abs(F), np.abs(P)) 85 | 86 | def test_total_inertia(self): 87 | F = robjects.r("sum(ca$eig[,1])")[0] 88 | P = self.ca.total_inertia_ 89 | assert math.isclose(F, P) 90 | 91 | def test_eigenvalues(self): 92 | F = load_df_from_R("ca$eig")[: self.ca.n_components] 93 | P = self.ca._eigenvalues_summary 94 | np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"]) 95 | np.testing.assert_allclose(F["percentage of variance"], P["% of variance"]) 96 | np.testing.assert_allclose( 97 | F["cumulative percentage of variance"], P["% of variance (cumulative)"] 98 | ) 99 | 100 | def test_row_coords(self, method_name="row_coordinates"): 101 | F = load_df_from_R(f"ca${self._row_name}$coord") 102 | if self.sup_rows: 103 | F = pd.concat((F, load_df_from_R(f"ca${self._row_name}.sup$coord"))) 104 | 105 | method = getattr(self.ca, method_name) 106 | P = method(self.dataset) 107 | 108 | np.testing.assert_allclose(F.abs(), P.abs()) 109 | 110 | def test_row_contrib(self): 111 | F = load_df_from_R(f"ca${self._row_name}$contrib") 112 | P = self.ca.row_contributions_ 113 | np.testing.assert_allclose(F, P * 100) 114 | 115 | def test_row_cosine_similarities(self): 116 | F = load_df_from_R(f"ca${self._row_name}$cos2") 117 | if self.sup_rows: 118 | F = pd.concat((F, load_df_from_R(f"ca${self._row_name}.sup$cos2"))) 119 | P = self.ca.row_cosine_similarities(self.dataset) 120 | np.testing.assert_allclose(F, P) 121 | 122 | def test_col_coords(self): 123 | F = load_df_from_R(f"ca${self._col_name}$coord") 124 | if self.sup_cols: 125 | F = pd.concat((F, load_df_from_R(f"ca${self._col_name}.sup$coord"))) 126 | P = self.ca.column_coordinates(self.dataset) 127 | np.testing.assert_allclose(F.abs(), P.abs()) 128 | 129 | def test_col_contrib(self): 130 | F = load_df_from_R(f"ca${self._col_name}$contrib") 131 | P = self.ca.column_contributions_ 132 | np.testing.assert_allclose(F, P * 100) 133 | 134 | def test_col_cos2(self): 135 | F = load_df_from_R(f"ca${self._col_name}$cos2") 136 | if self.sup_cols: 137 | F = pd.concat((F, load_df_from_R(f"ca${self._col_name}.sup$cos2"))) 138 | P = self.ca.column_cosine_similarities(self.dataset) 139 | np.testing.assert_allclose(F, P) 140 | -------------------------------------------------------------------------------- /tests/test_famd.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import tempfile 4 | 5 | import numpy as np 6 | import pytest 7 | import sklearn.utils.estimator_checks 8 | import sklearn.utils.validation 9 | from rpy2.robjects import r as R 10 | 11 | import prince 12 | from tests import load_df_from_R 13 | 14 | 15 | @pytest.mark.parametrize( 16 | "sup_rows, sup_cols", 17 | [ 18 | pytest.param( 19 | sup_rows, 20 | sup_cols, 21 | id=":".join(["sup_rows" if sup_rows else "", "sup_cols" if sup_cols else ""]).strip( 22 | ":" 23 | ), 24 | ) 25 | for sup_rows in [False] 26 | for sup_cols in [False] 27 | ], 28 | ) 29 | class TestFAMD: 30 | _row_name = "row" 31 | _col_name = "col" 32 | 33 | @pytest.fixture(autouse=True) 34 | def _prepare(self, sup_rows, sup_cols): 35 | self.sup_rows = sup_rows 36 | self.sup_cols = sup_cols 37 | 38 | n_components = 5 39 | 40 | # Fit Prince 41 | self.dataset = prince.datasets.load_beers().head(200) 42 | active = self.dataset.copy() 43 | self.famd = prince.FAMD(n_components=n_components, engine="scipy") 44 | self.famd.fit(active) 45 | 46 | # Fit FactoMineR 47 | R("library('FactoMineR')") 48 | with tempfile.NamedTemporaryFile() as fp: 49 | self.dataset.to_csv(fp) 50 | R(f"dataset <- read.csv('{fp.name}', row.names=c(1))") 51 | R("famd <- FAMD(dataset, graph=F)") 52 | 53 | def test_check_is_fitted(self): 54 | assert isinstance(self.famd, prince.FAMD) 55 | sklearn.utils.validation.check_is_fitted(self.famd) 56 | 57 | def test_num_cols(self): 58 | assert sorted(self.famd.num_cols_) == [ 59 | "alcohol_by_volume", 60 | "final_gravity", 61 | "international_bitterness_units", 62 | "standard_reference_method", 63 | ] 64 | 65 | def test_cat_cols(self): 66 | assert sorted(self.famd.cat_cols_) == ["is_organic", "style"] 67 | 68 | def test_eigenvalues(self): 69 | F = load_df_from_R("famd$eig")[: self.famd.n_components] 70 | P = self.famd._eigenvalues_summary 71 | np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"]) 72 | np.testing.assert_allclose(F["percentage of variance"], P["% of variance"]) 73 | np.testing.assert_allclose( 74 | F["cumulative percentage of variance"], P["% of variance (cumulative)"] 75 | ) 76 | 77 | @pytest.mark.parametrize("method_name", ("row_coordinates", "transform")) 78 | def test_row_coords(self, method_name): 79 | method = getattr(self.famd, method_name) 80 | F = load_df_from_R("famd$ind$coord") 81 | P = method(self.dataset) 82 | np.testing.assert_allclose(F.abs(), P.abs()) 83 | 84 | def test_row_contrib(self): 85 | F = load_df_from_R("famd$ind$contrib") 86 | P = self.famd.row_contributions_ 87 | np.testing.assert_allclose(F, P * 100) 88 | 89 | def test_col_coords(self): 90 | F = load_df_from_R("famd$var$coord") 91 | P = self.famd.column_coordinates_ 92 | np.testing.assert_allclose(F.abs(), P.abs()) 93 | 94 | def test_col_contrib(self): 95 | F = load_df_from_R("famd$var$contrib") 96 | P = self.famd.column_contributions_ 97 | np.testing.assert_allclose(F, P * 100) 98 | 99 | 100 | def test_issue_169(): 101 | """ 102 | 103 | https://github.com/MaxHalford/prince/issues/169 104 | 105 | >>> import pandas as pd 106 | >>> from prince import FAMD 107 | >>> df = pd.DataFrame({'var1':['c', 'a', 'b','c'], 'var2':['x','y','y','z'],'var2': [0.,10.,30.4,0.]}) 108 | 109 | >>> famd = FAMD(n_components=2, random_state=42) 110 | >>> famd = famd.fit(df[:3]) 111 | 112 | >>> famd.transform(df[0:3]) 113 | component 0 1 114 | 0 -1.303760 -0.658334 115 | 1 -0.335621 0.981047 116 | 2 1.639381 -0.322713 117 | 118 | >>> famd.transform(df[0:2]) 119 | component 0 1 120 | 0 -1.000920 -0.669274 121 | 1 -0.092001 0.669274 122 | 123 | >>> famd.transform(df[3:]).round(6) 124 | component 0 1 125 | 3 -0.869173 -0.0 126 | 127 | """ 128 | -------------------------------------------------------------------------------- /tests/test_gpa.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import unittest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | import prince 9 | 10 | 11 | class TestGPA(unittest.TestCase): 12 | def setUp(self): 13 | # Create a list of 2-D circles with different locations and rotations 14 | n_shapes = 4 15 | n_points = 12 16 | n_dims = 2 17 | 18 | shape_sizes = np.arange(1, n_shapes + 1) 19 | shape_angle_offsets = 10 * np.arange(n_shapes) 20 | shape_center_offsets = np.tile(np.arange(n_shapes), (n_dims, 1)) 21 | 22 | base_angles = np.linspace(0, 2 * np.pi, num=n_points, endpoint=False) 23 | # Size (n_shapes, n_points) 24 | angles = base_angles[np.newaxis, :] + shape_angle_offsets[:, np.newaxis] 25 | 26 | # Calculate along dimensions 27 | x = np.cos(angles) * shape_sizes[:, np.newaxis] + shape_center_offsets[0][:, np.newaxis] 28 | y = np.sin(angles) * shape_sizes[:, np.newaxis] + shape_center_offsets[1][:, np.newaxis] 29 | 30 | self.shapes = np.stack([x, y], axis=-1) 31 | 32 | def test_fit(self): 33 | gpa = prince.GPA() 34 | self.assertIsInstance(gpa.fit(self.shapes), prince.GPA) 35 | 36 | def test_fit_random(self): 37 | gpa = prince.GPA(init="random") 38 | self.assertIsInstance(gpa.fit(self.shapes), prince.GPA) 39 | 40 | def test_fit_mean(self): 41 | gpa = prince.GPA(init="mean") 42 | self.assertIsInstance(gpa.fit(self.shapes), prince.GPA) 43 | 44 | def test_fit_bad_init(self): 45 | gpa = prince.GPA(init="bad init type") 46 | 47 | with self.assertRaises(ValueError): 48 | gpa.fit(self.shapes) 49 | 50 | def test_fit_bad_input_size(self): 51 | gpa = prince.GPA() 52 | 53 | with self.assertRaises(ValueError): 54 | gpa.fit(self.shapes[0]) 55 | 56 | def test_transform(self): 57 | gpa = prince.GPA(copy=True) 58 | aligned_shapes = gpa.fit(self.shapes).transform(self.shapes) 59 | self.assertIsInstance(aligned_shapes, np.ndarray) 60 | self.assertEqual(self.shapes.shape, aligned_shapes.shape) 61 | 62 | def test_fit_transform_equal(self): 63 | """In our specific case of all-same-shape circles, the shapes should 64 | align perfectly.""" 65 | gpa = prince.GPA() 66 | aligned_shapes = gpa.fit_transform(self.shapes) 67 | self.assertIsInstance(aligned_shapes, np.ndarray) 68 | np.testing.assert_array_almost_equal(aligned_shapes[:-1], aligned_shapes[1:]) 69 | 70 | def test_fit_transform_single(self): 71 | """Aligning a single shape should return the same shape, just normalized.""" 72 | gpa = prince.GPA() 73 | shapes = self.shapes[0:1] 74 | aligned_shapes = gpa.fit_transform(shapes) 75 | np.testing.assert_array_almost_equal(shapes / np.linalg.norm(shapes), aligned_shapes) 76 | 77 | def test_copy(self): 78 | shapes_copy = np.copy(self.shapes) 79 | 80 | gpa = prince.GPA(copy=True) 81 | gpa.fit(shapes_copy) 82 | np.testing.assert_array_equal(self.shapes, shapes_copy) 83 | 84 | gpa = prince.GPA(copy=False) 85 | gpa.fit(shapes_copy) 86 | self.assertRaises(AssertionError, np.testing.assert_array_equal, self.shapes, shapes_copy) 87 | 88 | def test_xarray(self): 89 | points = pd.DataFrame( 90 | data=[ 91 | [0, 0, 0, 0], 92 | [0, 2, 0, 1], 93 | [1, 0, 0, 2], 94 | [3, 2, 1, 0], 95 | [1, 2, 1, 1], 96 | [3, 3, 1, 2], 97 | [0, 0, 2, 0], 98 | [0, 4, 2, 1], 99 | [2, 0, 2, 2], 100 | ], 101 | columns=["x", "y", "shape", "point"], 102 | ).astype({"x": float, "y": float}) 103 | 104 | ds = points.set_index(["shape", "point"]).to_xarray() 105 | da = ds.to_stacked_array("xy", ["shape", "point"]) 106 | shapes = da.values 107 | 108 | gpa = prince.GPA() 109 | aligned_shapes = gpa.fit_transform(shapes) 110 | da.values = aligned_shapes 111 | da.to_unstacked_dataset("xy").to_dataframe().reset_index() 112 | -------------------------------------------------------------------------------- /tests/test_mca.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import tempfile 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pytest 8 | from rpy2.robjects import r as R 9 | 10 | import prince 11 | from tests import load_df_from_R 12 | from tests.test_ca import TestCA as _TestCA 13 | 14 | 15 | class TestMCA(_TestCA): 16 | _row_name = "ind" 17 | _col_name = "var" 18 | 19 | @pytest.fixture(autouse=True) 20 | def _prepare(self, sup_rows, sup_cols): 21 | self.sup_rows = sup_rows 22 | self.sup_cols = sup_cols 23 | 24 | n_components = 5 25 | n_active_rows = 1_000 26 | 27 | # Fit Prince 28 | self.dataset = prince.datasets.load_hearthstone_cards() 29 | active = self.dataset.copy() 30 | if self.sup_rows: 31 | active = active[:n_active_rows] 32 | if self.sup_cols: 33 | active = active.drop(columns=["type_or_school"]) 34 | self.ca = prince.MCA(n_components=n_components, engine="scipy") 35 | self.ca.fit(active) 36 | 37 | # Fit FactoMineR 38 | R("library('FactoMineR')") 39 | with tempfile.NamedTemporaryFile() as fp: 40 | self.dataset.to_csv(fp) 41 | R(f"dataset <- read.csv('{fp.name}')[,-1]") 42 | 43 | args = f"dataset, ncp={n_components}, graph=F" 44 | if self.sup_cols: 45 | if self.sup_rows: 46 | R( 47 | f"ca <- MCA({args}, quali.sup=c(4), ind.sup=c({n_active_rows + 1}:nrow(dataset)))" 48 | ) 49 | else: 50 | R(f"ca <- MCA({args}, quali.sup=c(4))") 51 | else: 52 | if self.sup_rows: 53 | R(f"ca <- MCA({args}, ind.sup=c({n_active_rows + 1}:nrow(dataset)))") 54 | else: 55 | R(f"ca <- MCA({args})") 56 | 57 | @pytest.mark.parametrize("method_name", ("row_coordinates", "transform")) 58 | def test_row_coords(self, method_name): 59 | super().test_row_coords(method_name=method_name) 60 | 61 | def test_col_coords(self): 62 | if self.sup_cols: 63 | F = load_df_from_R("ca$var$coord") 64 | if self.sup_cols: 65 | F = pd.concat((F, load_df_from_R("ca$quali.sup$coord"))) 66 | P = self.ca.column_coordinates(self.dataset) 67 | # Prince adds a prefix to each column. We need to remove it in order to align the rows 68 | # of the two dataframes 69 | P.index = [idx.split("__", 1)[1] for idx in P.index] 70 | np.testing.assert_allclose(F.abs(), P.abs().loc[F.index]) 71 | else: 72 | super().test_col_coords() 73 | 74 | def test_col_cos2(self): 75 | if self.sup_cols: 76 | F = load_df_from_R("ca$var$cos2") 77 | if self.sup_cols: 78 | F = pd.concat((F, load_df_from_R("ca$quali.sup$cos2"))) 79 | P = self.ca.column_cosine_similarities(self.dataset) 80 | # Prince adds a prefix to each column. We need to remove it in order to align the rows 81 | # of the two dataframes 82 | P.index = [idx.split("__", 1)[1] for idx in P.index] 83 | np.testing.assert_allclose(F, P.loc[F.index]) 84 | else: 85 | super().test_col_cos2() 86 | 87 | 88 | def test_with_and_without_one_hot(): 89 | """ 90 | 91 | >>> df = pd.DataFrame({ 92 | ... "foo": [1, 2, 3, 3, 5], 93 | ... "bar": ["a", "b", "c", "b", "e"], 94 | ... }) 95 | >>> mca = prince.MCA(n_components=2, one_hot=True, engine="scipy") 96 | >>> mca = mca.fit(df) 97 | >>> coords = mca.transform(df) 98 | >>> assert coords.shape == (5, 2) 99 | >>> coords.round(2).abs().sort_index(axis='columns') # doctest: +SKIP 100 | 0 1 101 | 0 0.00 2.0 102 | 1 0.65 0.5 103 | 2 0.65 0.5 104 | 3 0.65 0.5 105 | 4 1.94 0.5 106 | 107 | >>> mca = prince.MCA(n_components=2, one_hot=False, engine="scipy") 108 | >>> one_hot = pd.get_dummies(df, columns=['foo', 'bar']) 109 | >>> mca = mca.fit(one_hot) 110 | >>> coords = mca.transform(one_hot) 111 | >>> assert coords.shape == (5, 2) 112 | >>> coords.round(2).abs().sort_index(axis='columns') # doctest: +SKIP 113 | 0 1 114 | 0 0.00 1.0 115 | 1 0.65 0.5 116 | 2 0.65 0.5 117 | 3 0.65 0.5 118 | 4 1.94 0.5 119 | 120 | """ 121 | 122 | 123 | def test_issue_131(): 124 | """ 125 | 126 | https://github.com/MaxHalford/prince/issues/131#issuecomment-1591426031 127 | 128 | >>> df = pd.DataFrame({ 129 | ... "foo": [1, 2, 3, 3, 5], 130 | ... "bar": ["a", "b", "c", "b", "e"], 131 | ... }) 132 | >>> mca = prince.MCA(engine="scipy") 133 | >>> mca = mca.fit(df) 134 | >>> coords = mca.transform(df) 135 | >>> assert coords.shape == (5, 2) 136 | >>> coords.round(2).abs().sort_index(axis='columns') # doctest: +SKIP 137 | 0 1 138 | 0 0.00 2.0 139 | 1 0.65 0.5 140 | 2 0.65 0.5 141 | 3 0.65 0.5 142 | 4 1.94 0.5 143 | 144 | >>> mca.K_, mca.J_ 145 | (2, 8) 146 | 147 | """ 148 | 149 | 150 | def test_issue_171(): 151 | """ 152 | 153 | https://github.com/MaxHalford/prince/issues/171 154 | 155 | >>> from sklearn import impute 156 | >>> from sklearn import pipeline 157 | 158 | >>> rng = np.random.RandomState(0) 159 | >>> test_data = pd.DataFrame(data=rng.random((10, 5))) 160 | >>> test = pipeline.Pipeline(steps=[ 161 | ... ('impute', impute.SimpleImputer()), # would break the pipeline since it returns an ndarray 162 | ... ('mca', prince.PCA()), 163 | ... ]) 164 | >>> _ = test[0].set_output(transform='pandas') 165 | >>> test.fit_transform(test_data) 166 | component 0 1 167 | 0 -0.392617 0.296831 168 | 1 0.119661 -1.660653 169 | 2 -1.541581 -0.826863 170 | 3 3.105498 -0.538801 171 | 4 -2.439259 -0.343292 172 | 5 1.129341 -0.533576 173 | 6 -1.077436 0.899673 174 | 7 0.020571 -0.941029 175 | 8 1.498005 1.566376 176 | 9 -0.422184 2.081334 177 | 178 | """ 179 | 180 | 181 | def test_type_doesnt_matter(): 182 | """ 183 | 184 | Checks that the type of the columns doesn't affect the result. 185 | 186 | """ 187 | outputs = [] 188 | dataset = prince.datasets.load_hearthstone_cards().head(100) 189 | for col in dataset.columns: 190 | labels, levels = pd.factorize(dataset[col]) 191 | dataset[col] = labels 192 | for typ in ("int", "float", "str", "category"): 193 | dataset = dataset.astype(typ) 194 | mca = prince.MCA(n_components=2, engine="scipy") 195 | mca = mca.fit(dataset) 196 | outputs.append(mca.transform(dataset).abs()) 197 | 198 | for i in range(len(outputs) - 1): 199 | np.testing.assert_allclose(outputs[i], outputs[i + 1]) 200 | 201 | 202 | issue_161_data = """ 203 | ,category,userid,location,applicationname,browser\n 204 | 0,Portal Login,a@b.com,"San Jose, CA, United States",A,Chrome\n 205 | 1,Application Access,b@b.com,"San Jose, CA, United States",B,Other\n 206 | 2,Application Access,a@b.com,"San Jose, CA, United States",C,Other\n 207 | 3,Portal Login,c@b.com,"San Diego, CA, United States",A,Chrome\n 208 | """ 209 | 210 | 211 | def test_issue_161(): 212 | """ 213 | 214 | https://github.com/MaxHalford/prince/issues/161 215 | 216 | >>> import io 217 | >>> data = pd.read_csv(io.StringIO(issue_161_data), index_col=0) 218 | 219 | >>> mca = prince.MCA( 220 | ... n_components=10, 221 | ... n_iter=3, 222 | ... copy=True, 223 | ... check_input=True, 224 | ... engine='sklearn', 225 | ... random_state=42 226 | ... ) 227 | >>> mca = mca.fit(data[:3]) 228 | 229 | >>> mca.eigenvalues_summary 230 | eigenvalue % of variance % of variance (cumulative) 231 | component 232 | 0 0.673 67.32% 67.32% 233 | 1 0.327 32.68% 100.00% 234 | 235 | >>> mca.row_coordinates(data[:3]) 236 | 0 1 237 | 0 1.120811 -0.209242 238 | 1 -0.820491 -0.571660 239 | 2 -0.300320 0.780902 240 | 241 | >>> mca.transform(data[3:]) 242 | 0 1 243 | 3 1.664888 -0.640285 244 | 245 | """ 246 | 247 | 248 | def test_abdi_2007_correction(): 249 | """ 250 | 251 | >>> wines = prince.datasets.load_burgundy_wines() 252 | >>> wines = wines.drop(columns=["Oak type"], level=0) 253 | 254 | >>> mca = prince.MCA(n_components=4, correction=None) 255 | >>> mca = mca.fit(wines) 256 | >>> mca.eigenvalues_.round(4).tolist() 257 | [0.8532, 0.2, 0.1151, 0.0317] 258 | >>> mca.percentage_of_variance_.round(3).tolist() 259 | [71.101, 16.667, 9.593, 2.64] 260 | 261 | >>> mca = prince.MCA(n_components=4, correction="benzecri") 262 | >>> mca = mca.fit(wines) 263 | >>> mca.eigenvalues_.round(4).tolist() 264 | [0.7004, 0.0123, 0.0003, 0.0] 265 | >>> mca.percentage_of_variance_.round(3).tolist() 266 | [98.229, 1.731, 0.04, 0.0] 267 | 268 | >>> mca = prince.MCA(n_components=4, correction="greenacre") 269 | >>> mca = mca.fit(wines) 270 | >>> mca.eigenvalues_.round(4).tolist() 271 | [0.7004, 0.0123, 0.0003, 0.0] 272 | >>> mca.percentage_of_variance_.round(3).tolist() 273 | [95.189, 1.678, 0.038, 0.0] 274 | 275 | """ 276 | -------------------------------------------------------------------------------- /tests/test_mfa.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | import tempfile 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pytest 9 | import rpy2.robjects as robjects 10 | import sklearn.utils.estimator_checks 11 | import sklearn.utils.validation 12 | from rpy2.robjects import r as R 13 | 14 | import prince 15 | from tests import load_df_from_R 16 | 17 | 18 | @pytest.mark.parametrize( 19 | "sup_rows, sup_groups", 20 | [ 21 | pytest.param(sup_rows, sup_groups, id=f"{sup_rows=}:{sup_groups=}") 22 | for sup_rows in [False, True] 23 | for sup_groups in [False, True] 24 | ], 25 | ) 26 | class TestMFA: 27 | _row_name = "row" 28 | _col_name = "col" 29 | 30 | @pytest.fixture(autouse=True) 31 | def _prepare(self, sup_rows, sup_groups): 32 | self.sup_rows = sup_rows 33 | self.sup_groups = sup_groups 34 | 35 | n_components = 3 36 | 37 | # Fit Prince 38 | self.dataset = prince.datasets.load_premier_league() 39 | active = self.dataset.copy() 40 | if self.sup_rows: 41 | active = active.drop(index=["Manchester City", "Manchester United"]) 42 | supplementary_groups = ["2023-24"] if self.sup_groups else [] 43 | self.groups = self.dataset.columns.levels[0].tolist() 44 | self.mfa = prince.MFA(n_components=n_components) 45 | self.mfa.fit(active, groups=self.groups, supplementary_groups=supplementary_groups) 46 | 47 | # Fit FactoMineR 48 | R("library('FactoMineR')") 49 | with tempfile.NamedTemporaryFile() as fp: 50 | dataset = self.dataset.copy() 51 | dataset.columns = [" ".join(parts) for parts in dataset.columns] 52 | dataset.to_csv(fp, index=False) 53 | R(f"dataset <- read.csv('{fp.name}')") 54 | 55 | args = "dataset, group=c(6, 6, 6), graph=F" 56 | if self.sup_rows: 57 | args += ", ind.sup=c(9:10)" 58 | if self.sup_groups: 59 | args += ", num.group.sup=c(3)" 60 | 61 | R(f"mfa <- MFA({args})") 62 | 63 | def test_check_is_fitted(self): 64 | assert isinstance(self.mfa, prince.MFA) 65 | sklearn.utils.validation.check_is_fitted(self.mfa) 66 | 67 | def test_total_inertia(self): 68 | F = robjects.r("sum(mfa$eig[,1])")[0] 69 | P = self.mfa.total_inertia_ 70 | assert math.isclose(F, P) 71 | 72 | def test_eigenvalues(self): 73 | F = load_df_from_R("mfa$eig")[: self.mfa.n_components] 74 | P = self.mfa._eigenvalues_summary 75 | np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"]) 76 | np.testing.assert_allclose(F["percentage of variance"], P["% of variance"]) 77 | np.testing.assert_allclose( 78 | F["cumulative percentage of variance"], P["% of variance (cumulative)"] 79 | ) 80 | 81 | def test_group_eigenvalues(self): 82 | for i, group in enumerate(self.groups, start=1): 83 | F = load_df_from_R(f"mfa$separate.analyses$Gr{i}$eig")[: self.mfa.n_components] 84 | P = self.mfa[group]._eigenvalues_summary 85 | np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"]) 86 | np.testing.assert_allclose(F["percentage of variance"], P["% of variance"]) 87 | np.testing.assert_allclose( 88 | F["cumulative percentage of variance"], P["% of variance (cumulative)"] 89 | ) 90 | 91 | @pytest.mark.parametrize("method_name", ("row_coordinates", "transform")) 92 | def test_row_coords(self, method_name): 93 | method = getattr(self.mfa, method_name) 94 | F = load_df_from_R("mfa$ind$coord") 95 | P = method(self.dataset) 96 | if self.sup_rows: 97 | F = pd.concat((F, load_df_from_R("mfa$ind.sup$coord"))) 98 | # Move supplementary rows to the end 99 | P = pd.concat( 100 | [ 101 | P.loc[P.index.difference(["Manchester City", "Manchester United"])], 102 | P.loc[["Manchester City", "Manchester United"]], 103 | ] 104 | ) 105 | F = F.iloc[:, : self.mfa.n_components] 106 | np.testing.assert_allclose(F.abs(), P.abs()) 107 | 108 | def test_row_contrib(self): 109 | F = load_df_from_R("mfa$ind$contrib").iloc[:, : self.mfa.n_components] 110 | P = self.mfa.row_contributions_ 111 | np.testing.assert_allclose(F, P * 100) 112 | -------------------------------------------------------------------------------- /tests/test_pca.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pytest 8 | import rpy2.robjects as robjects 9 | import sklearn.utils.estimator_checks 10 | import sklearn.utils.validation 11 | from rpy2.robjects import numpy2ri 12 | from sklearn import decomposition, pipeline, preprocessing 13 | 14 | import prince 15 | from tests import load_df_from_R 16 | 17 | 18 | @pytest.mark.parametrize( 19 | "sup_rows, sup_cols, scale, sample_weights, column_weights", 20 | [ 21 | pytest.param( 22 | sup_rows, 23 | sup_cols, 24 | scale, 25 | sample_weights, 26 | column_weights, 27 | id=f"{sup_rows=}:{sup_cols=}:{scale=}:{sample_weights=}:{column_weights=}", 28 | ) 29 | for sup_rows in [False, True] 30 | for sup_cols in [False, True] 31 | for scale in [False, True] 32 | for sample_weights in [False, True] 33 | for column_weights in [False, True] 34 | ], 35 | ) 36 | class TestPCA: 37 | @pytest.fixture(autouse=True) 38 | def _prepare(self, sup_rows, sup_cols, scale, sample_weights, column_weights): 39 | self.sup_rows = sup_rows 40 | self.sup_cols = sup_cols 41 | self.scale = scale 42 | 43 | n_components = 5 44 | 45 | # Fit Prince 46 | self.dataset = prince.datasets.load_decathlon() 47 | self.active = self.dataset.copy() 48 | if self.sup_rows: 49 | self.active = self.active.query('competition == "Decastar"') 50 | self.sample_weights = ( 51 | np.random.default_rng().dirichlet([1] * len(self.active)) if sample_weights else None 52 | ) 53 | supplementary_columns = ["rank", "points"] if self.sup_cols else [] 54 | self.column_weights = ( 55 | np.random.default_rng().random( 56 | len(self.active.columns.difference(supplementary_columns)) 57 | ) 58 | if column_weights 59 | else None 60 | ) 61 | self.pca = prince.PCA(n_components=n_components, rescale_with_std=self.scale) 62 | self.pca.fit( 63 | self.active, 64 | sample_weight=self.sample_weights, 65 | column_weight=self.column_weights, 66 | supplementary_columns=supplementary_columns, 67 | ) 68 | 69 | # scikit-learn 70 | if self.scale: 71 | self.sk_pca = pipeline.make_pipeline( 72 | preprocessing.StandardScaler(), 73 | decomposition.PCA(n_components=n_components), 74 | ) 75 | else: 76 | self.sk_pca = pipeline.make_pipeline( 77 | decomposition.PCA(n_components=n_components), 78 | ) 79 | # sklearn's PCA doesn't support sample weights 80 | self.sk_pca.fit(self.active[self.pca.feature_names_in_]) 81 | 82 | # Fit FactoMineR 83 | robjects.r( 84 | """ 85 | library('FactoMineR') 86 | 87 | data(decathlon) 88 | decathlon <- subset(decathlon, select = -c(Competition)) 89 | """ 90 | ) 91 | 92 | args = f"decathlon, ncp={n_components}, graph=F" 93 | if sample_weights: 94 | robjects.r.assign("row.w", numpy2ri.py2rpy(self.sample_weights)) 95 | robjects.r("row.w <- as.vector(row.w)") 96 | args += ", row.w=row.w" 97 | if column_weights: 98 | robjects.r.assign("col.w", numpy2ri.py2rpy(self.column_weights)) 99 | robjects.r("col.w <- as.vector(col.w)") 100 | args += ", col.w=col.w" 101 | if not self.scale: 102 | args += ", scale.unit=F" 103 | if self.sup_cols: 104 | if self.sup_rows: 105 | robjects.r(f"pca = PCA({args}, quanti.sup=c(11, 12), ind.sup=c(14:41))") 106 | else: 107 | robjects.r(f"pca = PCA({args}, quanti.sup=c(11, 12))") 108 | else: 109 | if self.sup_rows: 110 | robjects.r(f"pca = PCA({args}, ind.sup=c(14:41))") 111 | else: 112 | robjects.r(f"pca = PCA({args})") 113 | 114 | def test_check_is_fitted(self): 115 | assert isinstance(self.pca, prince.PCA) 116 | sklearn.utils.validation.check_is_fitted(self.pca) 117 | 118 | def test_total_inertia(self): 119 | F = robjects.r("sum(pca$eig[,1])")[0] 120 | P = self.pca.total_inertia_ 121 | assert math.isclose(F, P) 122 | 123 | def test_eigenvalues(self): 124 | P = self.pca._eigenvalues_summary 125 | # Test against FactoMineR 126 | F = load_df_from_R("pca$eig")[: self.pca.n_components] 127 | np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"]) 128 | np.testing.assert_allclose(F["percentage of variance"], P["% of variance"]) 129 | np.testing.assert_allclose( 130 | F["cumulative percentage of variance"], P["% of variance (cumulative)"] 131 | ) 132 | # Test against scikit-learn 133 | if self.sample_weights is None and self.column_weights is None: 134 | n = len(self.active) 135 | S = self.sk_pca[-1].explained_variance_ * (n - 1) / n 136 | np.testing.assert_allclose(P["eigenvalue"], S) 137 | np.testing.assert_allclose( 138 | P["% of variance"], self.sk_pca[-1].explained_variance_ratio_ * 100 139 | ) 140 | 141 | @pytest.mark.parametrize("method_name", ("row_coordinates", "transform")) 142 | def test_row_coords(self, method_name): 143 | method = getattr(self.pca, method_name) 144 | P = method(self.dataset) 145 | # Test againt FactoMineR 146 | F = load_df_from_R("pca$ind$coord") 147 | if self.sup_rows: 148 | F = pd.concat((F, load_df_from_R("pca$ind.sup$coord"))) 149 | np.testing.assert_allclose(F.abs(), P.abs()) 150 | # Test against scikit-learn 151 | if self.sample_weights is None and self.column_weights is None: 152 | S = self.sk_pca.transform(self.dataset[self.pca.feature_names_in_]) 153 | np.testing.assert_allclose(np.abs(S), P.abs()) 154 | 155 | def test_row_cosine_similarities(self): 156 | F = load_df_from_R("pca$ind$cos2") 157 | if self.sup_rows: 158 | F = pd.concat((F, load_df_from_R("pca$ind.sup$cos2"))) 159 | P = self.pca.row_cosine_similarities(self.dataset) 160 | np.testing.assert_allclose(F, P) 161 | 162 | def test_row_contrib(self): 163 | F = load_df_from_R("pca$ind$contrib") 164 | P = self.pca.row_contributions_ 165 | np.testing.assert_allclose(F, P * 100) 166 | 167 | def test_col_coords(self): 168 | F = load_df_from_R("pca$var$coord") 169 | P = self.pca.column_coordinates_ 170 | if self.sup_cols: 171 | P = P.drop(["rank", "points"]) 172 | np.testing.assert_allclose(F.abs(), P.abs()) 173 | 174 | def test_col_cos2(self): 175 | F = load_df_from_R("pca$var$cos2") 176 | P = self.pca.column_cosine_similarities_ 177 | if self.sup_cols: 178 | P = P.drop(["rank", "points"]) 179 | np.testing.assert_allclose(F, P) 180 | 181 | def test_col_contrib(self): 182 | F = load_df_from_R("pca$var$contrib") 183 | P = self.pca.column_contributions_ 184 | np.testing.assert_allclose(F, P * 100) 185 | -------------------------------------------------------------------------------- /tests/test_svd.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | import pytest 5 | import rpy2.robjects as robjects 6 | from rpy2.robjects import numpy2ri 7 | 8 | from prince import svd 9 | from tests import load_df_from_R 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "n_components, are_rows_weighted, are_columns_weighted", 14 | [ 15 | pytest.param( 16 | n_components, 17 | are_rows_weighted, 18 | are_columns_weighted, 19 | id=f"{n_components=}:{are_rows_weighted=}:{are_columns_weighted=}", 20 | ) 21 | for n_components in [1, 3, 10] 22 | for are_rows_weighted in [False, True] 23 | for are_columns_weighted in [False, True] 24 | ], 25 | ) 26 | class TestSVD: 27 | @pytest.fixture(autouse=True) 28 | def _prepare(self, n_components, are_rows_weighted, are_columns_weighted): 29 | self.n_components = n_components 30 | self.are_rows_weighted = are_rows_weighted 31 | self.are_columns_weighted = are_columns_weighted 32 | 33 | self.dataset = np.random.rand(100, 10) 34 | self.row_weights = np.random.rand(100) 35 | self.row_weights /= self.row_weights.sum() 36 | self.column_weights = np.random.rand(10) 37 | 38 | # Fit Prince 39 | self.svd = svd.compute_svd( 40 | X=self.dataset, 41 | row_weights=self.row_weights if are_rows_weighted else None, 42 | column_weights=self.column_weights if are_columns_weighted else None, 43 | n_components=n_components, 44 | n_iter=3, 45 | random_state=42, 46 | engine="scipy", 47 | ) 48 | 49 | # Fit FactoMineR 50 | robjects.r("library('FactoMineR')") 51 | robjects.r.assign("X", numpy2ri.py2rpy(self.dataset)) 52 | robjects.r.assign("row.w", numpy2ri.py2rpy(self.row_weights)) 53 | robjects.r.assign("col.w", numpy2ri.py2rpy(self.column_weights)) 54 | robjects.r("row.w <- as.vector(row.w)") 55 | robjects.r("col.w <- as.vector(col.w)") 56 | args = f"X, ncp={n_components}" 57 | if are_rows_weighted: 58 | args += ", row.w=row.w" 59 | if are_columns_weighted: 60 | args += ", col.w=col.w" 61 | robjects.r(f"svd = svd.triplet({args})") 62 | 63 | def test_U(self): 64 | assert self.svd.U.shape == (100, self.n_components) 65 | if self.are_rows_weighted: 66 | P = self.svd.U 67 | F = load_df_from_R("svd$U") 68 | np.testing.assert_allclose(np.abs(F), np.abs(P)) 69 | 70 | def test_s(self): 71 | assert self.svd.s.shape == (self.n_components,) 72 | if self.are_rows_weighted: 73 | P = self.svd.s 74 | F = robjects.r("svd$vs")[: self.n_components] 75 | np.testing.assert_allclose(np.abs(F), np.abs(P)) 76 | 77 | def test_V(self): 78 | assert self.svd.V.shape == (self.n_components, 10) 79 | P = self.svd.V 80 | F = load_df_from_R("svd$V").T 81 | np.testing.assert_allclose(np.abs(F), np.abs(P)) 82 | --------------------------------------------------------------------------------