├── .gitattributes
├── .github
├── FUNDING.yml
├── actions
│ └── install-env
│ │ └── action.yml
└── workflows
│ ├── code-quality.yml
│ ├── hugo.yml
│ └── unit-tests.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── .pylintrc
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── docs
├── .hugo_build.lock
├── archetypes
│ └── default.md
├── config.toml
├── content
│ ├── _index.md
│ ├── ca.ipynb
│ ├── famd.ipynb
│ ├── faq.ipynb
│ ├── gpa.ipynb
│ ├── mca.ipynb
│ ├── mfa.ipynb
│ └── pca.ipynb
├── layouts
│ └── _default
│ │ └── _markup
│ │ └── render-codeblock-mermaid.html
├── static
│ ├── favicon.ico
│ └── images
│ │ ├── favicon.png
│ │ └── logo.png
└── themes
│ └── hugo-bearblog
│ ├── archetypes
│ ├── blog.md
│ └── default.md
│ ├── layouts
│ ├── 404.html
│ ├── _default
│ │ ├── baseof.html
│ │ ├── list.html
│ │ └── single.html
│ ├── index.html
│ ├── partials
│ │ ├── custom_body.html
│ │ ├── custom_head.html
│ │ ├── favicon.html
│ │ ├── footer.html
│ │ ├── header.html
│ │ ├── nav.html
│ │ ├── seo_tags.html
│ │ └── style.html
│ └── robots.txt
│ └── theme.toml
├── figures
├── decastar.svg
└── decastar_bis.svg
├── poetry.lock
├── prince
├── __init__.py
├── ca.py
├── datasets.py
├── datasets
│ ├── 02-resultats-par-region.csv
│ ├── beers.csv.zip
│ ├── decathlon.csv
│ ├── hearthstone_cards.csv
│ ├── per-capita-energy-stacked.csv
│ ├── premier_league.csv
│ ├── punctuation_marks.csv
│ └── resultats-par-departement.csv
├── famd.py
├── gpa.py
├── mca.py
├── mfa.py
├── pca.py
├── plot.py
├── svd.py
└── utils.py
├── pyproject.toml
└── tests
├── DESCRIPTION
├── __init__.py
├── test_ca.py
├── test_famd.py
├── test_gpa.py
├── test_mca.py
├── test_mfa.py
├── test_pca.py
└── test_svd.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.R linguist-vendored
2 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: MaxHalford
2 |
--------------------------------------------------------------------------------
/.github/actions/install-env/action.yml:
--------------------------------------------------------------------------------
1 | name: Install env
2 | runs:
3 | using: "composite"
4 | steps:
5 | - name: Check out repository
6 | uses: actions/checkout@v3
7 |
8 | - name: Install R
9 | uses: r-lib/actions/setup-r@v2
10 |
11 | - name: Install R packages
12 | uses: r-lib/actions/setup-r-dependencies@v2
13 | with:
14 | cache-version: 1
15 | working-directory: tests
16 |
17 | - name: Set up Python
18 | id: set-up-python
19 | uses: actions/setup-python@v4
20 | with:
21 | python-version: "3.11"
22 |
23 | - name: Load cached Poetry installation
24 | uses: actions/cache@v3
25 | with:
26 | path: ~/.local
27 | key: poetry-0
28 |
29 | - name: Install poetry
30 | uses: snok/install-poetry@v1
31 | with:
32 | virtualenvs-create: true
33 | virtualenvs-in-project: true
34 | installer-parallel: true
35 |
36 | - name: Load cached virtual env
37 | uses: actions/cache@v3
38 | with:
39 | path: .venv
40 | key: venv-${{ runner.os }}-${{ steps.set-up-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
41 |
42 | - name: Install dependencies
43 | shell: bash
44 | if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
45 | run: poetry install --no-interaction --no-root
46 |
47 | - name: Install project
48 | shell: bash
49 | run: poetry install --no-interaction
50 |
51 | - name: Activate environment
52 | shell: bash
53 | run: source $VENV
54 |
--------------------------------------------------------------------------------
/.github/workflows/code-quality.yml:
--------------------------------------------------------------------------------
1 | name: Code quality
2 |
3 | on:
4 | pull_request:
5 | branches:
6 | - "*"
7 | push:
8 | branches:
9 | - master
10 |
11 | jobs:
12 | run:
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: actions/checkout@v3
16 | - uses: ./.github/actions/install-env
17 | - run: poetry run pre-commit run --all-files
18 |
--------------------------------------------------------------------------------
/.github/workflows/hugo.yml:
--------------------------------------------------------------------------------
1 | # Sample workflow for building and deploying a Hugo site to GitHub Pages
2 | name: Deploy Hugo site to Pages
3 |
4 | on:
5 | # Allows you to run this workflow manually from the Actions tab
6 | workflow_dispatch:
7 |
8 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
9 | permissions:
10 | contents: read
11 | pages: write
12 | id-token: write
13 |
14 | # Allow one concurrent deployment
15 | concurrency:
16 | group: "pages"
17 | cancel-in-progress: true
18 |
19 | # Default to bash
20 | defaults:
21 | run:
22 | shell: bash
23 |
24 | jobs:
25 | # Build job
26 | build:
27 | runs-on: ubuntu-latest
28 | env:
29 | HUGO_VERSION: 0.144.2
30 | steps:
31 | - name: Install Hugo CLI
32 | run: |
33 | wget -O ${{ runner.temp }}/hugo.deb https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_linux-amd64.deb \
34 | && sudo dpkg -i ${{ runner.temp }}/hugo.deb
35 |
36 | - name: Install Dart Sass
37 | run: sudo snap install dart-sass
38 |
39 | - name: Checkout
40 | uses: actions/checkout@v4
41 | with:
42 | submodules: recursive
43 | fetch-depth: 0
44 |
45 | - name: Install environment
46 | uses: ./.github/actions/install-env
47 |
48 | - name: Execute notebooks
49 | run: poetry run jupyter nbconvert --execute --to notebook --inplace docs/content/*.ipynb
50 |
51 | - name: Convert notebooks
52 | run: poetry run jupyter nbconvert --to markdown docs/content/*.ipynb
53 |
54 | - name: Clean MarkDown
55 | run: (for f in docs/content/*.md; do sed -e '/"
208 | ],
209 | "text/plain": [
210 | "alt.Chart(...)"
211 | ]
212 | },
213 | "execution_count": 2,
214 | "metadata": {},
215 | "output_type": "execute_result"
216 | }
217 | ],
218 | "source": [
219 | "import altair as alt\n",
220 | "\n",
221 | "alt.Chart(points).mark_line(opacity=0.5).encode(\n",
222 | " x='x',\n",
223 | " y='y',\n",
224 | " detail='shape',\n",
225 | " color='shape:N'\n",
226 | ")"
227 | ]
228 | },
229 | {
230 | "attachments": {},
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "The dataframe of points has to converted to a 3D numpy array of shape `(shapes, points, dims)`. There are many ways to do this. Here, we use xarray as a helper package."
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 3,
240 | "metadata": {
241 | "execution": {
242 | "iopub.execute_input": "2024-09-07T18:18:01.756840Z",
243 | "iopub.status.busy": "2024-09-07T18:18:01.756743Z",
244 | "iopub.status.idle": "2024-09-07T18:18:01.807548Z",
245 | "shell.execute_reply": "2024-09-07T18:18:01.807313Z"
246 | }
247 | },
248 | "outputs": [
249 | {
250 | "data": {
251 | "text/plain": [
252 | "(3, 3, 2)"
253 | ]
254 | },
255 | "execution_count": 3,
256 | "metadata": {},
257 | "output_type": "execute_result"
258 | }
259 | ],
260 | "source": [
261 | "ds = points.set_index(['shape', 'point']).to_xarray()\n",
262 | "da = ds.to_stacked_array('xy', ['shape', 'point'])\n",
263 | "shapes = da.values\n",
264 | "shapes.shape"
265 | ]
266 | },
267 | {
268 | "attachments": {},
269 | "cell_type": "markdown",
270 | "metadata": {},
271 | "source": [
272 | "This can also be done in NumPy:"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 4,
278 | "metadata": {
279 | "execution": {
280 | "iopub.execute_input": "2024-09-07T18:18:01.809002Z",
281 | "iopub.status.busy": "2024-09-07T18:18:01.808906Z",
282 | "iopub.status.idle": "2024-09-07T18:18:01.818337Z",
283 | "shell.execute_reply": "2024-09-07T18:18:01.818121Z"
284 | }
285 | },
286 | "outputs": [
287 | {
288 | "data": {
289 | "text/plain": [
290 | "(3, 3, 2)"
291 | ]
292 | },
293 | "execution_count": 4,
294 | "metadata": {},
295 | "output_type": "execute_result"
296 | }
297 | ],
298 | "source": [
299 | "import numpy as np\n",
300 | "\n",
301 | "gb = points.groupby('shape')\n",
302 | "np.stack([gb.get_group(g)[['x', 'y']] for g in gb.groups]).shape"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 5,
308 | "metadata": {
309 | "execution": {
310 | "iopub.execute_input": "2024-09-07T18:18:01.819667Z",
311 | "iopub.status.busy": "2024-09-07T18:18:01.819581Z",
312 | "iopub.status.idle": "2024-09-07T18:18:01.826748Z",
313 | "shell.execute_reply": "2024-09-07T18:18:01.826491Z"
314 | }
315 | },
316 | "outputs": [
317 | {
318 | "data": {
319 | "text/plain": [
320 | "array([[[0., 0.],\n",
321 | " [0., 2.],\n",
322 | " [1., 0.]],\n",
323 | "\n",
324 | " [[3., 2.],\n",
325 | " [1., 2.],\n",
326 | " [3., 3.]],\n",
327 | "\n",
328 | " [[0., 0.],\n",
329 | " [0., 4.],\n",
330 | " [2., 0.]]])"
331 | ]
332 | },
333 | "execution_count": 5,
334 | "metadata": {},
335 | "output_type": "execute_result"
336 | }
337 | ],
338 | "source": [
339 | "shapes"
340 | ]
341 | },
342 | {
343 | "attachments": {},
344 | "cell_type": "markdown",
345 | "metadata": {},
346 | "source": [
347 | "The shapes can now be aligned."
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 6,
353 | "metadata": {
354 | "execution": {
355 | "iopub.execute_input": "2024-09-07T18:18:01.828533Z",
356 | "iopub.status.busy": "2024-09-07T18:18:01.828396Z",
357 | "iopub.status.idle": "2024-09-07T18:18:02.157698Z",
358 | "shell.execute_reply": "2024-09-07T18:18:02.157289Z"
359 | }
360 | },
361 | "outputs": [],
362 | "source": [
363 | "import prince\n",
364 | "\n",
365 | "gpa = prince.GPA()\n",
366 | "aligned_shapes = gpa.fit_transform(shapes)"
367 | ]
368 | },
369 | {
370 | "attachments": {},
371 | "cell_type": "markdown",
372 | "metadata": {},
373 | "source": [
374 | "We then convert the 3D numpy array to a dataframe (using `xarray`) for plotting."
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": 7,
380 | "metadata": {
381 | "execution": {
382 | "iopub.execute_input": "2024-09-07T18:18:02.159473Z",
383 | "iopub.status.busy": "2024-09-07T18:18:02.159364Z",
384 | "iopub.status.idle": "2024-09-07T18:18:02.187045Z",
385 | "shell.execute_reply": "2024-09-07T18:18:02.186796Z"
386 | }
387 | },
388 | "outputs": [
389 | {
390 | "data": {
391 | "text/html": [
392 | "\n",
393 | "
\n",
394 | ""
447 | ],
448 | "text/plain": [
449 | "alt.Chart(...)"
450 | ]
451 | },
452 | "execution_count": 7,
453 | "metadata": {},
454 | "output_type": "execute_result"
455 | }
456 | ],
457 | "source": [
458 | "da.values = aligned_shapes\n",
459 | "aligned_points = da.to_unstacked_dataset('xy').to_dataframe().reset_index()\n",
460 | "\n",
461 | "alt.Chart(aligned_points).mark_line(opacity=0.5).encode(\n",
462 | " x='x',\n",
463 | " y='y',\n",
464 | " detail='shape',\n",
465 | " color='shape:N'\n",
466 | ")"
467 | ]
468 | },
469 | {
470 | "attachments": {},
471 | "cell_type": "markdown",
472 | "metadata": {},
473 | "source": [
474 | "The triangles were all the same shape, so they are now perfectly aligned."
475 | ]
476 | }
477 | ],
478 | "metadata": {
479 | "kernelspec": {
480 | "display_name": ".venv",
481 | "language": "python",
482 | "name": "python3"
483 | },
484 | "language_info": {
485 | "codemirror_mode": {
486 | "name": "ipython",
487 | "version": 3
488 | },
489 | "file_extension": ".py",
490 | "mimetype": "text/x-python",
491 | "name": "python",
492 | "nbconvert_exporter": "python",
493 | "pygments_lexer": "ipython3",
494 | "version": "3.11.4"
495 | },
496 | "vscode": {
497 | "interpreter": {
498 | "hash": "441c2ec70d9faeb70e7723f55150c6260f4a26a9c828b90915d3399002e14f43"
499 | }
500 | }
501 | },
502 | "nbformat": 4,
503 | "nbformat_minor": 2
504 | }
505 |
--------------------------------------------------------------------------------
/docs/layouts/_default/_markup/render-codeblock-mermaid.html:
--------------------------------------------------------------------------------
1 |
2 | {{- .Inner | safeHTML }}
3 |
4 | {{ .Page.Store.Set "hasMermaid" true }}
5 |
--------------------------------------------------------------------------------
/docs/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/docs/static/favicon.ico
--------------------------------------------------------------------------------
/docs/static/images/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/docs/static/images/favicon.png
--------------------------------------------------------------------------------
/docs/static/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/docs/static/images/logo.png
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/archetypes/blog.md:
--------------------------------------------------------------------------------
1 | +++
2 | title = "{{ replace .Name "-" " " | title }}"
3 | date = "{{ .Date }}"
4 |
5 | #
6 | # description is optional
7 | #
8 | # description = "An optional description for SEO. If not provided, an automatically created summary will be used."
9 |
10 | tags = [{{ range $plural, $terms := .Site.Taxonomies }}{{ range $term, $val := $terms }}"{{ printf "%s" $term }}",{{ end }}{{ end }}]
11 | +++
12 |
13 | This is a page about »{{ replace .Name "-" " " | title }}«.
14 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/archetypes/default.md:
--------------------------------------------------------------------------------
1 | +++
2 | title = "{{ replace .Name "-" " " | title }}"
3 | date = "{{ .Date }}"
4 |
5 | #
6 | # Set menu to "main" to add this page to
7 | # the main menu on top of the page
8 | #
9 | menu = "main"
10 |
11 | #
12 | # description is optional
13 | #
14 | # description = "An optional description for SEO. If not provided, an automatically created summary will be used."
15 |
16 | #
17 | # tags are optional
18 | #
19 | # tags = [{{ range $plural, $terms := .Site.Taxonomies }}{{ range $term, $val := $terms }}"{{ printf "%s" $term }}",{{ end }}{{ end }}]
20 | +++
21 |
22 | This is a page about »{{ replace .Name "-" " " | title }}«.
23 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/404.html:
--------------------------------------------------------------------------------
1 | {{ define "title" }}404{{ end }}
2 |
3 | {{ define "main" }}
4 | 404
5 | ʕノ•ᴥ•ʔノ ︵ ┻━┻
6 | {{ end }}
7 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/_default/baseof.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | {{- partial "favicon.html" . -}}
8 |
9 | {{- block "title" . }}{{ with .Title }}{{ . }} | {{ end }}{{ .Site.Title
10 | }}{{- end }}
11 |
12 |
13 | {{- partial "seo_tags.html" . -}}
14 |
15 |
16 | {{ with .OutputFormats.Get "rss" -}} {{ printf `
17 |
18 | ` .Rel .MediaType.Type .Permalink $.Site.Title | safeHTML }} {{ end -}} {{-
19 | partial "style.html" . -}}
20 |
21 |
24 | {{- partial "custom_head.html" . -}} {{- if not (eq hugo.Environment
25 | "development") -}}
26 |
31 | {{- end -}}
32 |
33 |
34 |
35 | {{- partial "header.html" . -}}
36 | {{- block "main" . }}{{- end }}
37 |
38 |
39 |
42 | {{- partial "custom_body.html" . -}}
43 |
44 |
45 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/_default/list.html:
--------------------------------------------------------------------------------
1 | {{ define "main" }}
2 |
3 | {{ if .Data.Singular }}
4 | Filtering for "{{ .Title }}"
5 |
6 | Remove filter
7 |
8 | {{ end }}
9 |
10 | {{ range .Pages }}
11 | -
12 |
13 |
14 |
17 |
18 |
19 | {{ .Title }}
20 |
21 | {{ else }}
22 | -
23 | No posts yet
24 |
25 | {{ end }}
26 |
27 | {{ if .Data.Singular }}
28 | {{else}}
29 |
30 |
35 |
36 | {{ end }}
37 |
38 | {{ end }}
39 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/_default/single.html:
--------------------------------------------------------------------------------
1 | {{ define "main" }}
2 | {{ if eq .Type "blog" }}{{ if not .Params.menu }}
3 |
4 |
5 |
8 |
9 |
10 | {{ end }}{{ end }}
11 |
12 | {{ .Title }}
13 | {{ if and (gt .WordCount 400 ) (.Params.toc) }}
14 | Table of contents
15 | {{.TableOfContents}}
16 | {{ end }}
17 | {{ .Content }}
18 |
19 |
20 | {{ range (.GetTerms "tags") }}
21 | #{{ .LinkTitle }}
22 | {{ end }}
23 |
24 | {{ end }}
25 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/index.html:
--------------------------------------------------------------------------------
1 | {{ define "main" }}
2 | {{ .Content }}
3 |
4 | {{ if .Page.Store.Get "hasMermaid" }}
5 |
6 |
9 | {{ end }}
10 |
11 | {{ end }}
12 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/custom_body.html:
--------------------------------------------------------------------------------
1 |
4 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/custom_head.html:
--------------------------------------------------------------------------------
1 |
4 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/favicon.html:
--------------------------------------------------------------------------------
1 | {{ with .Site.Params.favicon }}
2 | {{ end }}
3 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/footer.html:
--------------------------------------------------------------------------------
1 | {{ if ne .Site.Params.hideMadeWithLine true }}Made with Hugo ʕ•ᴥ•ʔ Bear{{ end }}
2 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/header.html:
--------------------------------------------------------------------------------
1 |
2 | {{ .Site.Title }} foo
3 |
4 |

5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/nav.html:
--------------------------------------------------------------------------------
1 | {{ range .Site.Menus.main }}
2 | {{ index .Page.Aliases 0 | upper }}
3 | {{ end }}
4 | GitHub
5 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/seo_tags.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | {{ template "_internal/opengraph.html" . }}
8 |
9 |
10 | {{ template "_internal/twitter_cards.html" . }}
11 |
12 |
13 | {{ template "_internal/schema.html" . }}
14 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/style.html:
--------------------------------------------------------------------------------
1 |
143 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/robots.txt:
--------------------------------------------------------------------------------
1 | User-Agent: *
2 | Sitemap: {{ "sitemap.xml" | absURL }}
3 |
--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/theme.toml:
--------------------------------------------------------------------------------
1 | # theme.toml template for a Hugo theme
2 | # See https://github.com/gohugoio/hugoThemes#themetoml for an example
3 |
4 | name = "Hugo Bear Blog"
5 | license = "MIT"
6 | licenselink = "https://github.com/janraasch/hugo-bearblog/blob/master/LICENSE"
7 | description = "A Hugo theme based on »Bear Blog«. Free, no-nonsense, super-fast blogging. »Bear Blog« now includes a dark color scheme to support dark mode!"
8 | homepage = "https://github.com/janraasch/hugo-bearblog"
9 | demosite = "https://janraasch.github.io/hugo-bearblog/"
10 | tags = ["blog", "responsive", "minimal", "seo", "clean", "simple", "light", "minimalist", "mobile", "fast", "white", "minimalistic", "reading", "dark mode"]
11 | features = ["favicon", "seo", "no stylesheets", "no javascript", "rss", "dark mode"]
12 | min_version = "0.73.0"
13 | # https://gohugo.io/content-management/taxonomies#default-taxonomies
14 | # https://gohugo.io/templates/taxonomy-templates/#example-list-tags-in-a-single-page-template
15 | # https://gohugo.io/templates/taxonomy-templates/#example-list-all-site-tags
16 |
17 | [author]
18 | name = "Jan Raasch"
19 | homepage = "https://www.janraasch.com"
20 |
21 | # If porting an existing theme
22 | [original]
23 | name = "ʕ•ᴥ•ʔ Bear Blog"
24 | homepage = "https://bearblog.dev"
25 | repo = "https://github.com/HermanMartinus/bearblog"
26 |
--------------------------------------------------------------------------------
/figures/decastar.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/prince/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import importlib.metadata
4 |
5 | from . import datasets
6 | from .ca import CA
7 | from .famd import FAMD
8 | from .gpa import GPA
9 | from .mca import MCA
10 | from .mfa import MFA
11 | from .pca import PCA
12 |
13 | __version__ = importlib.metadata.version("prince")
14 | __all__ = ["CA", "FAMD", "MCA", "MFA", "PCA", "GPA", "datasets"]
15 |
--------------------------------------------------------------------------------
/prince/ca.py:
--------------------------------------------------------------------------------
1 | """Correspondence Analysis (CA)"""
2 |
3 | from __future__ import annotations
4 |
5 | import functools
6 |
7 | import altair as alt
8 | import numpy as np
9 | import pandas as pd
10 | from scipy import sparse
11 | from sklearn.utils import check_array
12 |
13 | from prince import svd, utils
14 |
15 |
16 | def select_active_columns(method):
17 | @functools.wraps(method)
18 | def _impl(self, X=None, *method_args, **method_kwargs):
19 | if hasattr(self, "active_cols_") and isinstance(X, pd.DataFrame):
20 | return method(self, X[self.active_cols_], *method_args, **method_kwargs)
21 | return method(self, X, *method_args, **method_kwargs)
22 |
23 | return _impl
24 |
25 |
26 | def select_active_rows(method):
27 | @functools.wraps(method)
28 | def _impl(self, X=None, *method_args, **method_kwargs):
29 | if hasattr(self, "active_rows_") and isinstance(X, pd.DataFrame):
30 | return method(self, X.loc[self.active_rows_], *method_args, **method_kwargs)
31 | return method(self, X, *method_args, **method_kwargs)
32 |
33 | return _impl
34 |
35 |
36 | class CA(utils.EigenvaluesMixin):
37 | def __init__(
38 | self,
39 | n_components=2,
40 | n_iter=10,
41 | copy=True,
42 | check_input=True,
43 | random_state=None,
44 | engine="sklearn",
45 | ):
46 | self.n_components = n_components
47 | self.n_iter = n_iter
48 | self.copy = copy
49 | self.check_input = check_input
50 | self.random_state = random_state
51 | self.engine = engine
52 |
53 | @utils.check_is_dataframe_input
54 | def fit(self, X, y=None):
55 | # Check input
56 | if self.check_input:
57 | check_array(X)
58 |
59 | # Check all values are positive
60 | if (X < 0).any().any():
61 | raise ValueError("All values in X should be positive")
62 |
63 | _, row_names, _, col_names = utils.make_labels_and_names(X)
64 |
65 | if isinstance(X, pd.DataFrame):
66 | X = X.to_numpy()
67 |
68 | if self.copy:
69 | X = np.copy(X)
70 |
71 | # Compute the correspondence matrix which contains the relative frequencies
72 | X = X.astype(float) / np.sum(X)
73 |
74 | # Compute row and column masses
75 | self.row_masses_ = pd.Series(X.sum(axis=1), index=row_names)
76 | self.col_masses_ = pd.Series(X.sum(axis=0), index=col_names)
77 |
78 | self.active_rows_ = self.row_masses_.index.unique()
79 | self.active_cols_ = self.col_masses_.index.unique()
80 |
81 | # Compute standardised residuals
82 | r = self.row_masses_.to_numpy()
83 | c = self.col_masses_.to_numpy()
84 | S = sparse.diags(r**-0.5) @ (X - np.outer(r, c)) @ sparse.diags(c**-0.5)
85 |
86 | # Compute SVD on the standardised residuals
87 | self.svd_ = svd.compute_svd(
88 | X=S,
89 | n_components=min(self.n_components, min(X.shape) - 1),
90 | n_iter=self.n_iter,
91 | random_state=self.random_state,
92 | engine=self.engine,
93 | )
94 |
95 | # Compute total inertia
96 | self.total_inertia_ = np.einsum("ij,ji->", S, S.T)
97 |
98 | self.row_contributions_ = pd.DataFrame(
99 | sparse.diags(self.row_masses_.values)
100 | @ np.divide(
101 | # Same as row_coordinates(X)
102 | (
103 | sparse.diags(self.row_masses_.values**-0.5)
104 | @ self.svd_.U
105 | @ sparse.diags(self.svd_.s)
106 | )
107 | ** 2,
108 | self.eigenvalues_,
109 | out=np.zeros((len(self.row_masses_), len(self.eigenvalues_))),
110 | where=self.eigenvalues_ > 0,
111 | ),
112 | index=self.row_masses_.index,
113 | )
114 |
115 | self.column_contributions_ = pd.DataFrame(
116 | sparse.diags(self.col_masses_.values)
117 | @ np.divide(
118 | # Same as col_coordinates(X)
119 | (
120 | sparse.diags(self.col_masses_.values**-0.5)
121 | @ self.svd_.V.T
122 | @ sparse.diags(self.svd_.s)
123 | )
124 | ** 2,
125 | self.eigenvalues_,
126 | out=np.zeros((len(self.col_masses_), len(self.eigenvalues_))),
127 | where=self.eigenvalues_ > 0,
128 | ),
129 | index=self.col_masses_.index,
130 | )
131 |
132 | return self
133 |
134 | @property
135 | @utils.check_is_fitted
136 | def eigenvalues_(self):
137 | """Returns the eigenvalues associated with each principal component."""
138 | return np.square(self.svd_.s)
139 |
140 | @utils.check_is_dataframe_input
141 | @select_active_columns
142 | def row_coordinates(self, X):
143 | """The row principal coordinates."""
144 |
145 | _, row_names, _, _ = utils.make_labels_and_names(X)
146 | index_name = X.index.name
147 |
148 | if isinstance(X, pd.DataFrame):
149 | try:
150 | X = X.sparse.to_coo().astype(float)
151 | except AttributeError:
152 | X = X.to_numpy()
153 |
154 | if self.copy:
155 | X = X.copy()
156 |
157 | # Normalise the rows so that they sum up to 1
158 | if isinstance(X, np.ndarray):
159 | X = X / X.sum(axis=1)[:, None]
160 | else:
161 | X = X / X.sum(axis=1)
162 |
163 | return pd.DataFrame(
164 | data=X @ sparse.diags(self.col_masses_.to_numpy() ** -0.5) @ self.svd_.V.T,
165 | index=pd.Index(row_names, name=index_name),
166 | )
167 |
168 | @utils.check_is_dataframe_input
169 | @select_active_columns
170 | def row_cosine_similarities(self, X):
171 | """Return the cos2 for each row against the dimensions.
172 |
173 | The cos2 value gives an indicator of the accuracy of the row projection on the dimension.
174 |
175 | Values above 0.5 usually means that the row is relatively accurately well projected onto that dimension. Its often
176 | used to identify which factor/dimension is important for a given element as the cos2 can be interpreted as the proportion
177 | of the variance of the element attributed to a particular factor.
178 |
179 | """
180 | F = self.row_coordinates(X)
181 | return self._row_cosine_similarities(X, F)
182 |
183 | @select_active_columns
184 | def _row_cosine_similarities(self, X, F):
185 | # Active
186 | X_act = X.loc[self.active_rows_]
187 | X_act = X_act / X_act.sum().sum()
188 | marge_col = X_act.sum(axis=0)
189 | Tc = X_act.div(X_act.sum(axis=1), axis=0).div(marge_col, axis=1) - 1
190 | dist2_row = (Tc**2).mul(marge_col, axis=1).sum(axis=1)
191 |
192 | # Supplementary
193 | X_sup = X.loc[X.index.difference(self.active_rows_, sort=False)]
194 | X_sup = X_sup.div(X_sup.sum(axis=1), axis=0)
195 | dist2_row_sup = ((X_sup - marge_col) ** 2).div(marge_col, axis=1).sum(axis=1)
196 |
197 | dist2_row = pd.concat((dist2_row, dist2_row_sup))
198 |
199 | # Can't use pandas.div method because it doesn't support duplicate indices
200 | return F**2 / dist2_row.to_numpy()[:, None]
201 |
202 | @utils.check_is_dataframe_input
203 | @select_active_rows
204 | def column_coordinates(self, X):
205 | """The column principal coordinates."""
206 |
207 | _, _, _, col_names = utils.make_labels_and_names(X)
208 | index_name = X.columns.name
209 |
210 | if isinstance(X, pd.DataFrame):
211 | is_sparse = X.dtypes.apply(lambda dtype: isinstance(dtype, pd.SparseDtype)).all()
212 | if is_sparse:
213 | X = X.sparse.to_coo()
214 | else:
215 | X = X.to_numpy()
216 |
217 | if self.copy:
218 | X = X.copy()
219 |
220 | # Transpose and make sure the rows sum up to 1
221 | if isinstance(X, np.ndarray):
222 | X = X.T / X.T.sum(axis=1)[:, None]
223 | else:
224 | X = X.T / X.T.sum(axis=1)
225 |
226 | return pd.DataFrame(
227 | data=X @ sparse.diags(self.row_masses_.to_numpy() ** -0.5) @ self.svd_.U,
228 | index=pd.Index(col_names, name=index_name),
229 | )
230 |
231 | @utils.check_is_dataframe_input
232 | @select_active_rows
233 | def column_cosine_similarities(self, X):
234 | """Return the cos2 for each column against the dimensions.
235 |
236 | The cos2 value gives an indicator of the accuracy of the column projection on the dimension.
237 |
238 | Values above 0.5 usually means that the column is relatively accurately well projected onto that dimension. Its often
239 | used to identify which factor/dimension is important for a given element as the cos2 can be interpreted as the proportion
240 | of the variance of the element attributed to a particular factor.
241 | """
242 | G = self.column_coordinates(X)
243 | return self._column_cosine_similarities(X, G)
244 |
245 | @select_active_rows
246 | def _column_cosine_similarities(self, X, G):
247 | # Active
248 | X_act = X[self.active_cols_]
249 | X_act = X_act / X_act.sum().sum()
250 | marge_row = X_act.sum(axis=1)
251 | Tc = X_act.div(marge_row, axis=0).div(X_act.sum(axis=0), axis=1) - 1
252 | dist2_col = (Tc**2).mul(marge_row, axis=0).sum(axis=0)
253 |
254 | # Supplementary
255 | X_sup = X[X.columns.difference(self.active_cols_, sort=False)]
256 | X_sup = X_sup.div(X_sup.sum(axis=0), axis=1)
257 | dist2_col_sup = ((X_sup.sub(marge_row, axis=0)) ** 2).div(marge_row, axis=0).sum(axis=0)
258 |
259 | dist2_col = pd.concat((dist2_col, dist2_col_sup))
260 | return (G**2).div(dist2_col, axis=0)
261 |
262 | @utils.check_is_dataframe_input
263 | @utils.check_is_fitted
264 | def plot(
265 | self,
266 | X,
267 | x_component=0,
268 | y_component=1,
269 | show_row_markers=True,
270 | show_column_markers=True,
271 | show_row_labels=False,
272 | show_column_labels=False,
273 | ):
274 | eig = self._eigenvalues_summary.to_dict(orient="index")
275 |
276 | row_chart_markers = None
277 | row_chart_labels = None
278 | column_chart_markers = None
279 | column_chart_labels = None
280 |
281 | if show_row_markers or show_row_labels:
282 | row_coords = self.row_coordinates(X)
283 | row_coords.columns = [f"component {i}" for i in row_coords.columns]
284 | row_coords = row_coords.assign(
285 | variable=row_coords.index.name or "row",
286 | value=row_coords.index.astype(str),
287 | )
288 | row_labels = pd.Series(row_coords.index, index=row_coords.index)
289 | row_chart = alt.Chart(row_coords.assign(label=row_labels)).encode(
290 | x=alt.X(
291 | f"component {x_component}",
292 | scale=alt.Scale(zero=False),
293 | axis=alt.Axis(
294 | title=f"component {x_component} — {eig[x_component]['% of variance'] / 100:.2%}"
295 | ),
296 | ),
297 | y=alt.Y(
298 | f"component {y_component}",
299 | scale=alt.Scale(zero=False),
300 | axis=alt.Axis(
301 | title=f"component {y_component} — {eig[y_component]['% of variance'] / 100:.2%}"
302 | ),
303 | ),
304 | )
305 | row_chart_markers = row_chart.mark_circle(size=50 if show_row_markers else 0).encode(
306 | color="variable",
307 | tooltip=[
308 | "variable",
309 | "value",
310 | f"component {x_component}",
311 | f"component {y_component}",
312 | ],
313 | )
314 | if show_row_labels:
315 | row_chart_labels = row_chart.mark_text().encode(text="label:N")
316 |
317 | if show_column_markers or show_column_labels:
318 | column_coords = self.column_coordinates(X)
319 | column_coords.columns = [f"component {i}" for i in column_coords.columns]
320 | column_coords = column_coords.assign(
321 | variable=column_coords.index.name or "column",
322 | value=column_coords.index.astype(str),
323 | )
324 | column_labels = pd.Series(column_coords.index, index=column_coords.index)
325 | column_chart = alt.Chart(column_coords.assign(label=column_labels)).encode(
326 | x=alt.X(
327 | f"component {x_component}",
328 | scale=alt.Scale(zero=False),
329 | axis=alt.Axis(
330 | title=f"component {x_component} — {eig[x_component]['% of variance'] / 100:.2%}"
331 | ),
332 | ),
333 | y=alt.Y(
334 | f"component {y_component}",
335 | scale=alt.Scale(zero=False),
336 | axis=alt.Axis(
337 | title=f"component {y_component} — {eig[y_component]['% of variance'] / 100:.2%}"
338 | ),
339 | ),
340 | )
341 | column_chart_markers = column_chart.mark_circle(
342 | size=50 if show_column_markers else 0
343 | ).encode(
344 | color="variable",
345 | tooltip=[
346 | "variable",
347 | "value",
348 | f"component {x_component}",
349 | f"component {y_component}",
350 | ],
351 | )
352 | if show_column_labels:
353 | column_chart_labels = column_chart.mark_text().encode(text="label:N")
354 |
355 | charts = filter(
356 | None,
357 | (
358 | row_chart_markers,
359 | row_chart_labels,
360 | column_chart_markers,
361 | column_chart_labels,
362 | ),
363 | )
364 |
365 | return alt.layer(*charts).interactive()
366 |
--------------------------------------------------------------------------------
/prince/datasets.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pathlib
4 |
5 | import pandas as pd
6 |
7 | DATASETS_DIR = pathlib.Path(__file__).parent / "datasets"
8 |
9 |
10 | def load_energy_mix(year=2019, normalize=True):
11 | """Per capita energy mix by country in 2019.
12 |
13 | Each row corresponds to a country. There is one column for each energy source.
14 | A value corresponds to the average energy consumption of a source per capita.
15 | For instance, in France, every citizen consumed 15,186 kWh of nuclear energy.
16 |
17 | This data comes from https://ourworldindata.org/energy-mix
18 |
19 | Parameters
20 | ----------
21 | year
22 | The year the study was made.
23 | normalize
24 | Whether or not to normalize the kWh by country.
25 |
26 | """
27 |
28 | df = (
29 | pd.read_csv(DATASETS_DIR / "per-capita-energy-stacked.csv")
30 | .query("Year == @year")
31 | .query("Entity not in ['Africa', 'Europe', 'North America', 'World']")
32 | .drop(columns=["Code", "Year"])
33 | .rename(columns={"Entity": "Country"})
34 | .rename(columns=lambda x: x.replace(" per capita (kWh)", "").lower())
35 | .set_index(["continent", "country"])
36 | )
37 | if normalize:
38 | return df.div(df.sum(axis="columns"), axis="rows")
39 | return df
40 |
41 |
42 | def load_decathlon():
43 | """The Decathlon dataset from FactoMineR."""
44 | decathlon = pd.read_csv(DATASETS_DIR / "decathlon.csv")
45 | decathlon.columns = ["athlete", *map(str.lower, decathlon.columns[1:])]
46 | decathlon.athlete = decathlon.athlete.apply(str.title)
47 | decathlon = decathlon.set_index(["competition", "athlete"])
48 | return decathlon
49 |
50 |
51 | def load_french_elections():
52 | """Voting data for the 2022 French elections, by region.
53 |
54 | The [original dataset](https://www.data.gouv.fr/fr/datasets/resultats-du-premier-tour-de-lelection-presidentielle-2022-par-commune-et-par-departement/#resources)
55 | has been transformed into a contingency matrix. The latter tallies the number of votes for the
56 | 12 candidates across all 18 regions. The number of blank and abstentions are also recorded.
57 | More information about these regions, including a map, can be found
58 | [on Wikipedia](https://www.wikiwand.com/fr/Région_française).
59 |
60 | """
61 | dataset = pd.read_csv(DATASETS_DIR / "02-resultats-par-region.csv")
62 | cont = dataset.pivot(index="reg_name", columns="cand_nom", values="cand_nb_voix")
63 | cont["Abstention"] = dataset.groupby("reg_name")["abstention_nb"].min()
64 | cont["Blank"] = dataset.groupby("reg_name")["blancs_nb"].min()
65 | cont.columns = [c.title() for c in cont.columns]
66 | cont.index.name = "region"
67 | cont.columns.name = "candidate"
68 | return cont
69 |
70 |
71 | def load_punctuation_marks():
72 | """Punctuation marks of six French writers."""
73 | return pd.read_csv(DATASETS_DIR / "punctuation_marks.csv", index_col="author")
74 |
75 |
76 | def load_hearthstone_cards():
77 | """Hearthstone standard cards.
78 |
79 | Source: https://gist.github.com/MaxHalford/32ed2c80672d7391ec5b4e6f291f14c1
80 |
81 | """
82 | return pd.read_csv(DATASETS_DIR / "hearthstone_cards.csv", index_col="id")
83 |
84 |
85 | def load_burgundy_wines():
86 | """Burgundy wines dataset.
87 |
88 | Source: https://personal.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf
89 |
90 | """
91 | wines = pd.DataFrame(
92 | data=[
93 | ["Yes", "No", "No", "Yes", "No", "No", "No", "No", "No", "No"],
94 | ["No", "Maybe", "Yes", "No", "Yes", "Maybe", "Yes", "No", "Yes", "Yes"],
95 | ["No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes"],
96 | ["No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"],
97 | ["Yes", "No", "No", "Yes", "No", "No", "No", "Yes", "No", "No"],
98 | ["Yes", "Maybe", "No", "Yes", "No", "Maybe", "No", "Yes", "No", "No"],
99 | ],
100 | columns=pd.MultiIndex.from_tuples(
101 | [
102 | ("Expert 1", "Fruity"),
103 | ("Expert 1", "Woody"),
104 | ("Expert 1", "Coffee"),
105 | ("Expert 2", "Red fruit"),
106 | ("Expert 2", "Roasted"),
107 | ("Expert 2", "Vanillin"),
108 | ("Expert 2", "Woody"),
109 | ("Expert 3", "Fruity"),
110 | ("Expert 3", "Butter"),
111 | ("Expert 3", "Woody"),
112 | ],
113 | names=("expert", "aspect"),
114 | ),
115 | index=[f"Wine {i + 1}" for i in range(6)],
116 | )
117 | wines.insert(0, "Oak type", [1, 2, 2, 2, 1, 1])
118 | return wines
119 |
120 |
121 | def load_beers():
122 | """Beers dataset.
123 |
124 | The data is taken from https://github.com/philipperemy/beer-dataset.
125 |
126 | """
127 | return pd.read_csv(DATASETS_DIR / "beers.csv.zip", index_col="name")
128 |
129 |
130 | def load_premier_league():
131 | """Premier League dataset.
132 |
133 | The data is taken from Wikipedia, using pd.read_html.
134 |
135 | """
136 | return pd.read_csv(DATASETS_DIR / "premier_league.csv", index_col=0, header=[0, 1])
137 |
--------------------------------------------------------------------------------
/prince/datasets/beers.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/prince/datasets/beers.csv.zip
--------------------------------------------------------------------------------
/prince/datasets/decathlon.csv:
--------------------------------------------------------------------------------
1 | "","100m","Long.jump","Shot.put","High.jump","400m","110m.hurdle","Discus","Pole.vault","Javeline","1500m","Rank","Points","Competition"
2 | "SEBRLE",11.04,7.58,14.83,2.07,49.81,14.69,43.75,5.02,63.19,291.7,1,8217,"Decastar"
3 | "CLAY",10.76,7.4,14.26,1.86,49.37,14.05,50.72,4.92,60.15,301.5,2,8122,"Decastar"
4 | "KARPOV",11.02,7.3,14.77,2.04,48.37,14.09,48.95,4.92,50.31,300.2,3,8099,"Decastar"
5 | "BERNARD",11.02,7.23,14.25,1.92,48.93,14.99,40.87,5.32,62.77,280.1,4,8067,"Decastar"
6 | "YURKOV",11.34,7.09,15.19,2.1,50.42,15.31,46.26,4.72,63.44,276.4,5,8036,"Decastar"
7 | "WARNERS",11.11,7.6,14.31,1.98,48.68,14.23,41.1,4.92,51.77,278.1,6,8030,"Decastar"
8 | "ZSIVOCZKY",11.13,7.3,13.48,2.01,48.62,14.17,45.67,4.42,55.37,268,7,8004,"Decastar"
9 | "McMULLEN",10.83,7.31,13.76,2.13,49.91,14.38,44.41,4.42,56.37,285.1,8,7995,"Decastar"
10 | "MARTINEAU",11.64,6.81,14.57,1.95,50.14,14.93,47.6,4.92,52.33,262.1,9,7802,"Decastar"
11 | "HERNU",11.37,7.56,14.41,1.86,51.1,15.06,44.99,4.82,57.19,285.1,10,7733,"Decastar"
12 | "BARRAS",11.33,6.97,14.09,1.95,49.48,14.48,42.1,4.72,55.4,282,11,7708,"Decastar"
13 | "NOOL",11.33,7.27,12.68,1.98,49.2,15.29,37.92,4.62,57.44,266.6,12,7651,"Decastar"
14 | "BOURGUIGNON",11.36,6.8,13.46,1.86,51.16,15.67,40.49,5.02,54.68,291.7,13,7313,"Decastar"
15 | "Sebrle",10.85,7.84,16.36,2.12,48.36,14.05,48.72,5,70.52,280.01,1,8893,"OlympicG"
16 | "Clay",10.44,7.96,15.23,2.06,49.19,14.13,50.11,4.9,69.71,282,2,8820,"OlympicG"
17 | "Karpov",10.5,7.81,15.93,2.09,46.81,13.97,51.65,4.6,55.54,278.11,3,8725,"OlympicG"
18 | "Macey",10.89,7.47,15.73,2.15,48.97,14.56,48.34,4.4,58.46,265.42,4,8414,"OlympicG"
19 | "Warners",10.62,7.74,14.48,1.97,47.97,14.01,43.73,4.9,55.39,278.05,5,8343,"OlympicG"
20 | "Zsivoczky",10.91,7.14,15.31,2.12,49.4,14.95,45.62,4.7,63.45,269.54,6,8287,"OlympicG"
21 | "Hernu",10.97,7.19,14.65,2.03,48.73,14.25,44.72,4.8,57.76,264.35,7,8237,"OlympicG"
22 | "Nool",10.8,7.53,14.26,1.88,48.81,14.8,42.05,5.4,61.33,276.33,8,8235,"OlympicG"
23 | "Bernard",10.69,7.48,14.8,2.12,49.13,14.17,44.75,4.4,55.27,276.31,9,8225,"OlympicG"
24 | "Schwarzl",10.98,7.49,14.01,1.94,49.76,14.25,42.43,5.1,56.32,273.56,10,8102,"OlympicG"
25 | "Pogorelov",10.95,7.31,15.1,2.06,50.79,14.21,44.6,5,53.45,287.63,11,8084,"OlympicG"
26 | "Schoenbeck",10.9,7.3,14.77,1.88,50.3,14.34,44.41,5,60.89,278.82,12,8077,"OlympicG"
27 | "Barras",11.14,6.99,14.91,1.94,49.41,14.37,44.83,4.6,64.55,267.09,13,8067,"OlympicG"
28 | "Smith",10.85,6.81,15.24,1.91,49.27,14.01,49.02,4.2,61.52,272.74,14,8023,"OlympicG"
29 | "Averyanov",10.55,7.34,14.44,1.94,49.72,14.39,39.88,4.8,54.51,271.02,15,8021,"OlympicG"
30 | "Ojaniemi",10.68,7.5,14.97,1.94,49.12,15.01,40.35,4.6,59.26,275.71,16,8006,"OlympicG"
31 | "Smirnov",10.89,7.07,13.88,1.94,49.11,14.77,42.47,4.7,60.88,263.31,17,7993,"OlympicG"
32 | "Qi",11.06,7.34,13.55,1.97,49.65,14.78,45.13,4.5,60.79,272.63,18,7934,"OlympicG"
33 | "Drews",10.87,7.38,13.07,1.88,48.51,14.01,40.11,5,51.53,274.21,19,7926,"OlympicG"
34 | "Parkhomenko",11.14,6.61,15.69,2.03,51.04,14.88,41.9,4.8,65.82,277.94,20,7918,"OlympicG"
35 | "Terek",10.92,6.94,15.15,1.94,49.56,15.12,45.62,5.3,50.62,290.36,21,7893,"OlympicG"
36 | "Gomez",11.08,7.26,14.57,1.85,48.61,14.41,40.95,4.4,60.71,269.7,22,7865,"OlympicG"
37 | "Turi",11.08,6.91,13.62,2.03,51.67,14.26,39.83,4.8,59.34,290.01,23,7708,"OlympicG"
38 | "Lorenzo",11.1,7.03,13.22,1.85,49.34,15.38,40.22,4.5,58.36,263.08,24,7592,"OlympicG"
39 | "Karlivans",11.33,7.26,13.3,1.97,50.54,14.98,43.34,4.5,52.92,278.67,25,7583,"OlympicG"
40 | "Korkizoglou",10.86,7.07,14.81,1.94,51.16,14.96,46.07,4.7,53.05,317,26,7573,"OlympicG"
41 | "Uldal",11.23,6.99,13.53,1.85,50.95,15.09,43.01,4.5,60,281.7,27,7495,"OlympicG"
42 | "Casarsa",11.36,6.68,14.92,1.94,53.2,15.39,48.66,4.4,58.62,296.12,28,7404,"OlympicG"
43 |
--------------------------------------------------------------------------------
/prince/datasets/premier_league.csv:
--------------------------------------------------------------------------------
1 | ,2021-22,2021-22,2021-22,2021-22,2021-22,2021-22,2022-23,2022-23,2022-23,2022-23,2022-23,2022-23,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24
2 | ,W,D,L,GF,GA,Pts,W,D,L,GF,GA,Pts,W,D,L,GF,GA,Pts
3 | Team,,,,,,,,,,,,,,,,,,
4 | Arsenal,22,3,13,61,48,69,26,6,6,88,43,84,28,5,5,91,29,89
5 | Aston Villa,13,6,19,52,54,45,18,7,13,51,46,61,20,8,10,76,61,68
6 | Brentford,13,7,18,48,56,46,15,14,9,58,46,59,10,9,19,56,65,39
7 | Brighton & Hove Albion,12,15,11,42,44,51,18,8,12,72,53,62,12,12,14,55,62,48
8 | Chelsea,21,11,6,76,33,74,11,11,16,38,47,44,18,9,11,77,63,63
9 | Crystal Palace,11,15,12,50,46,48,11,12,15,40,49,45,13,10,15,57,58,49
10 | Everton,11,6,21,43,66,39,8,12,18,34,57,36,13,9,16,40,51,40
11 | Liverpool,28,8,2,94,26,92,19,10,9,75,47,67,24,10,4,86,41,82
12 | Manchester City,29,6,3,99,26,93,28,5,5,94,33,89,28,7,3,96,34,91
13 | Manchester United,16,10,12,57,57,58,23,6,9,58,43,75,18,6,14,57,58,60
14 | Newcastle United,13,10,15,44,62,49,19,14,5,68,33,71,18,6,14,85,62,60
15 | Tottenham Hotspur,22,5,11,69,40,71,18,6,14,70,63,60,20,6,12,74,61,66
16 | West Ham United,16,8,14,60,51,56,11,7,20,42,55,40,14,10,14,60,74,52
17 | Wolverhampton Wanderers,15,6,17,38,43,51,11,8,19,31,58,41,13,7,18,50,65,46
18 |
--------------------------------------------------------------------------------
/prince/datasets/punctuation_marks.csv:
--------------------------------------------------------------------------------
1 | "author","period","comma","other"
2 | "Rousseau",7836,13112,6026
3 | "Chateaubriand",53655,102383,42413
4 | "Hugo",115615,184541,59226
5 | "Zola",161926,340479,62754
6 | "Proust",38177,105101,12670
7 | "Giraudoux",46371,58367,14299
8 |
--------------------------------------------------------------------------------
/prince/famd.py:
--------------------------------------------------------------------------------
1 | """Factor Analysis of Mixed Data (FAMD)"""
2 |
3 | from __future__ import annotations
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import sklearn.utils
8 | from sklearn import preprocessing
9 |
10 | from prince import pca, utils
11 |
12 |
13 | class FAMD(pca.PCA):
14 | def __init__(
15 | self,
16 | n_components=2,
17 | n_iter=3,
18 | copy=True,
19 | check_input=True,
20 | random_state=None,
21 | engine="sklearn",
22 | handle_unknown="error",
23 | ):
24 | super().__init__(
25 | rescale_with_mean=True,
26 | rescale_with_std=False,
27 | n_components=n_components,
28 | n_iter=n_iter,
29 | copy=copy,
30 | check_input=check_input,
31 | random_state=random_state,
32 | engine=engine,
33 | )
34 | self.handle_unknown = handle_unknown
35 |
36 | def _check_input(self, X):
37 | if self.check_input:
38 | sklearn.utils.check_array(X, dtype=[str, "numeric"])
39 |
40 | @utils.check_is_dataframe_input
41 | def fit(self, X, y=None):
42 | # Separate numerical columns from categorical columns
43 | self.num_cols_ = X.select_dtypes(include=["float"]).columns.tolist()
44 | if not self.num_cols_:
45 | raise ValueError("All variables are qualitative: MCA should be used")
46 | self.cat_cols_ = X.columns.difference(self.num_cols_).tolist()
47 | if not self.cat_cols_:
48 | raise ValueError("All variables are quantitative: PCA should be used")
49 |
50 | # Preprocess numerical columns
51 | X_num = X[self.num_cols_].copy()
52 | self.num_scaler_ = preprocessing.StandardScaler().fit(X_num)
53 | X_num[:] = self.num_scaler_.transform(X_num)
54 |
55 | # Preprocess categorical columns
56 | X_cat = X[self.cat_cols_]
57 | self.cat_scaler_ = preprocessing.OneHotEncoder(handle_unknown=self.handle_unknown).fit(
58 | X_cat
59 | )
60 | X_cat_oh = pd.DataFrame.sparse.from_spmatrix(
61 | self.cat_scaler_.transform(X_cat),
62 | index=X_cat.index,
63 | columns=self.cat_scaler_.get_feature_names_out(self.cat_cols_),
64 | )
65 | prop = X_cat_oh.sum() / X_cat_oh.sum().sum() * 2
66 | X_cat_oh_norm = X_cat_oh.sub(X_cat_oh.mean(axis="rows")).div(prop**0.5, axis="columns")
67 |
68 | # PCA.fit doesn't work with sparse matrices. Well, it accepts them, but it densifies them.
69 | # We pre-densify them here to avoid a warning.
70 | # TODO: In the future, PCA should be able to handle sparse matrices.
71 | X_cat_oh_norm = X_cat_oh_norm.sparse.to_dense()
72 |
73 | Z = pd.concat([X_num, X_cat_oh_norm], axis=1)
74 | super().fit(Z)
75 |
76 | # Determine column_coordinates_
77 | # This is based on line 184 in FactoMineR's famd.R file
78 | rc = self.row_coordinates(X)
79 | weights = np.ones(len(X_cat_oh)) / len(X_cat_oh)
80 | norm = (rc**2).multiply(weights, axis=0).sum()
81 | eta2 = pd.DataFrame(index=rc.columns)
82 | for i, col in enumerate(self.cat_cols_):
83 | # TODO: there must be a better way to select a subset of the one-hot encoded matrix
84 | tt = X_cat_oh[[f"{col}_{i}" for i in self.cat_scaler_.categories_[i]]]
85 | ni = (tt / len(tt)).sum()
86 | eta2[col] = (
87 | rc.apply(lambda x: (tt.multiply(x * weights, axis=0).sum() ** 2 / ni).sum()) / norm
88 | ).values
89 | self.column_coordinates_ = pd.concat(
90 | [self.column_coordinates_.loc[self.num_cols_] ** 2, eta2.T]
91 | )
92 | self.column_coordinates_.columns.name = "component"
93 | self.column_coordinates_.index.name = "variable"
94 |
95 | return self
96 |
97 | @utils.check_is_dataframe_input
98 | @utils.check_is_fitted
99 | def row_coordinates(self, X):
100 | # Separate numerical columns from categorical columns
101 | X_num = X[self.num_cols_].copy()
102 | X_cat = X[self.cat_cols_]
103 |
104 | # Preprocess numerical columns
105 | X_num[:] = self.num_scaler_.transform(X_num)
106 |
107 | # Preprocess categorical columns
108 | X_cat = pd.DataFrame.sparse.from_spmatrix(
109 | self.cat_scaler_.transform(X_cat),
110 | index=X_cat.index,
111 | columns=self.cat_scaler_.get_feature_names_out(self.cat_cols_),
112 | )
113 | prop = X_cat.sum() / X_cat.sum().sum() * 2
114 | X_cat = X_cat.sub(X_cat.mean(axis="rows")).div(prop**0.5, axis="columns")
115 |
116 | Z = pd.concat([X_num, X_cat.sparse.to_dense()], axis=1).fillna(0.0)
117 |
118 | return super().row_coordinates(Z)
119 |
120 | @utils.check_is_dataframe_input
121 | @utils.check_is_fitted
122 | def inverse_transform(self, X):
123 | raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet")
124 |
125 | @utils.check_is_dataframe_input
126 | @utils.check_is_fitted
127 | def row_standard_coordinates(self, X):
128 | raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet")
129 |
130 | @utils.check_is_dataframe_input
131 | @utils.check_is_fitted
132 | def row_cosine_similarities(self, X):
133 | raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet")
134 |
135 | @utils.check_is_dataframe_input
136 | @utils.check_is_fitted
137 | def column_correlations(self, X):
138 | raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet")
139 |
140 | @utils.check_is_dataframe_input
141 | @utils.check_is_fitted
142 | def column_cosine_similarities_(self, X):
143 | raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet")
144 |
145 | @property
146 | def column_contributions_(self):
147 | return self.column_coordinates_ / self.eigenvalues_
148 |
--------------------------------------------------------------------------------
/prince/gpa.py:
--------------------------------------------------------------------------------
1 | """Generalized Procrustes Analysis (GPA)"""
2 |
3 | from __future__ import annotations
4 |
5 | import numpy as np
6 | from scipy.linalg import orthogonal_procrustes
7 | from scipy.spatial import procrustes
8 | from sklearn import base
9 | from sklearn import utils as sk_utils
10 |
11 | from prince import utils
12 |
13 |
14 | class GPA(base.BaseEstimator, base.TransformerMixin):
15 | """Generalized Procrustes Analysis (GPA).
16 |
17 | Algorithm outline:
18 |
19 | 1. Choose a reference shape.
20 | 2. Apply Procrustes Analysis to superimpose all shapes to the reference shape.
21 | 3. Compute the mean shape of the superimposed shapes.
22 | 4. Repeat steps 2 and 3 until convergence.
23 |
24 | Parameters
25 | ----------
26 | max_iter
27 | The maximum number of Procrustes analysis iterations.
28 | tol
29 | The tolerance for the optimization; stops if the Procrustes distance decreases by less or
30 | equal to `tol` between iterations.
31 | init
32 | Method for initializing reference shape.
33 | - 'random' : choose reference shape from shape list
34 | - 'mean' : initialize reference shape as mean of shape list
35 | scale
36 | Whether to compute transformations with a scale component.
37 | copy
38 | Whether to copy data or perform the computations inplace. If False, data passed to fit are
39 | overwritten and running fit(X).transform(X) will not yield the expected results,
40 | use fit_transform(X) instead.
41 | check_input
42 | Whether to check the consistency of the inputs.
43 | random_state
44 | Determines random number generation for initialization when `init=='random'`.
45 |
46 | References
47 | ----------
48 | https://wikipedia.org/wiki/Generalized_Procrustes_analysis
49 | https://medium.com/@olga_kravchenko/generalized-procrustes-analysis-with-python-numpy-c571e8e8a421
50 |
51 | """
52 |
53 | def __init__(
54 | self,
55 | max_iter=10,
56 | tol=1e-4,
57 | init="random",
58 | scale=True,
59 | copy=True,
60 | check_input=True,
61 | random_state=None,
62 | ):
63 | self.max_iter = max_iter
64 | self.tol = tol
65 | self.init = init
66 | self.scale = scale
67 | self.copy = copy
68 | self.check_input = check_input
69 | self.random_state = random_state
70 |
71 | def fit(self, X, y=None):
72 | """Fit the model with X.
73 |
74 | The algorithm naturally fits and transforms at the same time, so this
75 | simply calls ``.fit_transform``
76 |
77 | Parameters:
78 | X (array-like of shape (n_shapes, n_points, n_dim)): Matrix of
79 | shapes to match to each other.
80 | y: Ignored
81 |
82 | Returns:
83 | self (object): The instance itself
84 | """
85 | self.fit_transform(X)
86 |
87 | return self
88 |
89 | @utils.check_is_fitted
90 | def transform(self, X):
91 | """Align X to the reference shape.
92 |
93 | Parameters:
94 | X (array-like of shape (n_shapes, n_points, n_dim)): Matrix of
95 | shapes to align to the refernce shape.
96 |
97 | Returns:
98 | X_new (array-like of shape (n_shapes, n_points, n_dim)): Matrix of
99 | aligned shapes
100 | """
101 | self._check_is_fitted()
102 | if self.check_input:
103 | self._check_input(X)
104 |
105 | X_new = np.empty(X.shape)
106 | for shape_idx in range(X.shape[0]):
107 | _, X_new[shape_idx], _ = procrustes(self.reference_shape, X[shape_idx])
108 |
109 | return X_new
110 |
111 | def fit_transform(self, X, y=None):
112 | """Fit the model with X and return the aligned shapes.
113 |
114 | Parameters:
115 | X (array-like of shape (n_shapes, n_points, n_dim)): Matrix of
116 | shapes to match to each other.
117 | y: Ignored
118 |
119 | Returns:
120 | X_new (array-like of shape (n_shapes, n_points, n_dim)): Matrix X
121 | of aligned shapes
122 | """
123 |
124 | # Check input
125 | if self.check_input:
126 | self._check_input(X)
127 |
128 | # Copy data
129 | if self.copy:
130 | X = np.array(X, copy=True)
131 |
132 | # scikit-learn SLEP010
133 | n_shapes, n_points, n_dim = X.shape
134 | self.n_features_in_ = n_dim
135 |
136 | # Pick reference shape
137 | if self.init == "random":
138 | random_state = sk_utils.check_random_state(self.random_state)
139 | ref_shape_idx = random_state.randint(X.shape[0])
140 | reference_shape = X[ref_shape_idx].copy()
141 | elif self.init == "mean":
142 | reference_shape = X.mean(axis=0)
143 | else:
144 | raise ValueError("init method must be one of ('random', 'mean')")
145 |
146 | for iter_idx in range(self.max_iter):
147 | # Align each shape to reference shape
148 | for shape_idx in range(X.shape[0]):
149 | if self.scale:
150 | _, X[shape_idx], _ = procrustes(reference_shape, X[shape_idx])
151 | else:
152 | _, X[shape_idx] = unscaled_procrustes(reference_shape, X[shape_idx])
153 |
154 | # Compute diagnostics
155 | mean_shape = X.mean(axis=0)
156 | procrustes_distance = np.linalg.norm(reference_shape - mean_shape)
157 |
158 | # Update reference shape
159 | reference_shape = mean_shape
160 |
161 | # Check for convergence
162 | if procrustes_distance <= self.tol:
163 | break
164 |
165 | # Store properties
166 | self._reference_shape = reference_shape
167 |
168 | # Return the aligned shapes
169 | return X
170 |
171 | def _check_input(self, X):
172 | sk_utils.check_array(X, allow_nd=True)
173 | if X.ndim != 3:
174 | raise ValueError("Expected 3-dimensional input of (n_shapes, n_points, n_dim)")
175 |
176 | def _check_is_fitted(self):
177 | sk_utils.validation.check_is_fitted(self, "_reference_shape")
178 |
179 | @property
180 | def reference_shape(self):
181 | """Returns the final reference shape."""
182 | self._check_is_fitted()
183 | return self._reference_shape
184 |
185 |
186 | def unscaled_procrustes(reference, data):
187 | """Fit `data` to `reference` using procrustes analysis without scaling.
188 | Uses translation (mean-centering), reflection, and orthogonal rotation.
189 |
190 | Parameters:
191 | reference (array-like of shape (n_points, n_dim)): reference shape to
192 | fit `data` to
193 | data (array-like of shape (n_points, n_dim)): shape to align to
194 | `reference`
195 |
196 | Returns:
197 | reference_centered (np.ndarray of shape (n_points, n_dim)): 0-centered
198 | `reference` shape
199 | data_aligned (np.ndarray of shape (n_points, n_dim)): `data` aligned to
200 | the reference shape
201 | """
202 | # Convert inputs to np.ndarray types
203 | reference = np.array(reference, dtype=np.double)
204 | data = np.array(data, dtype=np.double)
205 |
206 | # Translate data to the origin
207 | reference_centered = reference - reference.mean(axis=0)
208 | data_centered = data - data.mean(axis=0)
209 |
210 | # Rotate / reflect data to match reference
211 | # transform mtx2 to minimize disparity
212 | R, _ = orthogonal_procrustes(data_centered, reference_centered)
213 | data_aligned = data_centered @ R
214 |
215 | return reference_centered, data_aligned
216 |
--------------------------------------------------------------------------------
/prince/mca.py:
--------------------------------------------------------------------------------
1 | """Multiple Correspondence Analysis (MCA)"""
2 |
3 | from __future__ import annotations
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import sklearn.base
8 | import sklearn.preprocessing
9 | import sklearn.utils
10 |
11 | from prince import utils
12 |
13 | from . import ca
14 |
15 |
16 | class MCA(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin, ca.CA):
17 | def __init__(
18 | self,
19 | n_components=2,
20 | n_iter=10,
21 | copy=True,
22 | check_input=True,
23 | random_state=None,
24 | engine="sklearn",
25 | one_hot=True,
26 | correction=None,
27 | ):
28 | if correction is not None:
29 | if correction not in {"benzecri", "greenacre"}:
30 | raise ValueError("correction must be either 'benzecri' or 'greenacre' if provided.")
31 | if not one_hot:
32 | raise ValueError(
33 | "correction can only be applied when one_hot is True. This is because the "
34 | "number of original variables is needed to apply the correction."
35 | )
36 |
37 | super().__init__(
38 | n_components=n_components,
39 | n_iter=n_iter,
40 | copy=copy,
41 | check_input=check_input,
42 | random_state=random_state,
43 | engine=engine,
44 | )
45 | self.one_hot = one_hot
46 | self.correction = correction
47 |
48 | def _prepare(self, X):
49 | if self.one_hot:
50 | X = pd.get_dummies(X, columns=X.columns, prefix_sep="__")
51 | if (one_hot_columns_ := getattr(self, "one_hot_columns_", None)) is not None:
52 | X = X.reindex(columns=one_hot_columns_.union(X.columns), fill_value=False)
53 | return X
54 |
55 | def get_feature_names_out(self, input_features=None):
56 | return np.arange(self.n_components_)
57 |
58 | @property
59 | def eigenvalues_(self):
60 | """Returns the eigenvalues associated with each principal component."""
61 | eigenvalues = super().eigenvalues_
62 | # Benzécri and Greenacre corrections
63 | if self.correction in {"benzecri", "greenacre"}:
64 | K = self.K_
65 | return np.array(
66 | [(K / (K - 1) * (eig - 1 / K)) ** 2 if eig > 1 / K else 0 for eig in eigenvalues]
67 | )
68 | return eigenvalues
69 |
70 | @property
71 | @utils.check_is_fitted
72 | def percentage_of_variance_(self):
73 | """Returns the percentage of explained inertia per principal component."""
74 | # Benzécri correction
75 | if self.correction == "benzecri":
76 | eigenvalues = self.eigenvalues_
77 | return 100 * eigenvalues / eigenvalues.sum()
78 | # Greenacre correction
79 | if self.correction == "greenacre":
80 | eigenvalues = super().eigenvalues_
81 | benzecris = self.eigenvalues_
82 | K, J = (self.K_, self.J_)
83 | average_inertia = (K / (K - 1)) * ((eigenvalues**2).sum() - (J - K) / K**2)
84 | return 100 * benzecris / average_inertia
85 | # No correction
86 | return super().percentage_of_variance_
87 |
88 | @utils.check_is_dataframe_input
89 | def fit(self, X, y=None):
90 | """Fit the MCA for the dataframe X.
91 |
92 | The MCA is computed on the indicator matrix (i.e. `X.get_dummies()`). If some of the columns are already
93 | in indicator matrix format, you'll want to pass in `K` as the number of "real" variables that it represents.
94 | (That's used for correcting the inertia linked to each dimension.)
95 |
96 | """
97 |
98 | if self.check_input:
99 | sklearn.utils.check_array(X, dtype=[str, "numeric"])
100 |
101 | # K is the number of actual variables, to apply the Benzécri correction
102 | self.K_ = X.shape[1]
103 |
104 | # One-hot encode the data
105 | one_hot = self._prepare(X)
106 | self.one_hot_columns_ = one_hot.columns
107 |
108 | # We need the number of columns to apply the Greenacre correction
109 | self.J_ = one_hot.shape[1]
110 |
111 | # Apply CA to the indicator matrix
112 | super().fit(one_hot)
113 |
114 | return self
115 |
116 | @utils.check_is_dataframe_input
117 | @utils.check_is_fitted
118 | def row_coordinates(self, X):
119 | return super().row_coordinates(self._prepare(X))
120 |
121 | @utils.check_is_dataframe_input
122 | @utils.check_is_fitted
123 | def row_cosine_similarities(self, X):
124 | oh = self._prepare(X)
125 | return super()._row_cosine_similarities(X=oh, F=super().row_coordinates(oh))
126 |
127 | @utils.check_is_dataframe_input
128 | @utils.check_is_fitted
129 | def column_coordinates(self, X):
130 | return super().column_coordinates(self._prepare(X))
131 |
132 | @utils.check_is_dataframe_input
133 | @utils.check_is_fitted
134 | def column_cosine_similarities(self, X):
135 | oh = self._prepare(X)
136 | return super()._column_cosine_similarities(X=oh, G=super().column_coordinates(oh))
137 |
138 | @utils.check_is_dataframe_input
139 | @utils.check_is_fitted
140 | def transform(self, X):
141 | """Computes the row principal coordinates of a dataset."""
142 | if self.check_input:
143 | sklearn.utils.check_array(X, dtype=[str, "numeric"])
144 | return self.row_coordinates(X)
145 |
--------------------------------------------------------------------------------
/prince/mfa.py:
--------------------------------------------------------------------------------
1 | """Multiple Factor Analysis (MFA)"""
2 |
3 | from __future__ import annotations
4 |
5 | import collections
6 |
7 | import altair as alt
8 | import numpy as np
9 | import pandas as pd
10 |
11 | from prince import pca, utils
12 |
13 |
14 | class MFA(pca.PCA, collections.UserDict):
15 | def __init__(
16 | self,
17 | n_components=2,
18 | n_iter=3,
19 | copy=True,
20 | check_input=True,
21 | random_state=None,
22 | engine="sklearn",
23 | ):
24 | super().__init__(
25 | rescale_with_mean=True,
26 | rescale_with_std=True,
27 | n_components=n_components,
28 | n_iter=n_iter,
29 | copy=copy,
30 | check_input=check_input,
31 | random_state=random_state,
32 | engine=engine,
33 | )
34 | collections.UserDict.__init__(self)
35 |
36 | @utils.check_is_dataframe_input
37 | def fit(self, X, y=None, groups=None, supplementary_groups=None):
38 | # Checks groups are provided
39 | self.groups_ = self._determine_groups(X, groups)
40 | if supplementary_groups is not None:
41 | for group in supplementary_groups:
42 | if group not in self.groups_:
43 | raise ValueError(f"Supplementary group '{group}' is not in the groups")
44 | self.supplementary_groups_ = supplementary_groups
45 |
46 | # Check group types are consistent
47 | self.all_nums_ = {}
48 | for group, cols in sorted(self.groups_.items()):
49 | all_num = all(pd.api.types.is_numeric_dtype(X[c]) for c in cols)
50 | all_cat = all(pd.api.types.is_string_dtype(X[c]) for c in cols)
51 | if not (all_num or all_cat):
52 | raise ValueError(f'Not all columns in "{group}" group are of the same type')
53 | self.all_nums_[group] = all_num
54 |
55 | # Run a factor analysis in each group
56 | for group, cols in sorted(self.groups_.items()):
57 | if self.all_nums_[group]:
58 | fa = pca.PCA(
59 | rescale_with_mean=True,
60 | rescale_with_std=True,
61 | n_components=self.n_components,
62 | n_iter=self.n_iter,
63 | copy=True,
64 | random_state=self.random_state,
65 | engine=self.engine,
66 | )
67 | else:
68 | raise NotImplementedError("Groups of non-numerical variables are not supported yet")
69 | self[group] = fa.fit(X.loc[:, cols])
70 |
71 | # Fit the global PCA
72 | Z = self._build_Z(X)
73 | column_weights = np.array(
74 | [
75 | 1 / self[group].eigenvalues_[0]
76 | for group, cols in self.groups_.items()
77 | for _ in cols
78 | if group not in getattr(self, "supplementary_groups_", [])
79 | ]
80 | )
81 | super().fit(
82 | Z,
83 | column_weight=column_weights,
84 | supplementary_columns=[
85 | column
86 | for group in getattr(self, "supplementary_groups_", [])
87 | for column in self.groups_[group]
88 | ],
89 | )
90 |
91 | return self
92 |
93 | def _determine_groups(self, X: pd.DataFrame, groups: dict | list | None) -> dict:
94 | if groups is None:
95 | if isinstance(X.columns, pd.MultiIndex):
96 | groups = X.columns.get_level_values(0).unique().tolist()
97 | else:
98 | raise ValueError("Groups have to be specified")
99 |
100 | if isinstance(groups, list):
101 | if not isinstance(X.columns, pd.MultiIndex):
102 | raise ValueError(
103 | "X has to have MultiIndex columns if groups are provided as a list"
104 | )
105 | groups = {
106 | group: [
107 | (group, column)
108 | for column in X.columns.get_level_values(1)[
109 | X.columns.get_level_values(0) == group
110 | ]
111 | ]
112 | for group in groups
113 | }
114 | return groups
115 |
116 | def _build_Z(self, X):
117 | return pd.concat(
118 | (X[cols] for _, cols in self.groups_.items()),
119 | axis="columns",
120 | )
121 |
122 | @utils.check_is_dataframe_input
123 | @utils.check_is_fitted
124 | def row_coordinates(self, X):
125 | """Returns the row principal coordinates."""
126 | Z = self._build_Z(X)
127 | return super().row_coordinates(Z)
128 |
129 | @utils.check_is_dataframe_input
130 | @utils.check_is_fitted
131 | def partial_row_coordinates(self, X):
132 | """Returns the partial row principal coordinates."""
133 | Z = self._build_Z(X)
134 | coords = []
135 | for _, names in self.groups_.items():
136 | partial_coords = pd.DataFrame(0.0, index=Z.index, columns=Z.columns)
137 | partial_coords.loc[:, names] = (Z[names] - Z[names].mean()) / Z[names].std(ddof=0)
138 | partial_coords = partial_coords * self.column_weight_
139 | partial_coords = (len(self.groups_) * partial_coords).dot(self.svd_.V.T)
140 | coords.append(partial_coords)
141 | coords = pd.concat(coords, axis=1, keys=self.groups_.keys())
142 | coords.columns.name = "component"
143 | return coords
144 |
145 | @utils.check_is_dataframe_input
146 | @utils.check_is_fitted
147 | def column_coordinates(self, X):
148 | Z = self._build_Z(X)
149 | return super().column_coordinates(Z)
150 |
151 | @utils.check_is_dataframe_input
152 | @utils.check_is_fitted
153 | def inverse_transform(self, X):
154 | raise NotImplementedError("MFA inherits from PCA, but this method is not implemented yet")
155 |
156 | @utils.check_is_dataframe_input
157 | @utils.check_is_fitted
158 | def row_standard_coordinates(self, X):
159 | Z = self._build_Z(X)
160 | return super().row_standard_coordinates(Z)
161 |
162 | @utils.check_is_dataframe_input
163 | @utils.check_is_fitted
164 | def row_cosine_similarities(self, X):
165 | Z = self._build_Z(X)
166 | return super().row_cosine_similarities(Z)
167 |
168 | @utils.check_is_dataframe_input
169 | @utils.check_is_fitted
170 | def column_cosine_similarities_(self, X):
171 | Z = self._build_Z(X)
172 | return super().column_cosine_similarities_(Z)
173 |
174 | @utils.check_is_dataframe_input
175 | @utils.check_is_fitted
176 | def plot(self, X, x_component=0, y_component=1, show_partial_rows=False, **params):
177 | index_name = X.index.name or "index"
178 |
179 | params["tooltip"] = (
180 | X.index.names if isinstance(X.index, pd.MultiIndex) else [index_name]
181 | ) + [
182 | "group",
183 | f"component {x_component}",
184 | f"component {y_component}",
185 | ]
186 |
187 | eig = self._eigenvalues_summary.to_dict(orient="index")
188 |
189 | row_plot = None
190 | partial_row_plot = None
191 | edges_plot = None
192 |
193 | # Barycenters
194 | row_coords = self.row_coordinates(X)
195 | row_coords.columns = [f"component {i}" for i in row_coords.columns]
196 | row_coords = row_coords.reset_index()
197 | row_coords["group"] = "Global"
198 | if show_partial_rows:
199 | params["color"] = "group:N"
200 | row_plot = (
201 | alt.Chart(row_coords)
202 | .mark_point(filled=True, size=50)
203 | .encode(
204 | alt.X(
205 | f"component {x_component}",
206 | scale=alt.Scale(zero=False),
207 | axis=alt.Axis(
208 | title=f"component {x_component} — {eig[x_component]['% of variance'] / 100:.2%}"
209 | ),
210 | ),
211 | alt.Y(
212 | f"component {y_component}",
213 | scale=alt.Scale(zero=False),
214 | axis=alt.Axis(
215 | title=f"component {y_component} — {eig[y_component]['% of variance'] / 100:.2%}"
216 | ),
217 | ),
218 | **params,
219 | )
220 | )
221 |
222 | # Partial row coordinates
223 | if show_partial_rows:
224 | partial_row_coords = self.partial_row_coordinates(X).stack(level=0, future_stack=True)
225 | partial_row_coords.columns = [f"component {i}" for i in partial_row_coords.columns]
226 | partial_row_coords = partial_row_coords.reset_index(names=[index_name, "group"])
227 |
228 | partial_row_plot = (
229 | alt.Chart(partial_row_coords)
230 | .mark_point(shape="circle")
231 | .encode(
232 | alt.X(f"component {x_component}", scale=alt.Scale(zero=False)),
233 | alt.Y(f"component {y_component}", scale=alt.Scale(zero=False)),
234 | **params,
235 | )
236 | )
237 |
238 | # Edges to connect the main markers to the partial markers
239 | if show_partial_rows:
240 | edges = pd.merge(
241 | left=row_coords[
242 | [index_name, f"component {x_component}", f"component {y_component}"]
243 | ],
244 | right=partial_row_coords[
245 | [index_name, f"component {x_component}", f"component {y_component}", "group"]
246 | ],
247 | on=index_name,
248 | suffixes=("_global", "_partial"),
249 | )
250 | edges_plot = (
251 | alt.Chart(edges)
252 | .mark_line(opacity=0.7)
253 | .encode(
254 | x=f"component {x_component}_global:Q",
255 | y=f"component {y_component}_global:Q",
256 | x2=f"component {x_component}_partial:Q",
257 | y2=f"component {y_component}_partial:Q",
258 | color="group:N",
259 | strokeDash=alt.value([2, 2]),
260 | )
261 | )
262 |
263 | charts = filter(
264 | None,
265 | (row_plot, partial_row_plot, edges_plot),
266 | )
267 |
268 | return alt.layer(*charts).interactive()
269 |
--------------------------------------------------------------------------------
/prince/pca.py:
--------------------------------------------------------------------------------
1 | """Principal Component Analysis (PCA)"""
2 |
3 | from __future__ import annotations
4 |
5 | import functools
6 |
7 | import altair as alt
8 | import numpy as np
9 | import pandas as pd
10 | import sklearn.base
11 | import sklearn.utils
12 | from sklearn import preprocessing
13 |
14 | from prince import svd, utils
15 |
16 |
17 | def select_active_variables(method):
18 | @functools.wraps(method)
19 | def _impl(self, X=None, *method_args, **method_kwargs):
20 | if hasattr(self, "feature_names_in_") and isinstance(X, pd.DataFrame):
21 | return method(self, X[self.feature_names_in_], *method_args, **method_kwargs)
22 | return method(self, X, *method_args, **method_kwargs)
23 |
24 | return _impl
25 |
26 |
27 | class PCA(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin, utils.EigenvaluesMixin):
28 | """Principal Component Analysis (PCA).
29 |
30 | Parameters
31 | ----------
32 | rescale_with_mean
33 | Whether or not to subtract each column's mean before performing SVD.
34 | rescale_with_std
35 | Whether or not to standardize each column before performing SVD.
36 | n_components
37 | The number of principal components to compute.
38 | n_iter
39 | The number of iterations used for computing the SVD.
40 | copy
41 | Whether nor to perform the computations inplace.
42 | check_input
43 | Whether to check the coherence of the inputs or not.
44 |
45 | """
46 |
47 | def __init__(
48 | self,
49 | rescale_with_mean=True,
50 | rescale_with_std=True,
51 | n_components=2,
52 | n_iter=3,
53 | copy=True,
54 | check_input=True,
55 | random_state=None,
56 | engine="sklearn",
57 | ):
58 | self.n_components = n_components
59 | self.n_iter = n_iter
60 | self.rescale_with_mean = rescale_with_mean
61 | self.rescale_with_std = rescale_with_std
62 | self.copy = copy
63 | self.check_input = check_input
64 | self.random_state = random_state
65 | self.engine = engine
66 |
67 | def _check_input(self, X):
68 | if self.check_input:
69 | sklearn.utils.check_array(X)
70 |
71 | def get_feature_names_out(self, input_features=None):
72 | return np.arange(self.n_components_)
73 |
74 | @utils.check_is_dataframe_input
75 | def fit(
76 | self,
77 | X,
78 | y=None,
79 | sample_weight=None,
80 | column_weight=None,
81 | supplementary_columns=None,
82 | ):
83 | self._check_input(X)
84 |
85 | # Massage input
86 | supplementary_columns = supplementary_columns or []
87 | active_variables = X.columns.difference(supplementary_columns, sort=False).tolist()
88 | sample_weight = np.ones(len(X)) if sample_weight is None else sample_weight
89 | sample_weight = sample_weight / sample_weight.sum()
90 | column_weight = np.ones(len(active_variables)) if column_weight is None else column_weight
91 | self.column_weight_ = column_weight
92 |
93 | # https://scikit-learn.org/stable/developers/develop.html#universal-attributes
94 | self.feature_names_in_ = active_variables
95 | self.n_features_in_ = len(active_variables)
96 |
97 | X_active = X[active_variables].to_numpy(dtype=np.float64, copy=self.copy)
98 | if supplementary_columns:
99 | X_sup = X[supplementary_columns].to_numpy(dtype=np.float64, copy=self.copy)
100 |
101 | # Scale datarow_contributions
102 | if self.rescale_with_mean or self.rescale_with_std:
103 | self.scaler_ = preprocessing.StandardScaler(
104 | copy=self.copy,
105 | with_mean=self.rescale_with_mean,
106 | with_std=self.rescale_with_std,
107 | ).fit(X_active, sample_weight=sample_weight)
108 | X_active = self.scaler_.transform(X_active) # TODO: maybe fit_transform is faster
109 | if supplementary_columns:
110 | X_sup = preprocessing.StandardScaler(
111 | copy=self.copy,
112 | with_mean=self.rescale_with_mean,
113 | with_std=self.rescale_with_std,
114 | ).fit_transform(X_sup)
115 |
116 | self._column_dist = pd.Series(
117 | (X_active**2 * sample_weight[:, np.newaxis]).sum(axis=0),
118 | index=active_variables,
119 | )
120 | if supplementary_columns:
121 | self._column_dist = pd.concat(
122 | (
123 | self._column_dist,
124 | pd.Series(
125 | (X_sup**2 / len(X_sup)).sum(axis=0),
126 | index=supplementary_columns,
127 | ),
128 | )
129 | )
130 |
131 | self.svd_ = svd.compute_svd(
132 | X=X_active,
133 | n_components=self.n_components,
134 | n_iter=self.n_iter,
135 | random_state=self.random_state,
136 | engine=self.engine,
137 | row_weights=sample_weight,
138 | column_weights=column_weight,
139 | )
140 |
141 | self.total_inertia_ = np.sum(
142 | np.square(X_active) * column_weight * sample_weight[:, np.newaxis]
143 | )
144 |
145 | self.column_coordinates_ = pd.DataFrame(
146 | data=self.svd_.V.T * self.eigenvalues_**0.5,
147 | index=active_variables,
148 | )
149 | if supplementary_columns:
150 | self.column_coordinates_ = pd.concat(
151 | [
152 | self.column_coordinates_,
153 | pd.DataFrame(
154 | data=X_sup.T @ (self.svd_.U / len(self.svd_.U) ** 0.5),
155 | index=supplementary_columns,
156 | ),
157 | ]
158 | )
159 | self.column_coordinates_.columns.name = "component"
160 | self.column_coordinates_.index.name = "variable"
161 | row_coords = pd.DataFrame(
162 | self.svd_.U * self.eigenvalues_**0.5,
163 | # HACK: there's a circular dependency between row_contributions_
164 | # and active_row_coordinates in self.__init__
165 | index=self.row_contributions_.index if hasattr(self, "row_contributions_") else None,
166 | )
167 | row_coords.columns.name = "component"
168 | self.row_contributions_ = (row_coords**2 * sample_weight[:, np.newaxis]).div(
169 | self.eigenvalues_, axis=1
170 | )
171 | self.row_contributions_.index = X.index
172 |
173 | return self
174 |
175 | @property
176 | @utils.check_is_fitted
177 | def eigenvalues_(self):
178 | """Returns the eigenvalues associated with each principal component."""
179 | return np.square(self.svd_.s)
180 |
181 | def _scale(self, X):
182 | if not hasattr(self, "scaler_"):
183 | return X
184 |
185 | if sup_variables := X.columns.difference(self.feature_names_in_, sort=False).tolist():
186 | X = np.concatenate(
187 | (
188 | self.scaler_.transform(X[self.feature_names_in_].to_numpy()),
189 | preprocessing.StandardScaler(
190 | copy=self.copy,
191 | with_mean=self.rescale_with_mean,
192 | with_std=self.rescale_with_std,
193 | ).fit_transform(X[sup_variables]),
194 | ),
195 | axis=1,
196 | )
197 | else:
198 | X = self.scaler_.transform(X.to_numpy())
199 |
200 | return X
201 |
202 | @utils.check_is_dataframe_input
203 | @utils.check_is_fitted
204 | @select_active_variables
205 | def row_coordinates(self, X: pd.DataFrame):
206 | """Returns the row principal coordinates.
207 |
208 | The row principal coordinates are obtained by projecting `X` on the right eigenvectors.
209 |
210 | Synonyms
211 | --------
212 | Row projections
213 | Factor scores
214 | Loadings
215 |
216 | """
217 |
218 | index = X.index if isinstance(X, pd.DataFrame) else None
219 | X = self._scale(X)
220 | X = np.array(X, copy=self.copy)
221 | X *= self.column_weight_
222 |
223 | coord = pd.DataFrame(data=X.dot(self.svd_.V.T), index=index)
224 | coord.columns.name = "component"
225 | return coord
226 |
227 | @utils.check_is_dataframe_input
228 | @utils.check_is_fitted
229 | def transform(self, X, as_array=False):
230 | """Computes the row principal coordinates of a dataset.
231 |
232 | Same as calling `row_coordinates`. This is just for compatibility with
233 | scikit-learn.
234 |
235 | """
236 | self._check_input(X)
237 | rc = self.row_coordinates(X)
238 | return rc.to_numpy() if as_array else rc
239 |
240 | @utils.check_is_dataframe_input
241 | def fit_transform(self, X, y=None, as_array=False):
242 | """A faster way to fit/transform.
243 |
244 | This methods produces exactly the same result as calling `fit(X)` followed
245 | by `transform(X)`. It is however much faster, as it avoids a matrix multiplication
246 | between the input data and the right eigenvectors. The row coordinates are instead obtained
247 | directly from the left eigenvectors.
248 |
249 | """
250 | self._check_input(X)
251 | self.fit(X)
252 | rc = self.row_coordinates(X)
253 | return rc.to_numpy() if as_array else rc
254 |
255 | @utils.check_is_dataframe_input
256 | @utils.check_is_fitted
257 | def inverse_transform(self, X, as_array=False):
258 | """Transforms row projections back to their original space.
259 |
260 | In other words, return a dataset whose transform would be X.
261 |
262 | """
263 |
264 | X_inv = np.dot(X, self.svd_.V)
265 |
266 | if hasattr(self, "scaler_"):
267 | X_inv = self.scaler_.inverse_transform(X_inv)
268 |
269 | if as_array:
270 | return X_inv
271 |
272 | # Extract index
273 | index = X.index if isinstance(X, pd.DataFrame) else None
274 | return pd.DataFrame(data=X_inv, index=index)
275 |
276 | @utils.check_is_dataframe_input
277 | @utils.check_is_fitted
278 | def row_standard_coordinates(self, X: pd.DataFrame = None):
279 | """Returns the row standard coordinates.
280 |
281 | The row standard coordinates are obtained by dividing each row principal coordinate by it's
282 | associated eigenvalue.
283 |
284 | """
285 | return self.row_coordinates(X).div(self.eigenvalues_, axis="columns")
286 |
287 | @utils.check_is_dataframe_input
288 | @utils.check_is_fitted
289 | @select_active_variables
290 | def row_cosine_similarities(self, X):
291 | """Returns the cosine similarities between the rows and their principal components.
292 |
293 | The row cosine similarities are obtained by calculating the cosine of the angle shaped by
294 | the row principal coordinates and the row principal components. This is calculated by
295 | squaring each row projection coordinate and dividing each squared coordinate by the sum of
296 | the squared coordinates, which results in a ratio comprised between 0 and 1 representing
297 | the squared cosine.
298 |
299 | """
300 | squared_coordinates = (np.square(self._scale(X)) * self.column_weight_).sum(axis=1)
301 | return (self.row_coordinates(X) ** 2).div(squared_coordinates, axis=0)
302 |
303 | @property
304 | @utils.check_is_fitted
305 | def column_correlations(self):
306 | """Calculate correlations between variables and components.
307 |
308 | The correlation between a variable and a component estimates the information they share. In
309 | the PCA framework, this correlation is called a loading.
310 |
311 | Note that the sum of the squared coefficients of correlation between a variable and all the
312 | components is equal to 1. As a consequence, the squared loadings are easier to interpret
313 | than the loadings (because the squared loadings give the proportion of the variance of the
314 | variables explained by the components).
315 |
316 | """
317 | return self.column_coordinates_.div(self._column_dist**0.5, axis=0)
318 |
319 | @property
320 | @utils.check_is_fitted
321 | def column_cosine_similarities_(self):
322 | return self.column_correlations**2
323 |
324 | @property
325 | @utils.check_is_fitted
326 | def column_contributions_(self):
327 | return (
328 | ((self.column_coordinates_.loc[self.feature_names_in_]) ** 2)
329 | * self.column_weight_[:, np.newaxis]
330 | ).div(self.eigenvalues_, axis=1)
331 |
332 | @utils.check_is_dataframe_input
333 | @utils.check_is_fitted
334 | def plot(
335 | self,
336 | X,
337 | x_component=0,
338 | y_component=1,
339 | color_rows_by=None,
340 | show_row_markers=True,
341 | show_column_markers=True,
342 | show_row_labels=False,
343 | show_column_labels=False,
344 | row_labels_column=None,
345 | ):
346 | row_params = {
347 | "tooltip": (
348 | X.index.names
349 | if isinstance(X.index, pd.MultiIndex)
350 | else [X.index.name or "index"] # index is the default name
351 | )
352 | + [
353 | f"component {x_component}",
354 | f"component {y_component}",
355 | ]
356 | }
357 | if color_rows_by:
358 | row_params["color"] = color_rows_by
359 |
360 | eig = self._eigenvalues_summary.to_dict(orient="index")
361 |
362 | row_chart_markers = None
363 | row_chart_labels = None
364 | column_chart_markers = None
365 | column_chart_labels = None
366 |
367 | if show_row_markers or show_row_labels:
368 | row_coords = self.row_coordinates(X)
369 | row_coords.columns = [f"component {i}" for i in row_coords.columns]
370 | row_labels = (
371 | pd.Series(
372 | row_coords.index.get_level_values(
373 | row_labels_column or row_coords.index.names[0]
374 | ),
375 | index=row_coords.index,
376 | )
377 | if isinstance(row_coords.index, pd.MultiIndex)
378 | else pd.Series(row_coords.index, index=row_coords.index)
379 | )
380 |
381 | row_chart = alt.Chart(row_coords.assign(label=row_labels).reset_index()).encode(
382 | alt.X(
383 | f"component {x_component}",
384 | scale=alt.Scale(zero=False),
385 | axis=alt.Axis(
386 | title=f"component {x_component} — {eig[x_component]['% of variance'] / 100:.2%}"
387 | ),
388 | ),
389 | alt.Y(
390 | f"component {y_component}",
391 | scale=alt.Scale(zero=False),
392 | axis=alt.Axis(
393 | title=f"component {y_component} — {eig[y_component]['% of variance'] / 100:.2%}"
394 | ),
395 | ),
396 | **row_params,
397 | )
398 | row_chart_markers = row_chart.mark_circle(size=50 if show_row_markers else 0)
399 | if show_row_labels:
400 | row_chart_labels = row_chart.mark_text().encode(text="label:N")
401 |
402 | if show_column_markers or show_column_labels:
403 | column_coords = self.column_coordinates_.copy()
404 | column_coords.columns = [f"component {i}" for i in column_coords.columns]
405 | # Scale the column coordinates to the row coordinates
406 | column_coords = column_coords * row_coords.abs().max()
407 | column_labels = pd.Series(column_coords.index, index=column_coords.index)
408 |
409 | column_chart = alt.Chart(
410 | column_coords.assign(label=column_labels).reset_index()
411 | ).encode(
412 | alt.X(f"component {x_component}", scale=alt.Scale(zero=False)),
413 | alt.Y(f"component {y_component}", scale=alt.Scale(zero=False)),
414 | tooltip=["variable"],
415 | )
416 | column_chart_markers = column_chart.mark_square(
417 | color="green", size=50 if show_column_markers else 0
418 | )
419 | if show_column_labels:
420 | column_chart_labels = column_chart.mark_text().encode(text="label:N")
421 |
422 | charts = filter(
423 | None,
424 | (
425 | row_chart_markers,
426 | row_chart_labels,
427 | column_chart_markers,
428 | column_chart_labels,
429 | ),
430 | )
431 |
432 | return alt.layer(*charts).interactive()
433 |
--------------------------------------------------------------------------------
/prince/plot.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from collections import OrderedDict
4 |
5 | import numpy as np
6 | from scipy import linalg
7 |
8 | GRAY = OrderedDict([("light", "#bababa"), ("dark", "#404040")])
9 |
10 |
11 | def stylize_axis(ax, grid=True):
12 | if grid:
13 | ax.grid()
14 |
15 | ax.xaxis.set_ticks_position("none")
16 | ax.yaxis.set_ticks_position("none")
17 |
18 | ax.axhline(y=0, linestyle="-", linewidth=1.2, color=GRAY["dark"], alpha=0.6)
19 | ax.axvline(x=0, linestyle="-", linewidth=1.2, color=GRAY["dark"], alpha=0.6)
20 |
21 | return ax
22 |
23 |
24 | def build_ellipse(X, Y):
25 | """Construct ellipse coordinates from two arrays of numbers.
26 |
27 | Args:
28 | X (1D array_like)
29 | Y (1D array_like)
30 |
31 | Returns:
32 | float: The mean of `X`.
33 | float: The mean of `Y`.
34 | float: The width of the ellipse.
35 | float: The height of the ellipse.
36 | float: The angle of orientation of the ellipse.
37 |
38 | """
39 | x_mean = np.mean(X)
40 | y_mean = np.mean(Y)
41 |
42 | cov_matrix = np.cov(np.vstack((X, Y)))
43 | U, s, V = linalg.svd(cov_matrix, full_matrices=False)
44 |
45 | chi_95 = np.sqrt(4.61) # 90% quantile of the chi-square distribution
46 | width = np.sqrt(cov_matrix[0][0]) * chi_95 * 2
47 | height = np.sqrt(cov_matrix[1][1]) * chi_95 * 2
48 |
49 | eigenvector = V.T[0]
50 | angle = np.arctan(eigenvector[1] / eigenvector[0])
51 |
52 | return x_mean, y_mean, width, height, angle
53 |
--------------------------------------------------------------------------------
/prince/svd.py:
--------------------------------------------------------------------------------
1 | """Singular Value Decomposition (SVD)"""
2 |
3 | from __future__ import annotations
4 |
5 | import dataclasses
6 |
7 | try:
8 | import fbpca
9 |
10 | FBPCA_INSTALLED = True
11 | except ImportError:
12 | FBPCA_INSTALLED = False
13 | import numpy as np
14 | import scipy
15 | from sklearn.utils import extmath
16 |
17 |
18 | @dataclasses.dataclass
19 | class SVD:
20 | U: np.ndarray
21 | s: np.ndarray
22 | V: np.ndarray
23 |
24 |
25 | def compute_svd(
26 | X: np.ndarray,
27 | n_components: int,
28 | n_iter: int,
29 | engine: str,
30 | random_state: int | None = None,
31 | row_weights: np.ndarray | None = None,
32 | column_weights: np.ndarray | None = None,
33 | ) -> SVD:
34 | """Computes an SVD with k components."""
35 |
36 | if row_weights is not None:
37 | X = X * np.sqrt(row_weights[:, np.newaxis]) # row-wise scaling
38 | if column_weights is not None:
39 | X = X * np.sqrt(column_weights)
40 |
41 | # Compute the SVD
42 | if engine == "fbpca":
43 | if FBPCA_INSTALLED:
44 | U, s, V = fbpca.pca(X, k=n_components, n_iter=n_iter)
45 | else:
46 | raise ValueError("fbpca is not installed; please install it if you want to use it")
47 | elif engine == "scipy":
48 | U, s, V = scipy.linalg.svd(X)
49 | U = U[:, :n_components]
50 | s = s[:n_components]
51 | V = V[:n_components, :]
52 | elif engine == "sklearn":
53 | U, s, V = extmath.randomized_svd(
54 | X, n_components=n_components, n_iter=n_iter, random_state=random_state
55 | )
56 | else:
57 | raise ValueError("engine has to be one of ('fbpca', 'scipy', 'sklearn')")
58 |
59 | # U, V = extmath.svd_flip(U, V)
60 |
61 | if row_weights is not None:
62 | U = U / np.sqrt(row_weights)[:, np.newaxis] # row-wise scaling
63 | if column_weights is not None:
64 | V = V / np.sqrt(column_weights)
65 |
66 | return SVD(U, s, V)
67 |
--------------------------------------------------------------------------------
/prince/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import functools
4 |
5 | import altair as alt
6 | import numpy as np
7 | import pandas as pd
8 | from sklearn.utils import validation
9 |
10 |
11 | def check_is_fitted(method):
12 | @functools.wraps(method)
13 | def _impl(self, *method_args, **method_kwargs):
14 | validation.check_is_fitted(self)
15 | return method(self, *method_args, **method_kwargs)
16 |
17 | return _impl
18 |
19 |
20 | def check_is_dataframe_input(func):
21 | @functools.wraps(func)
22 | def wrapper(*args, **kwargs):
23 | X = args[1] # Assuming the first argument is 'self' or an instance
24 | if not isinstance(X, pd.DataFrame):
25 | raise ValueError(
26 | f"The X argument must be a pandas DataFrame, but got {type(X).__name__}"
27 | )
28 | return func(*args, **kwargs)
29 |
30 | return wrapper
31 |
32 |
33 | def make_labels_and_names(X):
34 | if isinstance(X, pd.DataFrame):
35 | row_label = X.index.name if X.index.name else "Rows"
36 | row_names = X.index.tolist()
37 | col_label = X.columns.name if X.columns.name else "Columns"
38 | col_names = X.columns.tolist()
39 | else:
40 | row_label = "Rows"
41 | row_names = list(range(X.shape[0]))
42 | col_label = "Columns"
43 | col_names = list(range(X.shape[1]))
44 |
45 | return row_label, row_names, col_label, col_names
46 |
47 |
48 | class EigenvaluesMixin:
49 | @property
50 | @check_is_fitted
51 | def percentage_of_variance_(self):
52 | """Returns the percentage of explained inertia per principal component."""
53 | return 100 * self.eigenvalues_ / self.total_inertia_
54 |
55 | @property
56 | @check_is_fitted
57 | def cumulative_percentage_of_variance_(self):
58 | """Returns the percentage of explained inertia per principal component."""
59 | return np.cumsum(self.percentage_of_variance_)
60 |
61 | @property
62 | @check_is_fitted
63 | def _eigenvalues_summary(self):
64 | """Return a summary of the eigenvalues and their importance."""
65 | return pd.DataFrame(
66 | {
67 | "eigenvalue": self.eigenvalues_,
68 | r"% of variance": self.percentage_of_variance_,
69 | r"% of variance (cumulative)": self.cumulative_percentage_of_variance_,
70 | },
71 | index=pd.RangeIndex(0, len(self.eigenvalues_), name="component"),
72 | )
73 |
74 | @property
75 | def eigenvalues_summary(self):
76 | """Return a summary of the eigenvalues and their importance."""
77 | summary = self._eigenvalues_summary
78 | summary["% of variance"] /= 100
79 | summary["% of variance (cumulative)"] /= 100
80 | summary["eigenvalue"] = summary["eigenvalue"].map("{:,.3f}".format)
81 | summary["% of variance"] = summary["% of variance"].map("{:.2%}".format)
82 | summary["% of variance (cumulative)"] = summary["% of variance (cumulative)"].map(
83 | "{:.2%}".format
84 | )
85 | summary.index.name = "component"
86 | return summary
87 |
88 | def scree_plot(self):
89 | """Scree plot.
90 |
91 | References
92 | ----------
93 | https://en.wikipedia.org/wiki/Scree_plot
94 |
95 | """
96 | eig = self._eigenvalues_summary.reset_index()
97 | eig["component"] = eig["component"].astype(str)
98 | return (
99 | alt.Chart(
100 | self._eigenvalues_summary.reset_index().assign(
101 | component=lambda x: x["component"].astype(str)
102 | )
103 | )
104 | .mark_bar(size=10)
105 | .encode(x="component", y="eigenvalue", tooltip=eig.columns.tolist())
106 | )
107 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "prince"
3 | version = "0.16.0"
4 | description = "Factor analysis in Python: PCA, CA, MCA, MFA, FAMD, GPA"
5 | authors = ["Max Halford "]
6 | license = "MIT"
7 |
8 | [tool.poetry.dependencies]
9 | python = ">=3.10,<4.0"
10 | scikit-learn = "^1.5.1"
11 | pandas = "^2.2.0"
12 | altair = "^5.0.0"
13 |
14 | [tool.poetry.group.dev.dependencies]
15 | nbconvert = "^7.16.5"
16 | fbpca = "^1.0"
17 | pytest = "^8.3.4"
18 | ipykernel = "^6.13.0"
19 | rpy2 = "^3.5.2"
20 | ruff = "^0.8.5"
21 | xarray = "^2025.1.0"
22 | pre-commit = "^4.0.1"
23 |
24 | [tool.ruff]
25 | lint.select = ["E", "F", "I", "UP"] # https://beta.ruff.rs/docs/rules/
26 | line-length = 100
27 | target-version = 'py310'
28 | lint.ignore = ["E501"]
29 |
30 | [tool.ruff.lint.isort]
31 | required-imports = ["from __future__ import annotations"]
32 |
33 | [build-system]
34 | requires = ["poetry-core>=1.0.0"]
35 | build-backend = "poetry.core.masonry.api"
36 |
37 | [tool.pytest.ini_options]
38 | addopts = [
39 | "--verbose",
40 | "--doctest-modules",
41 | "--doctest-glob=*.md"
42 | ]
43 | doctest_optionflags = "NORMALIZE_WHITESPACE NUMBER ELLIPSIS"
44 |
--------------------------------------------------------------------------------
/tests/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: prince-test
2 | Version: 0.0.0.1
3 | Title: Test dependencies
4 | Imports:
5 | FactoMineR
6 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import rpy2.rinterface_lib
6 | from rpy2.robjects import r as R
7 |
8 |
9 | def load_df_from_R(code):
10 | df = R(code)
11 | if isinstance(df.names, rpy2.rinterface_lib.sexp.NULLType):
12 | return pd.DataFrame(np.array(df))
13 | return pd.DataFrame(np.array(df), index=df.names[0], columns=df.names[1])
14 |
--------------------------------------------------------------------------------
/tests/test_ca.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import math
4 | import tempfile
5 |
6 | import numpy as np
7 | import pandas as pd
8 | import pytest
9 | import rpy2.robjects as robjects
10 | import sklearn.utils.estimator_checks
11 | import sklearn.utils.validation
12 | from rpy2.robjects import r as R
13 | from scipy import sparse
14 |
15 | import prince
16 | from tests import load_df_from_R
17 |
18 |
19 | @pytest.mark.parametrize(
20 | "sup_rows, sup_cols",
21 | [
22 | pytest.param(
23 | sup_rows,
24 | sup_cols,
25 | id=":".join(["sup_rows" if sup_rows else "", "sup_cols" if sup_cols else ""]).strip(
26 | ":"
27 | ),
28 | )
29 | for sup_rows in [False, True]
30 | for sup_cols in [False, True]
31 | ],
32 | )
33 | class TestCA:
34 | _row_name = "row"
35 | _col_name = "col"
36 |
37 | @pytest.fixture(autouse=True)
38 | def _prepare(self, sup_rows, sup_cols):
39 | self.sup_rows = sup_rows
40 | self.sup_cols = sup_cols
41 |
42 | n_components = 5
43 |
44 | # Fit Prince
45 | self.dataset = prince.datasets.load_french_elections()
46 | active = self.dataset.copy()
47 | if sup_rows:
48 | active = active.drop("Île-de-France")
49 | if self.sup_cols:
50 | active = active.drop(columns=["Abstention", "Blank"])
51 | self.ca = prince.CA(n_components=n_components)
52 | self.ca.fit(active)
53 |
54 | # Fit FactoMineR
55 | R("library('FactoMineR')")
56 | with tempfile.NamedTemporaryFile() as fp:
57 | self.dataset.to_csv(fp)
58 | R(f"dataset <- read.csv('{fp.name}', row.names=1)")
59 |
60 | args = f"dataset, ncp={n_components}, graph=F"
61 | if self.sup_cols:
62 | if sup_rows:
63 | R(f"ca <- CA({args}, col.sup=c(13, 14), row.sup=c(18))")
64 | else:
65 | R(f"ca <- CA({args}, col.sup=c(13, 14))")
66 | else:
67 | if sup_rows:
68 | R(f"ca <- CA({args}, row.sup=c(18))")
69 | else:
70 | R(f"ca <- CA({args})")
71 |
72 | def test_check_is_fitted(self):
73 | assert isinstance(self.ca, prince.CA)
74 | sklearn.utils.validation.check_is_fitted(self.ca)
75 |
76 | def test_svd_U(self):
77 | F = load_df_from_R("ca$svd$U").to_numpy()
78 | P = sparse.diags(self.ca.row_masses_.to_numpy() ** -0.5) @ self.ca.svd_.U
79 | np.testing.assert_allclose(np.abs(F), np.abs(P))
80 |
81 | def test_svd_V(self):
82 | F = load_df_from_R("ca$svd$V").to_numpy()
83 | P = sparse.diags(self.ca.col_masses_.to_numpy() ** -0.5) @ self.ca.svd_.V.T
84 | np.testing.assert_allclose(np.abs(F), np.abs(P))
85 |
86 | def test_total_inertia(self):
87 | F = robjects.r("sum(ca$eig[,1])")[0]
88 | P = self.ca.total_inertia_
89 | assert math.isclose(F, P)
90 |
91 | def test_eigenvalues(self):
92 | F = load_df_from_R("ca$eig")[: self.ca.n_components]
93 | P = self.ca._eigenvalues_summary
94 | np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"])
95 | np.testing.assert_allclose(F["percentage of variance"], P["% of variance"])
96 | np.testing.assert_allclose(
97 | F["cumulative percentage of variance"], P["% of variance (cumulative)"]
98 | )
99 |
100 | def test_row_coords(self, method_name="row_coordinates"):
101 | F = load_df_from_R(f"ca${self._row_name}$coord")
102 | if self.sup_rows:
103 | F = pd.concat((F, load_df_from_R(f"ca${self._row_name}.sup$coord")))
104 |
105 | method = getattr(self.ca, method_name)
106 | P = method(self.dataset)
107 |
108 | np.testing.assert_allclose(F.abs(), P.abs())
109 |
110 | def test_row_contrib(self):
111 | F = load_df_from_R(f"ca${self._row_name}$contrib")
112 | P = self.ca.row_contributions_
113 | np.testing.assert_allclose(F, P * 100)
114 |
115 | def test_row_cosine_similarities(self):
116 | F = load_df_from_R(f"ca${self._row_name}$cos2")
117 | if self.sup_rows:
118 | F = pd.concat((F, load_df_from_R(f"ca${self._row_name}.sup$cos2")))
119 | P = self.ca.row_cosine_similarities(self.dataset)
120 | np.testing.assert_allclose(F, P)
121 |
122 | def test_col_coords(self):
123 | F = load_df_from_R(f"ca${self._col_name}$coord")
124 | if self.sup_cols:
125 | F = pd.concat((F, load_df_from_R(f"ca${self._col_name}.sup$coord")))
126 | P = self.ca.column_coordinates(self.dataset)
127 | np.testing.assert_allclose(F.abs(), P.abs())
128 |
129 | def test_col_contrib(self):
130 | F = load_df_from_R(f"ca${self._col_name}$contrib")
131 | P = self.ca.column_contributions_
132 | np.testing.assert_allclose(F, P * 100)
133 |
134 | def test_col_cos2(self):
135 | F = load_df_from_R(f"ca${self._col_name}$cos2")
136 | if self.sup_cols:
137 | F = pd.concat((F, load_df_from_R(f"ca${self._col_name}.sup$cos2")))
138 | P = self.ca.column_cosine_similarities(self.dataset)
139 | np.testing.assert_allclose(F, P)
140 |
--------------------------------------------------------------------------------
/tests/test_famd.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import tempfile
4 |
5 | import numpy as np
6 | import pytest
7 | import sklearn.utils.estimator_checks
8 | import sklearn.utils.validation
9 | from rpy2.robjects import r as R
10 |
11 | import prince
12 | from tests import load_df_from_R
13 |
14 |
15 | @pytest.mark.parametrize(
16 | "sup_rows, sup_cols",
17 | [
18 | pytest.param(
19 | sup_rows,
20 | sup_cols,
21 | id=":".join(["sup_rows" if sup_rows else "", "sup_cols" if sup_cols else ""]).strip(
22 | ":"
23 | ),
24 | )
25 | for sup_rows in [False]
26 | for sup_cols in [False]
27 | ],
28 | )
29 | class TestFAMD:
30 | _row_name = "row"
31 | _col_name = "col"
32 |
33 | @pytest.fixture(autouse=True)
34 | def _prepare(self, sup_rows, sup_cols):
35 | self.sup_rows = sup_rows
36 | self.sup_cols = sup_cols
37 |
38 | n_components = 5
39 |
40 | # Fit Prince
41 | self.dataset = prince.datasets.load_beers().head(200)
42 | active = self.dataset.copy()
43 | self.famd = prince.FAMD(n_components=n_components, engine="scipy")
44 | self.famd.fit(active)
45 |
46 | # Fit FactoMineR
47 | R("library('FactoMineR')")
48 | with tempfile.NamedTemporaryFile() as fp:
49 | self.dataset.to_csv(fp)
50 | R(f"dataset <- read.csv('{fp.name}', row.names=c(1))")
51 | R("famd <- FAMD(dataset, graph=F)")
52 |
53 | def test_check_is_fitted(self):
54 | assert isinstance(self.famd, prince.FAMD)
55 | sklearn.utils.validation.check_is_fitted(self.famd)
56 |
57 | def test_num_cols(self):
58 | assert sorted(self.famd.num_cols_) == [
59 | "alcohol_by_volume",
60 | "final_gravity",
61 | "international_bitterness_units",
62 | "standard_reference_method",
63 | ]
64 |
65 | def test_cat_cols(self):
66 | assert sorted(self.famd.cat_cols_) == ["is_organic", "style"]
67 |
68 | def test_eigenvalues(self):
69 | F = load_df_from_R("famd$eig")[: self.famd.n_components]
70 | P = self.famd._eigenvalues_summary
71 | np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"])
72 | np.testing.assert_allclose(F["percentage of variance"], P["% of variance"])
73 | np.testing.assert_allclose(
74 | F["cumulative percentage of variance"], P["% of variance (cumulative)"]
75 | )
76 |
77 | @pytest.mark.parametrize("method_name", ("row_coordinates", "transform"))
78 | def test_row_coords(self, method_name):
79 | method = getattr(self.famd, method_name)
80 | F = load_df_from_R("famd$ind$coord")
81 | P = method(self.dataset)
82 | np.testing.assert_allclose(F.abs(), P.abs())
83 |
84 | def test_row_contrib(self):
85 | F = load_df_from_R("famd$ind$contrib")
86 | P = self.famd.row_contributions_
87 | np.testing.assert_allclose(F, P * 100)
88 |
89 | def test_col_coords(self):
90 | F = load_df_from_R("famd$var$coord")
91 | P = self.famd.column_coordinates_
92 | np.testing.assert_allclose(F.abs(), P.abs())
93 |
94 | def test_col_contrib(self):
95 | F = load_df_from_R("famd$var$contrib")
96 | P = self.famd.column_contributions_
97 | np.testing.assert_allclose(F, P * 100)
98 |
99 |
100 | def test_issue_169():
101 | """
102 |
103 | https://github.com/MaxHalford/prince/issues/169
104 |
105 | >>> import pandas as pd
106 | >>> from prince import FAMD
107 | >>> df = pd.DataFrame({'var1':['c', 'a', 'b','c'], 'var2':['x','y','y','z'],'var2': [0.,10.,30.4,0.]})
108 |
109 | >>> famd = FAMD(n_components=2, random_state=42)
110 | >>> famd = famd.fit(df[:3])
111 |
112 | >>> famd.transform(df[0:3])
113 | component 0 1
114 | 0 -1.303760 -0.658334
115 | 1 -0.335621 0.981047
116 | 2 1.639381 -0.322713
117 |
118 | >>> famd.transform(df[0:2])
119 | component 0 1
120 | 0 -1.000920 -0.669274
121 | 1 -0.092001 0.669274
122 |
123 | >>> famd.transform(df[3:]).round(6)
124 | component 0 1
125 | 3 -0.869173 -0.0
126 |
127 | """
128 |
--------------------------------------------------------------------------------
/tests/test_gpa.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import unittest
4 |
5 | import numpy as np
6 | import pandas as pd
7 |
8 | import prince
9 |
10 |
11 | class TestGPA(unittest.TestCase):
12 | def setUp(self):
13 | # Create a list of 2-D circles with different locations and rotations
14 | n_shapes = 4
15 | n_points = 12
16 | n_dims = 2
17 |
18 | shape_sizes = np.arange(1, n_shapes + 1)
19 | shape_angle_offsets = 10 * np.arange(n_shapes)
20 | shape_center_offsets = np.tile(np.arange(n_shapes), (n_dims, 1))
21 |
22 | base_angles = np.linspace(0, 2 * np.pi, num=n_points, endpoint=False)
23 | # Size (n_shapes, n_points)
24 | angles = base_angles[np.newaxis, :] + shape_angle_offsets[:, np.newaxis]
25 |
26 | # Calculate along dimensions
27 | x = np.cos(angles) * shape_sizes[:, np.newaxis] + shape_center_offsets[0][:, np.newaxis]
28 | y = np.sin(angles) * shape_sizes[:, np.newaxis] + shape_center_offsets[1][:, np.newaxis]
29 |
30 | self.shapes = np.stack([x, y], axis=-1)
31 |
32 | def test_fit(self):
33 | gpa = prince.GPA()
34 | self.assertIsInstance(gpa.fit(self.shapes), prince.GPA)
35 |
36 | def test_fit_random(self):
37 | gpa = prince.GPA(init="random")
38 | self.assertIsInstance(gpa.fit(self.shapes), prince.GPA)
39 |
40 | def test_fit_mean(self):
41 | gpa = prince.GPA(init="mean")
42 | self.assertIsInstance(gpa.fit(self.shapes), prince.GPA)
43 |
44 | def test_fit_bad_init(self):
45 | gpa = prince.GPA(init="bad init type")
46 |
47 | with self.assertRaises(ValueError):
48 | gpa.fit(self.shapes)
49 |
50 | def test_fit_bad_input_size(self):
51 | gpa = prince.GPA()
52 |
53 | with self.assertRaises(ValueError):
54 | gpa.fit(self.shapes[0])
55 |
56 | def test_transform(self):
57 | gpa = prince.GPA(copy=True)
58 | aligned_shapes = gpa.fit(self.shapes).transform(self.shapes)
59 | self.assertIsInstance(aligned_shapes, np.ndarray)
60 | self.assertEqual(self.shapes.shape, aligned_shapes.shape)
61 |
62 | def test_fit_transform_equal(self):
63 | """In our specific case of all-same-shape circles, the shapes should
64 | align perfectly."""
65 | gpa = prince.GPA()
66 | aligned_shapes = gpa.fit_transform(self.shapes)
67 | self.assertIsInstance(aligned_shapes, np.ndarray)
68 | np.testing.assert_array_almost_equal(aligned_shapes[:-1], aligned_shapes[1:])
69 |
70 | def test_fit_transform_single(self):
71 | """Aligning a single shape should return the same shape, just normalized."""
72 | gpa = prince.GPA()
73 | shapes = self.shapes[0:1]
74 | aligned_shapes = gpa.fit_transform(shapes)
75 | np.testing.assert_array_almost_equal(shapes / np.linalg.norm(shapes), aligned_shapes)
76 |
77 | def test_copy(self):
78 | shapes_copy = np.copy(self.shapes)
79 |
80 | gpa = prince.GPA(copy=True)
81 | gpa.fit(shapes_copy)
82 | np.testing.assert_array_equal(self.shapes, shapes_copy)
83 |
84 | gpa = prince.GPA(copy=False)
85 | gpa.fit(shapes_copy)
86 | self.assertRaises(AssertionError, np.testing.assert_array_equal, self.shapes, shapes_copy)
87 |
88 | def test_xarray(self):
89 | points = pd.DataFrame(
90 | data=[
91 | [0, 0, 0, 0],
92 | [0, 2, 0, 1],
93 | [1, 0, 0, 2],
94 | [3, 2, 1, 0],
95 | [1, 2, 1, 1],
96 | [3, 3, 1, 2],
97 | [0, 0, 2, 0],
98 | [0, 4, 2, 1],
99 | [2, 0, 2, 2],
100 | ],
101 | columns=["x", "y", "shape", "point"],
102 | ).astype({"x": float, "y": float})
103 |
104 | ds = points.set_index(["shape", "point"]).to_xarray()
105 | da = ds.to_stacked_array("xy", ["shape", "point"])
106 | shapes = da.values
107 |
108 | gpa = prince.GPA()
109 | aligned_shapes = gpa.fit_transform(shapes)
110 | da.values = aligned_shapes
111 | da.to_unstacked_dataset("xy").to_dataframe().reset_index()
112 |
--------------------------------------------------------------------------------
/tests/test_mca.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import tempfile
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import pytest
8 | from rpy2.robjects import r as R
9 |
10 | import prince
11 | from tests import load_df_from_R
12 | from tests.test_ca import TestCA as _TestCA
13 |
14 |
15 | class TestMCA(_TestCA):
16 | _row_name = "ind"
17 | _col_name = "var"
18 |
19 | @pytest.fixture(autouse=True)
20 | def _prepare(self, sup_rows, sup_cols):
21 | self.sup_rows = sup_rows
22 | self.sup_cols = sup_cols
23 |
24 | n_components = 5
25 | n_active_rows = 1_000
26 |
27 | # Fit Prince
28 | self.dataset = prince.datasets.load_hearthstone_cards()
29 | active = self.dataset.copy()
30 | if self.sup_rows:
31 | active = active[:n_active_rows]
32 | if self.sup_cols:
33 | active = active.drop(columns=["type_or_school"])
34 | self.ca = prince.MCA(n_components=n_components, engine="scipy")
35 | self.ca.fit(active)
36 |
37 | # Fit FactoMineR
38 | R("library('FactoMineR')")
39 | with tempfile.NamedTemporaryFile() as fp:
40 | self.dataset.to_csv(fp)
41 | R(f"dataset <- read.csv('{fp.name}')[,-1]")
42 |
43 | args = f"dataset, ncp={n_components}, graph=F"
44 | if self.sup_cols:
45 | if self.sup_rows:
46 | R(
47 | f"ca <- MCA({args}, quali.sup=c(4), ind.sup=c({n_active_rows + 1}:nrow(dataset)))"
48 | )
49 | else:
50 | R(f"ca <- MCA({args}, quali.sup=c(4))")
51 | else:
52 | if self.sup_rows:
53 | R(f"ca <- MCA({args}, ind.sup=c({n_active_rows + 1}:nrow(dataset)))")
54 | else:
55 | R(f"ca <- MCA({args})")
56 |
57 | @pytest.mark.parametrize("method_name", ("row_coordinates", "transform"))
58 | def test_row_coords(self, method_name):
59 | super().test_row_coords(method_name=method_name)
60 |
61 | def test_col_coords(self):
62 | if self.sup_cols:
63 | F = load_df_from_R("ca$var$coord")
64 | if self.sup_cols:
65 | F = pd.concat((F, load_df_from_R("ca$quali.sup$coord")))
66 | P = self.ca.column_coordinates(self.dataset)
67 | # Prince adds a prefix to each column. We need to remove it in order to align the rows
68 | # of the two dataframes
69 | P.index = [idx.split("__", 1)[1] for idx in P.index]
70 | np.testing.assert_allclose(F.abs(), P.abs().loc[F.index])
71 | else:
72 | super().test_col_coords()
73 |
74 | def test_col_cos2(self):
75 | if self.sup_cols:
76 | F = load_df_from_R("ca$var$cos2")
77 | if self.sup_cols:
78 | F = pd.concat((F, load_df_from_R("ca$quali.sup$cos2")))
79 | P = self.ca.column_cosine_similarities(self.dataset)
80 | # Prince adds a prefix to each column. We need to remove it in order to align the rows
81 | # of the two dataframes
82 | P.index = [idx.split("__", 1)[1] for idx in P.index]
83 | np.testing.assert_allclose(F, P.loc[F.index])
84 | else:
85 | super().test_col_cos2()
86 |
87 |
88 | def test_with_and_without_one_hot():
89 | """
90 |
91 | >>> df = pd.DataFrame({
92 | ... "foo": [1, 2, 3, 3, 5],
93 | ... "bar": ["a", "b", "c", "b", "e"],
94 | ... })
95 | >>> mca = prince.MCA(n_components=2, one_hot=True, engine="scipy")
96 | >>> mca = mca.fit(df)
97 | >>> coords = mca.transform(df)
98 | >>> assert coords.shape == (5, 2)
99 | >>> coords.round(2).abs().sort_index(axis='columns') # doctest: +SKIP
100 | 0 1
101 | 0 0.00 2.0
102 | 1 0.65 0.5
103 | 2 0.65 0.5
104 | 3 0.65 0.5
105 | 4 1.94 0.5
106 |
107 | >>> mca = prince.MCA(n_components=2, one_hot=False, engine="scipy")
108 | >>> one_hot = pd.get_dummies(df, columns=['foo', 'bar'])
109 | >>> mca = mca.fit(one_hot)
110 | >>> coords = mca.transform(one_hot)
111 | >>> assert coords.shape == (5, 2)
112 | >>> coords.round(2).abs().sort_index(axis='columns') # doctest: +SKIP
113 | 0 1
114 | 0 0.00 1.0
115 | 1 0.65 0.5
116 | 2 0.65 0.5
117 | 3 0.65 0.5
118 | 4 1.94 0.5
119 |
120 | """
121 |
122 |
123 | def test_issue_131():
124 | """
125 |
126 | https://github.com/MaxHalford/prince/issues/131#issuecomment-1591426031
127 |
128 | >>> df = pd.DataFrame({
129 | ... "foo": [1, 2, 3, 3, 5],
130 | ... "bar": ["a", "b", "c", "b", "e"],
131 | ... })
132 | >>> mca = prince.MCA(engine="scipy")
133 | >>> mca = mca.fit(df)
134 | >>> coords = mca.transform(df)
135 | >>> assert coords.shape == (5, 2)
136 | >>> coords.round(2).abs().sort_index(axis='columns') # doctest: +SKIP
137 | 0 1
138 | 0 0.00 2.0
139 | 1 0.65 0.5
140 | 2 0.65 0.5
141 | 3 0.65 0.5
142 | 4 1.94 0.5
143 |
144 | >>> mca.K_, mca.J_
145 | (2, 8)
146 |
147 | """
148 |
149 |
150 | def test_issue_171():
151 | """
152 |
153 | https://github.com/MaxHalford/prince/issues/171
154 |
155 | >>> from sklearn import impute
156 | >>> from sklearn import pipeline
157 |
158 | >>> rng = np.random.RandomState(0)
159 | >>> test_data = pd.DataFrame(data=rng.random((10, 5)))
160 | >>> test = pipeline.Pipeline(steps=[
161 | ... ('impute', impute.SimpleImputer()), # would break the pipeline since it returns an ndarray
162 | ... ('mca', prince.PCA()),
163 | ... ])
164 | >>> _ = test[0].set_output(transform='pandas')
165 | >>> test.fit_transform(test_data)
166 | component 0 1
167 | 0 -0.392617 0.296831
168 | 1 0.119661 -1.660653
169 | 2 -1.541581 -0.826863
170 | 3 3.105498 -0.538801
171 | 4 -2.439259 -0.343292
172 | 5 1.129341 -0.533576
173 | 6 -1.077436 0.899673
174 | 7 0.020571 -0.941029
175 | 8 1.498005 1.566376
176 | 9 -0.422184 2.081334
177 |
178 | """
179 |
180 |
181 | def test_type_doesnt_matter():
182 | """
183 |
184 | Checks that the type of the columns doesn't affect the result.
185 |
186 | """
187 | outputs = []
188 | dataset = prince.datasets.load_hearthstone_cards().head(100)
189 | for col in dataset.columns:
190 | labels, levels = pd.factorize(dataset[col])
191 | dataset[col] = labels
192 | for typ in ("int", "float", "str", "category"):
193 | dataset = dataset.astype(typ)
194 | mca = prince.MCA(n_components=2, engine="scipy")
195 | mca = mca.fit(dataset)
196 | outputs.append(mca.transform(dataset).abs())
197 |
198 | for i in range(len(outputs) - 1):
199 | np.testing.assert_allclose(outputs[i], outputs[i + 1])
200 |
201 |
202 | issue_161_data = """
203 | ,category,userid,location,applicationname,browser\n
204 | 0,Portal Login,a@b.com,"San Jose, CA, United States",A,Chrome\n
205 | 1,Application Access,b@b.com,"San Jose, CA, United States",B,Other\n
206 | 2,Application Access,a@b.com,"San Jose, CA, United States",C,Other\n
207 | 3,Portal Login,c@b.com,"San Diego, CA, United States",A,Chrome\n
208 | """
209 |
210 |
211 | def test_issue_161():
212 | """
213 |
214 | https://github.com/MaxHalford/prince/issues/161
215 |
216 | >>> import io
217 | >>> data = pd.read_csv(io.StringIO(issue_161_data), index_col=0)
218 |
219 | >>> mca = prince.MCA(
220 | ... n_components=10,
221 | ... n_iter=3,
222 | ... copy=True,
223 | ... check_input=True,
224 | ... engine='sklearn',
225 | ... random_state=42
226 | ... )
227 | >>> mca = mca.fit(data[:3])
228 |
229 | >>> mca.eigenvalues_summary
230 | eigenvalue % of variance % of variance (cumulative)
231 | component
232 | 0 0.673 67.32% 67.32%
233 | 1 0.327 32.68% 100.00%
234 |
235 | >>> mca.row_coordinates(data[:3])
236 | 0 1
237 | 0 1.120811 -0.209242
238 | 1 -0.820491 -0.571660
239 | 2 -0.300320 0.780902
240 |
241 | >>> mca.transform(data[3:])
242 | 0 1
243 | 3 1.664888 -0.640285
244 |
245 | """
246 |
247 |
248 | def test_abdi_2007_correction():
249 | """
250 |
251 | >>> wines = prince.datasets.load_burgundy_wines()
252 | >>> wines = wines.drop(columns=["Oak type"], level=0)
253 |
254 | >>> mca = prince.MCA(n_components=4, correction=None)
255 | >>> mca = mca.fit(wines)
256 | >>> mca.eigenvalues_.round(4).tolist()
257 | [0.8532, 0.2, 0.1151, 0.0317]
258 | >>> mca.percentage_of_variance_.round(3).tolist()
259 | [71.101, 16.667, 9.593, 2.64]
260 |
261 | >>> mca = prince.MCA(n_components=4, correction="benzecri")
262 | >>> mca = mca.fit(wines)
263 | >>> mca.eigenvalues_.round(4).tolist()
264 | [0.7004, 0.0123, 0.0003, 0.0]
265 | >>> mca.percentage_of_variance_.round(3).tolist()
266 | [98.229, 1.731, 0.04, 0.0]
267 |
268 | >>> mca = prince.MCA(n_components=4, correction="greenacre")
269 | >>> mca = mca.fit(wines)
270 | >>> mca.eigenvalues_.round(4).tolist()
271 | [0.7004, 0.0123, 0.0003, 0.0]
272 | >>> mca.percentage_of_variance_.round(3).tolist()
273 | [95.189, 1.678, 0.038, 0.0]
274 |
275 | """
276 |
--------------------------------------------------------------------------------
/tests/test_mfa.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import math
4 | import tempfile
5 |
6 | import numpy as np
7 | import pandas as pd
8 | import pytest
9 | import rpy2.robjects as robjects
10 | import sklearn.utils.estimator_checks
11 | import sklearn.utils.validation
12 | from rpy2.robjects import r as R
13 |
14 | import prince
15 | from tests import load_df_from_R
16 |
17 |
18 | @pytest.mark.parametrize(
19 | "sup_rows, sup_groups",
20 | [
21 | pytest.param(sup_rows, sup_groups, id=f"{sup_rows=}:{sup_groups=}")
22 | for sup_rows in [False, True]
23 | for sup_groups in [False, True]
24 | ],
25 | )
26 | class TestMFA:
27 | _row_name = "row"
28 | _col_name = "col"
29 |
30 | @pytest.fixture(autouse=True)
31 | def _prepare(self, sup_rows, sup_groups):
32 | self.sup_rows = sup_rows
33 | self.sup_groups = sup_groups
34 |
35 | n_components = 3
36 |
37 | # Fit Prince
38 | self.dataset = prince.datasets.load_premier_league()
39 | active = self.dataset.copy()
40 | if self.sup_rows:
41 | active = active.drop(index=["Manchester City", "Manchester United"])
42 | supplementary_groups = ["2023-24"] if self.sup_groups else []
43 | self.groups = self.dataset.columns.levels[0].tolist()
44 | self.mfa = prince.MFA(n_components=n_components)
45 | self.mfa.fit(active, groups=self.groups, supplementary_groups=supplementary_groups)
46 |
47 | # Fit FactoMineR
48 | R("library('FactoMineR')")
49 | with tempfile.NamedTemporaryFile() as fp:
50 | dataset = self.dataset.copy()
51 | dataset.columns = [" ".join(parts) for parts in dataset.columns]
52 | dataset.to_csv(fp, index=False)
53 | R(f"dataset <- read.csv('{fp.name}')")
54 |
55 | args = "dataset, group=c(6, 6, 6), graph=F"
56 | if self.sup_rows:
57 | args += ", ind.sup=c(9:10)"
58 | if self.sup_groups:
59 | args += ", num.group.sup=c(3)"
60 |
61 | R(f"mfa <- MFA({args})")
62 |
63 | def test_check_is_fitted(self):
64 | assert isinstance(self.mfa, prince.MFA)
65 | sklearn.utils.validation.check_is_fitted(self.mfa)
66 |
67 | def test_total_inertia(self):
68 | F = robjects.r("sum(mfa$eig[,1])")[0]
69 | P = self.mfa.total_inertia_
70 | assert math.isclose(F, P)
71 |
72 | def test_eigenvalues(self):
73 | F = load_df_from_R("mfa$eig")[: self.mfa.n_components]
74 | P = self.mfa._eigenvalues_summary
75 | np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"])
76 | np.testing.assert_allclose(F["percentage of variance"], P["% of variance"])
77 | np.testing.assert_allclose(
78 | F["cumulative percentage of variance"], P["% of variance (cumulative)"]
79 | )
80 |
81 | def test_group_eigenvalues(self):
82 | for i, group in enumerate(self.groups, start=1):
83 | F = load_df_from_R(f"mfa$separate.analyses$Gr{i}$eig")[: self.mfa.n_components]
84 | P = self.mfa[group]._eigenvalues_summary
85 | np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"])
86 | np.testing.assert_allclose(F["percentage of variance"], P["% of variance"])
87 | np.testing.assert_allclose(
88 | F["cumulative percentage of variance"], P["% of variance (cumulative)"]
89 | )
90 |
91 | @pytest.mark.parametrize("method_name", ("row_coordinates", "transform"))
92 | def test_row_coords(self, method_name):
93 | method = getattr(self.mfa, method_name)
94 | F = load_df_from_R("mfa$ind$coord")
95 | P = method(self.dataset)
96 | if self.sup_rows:
97 | F = pd.concat((F, load_df_from_R("mfa$ind.sup$coord")))
98 | # Move supplementary rows to the end
99 | P = pd.concat(
100 | [
101 | P.loc[P.index.difference(["Manchester City", "Manchester United"])],
102 | P.loc[["Manchester City", "Manchester United"]],
103 | ]
104 | )
105 | F = F.iloc[:, : self.mfa.n_components]
106 | np.testing.assert_allclose(F.abs(), P.abs())
107 |
108 | def test_row_contrib(self):
109 | F = load_df_from_R("mfa$ind$contrib").iloc[:, : self.mfa.n_components]
110 | P = self.mfa.row_contributions_
111 | np.testing.assert_allclose(F, P * 100)
112 |
--------------------------------------------------------------------------------
/tests/test_pca.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import math
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import pytest
8 | import rpy2.robjects as robjects
9 | import sklearn.utils.estimator_checks
10 | import sklearn.utils.validation
11 | from rpy2.robjects import numpy2ri
12 | from sklearn import decomposition, pipeline, preprocessing
13 |
14 | import prince
15 | from tests import load_df_from_R
16 |
17 |
18 | @pytest.mark.parametrize(
19 | "sup_rows, sup_cols, scale, sample_weights, column_weights",
20 | [
21 | pytest.param(
22 | sup_rows,
23 | sup_cols,
24 | scale,
25 | sample_weights,
26 | column_weights,
27 | id=f"{sup_rows=}:{sup_cols=}:{scale=}:{sample_weights=}:{column_weights=}",
28 | )
29 | for sup_rows in [False, True]
30 | for sup_cols in [False, True]
31 | for scale in [False, True]
32 | for sample_weights in [False, True]
33 | for column_weights in [False, True]
34 | ],
35 | )
36 | class TestPCA:
37 | @pytest.fixture(autouse=True)
38 | def _prepare(self, sup_rows, sup_cols, scale, sample_weights, column_weights):
39 | self.sup_rows = sup_rows
40 | self.sup_cols = sup_cols
41 | self.scale = scale
42 |
43 | n_components = 5
44 |
45 | # Fit Prince
46 | self.dataset = prince.datasets.load_decathlon()
47 | self.active = self.dataset.copy()
48 | if self.sup_rows:
49 | self.active = self.active.query('competition == "Decastar"')
50 | self.sample_weights = (
51 | np.random.default_rng().dirichlet([1] * len(self.active)) if sample_weights else None
52 | )
53 | supplementary_columns = ["rank", "points"] if self.sup_cols else []
54 | self.column_weights = (
55 | np.random.default_rng().random(
56 | len(self.active.columns.difference(supplementary_columns))
57 | )
58 | if column_weights
59 | else None
60 | )
61 | self.pca = prince.PCA(n_components=n_components, rescale_with_std=self.scale)
62 | self.pca.fit(
63 | self.active,
64 | sample_weight=self.sample_weights,
65 | column_weight=self.column_weights,
66 | supplementary_columns=supplementary_columns,
67 | )
68 |
69 | # scikit-learn
70 | if self.scale:
71 | self.sk_pca = pipeline.make_pipeline(
72 | preprocessing.StandardScaler(),
73 | decomposition.PCA(n_components=n_components),
74 | )
75 | else:
76 | self.sk_pca = pipeline.make_pipeline(
77 | decomposition.PCA(n_components=n_components),
78 | )
79 | # sklearn's PCA doesn't support sample weights
80 | self.sk_pca.fit(self.active[self.pca.feature_names_in_])
81 |
82 | # Fit FactoMineR
83 | robjects.r(
84 | """
85 | library('FactoMineR')
86 |
87 | data(decathlon)
88 | decathlon <- subset(decathlon, select = -c(Competition))
89 | """
90 | )
91 |
92 | args = f"decathlon, ncp={n_components}, graph=F"
93 | if sample_weights:
94 | robjects.r.assign("row.w", numpy2ri.py2rpy(self.sample_weights))
95 | robjects.r("row.w <- as.vector(row.w)")
96 | args += ", row.w=row.w"
97 | if column_weights:
98 | robjects.r.assign("col.w", numpy2ri.py2rpy(self.column_weights))
99 | robjects.r("col.w <- as.vector(col.w)")
100 | args += ", col.w=col.w"
101 | if not self.scale:
102 | args += ", scale.unit=F"
103 | if self.sup_cols:
104 | if self.sup_rows:
105 | robjects.r(f"pca = PCA({args}, quanti.sup=c(11, 12), ind.sup=c(14:41))")
106 | else:
107 | robjects.r(f"pca = PCA({args}, quanti.sup=c(11, 12))")
108 | else:
109 | if self.sup_rows:
110 | robjects.r(f"pca = PCA({args}, ind.sup=c(14:41))")
111 | else:
112 | robjects.r(f"pca = PCA({args})")
113 |
114 | def test_check_is_fitted(self):
115 | assert isinstance(self.pca, prince.PCA)
116 | sklearn.utils.validation.check_is_fitted(self.pca)
117 |
118 | def test_total_inertia(self):
119 | F = robjects.r("sum(pca$eig[,1])")[0]
120 | P = self.pca.total_inertia_
121 | assert math.isclose(F, P)
122 |
123 | def test_eigenvalues(self):
124 | P = self.pca._eigenvalues_summary
125 | # Test against FactoMineR
126 | F = load_df_from_R("pca$eig")[: self.pca.n_components]
127 | np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"])
128 | np.testing.assert_allclose(F["percentage of variance"], P["% of variance"])
129 | np.testing.assert_allclose(
130 | F["cumulative percentage of variance"], P["% of variance (cumulative)"]
131 | )
132 | # Test against scikit-learn
133 | if self.sample_weights is None and self.column_weights is None:
134 | n = len(self.active)
135 | S = self.sk_pca[-1].explained_variance_ * (n - 1) / n
136 | np.testing.assert_allclose(P["eigenvalue"], S)
137 | np.testing.assert_allclose(
138 | P["% of variance"], self.sk_pca[-1].explained_variance_ratio_ * 100
139 | )
140 |
141 | @pytest.mark.parametrize("method_name", ("row_coordinates", "transform"))
142 | def test_row_coords(self, method_name):
143 | method = getattr(self.pca, method_name)
144 | P = method(self.dataset)
145 | # Test againt FactoMineR
146 | F = load_df_from_R("pca$ind$coord")
147 | if self.sup_rows:
148 | F = pd.concat((F, load_df_from_R("pca$ind.sup$coord")))
149 | np.testing.assert_allclose(F.abs(), P.abs())
150 | # Test against scikit-learn
151 | if self.sample_weights is None and self.column_weights is None:
152 | S = self.sk_pca.transform(self.dataset[self.pca.feature_names_in_])
153 | np.testing.assert_allclose(np.abs(S), P.abs())
154 |
155 | def test_row_cosine_similarities(self):
156 | F = load_df_from_R("pca$ind$cos2")
157 | if self.sup_rows:
158 | F = pd.concat((F, load_df_from_R("pca$ind.sup$cos2")))
159 | P = self.pca.row_cosine_similarities(self.dataset)
160 | np.testing.assert_allclose(F, P)
161 |
162 | def test_row_contrib(self):
163 | F = load_df_from_R("pca$ind$contrib")
164 | P = self.pca.row_contributions_
165 | np.testing.assert_allclose(F, P * 100)
166 |
167 | def test_col_coords(self):
168 | F = load_df_from_R("pca$var$coord")
169 | P = self.pca.column_coordinates_
170 | if self.sup_cols:
171 | P = P.drop(["rank", "points"])
172 | np.testing.assert_allclose(F.abs(), P.abs())
173 |
174 | def test_col_cos2(self):
175 | F = load_df_from_R("pca$var$cos2")
176 | P = self.pca.column_cosine_similarities_
177 | if self.sup_cols:
178 | P = P.drop(["rank", "points"])
179 | np.testing.assert_allclose(F, P)
180 |
181 | def test_col_contrib(self):
182 | F = load_df_from_R("pca$var$contrib")
183 | P = self.pca.column_contributions_
184 | np.testing.assert_allclose(F, P * 100)
185 |
--------------------------------------------------------------------------------
/tests/test_svd.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import numpy as np
4 | import pytest
5 | import rpy2.robjects as robjects
6 | from rpy2.robjects import numpy2ri
7 |
8 | from prince import svd
9 | from tests import load_df_from_R
10 |
11 |
12 | @pytest.mark.parametrize(
13 | "n_components, are_rows_weighted, are_columns_weighted",
14 | [
15 | pytest.param(
16 | n_components,
17 | are_rows_weighted,
18 | are_columns_weighted,
19 | id=f"{n_components=}:{are_rows_weighted=}:{are_columns_weighted=}",
20 | )
21 | for n_components in [1, 3, 10]
22 | for are_rows_weighted in [False, True]
23 | for are_columns_weighted in [False, True]
24 | ],
25 | )
26 | class TestSVD:
27 | @pytest.fixture(autouse=True)
28 | def _prepare(self, n_components, are_rows_weighted, are_columns_weighted):
29 | self.n_components = n_components
30 | self.are_rows_weighted = are_rows_weighted
31 | self.are_columns_weighted = are_columns_weighted
32 |
33 | self.dataset = np.random.rand(100, 10)
34 | self.row_weights = np.random.rand(100)
35 | self.row_weights /= self.row_weights.sum()
36 | self.column_weights = np.random.rand(10)
37 |
38 | # Fit Prince
39 | self.svd = svd.compute_svd(
40 | X=self.dataset,
41 | row_weights=self.row_weights if are_rows_weighted else None,
42 | column_weights=self.column_weights if are_columns_weighted else None,
43 | n_components=n_components,
44 | n_iter=3,
45 | random_state=42,
46 | engine="scipy",
47 | )
48 |
49 | # Fit FactoMineR
50 | robjects.r("library('FactoMineR')")
51 | robjects.r.assign("X", numpy2ri.py2rpy(self.dataset))
52 | robjects.r.assign("row.w", numpy2ri.py2rpy(self.row_weights))
53 | robjects.r.assign("col.w", numpy2ri.py2rpy(self.column_weights))
54 | robjects.r("row.w <- as.vector(row.w)")
55 | robjects.r("col.w <- as.vector(col.w)")
56 | args = f"X, ncp={n_components}"
57 | if are_rows_weighted:
58 | args += ", row.w=row.w"
59 | if are_columns_weighted:
60 | args += ", col.w=col.w"
61 | robjects.r(f"svd = svd.triplet({args})")
62 |
63 | def test_U(self):
64 | assert self.svd.U.shape == (100, self.n_components)
65 | if self.are_rows_weighted:
66 | P = self.svd.U
67 | F = load_df_from_R("svd$U")
68 | np.testing.assert_allclose(np.abs(F), np.abs(P))
69 |
70 | def test_s(self):
71 | assert self.svd.s.shape == (self.n_components,)
72 | if self.are_rows_weighted:
73 | P = self.svd.s
74 | F = robjects.r("svd$vs")[: self.n_components]
75 | np.testing.assert_allclose(np.abs(F), np.abs(P))
76 |
77 | def test_V(self):
78 | assert self.svd.V.shape == (self.n_components, 10)
79 | P = self.svd.V
80 | F = load_df_from_R("svd$V").T
81 | np.testing.assert_allclose(np.abs(F), np.abs(P))
82 |
--------------------------------------------------------------------------------