├── .gitattributes
├── .github
    ├── FUNDING.yml
    ├── actions
    │   └── install-env
    │   │   └── action.yml
    └── workflows
    │   ├── code-quality.yml
    │   ├── hugo.yml
    │   └── unit-tests.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── .pylintrc
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── docs
    ├── .hugo_build.lock
    ├── archetypes
    │   └── default.md
    ├── config.toml
    ├── content
    │   ├── _index.md
    │   ├── ca.ipynb
    │   ├── famd.ipynb
    │   ├── faq.ipynb
    │   ├── gpa.ipynb
    │   ├── mca.ipynb
    │   ├── mfa.ipynb
    │   └── pca.ipynb
    ├── layouts
    │   └── _default
    │   │   └── _markup
    │   │       └── render-codeblock-mermaid.html
    ├── static
    │   ├── favicon.ico
    │   └── images
    │   │   ├── favicon.png
    │   │   └── logo.png
    └── themes
    │   └── hugo-bearblog
    │       ├── archetypes
    │           ├── blog.md
    │           └── default.md
    │       ├── layouts
    │           ├── 404.html
    │           ├── _default
    │           │   ├── baseof.html
    │           │   ├── list.html
    │           │   └── single.html
    │           ├── index.html
    │           ├── partials
    │           │   ├── custom_body.html
    │           │   ├── custom_head.html
    │           │   ├── favicon.html
    │           │   ├── footer.html
    │           │   ├── header.html
    │           │   ├── nav.html
    │           │   ├── seo_tags.html
    │           │   └── style.html
    │           └── robots.txt
    │       └── theme.toml
├── figures
    ├── decastar.svg
    └── decastar_bis.svg
├── poetry.lock
├── prince
    ├── __init__.py
    ├── ca.py
    ├── datasets.py
    ├── datasets
    │   ├── 02-resultats-par-region.csv
    │   ├── beers.csv.zip
    │   ├── decathlon.csv
    │   ├── hearthstone_cards.csv
    │   ├── per-capita-energy-stacked.csv
    │   ├── premier_league.csv
    │   ├── punctuation_marks.csv
    │   └── resultats-par-departement.csv
    ├── famd.py
    ├── gpa.py
    ├── mca.py
    ├── mfa.py
    ├── pca.py
    ├── plot.py
    ├── svd.py
    └── utils.py
├── pyproject.toml
└── tests
    ├── DESCRIPTION
    ├── __init__.py
    ├── test_ca.py
    ├── test_famd.py
    ├── test_gpa.py
    ├── test_mca.py
    ├── test_mfa.py
    ├── test_pca.py
    └── test_svd.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.R linguist-vendored
2 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: MaxHalford
2 | 


--------------------------------------------------------------------------------
/.github/actions/install-env/action.yml:
--------------------------------------------------------------------------------
 1 | name: Install env
 2 | runs:
 3 |   using: "composite"
 4 |   steps:
 5 |     - name: Check out repository
 6 |       uses: actions/checkout@v3
 7 | 
 8 |     - name: Install R
 9 |       uses: r-lib/actions/setup-r@v2
10 | 
11 |     - name: Install R packages
12 |       uses: r-lib/actions/setup-r-dependencies@v2
13 |       with:
14 |         cache-version: 1
15 |         working-directory: tests
16 | 
17 |     - name: Set up Python
18 |       id: set-up-python
19 |       uses: actions/setup-python@v4
20 |       with:
21 |         python-version: "3.11"
22 | 
23 |     - name: Load cached Poetry installation
24 |       uses: actions/cache@v3
25 |       with:
26 |         path: ~/.local
27 |         key: poetry-0
28 | 
29 |     - name: Install poetry
30 |       uses: snok/install-poetry@v1
31 |       with:
32 |         virtualenvs-create: true
33 |         virtualenvs-in-project: true
34 |         installer-parallel: true
35 | 
36 |     - name: Load cached virtual env
37 |       uses: actions/cache@v3
38 |       with:
39 |         path: .venv
40 |         key: venv-${{ runner.os }}-${{ steps.set-up-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
41 | 
42 |     - name: Install dependencies
43 |       shell: bash
44 |       if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
45 |       run: poetry install --no-interaction --no-root
46 | 
47 |     - name: Install project
48 |       shell: bash
49 |       run: poetry install --no-interaction
50 | 
51 |     - name: Activate environment
52 |       shell: bash
53 |       run: source $VENV
54 | 


--------------------------------------------------------------------------------
/.github/workflows/code-quality.yml:
--------------------------------------------------------------------------------
 1 | name: Code quality
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - "*"
 7 |   push:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   run:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |       - uses: ./.github/actions/install-env
17 |       - run: poetry run pre-commit run --all-files
18 | 


--------------------------------------------------------------------------------
/.github/workflows/hugo.yml:
--------------------------------------------------------------------------------
 1 | # Sample workflow for building and deploying a Hugo site to GitHub Pages
 2 | name: Deploy Hugo site to Pages
 3 | 
 4 | on:
 5 |   # Allows you to run this workflow manually from the Actions tab
 6 |   workflow_dispatch:
 7 | 
 8 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 9 | permissions:
10 |   contents: read
11 |   pages: write
12 |   id-token: write
13 | 
14 | # Allow one concurrent deployment
15 | concurrency:
16 |   group: "pages"
17 |   cancel-in-progress: true
18 | 
19 | # Default to bash
20 | defaults:
21 |   run:
22 |     shell: bash
23 | 
24 | jobs:
25 |   # Build job
26 |   build:
27 |     runs-on: ubuntu-latest
28 |     env:
29 |       HUGO_VERSION: 0.144.2
30 |     steps:
31 |       - name: Install Hugo CLI
32 |         run: |
33 |           wget -O ${{ runner.temp }}/hugo.deb https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_linux-amd64.deb \
34 |           && sudo dpkg -i ${{ runner.temp }}/hugo.deb
35 | 
36 |       - name: Install Dart Sass
37 |         run: sudo snap install dart-sass
38 | 
39 |       - name: Checkout
40 |         uses: actions/checkout@v4
41 |         with:
42 |           submodules: recursive
43 |           fetch-depth: 0
44 | 
45 |       - name: Install environment
46 |         uses: ./.github/actions/install-env
47 | 
48 |       - name: Execute notebooks
49 |         run: poetry run jupyter nbconvert --execute --to notebook --inplace docs/content/*.ipynb
50 | 
51 |       - name: Convert notebooks
52 |         run: poetry run jupyter nbconvert --to markdown docs/content/*.ipynb
53 | 
54 |       - name: Clean MarkDown
55 |         run: (for f in docs/content/*.md; do sed -e '/<script/,/<\/script>/{/^$/d;}' ${f} > ${f}.tmp; mv ${f}.tmp ${f}; done)
56 | 
57 |       - name: Setup Pages
58 |         id: pages
59 |         uses: actions/configure-pages@v5
60 | 
61 |       - name: Install Node.js dependencies
62 |         run: "[[ -f package-lock.json || -f npm-shrinkwrap.json ]] && npm ci || true"
63 | 
64 |       - name: Build with Hugo
65 |         env:
66 |           HUGO_CACHEDIR: ${{ runner.temp }}/hugo_cache
67 |           HUGO_ENVIRONMENT: production
68 |         run: |
69 |           cd docs && hugo \
70 |             --gc \
71 |             --minify \
72 |             --baseURL "${{ steps.pages.outputs.base_url }}/"
73 | 
74 |       - name: Upload artifact
75 |         uses: actions/upload-pages-artifact@v3
76 |         with:
77 |           path: ./docs/public
78 | 
79 |   # Deployment job
80 |   deploy:
81 |     environment:
82 |       name: github-pages
83 |       url: ${{ steps.deployment.outputs.page_url }}
84 |     runs-on: ubuntu-latest
85 |     needs: build
86 |     steps:
87 |       - name: Deploy to GitHub Pages
88 |         id: deployment
89 |         uses: actions/deploy-pages@v4
90 | 


--------------------------------------------------------------------------------
/.github/workflows/unit-tests.yml:
--------------------------------------------------------------------------------
 1 | name: Unit tests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - "*"
 7 |   push:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   run:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |       - uses: ./.github/actions/install-env
17 |       - run: poetry run pytest
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints
 2 | .pytest_cache
 3 | *.pyc
 4 | tests/*.csv
 5 | *.ipynb
 6 | .venv
 7 | .DS_Store
 8 | docs/content/*.md
 9 | !docs/content/_index.md
10 | !docs/content/*.ipynb
11 | dist
12 | public
13 | .vscode
14 | .tmp
15 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "docs/themes/hugo-bearblog"]
2 | 	path = docs/themes/hugo-bearblog
3 | 	url = https://github.com/janraasch/hugo-bearblog.git
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | files: .
 2 | repos:
 3 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 4 |     rev: v5.0.0
 5 |     hooks:
 6 |       - id: end-of-file-fixer
 7 |       - id: trailing-whitespace
 8 |       - id: mixed-line-ending
 9 | 
10 |   - repo: https://github.com/astral-sh/ruff-pre-commit
11 |     # Ruff version.
12 |     rev: v0.8.0
13 |     hooks:
14 |       # Run the linter.
15 |       - id: ruff
16 |         types_or: [python, pyi]
17 |         args: [--fix]
18 |       # Run the formatter.
19 |       - id: ruff-format
20 |         types_or: [python, pyi]
21 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # Specify a configuration file.
  4 | #rcfile=
  5 | 
  6 | # Python code to execute, usually for sys.path manipulation such as
  7 | # pygtk.require().
  8 | #init-hook=
  9 | 
 10 | # Add files or directories to the blacklist. They should be base names, not
 11 | # paths.
 12 | ignore=CVS
 13 | 
 14 | # Pickle collected data for later comparisons.
 15 | persistent=yes
 16 | 
 17 | # List of plugins (as comma separated values of python modules names) to load,
 18 | # usually to register additional checkers.
 19 | load-plugins=
 20 | 
 21 | # Use multiple processes to speed up Pylint.
 22 | jobs=1
 23 | 
 24 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 25 | # active Python interpreter and may run arbitrary code.
 26 | unsafe-load-any-extension=no
 27 | 
 28 | # A comma-separated list of package or module names from where C extensions may
 29 | # be loaded. Extensions are loading into the active Python interpreter and may
 30 | # run arbitrary code
 31 | extension-pkg-whitelist=
 32 | 
 33 | # Allow optimization of some AST trees. This will activate a peephole AST
 34 | # optimizer, which will apply various small optimizations. For instance, it can
 35 | # be used to obtain the result of joining multiple strings with the addition
 36 | # operator. Joining a lot of strings can lead to a maximum recursion error in
 37 | # Pylint and this flag can prevent that. It has one side effect, the resulting
 38 | # AST will be different than the one from reality.
 39 | optimize-ast=no
 40 | 
 41 | 
 42 | [MESSAGES CONTROL]
 43 | 
 44 | # Only show warnings with the listed confidence levels. Leave empty to show
 45 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
 46 | confidence=
 47 | 
 48 | # Enable the message, report, category or checker with the given id(s). You can
 49 | # either give multiple identifier separated by comma (,) or put this option
 50 | # multiple time (only on the command line, not in the configuration file where
 51 | # it should appear only once). See also the "--disable" option for examples.
 52 | #enable=
 53 | 
 54 | # Disable the message, report, category or checker with the given id(s). You
 55 | # can either give multiple identifiers separated by comma (,) or put this
 56 | # option multiple times (only on the command line, not in the configuration
 57 | # file where it should appear only once).You can also use "--disable=all" to
 58 | # disable everything first and then reenable specific checks. For example, if
 59 | # you want to run only the similarities checker, you can use "--disable=all
 60 | # --enable=similarities". If you want to run only the classes checker, but have
 61 | # no Warning level messages displayed, use"--disable=all --enable=classes
 62 | # --disable=W"
 63 | disable=unicode-builtin,coerce-method,dict-view-method,unpacking-in-except,file-builtin,nonzero-method,execfile-builtin,import-star-module-level,unichr-builtin,zip-builtin-not-iterating,map-builtin-not-iterating,parameter-unpacking,dict-iter-method,filter-builtin-not-iterating,metaclass-assignment,coerce-builtin,intern-builtin,apply-builtin,suppressed-message,reduce-builtin,no-absolute-import,old-ne-operator,xrange-builtin,long-suffix,old-raise-syntax,round-builtin,buffer-builtin,raw_input-builtin,next-method-called,delslice-method,oct-method,old-division,range-builtin-not-iterating,hex-method,basestring-builtin,useless-suppression,print-statement,long-builtin,getslice-method,old-octal-literal,cmp-method,cmp-builtin,indexing-exception,backtick,standarderror-builtin,using-cmp-argument,raising-string,reload-builtin,input-builtin,setslice-method
 64 | 
 65 | 
 66 | [REPORTS]
 67 | 
 68 | # Set the output format. Available formats are text, parseable, colorized, msvs
 69 | # (visual studio) and html. You can also give a reporter class, eg
 70 | # mypackage.mymodule.MyReporterClass.
 71 | output-format=text
 72 | 
 73 | # Put messages in a separate file for each module / package specified on the
 74 | # command line instead of printing them on stdout. Reports (if any) will be
 75 | # written in a file name "pylint_global.[txt|html]".
 76 | files-output=no
 77 | 
 78 | # Tells whether to display a full report or only the messages
 79 | reports=yes
 80 | 
 81 | # Python expression which should return a note less than 10 (10 is the highest
 82 | # note). You have access to the variables errors warning, statement which
 83 | # respectively contain the number of errors / warnings messages and the total
 84 | # number of statements analyzed. This is used by the global evaluation report
 85 | # (RP0004).
 86 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 87 | 
 88 | # Template used to display messages. This is a python new-style format string
 89 | # used to format the message information. See doc for all details
 90 | #msg-template=
 91 | 
 92 | 
 93 | [BASIC]
 94 | 
 95 | # List of builtins function names that should not be used, separated by a comma
 96 | bad-functions=map,filter
 97 | 
 98 | # Good variable names which should always be accepted, separated by a comma
 99 | good-names=i,j,k,ex,Run,_,X,y
100 | 
101 | # Bad variable names which should always be refused, separated by a comma
102 | bad-names=foo,bar,baz,toto,tutu,tata
103 | 
104 | # Colon-delimited sets of names that determine each other's naming style when
105 | # the name regexes allow several styles.
106 | name-group=
107 | 
108 | # Include a hint for the correct naming format with invalid-name
109 | include-naming-hint=no
110 | 
111 | # Regular expression matching correct attribute names
112 | attr-rgx=[a-z_][a-z0-9_]{2,100}$
113 | 
114 | # Naming hint for attribute names
115 | attr-name-hint=[a-z_][a-z0-9_]{2,100}$
116 | 
117 | # Regular expression matching correct class names
118 | class-rgx=[A-Z_][a-zA-Z0-9]+$
119 | 
120 | # Naming hint for class names
121 | class-name-hint=[A-Z_][a-zA-Z0-9]+$
122 | 
123 | # Regular expression matching correct argument names
124 | argument-rgx=[a-z_][a-z0-9_]{2,100}$
125 | 
126 | # Naming hint for argument names
127 | argument-name-hint=[a-z_][a-z0-9_]{2,100}$
128 | 
129 | # Regular expression matching correct method names
130 | method-rgx=[a-z_][a-z0-9_]{2,100}$
131 | 
132 | # Naming hint for method names
133 | method-name-hint=[a-z_][a-z0-9_]{2,100}$
134 | 
135 | # Regular expression matching correct constant names
136 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
137 | 
138 | # Naming hint for constant names
139 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
140 | 
141 | # Regular expression matching correct class attribute names
142 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,100}|(__.*__))$
143 | 
144 | # Naming hint for class attribute names
145 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,100}|(__.*__))$
146 | 
147 | # Regular expression matching correct module names
148 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
149 | 
150 | # Naming hint for module names
151 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
152 | 
153 | # Regular expression matching correct inline iteration names
154 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
155 | 
156 | # Naming hint for inline iteration names
157 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
158 | 
159 | # Regular expression matching correct variable names
160 | variable-rgx=[a-z_][a-z0-9_]{2,100}$
161 | 
162 | # Naming hint for variable names
163 | variable-name-hint=[a-z_][a-z0-9_]{2,100}$
164 | 
165 | # Regular expression matching correct function names
166 | function-rgx=[a-z_][a-z0-9_]{2,100}$
167 | 
168 | # Naming hint for function names
169 | function-name-hint=[a-z_][a-z0-9_]{2,100}$
170 | 
171 | # Regular expression which should only match function or class names that do
172 | # not require a docstring.
173 | no-docstring-rgx=^_
174 | 
175 | # Minimum line length for functions/classes that require docstrings, shorter
176 | # ones are exempt.
177 | docstring-min-length=-1
178 | 
179 | 
180 | [ELIF]
181 | 
182 | # Maximum number of nested blocks for function / method body
183 | max-nested-blocks=5
184 | 
185 | 
186 | [FORMAT]
187 | 
188 | # Maximum number of characters on a single line.
189 | max-line-length=100
190 | 
191 | # Regexp for a line that is allowed to be longer than the limit.
192 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
193 | 
194 | # Allow the body of an if to be on the same line as the test if there is no
195 | # else.
196 | single-line-if-stmt=no
197 | 
198 | # List of optional constructs for which whitespace checking is disabled. `dict-
199 | # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
200 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
201 | # `empty-line` allows space-only lines.
202 | no-space-check=trailing-comma,dict-separator
203 | 
204 | # Maximum number of lines in a module
205 | max-module-lines=1000
206 | 
207 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
208 | # tab).
209 | indent-string='    '
210 | 
211 | # Number of spaces of indent required inside a hanging  or continued line.
212 | indent-after-paren=4
213 | 
214 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
215 | expected-line-ending-format=
216 | 
217 | 
218 | [LOGGING]
219 | 
220 | # Logging modules to check that the string format arguments are in logging
221 | # function parameter format
222 | logging-modules=logging
223 | 
224 | 
225 | [MISCELLANEOUS]
226 | 
227 | # List of note tags to take in consideration, separated by a comma.
228 | notes=FIXME,XXX,TODO
229 | 
230 | 
231 | [SIMILARITIES]
232 | 
233 | # Minimum lines number of a similarity.
234 | min-similarity-lines=4
235 | 
236 | # Ignore comments when computing similarities.
237 | ignore-comments=yes
238 | 
239 | # Ignore docstrings when computing similarities.
240 | ignore-docstrings=yes
241 | 
242 | # Ignore imports when computing similarities.
243 | ignore-imports=no
244 | 
245 | 
246 | [SPELLING]
247 | 
248 | # Spelling dictionary name. Available dictionaries: none. To make it working
249 | # install python-enchant package.
250 | spelling-dict=
251 | 
252 | # List of comma separated words that should not be checked.
253 | spelling-ignore-words=
254 | 
255 | # A path to a file that contains private dictionary; one word per line.
256 | spelling-private-dict-file=
257 | 
258 | # Tells whether to store unknown words to indicated private dictionary in
259 | # --spelling-private-dict-file option instead of raising a message.
260 | spelling-store-unknown-words=no
261 | 
262 | 
263 | [TYPECHECK]
264 | 
265 | # Tells whether missing members accessed in mixin class should be ignored. A
266 | # mixin class is detected if its name ends with "mixin" (case insensitive).
267 | ignore-mixin-members=yes
268 | 
269 | # List of module names for which member attributes should not be checked
270 | # (useful for modules/projects where namespaces are manipulated during runtime
271 | # and thus existing member attributes cannot be deduced by static analysis. It
272 | # supports qualified module names, as well as Unix pattern matching.
273 | ignored-modules=numpy
274 | 
275 | # List of classes names for which member attributes should not be checked
276 | # (useful for classes with attributes dynamically set). This supports can work
277 | # with qualified names.
278 | ignored-classes=
279 | 
280 | # List of members which are set dynamically and missed by pylint inference
281 | # system, and so shouldn't trigger E1101 when accessed. Python regular
282 | # expressions are accepted.
283 | generated-members=
284 | 
285 | 
286 | [VARIABLES]
287 | 
288 | # Tells whether we should check for unused import in __init__ files.
289 | init-import=no
290 | 
291 | # A regular expression matching the name of dummy variables (i.e. expectedly
292 | # not used).
293 | dummy-variables-rgx=_$|dummy
294 | 
295 | # List of additional names supposed to be defined in builtins. Remember that
296 | # you should avoid to define new builtins when possible.
297 | additional-builtins=
298 | 
299 | # List of strings which can identify a callback function by name. A callback
300 | # name must start or end with one of those strings.
301 | callbacks=cb_,_cb
302 | 
303 | 
304 | [CLASSES]
305 | 
306 | # List of method names used to declare (i.e. assign) instance attributes.
307 | defining-attr-methods=__init__,__new__,setUp
308 | 
309 | # List of valid names for the first argument in a class method.
310 | valid-classmethod-first-arg=cls
311 | 
312 | # List of valid names for the first argument in a metaclass class method.
313 | valid-metaclass-classmethod-first-arg=mcs
314 | 
315 | # List of member names, which should be excluded from the protected access
316 | # warning.
317 | exclude-protected=_asdict,_fields,_replace,_source,_make
318 | 
319 | 
320 | [DESIGN]
321 | 
322 | # Maximum number of arguments for function / method
323 | max-args=10
324 | 
325 | # Argument names that match this expression will be ignored. Default to name
326 | # with leading underscore
327 | ignored-argument-names=_.*
328 | 
329 | # Maximum number of locals for function / method body
330 | max-locals=15
331 | 
332 | # Maximum number of return / yield for function / method body
333 | max-returns=6
334 | 
335 | # Maximum number of branch for function / method body
336 | max-branches=12
337 | 
338 | # Maximum number of statements in function / method body
339 | max-statements=50
340 | 
341 | # Maximum number of parents for a class (see R0901).
342 | max-parents=7
343 | 
344 | # Maximum number of attributes for a class (see R0902).
345 | max-attributes=7
346 | 
347 | # Minimum number of public methods for a class (see R0903).
348 | min-public-methods=2
349 | 
350 | # Maximum number of public methods for a class (see R0904).
351 | max-public-methods=20
352 | 
353 | # Maximum number of boolean expressions in a if statement
354 | max-bool-expr=5
355 | 
356 | 
357 | [IMPORTS]
358 | 
359 | # Deprecated modules which should not be used, separated by a comma
360 | deprecated-modules=optparse
361 | 
362 | # Create a graph of every (i.e. internal and external) dependencies in the
363 | # given file (report RP0402 must not be disabled)
364 | import-graph=
365 | 
366 | # Create a graph of external dependencies in the given file (report RP0402 must
367 | # not be disabled)
368 | ext-import-graph=
369 | 
370 | # Create a graph of internal dependencies in the given file (report RP0402 must
371 | # not be disabled)
372 | int-import-graph=
373 | 
374 | 
375 | [EXCEPTIONS]
376 | 
377 | # Exceptions that will emit a warning when being caught. Defaults to
378 | # "Exception"
379 | overgeneral-exceptions=Exception
380 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # This CITATION.cff file was generated with cffinit.
 2 | # Visit https://bit.ly/cffinit to generate yours today!
 3 | 
 4 | cff-version: 1.2.0
 5 | title: Prince
 6 | message: >-
 7 |   Please use this citation if you use this software as part
 8 |   of a scientific publication.
 9 | type: software
10 | authors:
11 |   - given-names: Max
12 |     family-names: Halford
13 |     email: maxhalford25@gmail.com
14 |     orcid: "https://orcid.org/0000-0003-1464-4520"
15 | repository-code: "https://github.com/MaxHalford/prince"
16 | url: "https://maxhalford.github.io/prince"
17 | abstract: "Factor analysis in Python: PCA, CA, MCA, MFA, FAMD, GPA"
18 | license: MIT
19 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ## Setup
 4 | 
 5 | ```sh
 6 | git clone https://github.com/MaxHalford/prince
 7 | cd prince
 8 | poetry install
 9 | poetry shell
10 | ```
11 | 
12 | Install the [pre-commit](https://pre-commit.com/) push hooks. This will run some code quality checks every time you push to GitHub.
13 | 
14 | ```sh
15 | pre-commit install --hook-type pre-push
16 | ```
17 | 
18 | You can optionally run `pre-commit` at any time as so:
19 | 
20 | ```sh
21 | pre-commit run --all-files
22 | ```
23 | 
24 | ## Unit tests
25 | 
26 | Some unit tests call the FactoMineR package via rpy2; you have to install it:
27 | 
28 | ```sh
29 | Rscript -e 'install.packages("FactoMineR", repos="https://cloud.r-project.org")'
30 | ```
31 | 
32 | ```sh
33 | pytest
34 | ```
35 | 
36 | ## Building docs locally
37 | 
38 | ```sh
39 | make execute-notebooks
40 | make render-notebooks
41 | (cd docs && hugo serve)
42 | ```
43 | 
44 | ## Deploy docs
45 | 
46 | ```sh
47 | gh workflow run hugo.yml
48 | ```
49 | 
50 | ## Release
51 | 
52 | ```sh
53 | poetry publish --build
54 | ```
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2023 Max Halford
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include prince/datasets/*.csv
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | execute-notebooks:
2 | 	poetry run jupyter nbconvert --execute --to notebook --inplace docs/content/*.ipynb
3 | 
4 | render-notebooks:
5 | 	poetry run jupyter nbconvert --to markdown docs/content/*.ipynb
6 | 	(for f in docs/content/*.md; do sed -e '/<script/,/<\/script>/{/^$/d;}' ${f} > ${f}.tmp; mv ${f}.tmp ${f}; done)
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   <img src="docs/static/images/logo.png" alt="prince_logo" width="80%" />
  3 | </div>
  4 | 
  5 | <br/>
  6 | 
  7 | <div align="center">
  8 |   <!-- Documentation -->
  9 |   <a href="https://maxhalford.github.io/prince">
 10 |     <img src="https://img.shields.io/website?label=docs&style=flat-square&url=https://maxhalford.github.io/prince" alt="documentation">
 11 |   </a>
 12 |   <!-- PyPi -->
 13 |   <a href="https://pypi.org/project/prince/">
 14 |     <img src="https://img.shields.io/pypi/v/prince.svg" alt="pypi" />
 15 |   </a>
 16 |   <!-- PePy -->
 17 |   <a href="https://pepy.tech/project/prince">
 18 |     <img src="https://static.pepy.tech/badge/prince" alt="pepy">
 19 |   </a>
 20 |   <!-- PePy by month -->
 21 |   <a href="https://pepy.tech/project/prince">
 22 |     <img src="https://static.pepy.tech/badge/prince/month" alt="pepy_month">
 23 |   </a>
 24 |   <!-- Unit tests -->
 25 |   <a href="https://github.com/MaxHalford/prince/actions/workflows/unit-tests.yml">
 26 |     <img src="https://github.com/MaxHalford/prince/actions/workflows/unit-tests.yml/badge.svg" alt="Unit tests" />
 27 |   </a>
 28 |   <!-- Code quality -->
 29 |   <a href="https://github.com/MaxHalford/prince/actions/workflows/code-quality.yml">
 30 |     <img src="https://github.com/MaxHalford/prince/actions/workflows/code-quality.yml/badge.svg" alt="Code quality" />
 31 |   </a>
 32 |   <!-- License -->
 33 |   <a href="https://opensource.org/licenses/MIT">
 34 |     <img src="http://img.shields.io/:license-mit-ff69b4.svg" alt="license"/>
 35 |   </a>
 36 | </div>
 37 | 
 38 | <br/>
 39 | 
 40 | Prince is a Python library for multivariate exploratory data analysis in Python. It includes a variety of methods for summarizing tabular data, including [principal component analysis (PCA)](https://www.wikiwand.com/en/Principal_component_analysis) and [correspondence analysis (CA)](https://www.wikiwand.com/en/Correspondence_analysis). Prince provides efficient implementations, using a scikit-learn API.
 41 | 
 42 | I made Prince when I was at university, back in 2016. I spent a significant amount of time in 2022 to revamp the entire package. It is thoroughly tested and supports many features, such as supplementary row/columns, as well as row/column weights.
 43 | 
 44 | ## Example usage
 45 | 
 46 | ```py
 47 | >>> import prince
 48 | 
 49 | >>> dataset = prince.datasets.load_decathlon()
 50 | >>> decastar = dataset.query('competition == "Decastar"')
 51 | 
 52 | >>> pca = prince.PCA(n_components=5)
 53 | >>> pca = pca.fit(decastar, supplementary_columns=['rank', 'points'])
 54 | >>> pca.eigenvalues_summary
 55 |           eigenvalue % of variance % of variance (cumulative)
 56 | component
 57 | 0              3.114        31.14%                     31.14%
 58 | 1              2.027        20.27%                     51.41%
 59 | 2              1.390        13.90%                     65.31%
 60 | 3              1.321        13.21%                     78.52%
 61 | 4              0.861         8.61%                     87.13%
 62 | 
 63 | >>> pca.transform(dataset).tail()
 64 | component                       0         1         2         3         4
 65 | competition athlete
 66 | OlympicG    Lorenzo      2.070933  1.545461 -1.272104 -0.215067 -0.515746
 67 |             Karlivans    1.321239  1.318348  0.138303 -0.175566 -1.484658
 68 |             Korkizoglou -0.756226 -1.975769  0.701975 -0.642077 -2.621566
 69 |             Uldal        1.905276 -0.062984 -0.370408 -0.007944 -2.040579
 70 |             Casarsa      2.282575 -2.150282  2.601953  1.196523 -3.571794
 71 | 
 72 | ```
 73 | 
 74 | ```py
 75 | >>> chart = pca.plot(dataset)
 76 | 
 77 | ```
 78 | 
 79 | <div align="center">
 80 |   <img src="figures/decastar.svg" width="74%" />
 81 |   <p>
 82 |     <i>This chart is interactive, which doesn't show on GitHub. The green points are the column loadings.</i>
 83 |   <p>
 84 | </div>
 85 | 
 86 | ```py
 87 | >>> chart = pca.plot(
 88 | ...     dataset,
 89 | ...     show_row_labels=True,
 90 | ...     show_row_markers=False,
 91 | ...     row_labels_column='athlete',
 92 | ...     color_rows_by='competition'
 93 | ... )
 94 | 
 95 | ```
 96 | 
 97 | <div align="center">
 98 |   <img src="figures/decastar_bis.svg" width="74%" />
 99 | </div>
100 | 
101 | ## Installation
102 | 
103 | ```sh
104 | pip install prince
105 | ```
106 | 
107 | 🎨 Prince uses [Altair](https://altair-viz.github.io/) for making charts.
108 | 
109 | ## Methods
110 | 
111 | ```mermaid
112 | flowchart TD
113 |     cat?(Categorical data?) --> |"✅"| num_too?(Numerical data too?)
114 |     num_too? --> |"✅"| FAMD
115 |     num_too? --> |"❌"| multiple_cat?(More than two columns?)
116 |     multiple_cat? --> |"✅"| MCA
117 |     multiple_cat? --> |"❌"| CA
118 |     cat? --> |"❌"| groups?(Groups of columns?)
119 |     groups? --> |"✅"| MFA
120 |     groups? --> |"❌"| shapes?(Analysing shapes?)
121 |     shapes? --> |"✅"| GPA
122 |     shapes? --> |"❌"| PCA
123 | ```
124 | 
125 | ### [Principal component analysis (PCA)](https://maxhalford.github.io/prince/pca)
126 | 
127 | ### [Correspondence analysis (CA)](https://maxhalford.github.io/prince/ca)
128 | 
129 | ### [Multiple correspondence analysis (MCA)](https://maxhalford.github.io/prince/mca)
130 | 
131 | ### [Multiple factor analysis (MFA)](https://maxhalford.github.io/prince/mfa)
132 | 
133 | ### [Factor analysis of mixed data (FAMD)](https://maxhalford.github.io/prince/famd)
134 | 
135 | ### [Generalized procrustes analysis (GPA)](https://maxhalford.github.io/prince/gpa)
136 | 
137 | ## Correctness
138 | 
139 | Prince is tested against scikit-learn and [FactoMineR](http://factominer.free.fr/). For the latter, [rpy2](https://rpy2.github.io/) is used to run code in R, and convert the results to Python, which allows running automated tests. See more in the [`tests`](/tests/) directory.
140 | 
141 | ## Citation
142 | 
143 | Please use this citation if you use this software as part of a scientific publication.
144 | 
145 | ```bibtex
146 | @software{Halford_Prince,
147 |     author = {Halford, Max},
148 |     license = {MIT},
149 |     title = {{Prince}},
150 |     url = {https://github.com/MaxHalford/prince}
151 | }
152 | ```
153 | 
154 | ## License
155 | 
156 | The MIT License (MIT). Please see the [license file](LICENSE) for more information.
157 | 


--------------------------------------------------------------------------------
/docs/.hugo_build.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/docs/.hugo_build.lock


--------------------------------------------------------------------------------
/docs/archetypes/default.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "{{ replace .Name "-" " " | title }}"
3 | date: {{ .Date }}
4 | draft: true
5 | ---
6 | 


--------------------------------------------------------------------------------
/docs/config.toml:
--------------------------------------------------------------------------------
 1 | # Base URL used when generating links to your pages
 2 | # Set to the URL for your site
 3 | baseURL = "https://maxhalford.github.io/prince/"
 4 | 
 5 | # The name of this wonderful theme ;-).
 6 | theme = 'hugo-bearblog'
 7 | 
 8 | # Basic metadata configuration for your blog.
 9 | title = "Prince"
10 | author = "Max Halford"
11 | copyright = "Copyright © 2024, Max Halford."
12 | languageCode = "en-US"
13 | 
14 | # Generate a nice robots.txt for SEO
15 | enableRobotsTXT = true
16 | 
17 | # Generate "Bearblog"-like URLs !only!, see https://bearblog.dev/.
18 | disableKinds = ["taxonomy"]
19 | ignoreErrors = ["error-disable-taxonomy"]
20 | 
21 | [params]
22 |   # The "description" of your website. This is used in the meta data of your generated html.
23 |   description = "Multivariate Exploratory data analysis in Python: PCA, CA, MCA, MFA, FAMD, GPA"
24 | 
25 |   # The path to your "favicon". This should be a square (at least 32px x 32px) png-file.
26 |   # Hint: It's good practise to also put a "favicon.ico"-file into your "static"-folder.
27 |   favicon = "images/favicon.png"
28 | 
29 |   # These "images" are used for the structured data templates. This will show up, when
30 |   # services like Twitter or Slack want to generate a preview of a link to your site.
31 |   # See https://gohugo.io/templates/internal#twitter-cards and
32 |   # https://gohugo.io/templates/internal#open-graph.
33 |   images = ["images/logo.png"]
34 | 
35 |   # Another "title" :-). This one is used as the site_name on the Hugo's internal
36 |   # opengraph structured data template.
37 |   # See https://ogp.me/ and https://gohugo.io/templates/internal#open-graph.
38 |   title = "Prince"
39 | 
40 |   # This theme will, by default, inject a made-with-line at the bottom of the page.
41 |   # You can turn it off, but we would really appreciate if you don’t :-).
42 |   hideMadeWithLine = true
43 | 
44 |   # By default, this theme displays dates with a format like "02 Jan, 2006", but
45 |   # you can customize it by setting the `dateFormat` param in your site's config
46 |   # file. See [Hugo's Format function docs](https://gohugo.io/functions/format/)
47 |   # for details. An example TOML config that uses [ISO
48 |   # 8601](https://en.wikipedia.org/wiki/ISO_8601) format:
49 |   # dateFormat = "2006-01-02"
50 | 
51 | [markup.goldmark.renderer]
52 |   unsafe = true
53 | 
54 | [markup]
55 |   [markup.highlight]
56 |     style = 'xcode'
57 | 


--------------------------------------------------------------------------------
/docs/content/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | ---
 3 | 
 4 | <div style="margin-top: 2em;"></div>
 5 | 
 6 | Prince is a Python library for multivariate exploratory data analysis in Python. It includes a variety of methods for summarizing tabular data, including principal component analysis and correspondence analysis. Prince provides efficient implementations, using a scikit-learn API.
 7 | 
 8 | ## Methods
 9 | 
10 | ```mermaid
11 | flowchart TD
12 |     cat?(Categorical data?) --> |"✅"| num_too?(Numerical data too?)
13 |     num_too? --> |"✅"| FAMD
14 |     num_too? --> |"❌"| multiple_cat?(More than two columns?)
15 |     multiple_cat? --> |"✅"| MCA
16 |     multiple_cat? --> |"❌"| CA
17 |     cat? --> |"❌"| groups?(Groups of columns?)
18 |     groups? --> |"✅"| MFA
19 |     groups? --> |"❌"| shapes?(Analysing shapes?)
20 |     shapes? --> |"✅"| GPA
21 |     shapes? --> |"❌"| PCA
22 | ```
23 | 
24 | ## Installation
25 | 
26 | ```sh
27 | pip install prince
28 | ```
29 | 
30 | 🎨 Prince uses [Altair](https://altair-viz.github.io/) for making charts.
31 | 
32 | ## Correctness
33 | 
34 | Prince is tested against scikit-learn and [FactoMineR](http://factominer.free.fr/). For the latter, [rpy2](https://rpy2.github.io/) is used to run code in R, and convert the results to Python, which allows running automated tests. See more in the [`tests`](https://github.com/MaxHalford/prince/tree/master/tests) directory.
35 | 
36 | ## Citation
37 | 
38 | Please use this citation if you use this software as part of a scientific publication.
39 | 
40 | ```bibtex
41 | @software{Halford_Prince,
42 |     author = {Halford, Max},
43 |     license = {MIT},
44 |     title = {{Prince}},
45 |     url = {https://github.com/MaxHalford/prince}
46 | }
47 | ```
48 | 
49 | ## About
50 | 
51 | I made Prince when I was at university, back in 2016. We used [FactoMineR](http://factominer.free.fr/) extensively in our labs, and so my initial motivation was to provide the same features for Python. I've had very little time over the years to maintain this package. I spent a significant of time in 2022 to revamp the entire package. Prince has now been downloaded over [2 million times](https://pepy.tech/project/prince).
52 | 
53 | ## License
54 | 
55 | The MIT License (MIT). Please see the [license file](https://github.com/MaxHalford/prince/blob/master/LICENSE) for more information.
56 | 


--------------------------------------------------------------------------------
/docs/content/faq.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "+++\n",
  8 |     "title = \"Frequently Asked Questions\"\n",
  9 |     "menu = \"main\"\n",
 10 |     "weight = 7\n",
 11 |     "toc = true\n",
 12 |     "aliases = [\"faq\"]\n",
 13 |     "+++"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "**How to use Prince with sklearn pipelines?**\n",
 21 |     "\n",
 22 |     "Prince estimators consume and produce pandas DataFrames. If you want to use them in a sklearn pipeline, you can [sklearn's `set_output` API](https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_set_output.html). This way, you can tell sklearn that the pipeline should exchange DataFrames instead of numpy arrays between the steps."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {
 29 |     "execution": {
 30 |      "iopub.execute_input": "2024-09-07T18:17:59.587100Z",
 31 |      "iopub.status.busy": "2024-09-07T18:17:59.586565Z",
 32 |      "iopub.status.idle": "2024-09-07T18:18:00.199076Z",
 33 |      "shell.execute_reply": "2024-09-07T18:18:00.198381Z"
 34 |     }
 35 |    },
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/html": [
 40 |        "<div>\n",
 41 |        "<style scoped>\n",
 42 |        "    .dataframe tbody tr th:only-of-type {\n",
 43 |        "        vertical-align: middle;\n",
 44 |        "    }\n",
 45 |        "\n",
 46 |        "    .dataframe tbody tr th {\n",
 47 |        "        vertical-align: top;\n",
 48 |        "    }\n",
 49 |        "\n",
 50 |        "    .dataframe thead th {\n",
 51 |        "        text-align: right;\n",
 52 |        "    }\n",
 53 |        "</style>\n",
 54 |        "<table border=\"1\" class=\"dataframe\">\n",
 55 |        "  <thead>\n",
 56 |        "    <tr style=\"text-align: right;\">\n",
 57 |        "      <th>component</th>\n",
 58 |        "      <th>0</th>\n",
 59 |        "      <th>1</th>\n",
 60 |        "    </tr>\n",
 61 |        "  </thead>\n",
 62 |        "  <tbody>\n",
 63 |        "    <tr>\n",
 64 |        "      <th>0</th>\n",
 65 |        "      <td>-2.264703</td>\n",
 66 |        "      <td>0.480027</td>\n",
 67 |        "    </tr>\n",
 68 |        "    <tr>\n",
 69 |        "      <th>1</th>\n",
 70 |        "      <td>-2.080961</td>\n",
 71 |        "      <td>-0.674134</td>\n",
 72 |        "    </tr>\n",
 73 |        "    <tr>\n",
 74 |        "      <th>2</th>\n",
 75 |        "      <td>-2.364229</td>\n",
 76 |        "      <td>-0.341908</td>\n",
 77 |        "    </tr>\n",
 78 |        "    <tr>\n",
 79 |        "      <th>3</th>\n",
 80 |        "      <td>-2.299384</td>\n",
 81 |        "      <td>-0.597395</td>\n",
 82 |        "    </tr>\n",
 83 |        "    <tr>\n",
 84 |        "      <th>4</th>\n",
 85 |        "      <td>-2.389842</td>\n",
 86 |        "      <td>0.646835</td>\n",
 87 |        "    </tr>\n",
 88 |        "  </tbody>\n",
 89 |        "</table>\n",
 90 |        "</div>"
 91 |       ],
 92 |       "text/plain": [
 93 |        "component         0         1\n",
 94 |        "0         -2.264703  0.480027\n",
 95 |        "1         -2.080961 -0.674134\n",
 96 |        "2         -2.364229 -0.341908\n",
 97 |        "3         -2.299384 -0.597395\n",
 98 |        "4         -2.389842  0.646835"
 99 |       ]
100 |      },
101 |      "execution_count": 1,
102 |      "metadata": {},
103 |      "output_type": "execute_result"
104 |     }
105 |    ],
106 |    "source": [
107 |     "import prince\n",
108 |     "from sklearn import datasets\n",
109 |     "from sklearn import impute\n",
110 |     "from sklearn import pipeline\n",
111 |     "\n",
112 |     "pipe = pipeline.make_pipeline(\n",
113 |     "    impute.SimpleImputer(),\n",
114 |     "    prince.PCA()\n",
115 |     ")\n",
116 |     "pipe.set_output(transform='pandas')\n",
117 |     "dataset = datasets.load_iris()\n",
118 |     "pipe.fit_transform(dataset.data).head()"
119 |    ]
120 |   }
121 |  ],
122 |  "metadata": {
123 |   "kernelspec": {
124 |    "display_name": "prince-NQ1O93Uh-py3.11",
125 |    "language": "python",
126 |    "name": "python3"
127 |   },
128 |   "language_info": {
129 |    "codemirror_mode": {
130 |     "name": "ipython",
131 |     "version": 3
132 |    },
133 |    "file_extension": ".py",
134 |    "mimetype": "text/x-python",
135 |    "name": "python",
136 |    "nbconvert_exporter": "python",
137 |    "pygments_lexer": "ipython3",
138 |    "version": "3.11.4"
139 |   }
140 |  },
141 |  "nbformat": 4,
142 |  "nbformat_minor": 2
143 | }
144 | 


--------------------------------------------------------------------------------
/docs/content/gpa.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "+++\n",
  9 |     "title = \"Generalized Procrustes analysis\"\n",
 10 |     "menu = \"main\"\n",
 11 |     "weight = 6\n",
 12 |     "toc = true\n",
 13 |     "aliases = [\"gpa\"]\n",
 14 |     "+++"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "attachments": {},
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Resources\n",
 23 |     "\n",
 24 |     "🤷‍♂️"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "attachments": {},
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## User guide\n",
 33 |     "\n",
 34 |     "Generalized procrustes analysis (GPA) is a shape analysis tool that aligns and scales a set of shapes to a common reference. Here, the term \"shape\" means an *ordered* sequence of points. GPA iteratively 1) aligns each shape with a reference shape (usually the mean shape), 2) then updates the reference shape, 3) repeating until converged.\n",
 35 |     "\n",
 36 |     "Note that the final rotation of the aligned shapes may vary between runs, based on the initialization.\n",
 37 |     "\n",
 38 |     "Here is an example aligning a few right triangles:"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 1,
 44 |    "metadata": {
 45 |     "execution": {
 46 |      "iopub.execute_input": "2024-09-07T18:18:01.633858Z",
 47 |      "iopub.status.busy": "2024-09-07T18:18:01.633426Z",
 48 |      "iopub.status.idle": "2024-09-07T18:18:01.668183Z",
 49 |      "shell.execute_reply": "2024-09-07T18:18:01.667693Z"
 50 |     }
 51 |    },
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/html": [
 56 |        "<div>\n",
 57 |        "<style scoped>\n",
 58 |        "    .dataframe tbody tr th:only-of-type {\n",
 59 |        "        vertical-align: middle;\n",
 60 |        "    }\n",
 61 |        "\n",
 62 |        "    .dataframe tbody tr th {\n",
 63 |        "        vertical-align: top;\n",
 64 |        "    }\n",
 65 |        "\n",
 66 |        "    .dataframe thead th {\n",
 67 |        "        text-align: right;\n",
 68 |        "    }\n",
 69 |        "</style>\n",
 70 |        "<table border=\"1\" class=\"dataframe\">\n",
 71 |        "  <thead>\n",
 72 |        "    <tr style=\"text-align: right;\">\n",
 73 |        "      <th></th>\n",
 74 |        "      <th>x</th>\n",
 75 |        "      <th>y</th>\n",
 76 |        "      <th>shape</th>\n",
 77 |        "      <th>point</th>\n",
 78 |        "    </tr>\n",
 79 |        "  </thead>\n",
 80 |        "  <tbody>\n",
 81 |        "    <tr>\n",
 82 |        "      <th>0</th>\n",
 83 |        "      <td>0.0</td>\n",
 84 |        "      <td>0.0</td>\n",
 85 |        "      <td>0</td>\n",
 86 |        "      <td>0</td>\n",
 87 |        "    </tr>\n",
 88 |        "    <tr>\n",
 89 |        "      <th>1</th>\n",
 90 |        "      <td>0.0</td>\n",
 91 |        "      <td>2.0</td>\n",
 92 |        "      <td>0</td>\n",
 93 |        "      <td>1</td>\n",
 94 |        "    </tr>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>2</th>\n",
 97 |        "      <td>1.0</td>\n",
 98 |        "      <td>0.0</td>\n",
 99 |        "      <td>0</td>\n",
100 |        "      <td>2</td>\n",
101 |        "    </tr>\n",
102 |        "  </tbody>\n",
103 |        "</table>\n",
104 |        "</div>"
105 |       ],
106 |       "text/plain": [
107 |        "     x    y  shape  point\n",
108 |        "0  0.0  0.0      0      0\n",
109 |        "1  0.0  2.0      0      1\n",
110 |        "2  1.0  0.0      0      2"
111 |       ]
112 |      },
113 |      "execution_count": 1,
114 |      "metadata": {},
115 |      "output_type": "execute_result"
116 |     }
117 |    ],
118 |    "source": [
119 |     "import pandas as pd\n",
120 |     "\n",
121 |     "points = pd.DataFrame(\n",
122 |     "    data=[\n",
123 |     "        [0, 0, 0, 0],\n",
124 |     "        [0, 2, 0, 1],\n",
125 |     "        [1, 0, 0, 2],\n",
126 |     "        [3, 2, 1, 0],\n",
127 |     "        [1, 2, 1, 1],\n",
128 |     "        [3, 3, 1, 2],\n",
129 |     "        [0, 0, 2, 0],\n",
130 |     "        [0, 4, 2, 1],\n",
131 |     "        [2, 0, 2, 2],\n",
132 |     "    ],\n",
133 |     "    columns=['x', 'y', 'shape', 'point']\n",
134 |     ").astype({'x': float, 'y': float})\n",
135 |     "points.head(3)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 2,
141 |    "metadata": {
142 |     "execution": {
143 |      "iopub.execute_input": "2024-09-07T18:18:01.670859Z",
144 |      "iopub.status.busy": "2024-09-07T18:18:01.670666Z",
145 |      "iopub.status.idle": "2024-09-07T18:18:01.755157Z",
146 |      "shell.execute_reply": "2024-09-07T18:18:01.754860Z"
147 |     }
148 |    },
149 |    "outputs": [
150 |     {
151 |      "data": {
152 |       "text/html": [
153 |        "\n",
154 |        "<div id=\"altair-viz-163523f3e6ec4bb485bf25d7a35cc5ec\"></div>\n",
155 |        "<script type=\"text/javascript\">\n",
156 |        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
157 |        "  (function(spec, embedOpt){\n",
158 |        "    let outputDiv = document.currentScript.previousElementSibling;\n",
159 |        "    if (outputDiv.id !== \"altair-viz-163523f3e6ec4bb485bf25d7a35cc5ec\") {\n",
160 |        "      outputDiv = document.getElementById(\"altair-viz-163523f3e6ec4bb485bf25d7a35cc5ec\");\n",
161 |        "    }\n",
162 |        "    const paths = {\n",
163 |        "      \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n",
164 |        "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n",
165 |        "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm//vega-lite@4.17.0?noext\",\n",
166 |        "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm//vega-embed@6?noext\",\n",
167 |        "    };\n",
168 |        "\n",
169 |        "    function maybeLoadScript(lib, version) {\n",
170 |        "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
171 |        "      return (VEGA_DEBUG[key] == version) ?\n",
172 |        "        Promise.resolve(paths[lib]) :\n",
173 |        "        new Promise(function(resolve, reject) {\n",
174 |        "          var s = document.createElement('script');\n",
175 |        "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
176 |        "          s.async = true;\n",
177 |        "          s.onload = () => {\n",
178 |        "            VEGA_DEBUG[key] = version;\n",
179 |        "            return resolve(paths[lib]);\n",
180 |        "          };\n",
181 |        "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
182 |        "          s.src = paths[lib];\n",
183 |        "        });\n",
184 |        "    }\n",
185 |        "\n",
186 |        "    function showError(err) {\n",
187 |        "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
188 |        "      throw err;\n",
189 |        "    }\n",
190 |        "\n",
191 |        "    function displayChart(vegaEmbed) {\n",
192 |        "      vegaEmbed(outputDiv, spec, embedOpt)\n",
193 |        "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
194 |        "    }\n",
195 |        "\n",
196 |        "    if(typeof define === \"function\" && define.amd) {\n",
197 |        "      requirejs.config({paths});\n",
198 |        "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
199 |        "    } else {\n",
200 |        "      maybeLoadScript(\"vega\", \"5\")\n",
201 |        "        .then(() => maybeLoadScript(\"vega-lite\", \"4.17.0\"))\n",
202 |        "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
203 |        "        .catch(showError)\n",
204 |        "        .then(() => displayChart(vegaEmbed));\n",
205 |        "    }\n",
206 |        "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-28b7252a2dc4b45238c1274f5dffb9c6\"}, \"mark\": {\"type\": \"line\", \"opacity\": 0.5}, \"encoding\": {\"color\": {\"field\": \"shape\", \"type\": \"nominal\"}, \"detail\": {\"field\": \"shape\", \"type\": \"quantitative\"}, \"x\": {\"field\": \"x\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"y\", \"type\": \"quantitative\"}}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-28b7252a2dc4b45238c1274f5dffb9c6\": [{\"x\": 0.0, \"y\": 0.0, \"shape\": 0, \"point\": 0}, {\"x\": 0.0, \"y\": 2.0, \"shape\": 0, \"point\": 1}, {\"x\": 1.0, \"y\": 0.0, \"shape\": 0, \"point\": 2}, {\"x\": 3.0, \"y\": 2.0, \"shape\": 1, \"point\": 0}, {\"x\": 1.0, \"y\": 2.0, \"shape\": 1, \"point\": 1}, {\"x\": 3.0, \"y\": 3.0, \"shape\": 1, \"point\": 2}, {\"x\": 0.0, \"y\": 0.0, \"shape\": 2, \"point\": 0}, {\"x\": 0.0, \"y\": 4.0, \"shape\": 2, \"point\": 1}, {\"x\": 2.0, \"y\": 0.0, \"shape\": 2, \"point\": 2}]}}, {\"mode\": \"vega-lite\"});\n",
207 |        "</script>"
208 |       ],
209 |       "text/plain": [
210 |        "alt.Chart(...)"
211 |       ]
212 |      },
213 |      "execution_count": 2,
214 |      "metadata": {},
215 |      "output_type": "execute_result"
216 |     }
217 |    ],
218 |    "source": [
219 |     "import altair as alt\n",
220 |     "\n",
221 |     "alt.Chart(points).mark_line(opacity=0.5).encode(\n",
222 |     "    x='x',\n",
223 |     "    y='y',\n",
224 |     "    detail='shape',\n",
225 |     "    color='shape:N'\n",
226 |     ")"
227 |    ]
228 |   },
229 |   {
230 |    "attachments": {},
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "The dataframe of points has to converted to a 3D numpy array of shape `(shapes, points, dims)`. There are many ways to do this. Here, we use xarray as a helper package."
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 3,
240 |    "metadata": {
241 |     "execution": {
242 |      "iopub.execute_input": "2024-09-07T18:18:01.756840Z",
243 |      "iopub.status.busy": "2024-09-07T18:18:01.756743Z",
244 |      "iopub.status.idle": "2024-09-07T18:18:01.807548Z",
245 |      "shell.execute_reply": "2024-09-07T18:18:01.807313Z"
246 |     }
247 |    },
248 |    "outputs": [
249 |     {
250 |      "data": {
251 |       "text/plain": [
252 |        "(3, 3, 2)"
253 |       ]
254 |      },
255 |      "execution_count": 3,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "ds = points.set_index(['shape', 'point']).to_xarray()\n",
262 |     "da = ds.to_stacked_array('xy', ['shape', 'point'])\n",
263 |     "shapes = da.values\n",
264 |     "shapes.shape"
265 |    ]
266 |   },
267 |   {
268 |    "attachments": {},
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "This can also be done in NumPy:"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 4,
278 |    "metadata": {
279 |     "execution": {
280 |      "iopub.execute_input": "2024-09-07T18:18:01.809002Z",
281 |      "iopub.status.busy": "2024-09-07T18:18:01.808906Z",
282 |      "iopub.status.idle": "2024-09-07T18:18:01.818337Z",
283 |      "shell.execute_reply": "2024-09-07T18:18:01.818121Z"
284 |     }
285 |    },
286 |    "outputs": [
287 |     {
288 |      "data": {
289 |       "text/plain": [
290 |        "(3, 3, 2)"
291 |       ]
292 |      },
293 |      "execution_count": 4,
294 |      "metadata": {},
295 |      "output_type": "execute_result"
296 |     }
297 |    ],
298 |    "source": [
299 |     "import numpy as np\n",
300 |     "\n",
301 |     "gb = points.groupby('shape')\n",
302 |     "np.stack([gb.get_group(g)[['x', 'y']] for g in gb.groups]).shape"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 5,
308 |    "metadata": {
309 |     "execution": {
310 |      "iopub.execute_input": "2024-09-07T18:18:01.819667Z",
311 |      "iopub.status.busy": "2024-09-07T18:18:01.819581Z",
312 |      "iopub.status.idle": "2024-09-07T18:18:01.826748Z",
313 |      "shell.execute_reply": "2024-09-07T18:18:01.826491Z"
314 |     }
315 |    },
316 |    "outputs": [
317 |     {
318 |      "data": {
319 |       "text/plain": [
320 |        "array([[[0., 0.],\n",
321 |        "        [0., 2.],\n",
322 |        "        [1., 0.]],\n",
323 |        "\n",
324 |        "       [[3., 2.],\n",
325 |        "        [1., 2.],\n",
326 |        "        [3., 3.]],\n",
327 |        "\n",
328 |        "       [[0., 0.],\n",
329 |        "        [0., 4.],\n",
330 |        "        [2., 0.]]])"
331 |       ]
332 |      },
333 |      "execution_count": 5,
334 |      "metadata": {},
335 |      "output_type": "execute_result"
336 |     }
337 |    ],
338 |    "source": [
339 |     "shapes"
340 |    ]
341 |   },
342 |   {
343 |    "attachments": {},
344 |    "cell_type": "markdown",
345 |    "metadata": {},
346 |    "source": [
347 |     "The shapes can now be aligned."
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 6,
353 |    "metadata": {
354 |     "execution": {
355 |      "iopub.execute_input": "2024-09-07T18:18:01.828533Z",
356 |      "iopub.status.busy": "2024-09-07T18:18:01.828396Z",
357 |      "iopub.status.idle": "2024-09-07T18:18:02.157698Z",
358 |      "shell.execute_reply": "2024-09-07T18:18:02.157289Z"
359 |     }
360 |    },
361 |    "outputs": [],
362 |    "source": [
363 |     "import prince\n",
364 |     "\n",
365 |     "gpa = prince.GPA()\n",
366 |     "aligned_shapes = gpa.fit_transform(shapes)"
367 |    ]
368 |   },
369 |   {
370 |    "attachments": {},
371 |    "cell_type": "markdown",
372 |    "metadata": {},
373 |    "source": [
374 |     "We then convert the 3D numpy array to a dataframe (using `xarray`) for plotting."
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": 7,
380 |    "metadata": {
381 |     "execution": {
382 |      "iopub.execute_input": "2024-09-07T18:18:02.159473Z",
383 |      "iopub.status.busy": "2024-09-07T18:18:02.159364Z",
384 |      "iopub.status.idle": "2024-09-07T18:18:02.187045Z",
385 |      "shell.execute_reply": "2024-09-07T18:18:02.186796Z"
386 |     }
387 |    },
388 |    "outputs": [
389 |     {
390 |      "data": {
391 |       "text/html": [
392 |        "\n",
393 |        "<div id=\"altair-viz-36f965231bd64457b172e6c0c8c0f7b3\"></div>\n",
394 |        "<script type=\"text/javascript\">\n",
395 |        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
396 |        "  (function(spec, embedOpt){\n",
397 |        "    let outputDiv = document.currentScript.previousElementSibling;\n",
398 |        "    if (outputDiv.id !== \"altair-viz-36f965231bd64457b172e6c0c8c0f7b3\") {\n",
399 |        "      outputDiv = document.getElementById(\"altair-viz-36f965231bd64457b172e6c0c8c0f7b3\");\n",
400 |        "    }\n",
401 |        "    const paths = {\n",
402 |        "      \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n",
403 |        "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n",
404 |        "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm//vega-lite@4.17.0?noext\",\n",
405 |        "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm//vega-embed@6?noext\",\n",
406 |        "    };\n",
407 |        "\n",
408 |        "    function maybeLoadScript(lib, version) {\n",
409 |        "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
410 |        "      return (VEGA_DEBUG[key] == version) ?\n",
411 |        "        Promise.resolve(paths[lib]) :\n",
412 |        "        new Promise(function(resolve, reject) {\n",
413 |        "          var s = document.createElement('script');\n",
414 |        "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
415 |        "          s.async = true;\n",
416 |        "          s.onload = () => {\n",
417 |        "            VEGA_DEBUG[key] = version;\n",
418 |        "            return resolve(paths[lib]);\n",
419 |        "          };\n",
420 |        "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
421 |        "          s.src = paths[lib];\n",
422 |        "        });\n",
423 |        "    }\n",
424 |        "\n",
425 |        "    function showError(err) {\n",
426 |        "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
427 |        "      throw err;\n",
428 |        "    }\n",
429 |        "\n",
430 |        "    function displayChart(vegaEmbed) {\n",
431 |        "      vegaEmbed(outputDiv, spec, embedOpt)\n",
432 |        "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
433 |        "    }\n",
434 |        "\n",
435 |        "    if(typeof define === \"function\" && define.amd) {\n",
436 |        "      requirejs.config({paths});\n",
437 |        "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
438 |        "    } else {\n",
439 |        "      maybeLoadScript(\"vega\", \"5\")\n",
440 |        "        .then(() => maybeLoadScript(\"vega-lite\", \"4.17.0\"))\n",
441 |        "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
442 |        "        .catch(showError)\n",
443 |        "        .then(() => displayChart(vegaEmbed));\n",
444 |        "    }\n",
445 |        "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-c02c635fccec02f1185693da22cf110f\"}, \"mark\": {\"type\": \"line\", \"opacity\": 0.5}, \"encoding\": {\"color\": {\"field\": \"shape\", \"type\": \"nominal\"}, \"detail\": {\"field\": \"shape\", \"type\": \"quantitative\"}, \"x\": {\"field\": \"x\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"y\", \"type\": \"quantitative\"}}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-c02c635fccec02f1185693da22cf110f\": [{\"shape\": 0, \"point\": 0, \"x\": -0.1825741858350555, \"y\": -0.36514837167011066}, {\"shape\": 0, \"point\": 1, \"x\": -0.1825741858350552, \"y\": 0.7302967433402214}, {\"shape\": 0, \"point\": 2, \"x\": 0.36514837167011066, \"y\": -0.3651483716701107}, {\"shape\": 1, \"point\": 0, \"x\": -0.1825741858350555, \"y\": -0.3651483716701106}, {\"shape\": 1, \"point\": 1, \"x\": -0.18257418583505522, \"y\": 0.7302967433402213}, {\"shape\": 1, \"point\": 2, \"x\": 0.36514837167011066, \"y\": -0.36514837167011077}, {\"shape\": 2, \"point\": 0, \"x\": -0.1825741858350555, \"y\": -0.36514837167011066}, {\"shape\": 2, \"point\": 1, \"x\": -0.1825741858350552, \"y\": 0.7302967433402214}, {\"shape\": 2, \"point\": 2, \"x\": 0.36514837167011066, \"y\": -0.3651483716701107}]}}, {\"mode\": \"vega-lite\"});\n",
446 |        "</script>"
447 |       ],
448 |       "text/plain": [
449 |        "alt.Chart(...)"
450 |       ]
451 |      },
452 |      "execution_count": 7,
453 |      "metadata": {},
454 |      "output_type": "execute_result"
455 |     }
456 |    ],
457 |    "source": [
458 |     "da.values = aligned_shapes\n",
459 |     "aligned_points = da.to_unstacked_dataset('xy').to_dataframe().reset_index()\n",
460 |     "\n",
461 |     "alt.Chart(aligned_points).mark_line(opacity=0.5).encode(\n",
462 |     "    x='x',\n",
463 |     "    y='y',\n",
464 |     "    detail='shape',\n",
465 |     "    color='shape:N'\n",
466 |     ")"
467 |    ]
468 |   },
469 |   {
470 |    "attachments": {},
471 |    "cell_type": "markdown",
472 |    "metadata": {},
473 |    "source": [
474 |     "The triangles were all the same shape, so they are now perfectly aligned."
475 |    ]
476 |   }
477 |  ],
478 |  "metadata": {
479 |   "kernelspec": {
480 |    "display_name": ".venv",
481 |    "language": "python",
482 |    "name": "python3"
483 |   },
484 |   "language_info": {
485 |    "codemirror_mode": {
486 |     "name": "ipython",
487 |     "version": 3
488 |    },
489 |    "file_extension": ".py",
490 |    "mimetype": "text/x-python",
491 |    "name": "python",
492 |    "nbconvert_exporter": "python",
493 |    "pygments_lexer": "ipython3",
494 |    "version": "3.11.4"
495 |   },
496 |   "vscode": {
497 |    "interpreter": {
498 |     "hash": "441c2ec70d9faeb70e7723f55150c6260f4a26a9c828b90915d3399002e14f43"
499 |    }
500 |   }
501 |  },
502 |  "nbformat": 4,
503 |  "nbformat_minor": 2
504 | }
505 | 


--------------------------------------------------------------------------------
/docs/layouts/_default/_markup/render-codeblock-mermaid.html:
--------------------------------------------------------------------------------
1 | <div class="mermaid" align="center">
2 |     {{- .Inner | safeHTML }}
3 | </div>
4 | {{ .Page.Store.Set "hasMermaid" true }}
5 | 


--------------------------------------------------------------------------------
/docs/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/docs/static/favicon.ico


--------------------------------------------------------------------------------
/docs/static/images/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/docs/static/images/favicon.png


--------------------------------------------------------------------------------
/docs/static/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/docs/static/images/logo.png


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/archetypes/blog.md:
--------------------------------------------------------------------------------
 1 | +++
 2 | title = "{{ replace .Name "-" " " | title }}"
 3 | date = "{{ .Date }}"
 4 | 
 5 | #
 6 | # description is optional
 7 | #
 8 | # description = "An optional description for SEO. If not provided, an automatically created summary will be used."
 9 | 
10 | tags = [{{ range $plural, $terms := .Site.Taxonomies }}{{ range $term, $val := $terms }}"{{ printf "%s" $term }}",{{ end }}{{ end }}]
11 | +++
12 | 
13 | This is a page about »{{ replace .Name "-" " " | title }}«.
14 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/archetypes/default.md:
--------------------------------------------------------------------------------
 1 | +++
 2 | title = "{{ replace .Name "-" " " | title }}"
 3 | date = "{{ .Date }}"
 4 | 
 5 | #
 6 | # Set menu to "main" to add this page to
 7 | # the main menu on top of the page
 8 | #
 9 | menu = "main"
10 | 
11 | #
12 | # description is optional
13 | #
14 | # description = "An optional description for SEO. If not provided, an automatically created summary will be used."
15 | 
16 | #
17 | # tags are optional
18 | #
19 | # tags = [{{ range $plural, $terms := .Site.Taxonomies }}{{ range $term, $val := $terms }}"{{ printf "%s" $term }}",{{ end }}{{ end }}]
20 | +++
21 | 
22 | This is a page about »{{ replace .Name "-" " " | title }}«.
23 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/404.html:
--------------------------------------------------------------------------------
1 | {{ define "title" }}404{{ end }}
2 | 
3 | {{ define "main" }}
4 | <h1>404</h1>
5 | <h2>ʕノ•ᴥ•ʔノ ︵ ┻━┻</h2>
6 | {{ end }}
7 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/_default/baseof.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="{{ with .Site.LanguageCode }}{{ . }}{{ else }}en-US{{ end }}">
 3 |   <head>
 4 |     <meta http-equiv="X-Clacks-Overhead" content="GNU Terry Pratchett" />
 5 |     <meta charset="utf-8" />
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
 7 |     {{- partial "favicon.html" . -}}
 8 |     <title>
 9 |       {{- block "title" . }}{{ with .Title }}{{ . }} | {{ end }}{{ .Site.Title
10 |       }}{{- end }}
11 |     </title>
12 | 
13 |     {{- partial "seo_tags.html" . -}}
14 |     <meta name="referrer" content="no-referrer-when-downgrade" />
15 | 
16 |     {{ with .OutputFormats.Get "rss" -}} {{ printf `
17 |     <link rel="%s" type="%s" href="%s" title="%s" />
18 |     ` .Rel .MediaType.Type .Permalink $.Site.Title | safeHTML }} {{ end -}} {{-
19 |     partial "style.html" . -}}
20 | 
21 |     <!-- A partial to be overwritten by the user.
22 |   Simply place a custom_head.html into
23 |   your local /layouts/partials-directory -->
24 |     {{- partial "custom_head.html" . -}} {{- if not (eq hugo.Environment
25 |     "development") -}}
26 |     <script
27 |       defer
28 |       src="https://eu.umami.is/script.js"
29 |       data-website-id="15f89e99-7fd4-4db3-a433-fd2954385d6d"
30 |     ></script>
31 |     {{- end -}}
32 |   </head>
33 | 
34 |   <body>
35 |     <header>{{- partial "header.html" . -}}</header>
36 |     <main>{{- block "main" . }}{{- end }}</main>
37 |     <footer>{{- partial "footer.html" . -}}</footer>
38 | 
39 |     <!-- A partial to be overwritten by the user.
40 |   Simply place a custom_body.html into
41 |   your local /layouts/partials-directory -->
42 |     {{- partial "custom_body.html" . -}}
43 |   </body>
44 | </html>
45 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/_default/list.html:
--------------------------------------------------------------------------------
 1 | {{ define "main" }}
 2 | <content>
 3 |   {{ if .Data.Singular }}
 4 |   <h3 style="margin-bottom:0">Filtering for "{{ .Title }}"</h3>
 5 |   <small>
 6 |     <a href="{{ "/blog" | relURL }}">Remove filter</a>
 7 |   </small>
 8 |   {{ end }}
 9 |   <ul class="blog-posts">
10 |     {{ range .Pages }}
11 |     <li>
12 |       <span>
13 |         <i>
14 |           <time datetime='{{ .Date.Format "2006-01-02" }}' pubdate>
15 |             {{ .Date.Format (default "02 Jan, 2006" .Site.Params.dateFormat) }}
16 |           </time>
17 |         </i>
18 |       </span>
19 |       <a href="{{ .Permalink }}">{{ .Title }}</a>
20 |     </li>
21 |     {{ else }}
22 |     <li>
23 |       No posts yet
24 |     </li>
25 |     {{ end }}
26 |   </ul>
27 |   {{ if .Data.Singular }}
28 |   {{else}}
29 |     <small>
30 |       <div>
31 |         {{ range .Site.Taxonomies.tags }}
32 |         <a href="{{ .Page.Permalink }}">#{{ .Page.Title }}</a>&nbsp;
33 |         {{ end }}
34 |       </div>
35 |     </small>
36 |     {{ end }}
37 | </content>
38 | {{ end }}
39 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/_default/single.html:
--------------------------------------------------------------------------------
 1 | {{ define "main" }}
 2 | {{ if eq .Type "blog" }}{{ if not .Params.menu }}
 3 | <p>
 4 |   <i>
 5 |     <time datetime='{{ .Date.Format "2006-01-02" }}' pubdate>
 6 |       {{ .Date.Format (default "02 Jan, 2006" .Site.Params.dateFormat) }}
 7 |     </time>
 8 |   </i>
 9 | </p>
10 | {{ end }}{{ end }}
11 | <content>
12 |     <h1>{{ .Title }}</h1>
13 | {{ if and (gt .WordCount 400 ) (.Params.toc) }}
14 | <h2>Table of contents</h2>
15 | {{.TableOfContents}}
16 | {{ end }}
17 |   {{ .Content }}
18 | </content>
19 | <p>
20 |   {{ range (.GetTerms "tags") }}
21 |   <a href="{{ .Permalink }}">#{{ .LinkTitle }}</a>
22 |   {{ end }}
23 | </p>
24 | {{ end }}
25 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/index.html:
--------------------------------------------------------------------------------
 1 | {{ define "main" }}
 2 | {{ .Content }}
 3 | 
 4 | {{ if .Page.Store.Get "hasMermaid" }}
 5 |   <script src="https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js"></script>
 6 |   <script>
 7 |     mermaid.initialize({ startOnLoad: true, securityLevel: 'loose' });
 8 |   </script>
 9 | {{ end }}
10 | 
11 | {{ end }}
12 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/custom_body.html:
--------------------------------------------------------------------------------
1 |   <!-- A partial to be overwritten by the user.
2 |   Simply place a custom_body.html into
3 |   your local /layouts/partials-directory -->
4 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/custom_head.html:
--------------------------------------------------------------------------------
1 | <!-- A partial to be overwritten by the user.
2 |   Simply place a custom_head.html into
3 |   your local /layouts/partials-directory -->
4 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/favicon.html:
--------------------------------------------------------------------------------
1 | {{ with .Site.Params.favicon }}
2 | <link rel="shortcut icon" href="{{ . | absURL }}" />{{ end }}
3 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/footer.html:
--------------------------------------------------------------------------------
1 | {{ if ne .Site.Params.hideMadeWithLine true }}Made with <a href="https://github.com/janraasch/hugo-bearblog/">Hugo ʕ•ᴥ•ʔ Bear</a>{{ end }}
2 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/header.html:
--------------------------------------------------------------------------------
 1 | <a href="{{ "/prince" | relURL }}" class="title">
 2 |   <h2 style="display: none;">{{ .Site.Title }} foo</h2>
 3 |   <div align="center">
 4 |     <img width="70%" src="https://maxhalford.github.io/prince/images/logo.png" />
 5 |   </div>
 6 | </a>
 7 | <div align="center">
 8 |     <nav>{{- partial "nav.html" . -}}</nav>
 9 | </div>
10 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/nav.html:
--------------------------------------------------------------------------------
1 | {{ range .Site.Menus.main }}
2 | <a href="{{ .URL }}">{{ index .Page.Aliases 0 | upper }}</a>
3 | {{ end }}
4 | <a href="https://github.com/MaxHalford/prince">GitHub</a>
5 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/seo_tags.html:
--------------------------------------------------------------------------------
 1 | <!-- Primary Meta Tags -->
 2 | <meta name="title" content="{{ with .Title }}{{ . }}{{ else }}{{ .Site.Title }}{{ end }}" />
 3 | <meta name="description" content="{{ with .Description }}{{ . }}{{ else }}{{if .IsPage}}{{ .Summary }}{{ else }}{{ with .Site.Params.description }}{{ . }}{{ end }}{{ end }}{{ end }}" />
 4 | <meta name="keywords" content="{{ if .IsPage}}{{ range $index, $tag := .Params.tags }}{{ $tag }},{{ end }}{{ else }}{{ range $plural, $terms := .Site.Taxonomies }}{{ range $term, $val := $terms }}{{ printf "%s," $term }}{{ end }}{{ end }}{{ end }}" />
 5 | 
 6 | <!-- Open Graph / Facebook -->
 7 | {{ template "_internal/opengraph.html" . }}
 8 | 
 9 | <!-- Twitter -->
10 | {{ template "_internal/twitter_cards.html" . }}
11 | 
12 | <!-- Microdata -->
13 | {{ template "_internal/schema.html" . }}
14 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/partials/style.html:
--------------------------------------------------------------------------------
  1 | <style>
  2 | 
  3 | body {
  4 |     font-family: Verdana, sans-serif;
  5 |     margin: auto;
  6 |     padding: 20px;
  7 |     max-width: 720px;
  8 |     text-align: left;
  9 |     background: #faf4ed;
 10 |     word-wrap: break-word;
 11 |     overflow-wrap: break-word;
 12 |     line-height: 1.5;
 13 |     color: #444;
 14 |   }
 15 | 
 16 |     a {
 17 |     border-bottom: 1px solid #d7827e;
 18 |     color: #d7827e;
 19 |     text-decoration: none;
 20 |     }
 21 |     a:hover {
 22 |     border-bottom: 0;
 23 |     }
 24 |     b,
 25 |     body,
 26 |     strong {
 27 |     color: #575279;
 28 |     }
 29 |     h1,
 30 |     h2,
 31 |     h3,
 32 |     h4,
 33 |     h5,
 34 |     h6 {
 35 |     color: #907aa9;
 36 |     }
 37 | 
 38 | 
 39 | 
 40 |   .title {
 41 |     text-decoration: none;
 42 |     border: 0;
 43 |   }
 44 | 
 45 |   .title span {
 46 |     font-weight: 400;
 47 |   }
 48 | 
 49 |   nav a {
 50 |     margin-right: 20px;
 51 |     font-size: 20px;
 52 |     font-weight: 600;
 53 |     border-bottom: none;
 54 |   }
 55 | 
 56 |   #TableOfContents a {
 57 |     font-size: 16px;
 58 |   }
 59 | 
 60 |   textarea {
 61 |     width: 100%;
 62 |     font-size: 16px;
 63 |   }
 64 | 
 65 |   input {
 66 |     font-size: 16px;
 67 |   }
 68 | 
 69 |   content {
 70 |     line-height: 1.6;
 71 |   }
 72 | 
 73 |   table {
 74 |     width: 100%;
 75 |   }
 76 | 
 77 |   img {
 78 |     max-width: 100%;
 79 |   }
 80 | 
 81 |   code {
 82 |     padding: 2px 5px;
 83 |     background: #fffaf3;
 84 |   }
 85 | 
 86 |   pre code {
 87 |     color: #222;
 88 |     display: block;
 89 |     padding: 20px;
 90 |     white-space: pre-wrap;
 91 |     font-size: 14px;
 92 |     overflow-x: auto;
 93 |   }
 94 | 
 95 |   div.highlight pre {
 96 |     background-color: initial;
 97 |     color: initial;
 98 |   }
 99 | 
100 |   blockquote {
101 |     border-left: 1px solid #999;
102 |     color: #222;
103 |     padding-left: 20px;
104 |     font-style: italic;
105 |   }
106 | 
107 |   footer {
108 |     padding: 25px;
109 |     text-align: center;
110 |   }
111 | 
112 |   .helptext {
113 |     color: #777;
114 |     font-size: small;
115 |   }
116 | 
117 |   .errorlist {
118 |     color: #eba613;
119 |     font-size: small;
120 |   }
121 | 
122 |   table {
123 |     border: 0;
124 |     border-spacing: 0;
125 |   }
126 | 
127 |   td, th {
128 |   border: 0;
129 |   padding: 6px;
130 |   padding-right: 12px;
131 | }
132 | 
133 | tr:nth-child(even){
134 |     background-color: #faf4ed;
135 | }
136 | 
137 | tr {
138 | background-color: #fffaf3;
139 |   text-align: right;
140 | }
141 | 
142 | </style>
143 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/layouts/robots.txt:
--------------------------------------------------------------------------------
1 | User-Agent: *
2 | Sitemap: {{ "sitemap.xml" | absURL }}
3 | 


--------------------------------------------------------------------------------
/docs/themes/hugo-bearblog/theme.toml:
--------------------------------------------------------------------------------
 1 | # theme.toml template for a Hugo theme
 2 | # See https://github.com/gohugoio/hugoThemes#themetoml for an example
 3 | 
 4 | name = "Hugo Bear Blog"
 5 | license = "MIT"
 6 | licenselink = "https://github.com/janraasch/hugo-bearblog/blob/master/LICENSE"
 7 | description = "A Hugo theme based on »Bear Blog«. Free, no-nonsense, super-fast blogging. »Bear Blog« now includes a dark color scheme to support dark mode!"
 8 | homepage = "https://github.com/janraasch/hugo-bearblog"
 9 | demosite = "https://janraasch.github.io/hugo-bearblog/"
10 | tags = ["blog", "responsive", "minimal", "seo", "clean", "simple", "light", "minimalist", "mobile", "fast", "white", "minimalistic", "reading", "dark mode"]
11 | features = ["favicon", "seo", "no stylesheets", "no javascript", "rss", "dark mode"]
12 | min_version = "0.73.0"
13 | # https://gohugo.io/content-management/taxonomies#default-taxonomies
14 | # https://gohugo.io/templates/taxonomy-templates/#example-list-tags-in-a-single-page-template
15 | # https://gohugo.io/templates/taxonomy-templates/#example-list-all-site-tags
16 | 
17 | [author]
18 |   name = "Jan Raasch"
19 |   homepage = "https://www.janraasch.com"
20 | 
21 | # If porting an existing theme
22 | [original]
23 |   name = "ʕ•ᴥ•ʔ Bear Blog"
24 |   homepage = "https://bearblog.dev"
25 |   repo = "https://github.com/HermanMartinus/bearblog"
26 | 


--------------------------------------------------------------------------------
/figures/decastar.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" class="marks" width="533" height="347" viewBox="0 0 533 347"><rect width="533" height="347" fill="white"/><g fill="none" stroke-miterlimit="10" transform="translate(47,10)"><g class="mark-group role-frame root" role="graphics-object" aria-roledescription="group mark container"><g transform="translate(0,0)"><path class="background" aria-hidden="true" d="M0.5,0.5h400v300h-400Z" stroke="#ddd"/><g><g class="mark-group role-axis" aria-hidden="true"><g transform="translate(0.5,300.5)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-rule role-axis-grid" pointer-events="none"><line transform="translate(0,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(36,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(73,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(109,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(145,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(182,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(218,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(255,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(291,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(327,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(364,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(400,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-group role-axis" aria-hidden="true"><g transform="translate(0.5,0.5)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-rule role-axis-grid" pointer-events="none"><line transform="translate(0,300)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,273)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,245)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,218)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,191)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,164)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,136)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,109)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,82)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,55)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,27)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,0)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-group role-axis" role="graphics-symbol" aria-roledescription="axis" aria-label="X-axis titled 'component 0 — 31.14%' for a linear scale with values from −6 to 5"><g transform="translate(0.5,300.5)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-rule role-axis-tick" pointer-events="none"><line transform="translate(0,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(36,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(73,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(109,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(145,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(182,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(218,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(255,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(291,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(327,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(364,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(400,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/></g><g class="mark-text role-axis-label" pointer-events="none"><text text-anchor="start" transform="translate(0,15)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">−6</text><text text-anchor="middle" transform="translate(36.36363636363637,15)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">−5</text><text text-anchor="middle" transform="translate(72.72727272727273,15)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">−4</text><text text-anchor="middle" transform="translate(109.09090909090908,15)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">−3</text><text text-anchor="middle" transform="translate(145.45454545454547,15)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">−2</text><text text-anchor="middle" transform="translate(181.8181818181818,15)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">−1</text><text text-anchor="middle" transform="translate(218.18181818181816,15)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">0</text><text text-anchor="middle" transform="translate(254.54545454545453,15)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">1</text><text text-anchor="middle" transform="translate(290.90909090909093,15)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">2</text><text text-anchor="middle" transform="translate(327.2727272727273,15)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">3</text><text text-anchor="middle" transform="translate(363.6363636363636,15)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">4</text><text text-anchor="end" transform="translate(400,15)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">5</text></g><g class="mark-rule role-axis-domain" pointer-events="none"><line transform="translate(0,0)" x2="400" y2="0" stroke="#888" stroke-width="1" opacity="1"/></g><g class="mark-text role-axis-title" pointer-events="none"><text text-anchor="middle" transform="translate(200,30)" font-family="sans-serif" font-size="11px" font-weight="bold" fill="#000" opacity="1">component 0 — 31.14%</text></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-group role-axis" role="graphics-symbol" aria-roledescription="axis" aria-label="Y-axis titled 'component 1 — 20.27%' for a linear scale with values from −3 to 3"><g transform="translate(0.5,0.5)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-rule role-axis-tick" pointer-events="none"><line transform="translate(0,300)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,273)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,245)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,218)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,191)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,164)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,136)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,109)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,82)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,55)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,27)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,0)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/></g><g class="mark-text role-axis-label" pointer-events="none"><text text-anchor="end" transform="translate(-7,303)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">−2.5</text><text text-anchor="end" transform="translate(-7,275.7272727272727)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">−2.0</text><text text-anchor="end" transform="translate(-7,248.45454545454544)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">−1.5</text><text text-anchor="end" transform="translate(-7,221.1818181818182)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">−1.0</text><text text-anchor="end" transform="translate(-7,193.9090909090909)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">−0.5</text><text text-anchor="end" transform="translate(-7,166.63636363636363)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">0.0</text><text text-anchor="end" transform="translate(-7,139.36363636363637)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">0.5</text><text text-anchor="end" transform="translate(-7,112.0909090909091)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">1.0</text><text text-anchor="end" transform="translate(-7,84.81818181818181)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">1.5</text><text text-anchor="end" transform="translate(-7,57.54545454545453)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">2.0</text><text text-anchor="end" transform="translate(-7,30.27272727272728)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">2.5</text><text text-anchor="end" transform="translate(-7,3)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">3.0</text></g><g class="mark-rule role-axis-domain" pointer-events="none"><line transform="translate(0,300)" x2="0" y2="-300" stroke="#888" stroke-width="1" opacity="1"/></g><g class="mark-text role-axis-title" pointer-events="none"><text text-anchor="middle" transform="translate(-30.7412109375,150) rotate(-90) translate(0,-2)" font-family="sans-serif" font-size="11px" font-weight="bold" fill="#000" opacity="1">component 1 — 20.27%</text></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-symbol role-mark layer_0_marks" clip-path="url(#clip1)" role="graphics-object" aria-roledescription="symbol mark container"><path aria-label="competition: Decastar; athlete: Sebrle; component 0: −1.31479290105; component 1: −1.43855834894" role="graphics-symbol" aria-roledescription="circle" transform="translate(170.3711672344609,242.10318266967147)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: Decastar; athlete: Clay; component 0: −2.66201255578; component 1: −1.57848179467" role="graphics-symbol" aria-roledescription="circle" transform="translate(121.38136160805315,249.73537061841645)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: Decastar; athlete: Karpov; component 0: −2.69187317618; component 1: −0.0516357497549" role="graphics-symbol" aria-roledescription="circle" transform="translate(120.29552086633817,166.4528590775395)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: Decastar; athlete: Bernard; component 0: 0.205941186143; component 1: −1.3879190308" role="graphics-symbol" aria-roledescription="circle" transform="translate(225.67058858701077,239.34103804346793)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: Decastar; athlete: Yurkov; component 0: 0.811251800075; component 1: −1.00888749615" role="graphics-symbol" aria-roledescription="circle" transform="translate(247.68188363908422,218.66659069923975)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: Decastar; athlete: Warners; component 0: −1.13635263346; component 1: 0.972792759983" role="graphics-symbol" aria-roledescription="circle" transform="translate(176.8599042379211,110.574940364582)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: Decastar; athlete: Zsivoczky; component 0: −0.767542939884; component 1: 2.58732292609" role="graphics-symbol" aria-roledescription="circle" transform="translate(190.27116582239634,22.509658576971937)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: Decastar; athlete: Mcmullen; component 0: −1.36943177799; component 1: 1.56977169083" role="graphics-symbol" aria-roledescription="circle" transform="translate(168.38429898209455,78.01245322736169)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: Decastar; athlete: Martineau; component 0: 2.17306781163; component 1: 0.0658653262392" role="graphics-symbol" aria-roledescription="circle" transform="translate(297.20246587742093,160.0437094778641)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: Decastar; athlete: Hernu; component 0: 0.892302122092; component 1: −1.35445297518" role="graphics-symbol" aria-roledescription="circle" transform="translate(250.62916807607576,237.51561682809935)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: Decastar; athlete: Barras; component 0: 0.680865786872; component 1: 0.608605435008" role="graphics-symbol" aria-roledescription="circle" transform="translate(242.9405740680771,130.4397035450084)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: Decastar; athlete: Nool; component 0: 2.05115586085; component 1: 2.24117829614" role="graphics-symbol" aria-roledescription="circle" transform="translate(292.76930403073436,41.390274755949996)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: Decastar; athlete: Bourguignon; component 0: 3.12742141668; component 1: −1.22560103879" role="graphics-symbol" aria-roledescription="circle" transform="translate(331.9062333339669,230.4873293885565)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#4c78a8" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Sebrle; component 0: −3.98887451198; component 1: −1.9048156646" role="graphics-symbol" aria-roledescription="circle" transform="translate(73.13183592811488,267.53539988734735)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Clay; component 0: −4.29245196966; component 1: −1.5947494615" role="graphics-symbol" aria-roledescription="circle" transform="translate(62.092655648765025,250.62269790012067)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Karpov; component 0: −5.37662999524; component 1: 0.832191723063" role="graphics-symbol" aria-roledescription="circle" transform="translate(22.668000173028187,118.24408783290853)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Macey; component 0: −2.14741239752; component 1: 0.956146584275" role="graphics-symbol" aria-roledescription="circle" transform="translate(140.09409463577992,111.48291358498788)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Warners; component 0: −3.09075614341; component 1: 0.755853630568" role="graphics-symbol" aria-roledescription="circle" transform="translate(105.79068569422496,122.40798378717375)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Zsivoczky; component 0: −0.726573197651; component 1: −0.225649194917" role="graphics-symbol" aria-roledescription="circle" transform="translate(191.76097463086643,175.94450154092155)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Hernu; component 0: −0.98221758544; component 1: 0.92705046407" role="graphics-symbol" aria-roledescription="circle" transform="translate(182.4648150749043,113.06997468707219)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Nool; component 0: −0.806206027855; component 1: −1.41923577723" role="graphics-symbol" aria-roledescription="circle" transform="translate(188.86523535072413,241.04922421269853)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Bernard; component 0: −2.50131909428; component 1: 1.62689934349" role="graphics-symbol" aria-roledescription="circle" transform="translate(127.22476020806366,74.89639944600393)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Schwarzl; component 0: −0.593174922426; component 1: −0.0602659893097" role="graphics-symbol" aria-roledescription="circle" transform="translate(196.61182100267888,166.92359941689435)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Pogorelov; component 0: −1.19672923811; component 1: −0.948651966806" role="graphics-symbol" aria-roledescription="circle" transform="translate(174.6643913415959,215.38101637124197)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Schoenbeck; component 0: −0.616947041025; component 1: −1.43079996951" role="graphics-symbol" aria-roledescription="circle" transform="translate(195.74738032636233,241.67999833697857)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Barras; component 0: 0.0283041888308; component 1: −0.00615298919585" role="graphics-symbol" aria-roledescription="circle" transform="translate(219.21106141202893,163.97198122886456)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Smith; component 0: −1.30637509766; component 1: 0.544059508042" role="graphics-symbol" aria-roledescription="circle" transform="translate(170.67726917607365,133.96039047044448)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Averyanov; component 0: −1.0020756407; component 1: 0.592213940835" role="graphics-symbol" aria-roledescription="circle" transform="translate(181.74270397468482,131.33378504534048)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Ojaniemi; component 0: −0.969519408431; component 1: 0.121953231294" role="graphics-symbol" aria-roledescription="circle" transform="translate(182.92656696614057,156.9843692021286)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Smirnov; component 0: 0.324235533613; component 1: 0.928995898682" role="graphics-symbol" aria-roledescription="circle" transform="translate(229.97220122227708,112.96386007190456)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Qi; component 0: 0.0262561102864; component 1: 1.0393887187" role="graphics-symbol" aria-roledescription="circle" transform="translate(219.13658582859546,106.9424335253142)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Drews; component 0: −0.783918873023; component 1: 1.63913326034" role="graphics-symbol" aria-roledescription="circle" transform="translate(189.6756773446089,74.22909489072772)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Parkhomenko; component 0: 1.274269618; component 1: −1.8304434839" role="graphics-symbol" aria-roledescription="circle" transform="translate(264.51889519990226,263.4787354856353)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Terek; component 0: −0.188673746039; component 1: −1.85017227666" role="graphics-symbol" aria-roledescription="circle" transform="translate(211.320954689489,264.55485145399854)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Gomez; component 0: −0.2070451656; component 1: 1.08856547486" role="graphics-symbol" aria-roledescription="circle" transform="translate(210.65290306908162,104.26006500740434)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Turi; component 0: 0.870719861231; component 1: −0.242170448565" role="graphics-symbol" aria-roledescription="circle" transform="translate(249.84435859022943,176.84566083084226)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Lorenzo; component 0: 2.07093274793; component 1: 1.5454613445" role="graphics-symbol" aria-roledescription="circle" transform="translate(293.4884635610111,79.3384721180366)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Karlivans; component 0: 1.3212391044; component 1: 1.31834822982" role="graphics-symbol" aria-roledescription="circle" transform="translate(266.22687652352164,91.72646019141804)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Korkizoglou; component 0: −0.756226224624; component 1: −1.97576889714" role="graphics-symbol" aria-roledescription="circle" transform="translate(190.68268274094817,271.405576207399)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Uldal; component 0: 1.90527618698; component 1: −0.0629840952529" role="graphics-symbol" aria-roledescription="circle" transform="translate(287.46458861757844,167.07185974106486)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/><path aria-label="competition: OlympicG; athlete: Casarsa; component 0: 2.28257472127; component 1: −2.15028191082" role="graphics-symbol" aria-roledescription="circle" transform="translate(301.18453531901497,280.92446786296875)" d="M3.536,0A3.536,3.536,0,1,1,-3.536,0A3.536,3.536,0,1,1,3.536,0" fill="#f58518" stroke-width="2" opacity="0.7"/></g><g class="mark-symbol role-mark layer_1_marks" clip-path="url(#clip2)" role="graphics-object" aria-roledescription="symbol mark container"><path aria-label="variable: 100m; component 0: 4.52612196727; component 1: 0.0982167185964" role="graphics-symbol" aria-roledescription="square" transform="translate(382.7680715372107,158.27908807656127)" d="M-3.536,-3.536h7.071v7.071h-7.071Z" fill="green" stroke-width="2" opacity="0.7"/><path aria-label="variable: long.jump; component 0: −3.51506117735; component 1: 0.00881126408369" role="graphics-symbol" aria-roledescription="square" transform="translate(90.36141173289992,163.15574923179858)" d="M-3.536,-3.536h7.071v7.071h-7.071Z" fill="green" stroke-width="2" opacity="0.7"/><path aria-label="variable: shot.put; component 0: −1.87824011891; component 1: −1.59014029876" role="graphics-symbol" aria-roledescription="square" transform="translate(149.882177494248,250.37128902314788)" d="M-3.536,-3.536h7.071v7.071h-7.071Z" fill="green" stroke-width="2" opacity="0.7"/><path aria-label="variable: high.jump; component 0: −1.81313760676; component 1: 1.00059162236" role="graphics-symbol" aria-roledescription="square" transform="translate(152.24954157223536,109.05863878035194)" d="M-3.536,-3.536h7.071v7.071h-7.071Z" fill="green" stroke-width="2" opacity="0.7"/><path aria-label="variable: 400m; component 0: 3.25343122926; component 1: −1.22457908091" role="graphics-symbol" aria-roledescription="square" transform="translate(336.4884083365672,230.43158623160048)" d="M-3.536,-3.536h7.071v7.071h-7.071Z" fill="green" stroke-width="2" opacity="0.7"/><path aria-label="variable: 110m.hurdle; component 0: 4.71034270733; component 1: −0.754959779907" role="graphics-symbol" aria-roledescription="square" transform="translate(389.4670075393381,204.81598799494478)" d="M-3.536,-3.536h7.071v7.071h-7.071Z" fill="green" stroke-width="2" opacity="0.7"/><path aria-label="variable: discus; component 0: −2.93767978185; component 1: −0.707654087242" role="graphics-symbol" aria-roledescription="square" transform="translate(111.35709884189369,202.2356774859163)" d="M-3.536,-3.536h7.071v7.071h-7.071Z" fill="green" stroke-width="2" opacity="0.7"/><path aria-label="variable: pole.vault; component 0: 0.321295313757; component 1: −1.95491169123" role="graphics-symbol" aria-roledescription="square" transform="translate(229.86528413663453,270.2679104309983)" d="M-3.536,-3.536h7.071v7.071h-7.071Z" fill="green" stroke-width="2" opacity="0.7"/><path aria-label="variable: javeline; component 0: −0.0820585742034; component 1: −1.2027877231" role="graphics-symbol" aria-roledescription="square" transform="translate(215.19787002896624,229.24296671455446)" d="M-3.536,-3.536h7.071v7.071h-7.071Z" fill="green" stroke-width="2" opacity="0.7"/><path aria-label="variable: 1500m; component 0: −2.97714981701; component 1: −1.47951009665" role="graphics-symbol" aria-roledescription="square" transform="translate(109.92182483593866,244.33691436256603)" d="M-3.536,-3.536h7.071v7.071h-7.071Z" fill="green" stroke-width="2" opacity="0.7"/><path aria-label="variable: rank; component 0: 4.27541766428; component 1: 0.966371318267" role="graphics-symbol" aria-roledescription="square" transform="translate(373.6515514283972,110.92520082181498)" d="M-3.536,-3.536h7.071v7.071h-7.071Z" fill="green" stroke-width="2" opacity="0.7"/><path aria-label="variable: points; component 0: −4.4750995466; component 1: −0.218868401356" role="graphics-symbol" aria-roledescription="square" transform="translate(55.45092557825991,175.57464007398735)" d="M-3.536,-3.536h7.071v7.071h-7.071Z" fill="green" stroke-width="2" opacity="0.7"/></g><g class="mark-group role-legend" role="graphics-symbol" aria-roledescription="legend" aria-label="Symbol legend titled 'competition' for fill color with 2 values: Decastar, OlympicG"><g transform="translate(418,0)"><path class="background" aria-hidden="true" d="M0,0h63v40h-63Z" pointer-events="none"/><g><g class="mark-group role-legend-entry"><g transform="translate(0,16)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-group role-scope" role="graphics-object" aria-roledescription="group mark container"><g transform="translate(0,0)"><path class="background" aria-hidden="true" d="M0,0h59.8916015625v11h-59.8916015625Z" pointer-events="none" opacity="1"/><g><g class="mark-symbol role-legend-symbol" pointer-events="none"><path transform="translate(6,6)" d="M5,0A5,5,0,1,1,-5,0A5,5,0,1,1,5,0" fill="#4c78a8" stroke-width="1.5" opacity="0.7"/></g><g class="mark-text role-legend-label" pointer-events="none"><text text-anchor="start" transform="translate(16,9)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">Decastar</text></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g><g transform="translate(0,13)"><path class="background" aria-hidden="true" d="M0,0h59.8916015625v11h-59.8916015625Z" pointer-events="none" opacity="1"/><g><g class="mark-symbol role-legend-symbol" pointer-events="none"><path transform="translate(6,6)" d="M5,0A5,5,0,1,1,-5,0A5,5,0,1,1,5,0" fill="#f58518" stroke-width="1.5" opacity="0.7"/></g><g class="mark-text role-legend-label" pointer-events="none"><text text-anchor="start" transform="translate(16,9)" font-family="sans-serif" font-size="10px" fill="#000" opacity="1">OlympicG</text></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-text role-legend-title" pointer-events="none"><text text-anchor="start" transform="translate(0,9)" font-family="sans-serif" font-size="11px" font-weight="bold" fill="#000" opacity="1">competition</text></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g></g><path class="foreground" aria-hidden="true" d="" display="none"/></g></g></g><defs><clipPath id="clip1"><rect x="0" y="0" width="400" height="300"/></clipPath><clipPath id="clip2"><rect x="0" y="0" width="400" height="300"/></clipPath></defs></svg>
2 | 


--------------------------------------------------------------------------------
/prince/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import importlib.metadata
 4 | 
 5 | from . import datasets
 6 | from .ca import CA
 7 | from .famd import FAMD
 8 | from .gpa import GPA
 9 | from .mca import MCA
10 | from .mfa import MFA
11 | from .pca import PCA
12 | 
13 | __version__ = importlib.metadata.version("prince")
14 | __all__ = ["CA", "FAMD", "MCA", "MFA", "PCA", "GPA", "datasets"]
15 | 


--------------------------------------------------------------------------------
/prince/ca.py:
--------------------------------------------------------------------------------
  1 | """Correspondence Analysis (CA)"""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import functools
  6 | 
  7 | import altair as alt
  8 | import numpy as np
  9 | import pandas as pd
 10 | from scipy import sparse
 11 | from sklearn.utils import check_array
 12 | 
 13 | from prince import svd, utils
 14 | 
 15 | 
 16 | def select_active_columns(method):
 17 |     @functools.wraps(method)
 18 |     def _impl(self, X=None, *method_args, **method_kwargs):
 19 |         if hasattr(self, "active_cols_") and isinstance(X, pd.DataFrame):
 20 |             return method(self, X[self.active_cols_], *method_args, **method_kwargs)
 21 |         return method(self, X, *method_args, **method_kwargs)
 22 | 
 23 |     return _impl
 24 | 
 25 | 
 26 | def select_active_rows(method):
 27 |     @functools.wraps(method)
 28 |     def _impl(self, X=None, *method_args, **method_kwargs):
 29 |         if hasattr(self, "active_rows_") and isinstance(X, pd.DataFrame):
 30 |             return method(self, X.loc[self.active_rows_], *method_args, **method_kwargs)
 31 |         return method(self, X, *method_args, **method_kwargs)
 32 | 
 33 |     return _impl
 34 | 
 35 | 
 36 | class CA(utils.EigenvaluesMixin):
 37 |     def __init__(
 38 |         self,
 39 |         n_components=2,
 40 |         n_iter=10,
 41 |         copy=True,
 42 |         check_input=True,
 43 |         random_state=None,
 44 |         engine="sklearn",
 45 |     ):
 46 |         self.n_components = n_components
 47 |         self.n_iter = n_iter
 48 |         self.copy = copy
 49 |         self.check_input = check_input
 50 |         self.random_state = random_state
 51 |         self.engine = engine
 52 | 
 53 |     @utils.check_is_dataframe_input
 54 |     def fit(self, X, y=None):
 55 |         # Check input
 56 |         if self.check_input:
 57 |             check_array(X)
 58 | 
 59 |         # Check all values are positive
 60 |         if (X < 0).any().any():
 61 |             raise ValueError("All values in X should be positive")
 62 | 
 63 |         _, row_names, _, col_names = utils.make_labels_and_names(X)
 64 | 
 65 |         if isinstance(X, pd.DataFrame):
 66 |             X = X.to_numpy()
 67 | 
 68 |         if self.copy:
 69 |             X = np.copy(X)
 70 | 
 71 |         # Compute the correspondence matrix which contains the relative frequencies
 72 |         X = X.astype(float) / np.sum(X)
 73 | 
 74 |         # Compute row and column masses
 75 |         self.row_masses_ = pd.Series(X.sum(axis=1), index=row_names)
 76 |         self.col_masses_ = pd.Series(X.sum(axis=0), index=col_names)
 77 | 
 78 |         self.active_rows_ = self.row_masses_.index.unique()
 79 |         self.active_cols_ = self.col_masses_.index.unique()
 80 | 
 81 |         # Compute standardised residuals
 82 |         r = self.row_masses_.to_numpy()
 83 |         c = self.col_masses_.to_numpy()
 84 |         S = sparse.diags(r**-0.5) @ (X - np.outer(r, c)) @ sparse.diags(c**-0.5)
 85 | 
 86 |         # Compute SVD on the standardised residuals
 87 |         self.svd_ = svd.compute_svd(
 88 |             X=S,
 89 |             n_components=min(self.n_components, min(X.shape) - 1),
 90 |             n_iter=self.n_iter,
 91 |             random_state=self.random_state,
 92 |             engine=self.engine,
 93 |         )
 94 | 
 95 |         # Compute total inertia
 96 |         self.total_inertia_ = np.einsum("ij,ji->", S, S.T)
 97 | 
 98 |         self.row_contributions_ = pd.DataFrame(
 99 |             sparse.diags(self.row_masses_.values)
100 |             @ np.divide(
101 |                 # Same as row_coordinates(X)
102 |                 (
103 |                     sparse.diags(self.row_masses_.values**-0.5)
104 |                     @ self.svd_.U
105 |                     @ sparse.diags(self.svd_.s)
106 |                 )
107 |                 ** 2,
108 |                 self.eigenvalues_,
109 |                 out=np.zeros((len(self.row_masses_), len(self.eigenvalues_))),
110 |                 where=self.eigenvalues_ > 0,
111 |             ),
112 |             index=self.row_masses_.index,
113 |         )
114 | 
115 |         self.column_contributions_ = pd.DataFrame(
116 |             sparse.diags(self.col_masses_.values)
117 |             @ np.divide(
118 |                 # Same as col_coordinates(X)
119 |                 (
120 |                     sparse.diags(self.col_masses_.values**-0.5)
121 |                     @ self.svd_.V.T
122 |                     @ sparse.diags(self.svd_.s)
123 |                 )
124 |                 ** 2,
125 |                 self.eigenvalues_,
126 |                 out=np.zeros((len(self.col_masses_), len(self.eigenvalues_))),
127 |                 where=self.eigenvalues_ > 0,
128 |             ),
129 |             index=self.col_masses_.index,
130 |         )
131 | 
132 |         return self
133 | 
134 |     @property
135 |     @utils.check_is_fitted
136 |     def eigenvalues_(self):
137 |         """Returns the eigenvalues associated with each principal component."""
138 |         return np.square(self.svd_.s)
139 | 
140 |     @utils.check_is_dataframe_input
141 |     @select_active_columns
142 |     def row_coordinates(self, X):
143 |         """The row principal coordinates."""
144 | 
145 |         _, row_names, _, _ = utils.make_labels_and_names(X)
146 |         index_name = X.index.name
147 | 
148 |         if isinstance(X, pd.DataFrame):
149 |             try:
150 |                 X = X.sparse.to_coo().astype(float)
151 |             except AttributeError:
152 |                 X = X.to_numpy()
153 | 
154 |         if self.copy:
155 |             X = X.copy()
156 | 
157 |         # Normalise the rows so that they sum up to 1
158 |         if isinstance(X, np.ndarray):
159 |             X = X / X.sum(axis=1)[:, None]
160 |         else:
161 |             X = X / X.sum(axis=1)
162 | 
163 |         return pd.DataFrame(
164 |             data=X @ sparse.diags(self.col_masses_.to_numpy() ** -0.5) @ self.svd_.V.T,
165 |             index=pd.Index(row_names, name=index_name),
166 |         )
167 | 
168 |     @utils.check_is_dataframe_input
169 |     @select_active_columns
170 |     def row_cosine_similarities(self, X):
171 |         """Return the cos2 for each row against the dimensions.
172 | 
173 |         The cos2 value gives an indicator of the accuracy of the row projection on the dimension.
174 | 
175 |         Values above 0.5 usually means that the row is relatively accurately well projected onto that dimension. Its often
176 |         used to identify which factor/dimension is important for a given element as the cos2 can be interpreted as the proportion
177 |         of the variance of the element attributed to a particular factor.
178 | 
179 |         """
180 |         F = self.row_coordinates(X)
181 |         return self._row_cosine_similarities(X, F)
182 | 
183 |     @select_active_columns
184 |     def _row_cosine_similarities(self, X, F):
185 |         # Active
186 |         X_act = X.loc[self.active_rows_]
187 |         X_act = X_act / X_act.sum().sum()
188 |         marge_col = X_act.sum(axis=0)
189 |         Tc = X_act.div(X_act.sum(axis=1), axis=0).div(marge_col, axis=1) - 1
190 |         dist2_row = (Tc**2).mul(marge_col, axis=1).sum(axis=1)
191 | 
192 |         # Supplementary
193 |         X_sup = X.loc[X.index.difference(self.active_rows_, sort=False)]
194 |         X_sup = X_sup.div(X_sup.sum(axis=1), axis=0)
195 |         dist2_row_sup = ((X_sup - marge_col) ** 2).div(marge_col, axis=1).sum(axis=1)
196 | 
197 |         dist2_row = pd.concat((dist2_row, dist2_row_sup))
198 | 
199 |         # Can't use pandas.div method because it doesn't support duplicate indices
200 |         return F**2 / dist2_row.to_numpy()[:, None]
201 | 
202 |     @utils.check_is_dataframe_input
203 |     @select_active_rows
204 |     def column_coordinates(self, X):
205 |         """The column principal coordinates."""
206 | 
207 |         _, _, _, col_names = utils.make_labels_and_names(X)
208 |         index_name = X.columns.name
209 | 
210 |         if isinstance(X, pd.DataFrame):
211 |             is_sparse = X.dtypes.apply(lambda dtype: isinstance(dtype, pd.SparseDtype)).all()
212 |             if is_sparse:
213 |                 X = X.sparse.to_coo()
214 |             else:
215 |                 X = X.to_numpy()
216 | 
217 |         if self.copy:
218 |             X = X.copy()
219 | 
220 |         # Transpose and make sure the rows sum up to 1
221 |         if isinstance(X, np.ndarray):
222 |             X = X.T / X.T.sum(axis=1)[:, None]
223 |         else:
224 |             X = X.T / X.T.sum(axis=1)
225 | 
226 |         return pd.DataFrame(
227 |             data=X @ sparse.diags(self.row_masses_.to_numpy() ** -0.5) @ self.svd_.U,
228 |             index=pd.Index(col_names, name=index_name),
229 |         )
230 | 
231 |     @utils.check_is_dataframe_input
232 |     @select_active_rows
233 |     def column_cosine_similarities(self, X):
234 |         """Return the cos2 for each column against the dimensions.
235 | 
236 |         The cos2 value gives an indicator of the accuracy of the column projection on the dimension.
237 | 
238 |         Values above 0.5 usually means that the column is relatively accurately well projected onto that dimension. Its often
239 |         used to identify which factor/dimension is important for a given element as the cos2 can be interpreted as the proportion
240 |         of the variance of the element attributed to a particular factor.
241 |         """
242 |         G = self.column_coordinates(X)
243 |         return self._column_cosine_similarities(X, G)
244 | 
245 |     @select_active_rows
246 |     def _column_cosine_similarities(self, X, G):
247 |         # Active
248 |         X_act = X[self.active_cols_]
249 |         X_act = X_act / X_act.sum().sum()
250 |         marge_row = X_act.sum(axis=1)
251 |         Tc = X_act.div(marge_row, axis=0).div(X_act.sum(axis=0), axis=1) - 1
252 |         dist2_col = (Tc**2).mul(marge_row, axis=0).sum(axis=0)
253 | 
254 |         # Supplementary
255 |         X_sup = X[X.columns.difference(self.active_cols_, sort=False)]
256 |         X_sup = X_sup.div(X_sup.sum(axis=0), axis=1)
257 |         dist2_col_sup = ((X_sup.sub(marge_row, axis=0)) ** 2).div(marge_row, axis=0).sum(axis=0)
258 | 
259 |         dist2_col = pd.concat((dist2_col, dist2_col_sup))
260 |         return (G**2).div(dist2_col, axis=0)
261 | 
262 |     @utils.check_is_dataframe_input
263 |     @utils.check_is_fitted
264 |     def plot(
265 |         self,
266 |         X,
267 |         x_component=0,
268 |         y_component=1,
269 |         show_row_markers=True,
270 |         show_column_markers=True,
271 |         show_row_labels=False,
272 |         show_column_labels=False,
273 |     ):
274 |         eig = self._eigenvalues_summary.to_dict(orient="index")
275 | 
276 |         row_chart_markers = None
277 |         row_chart_labels = None
278 |         column_chart_markers = None
279 |         column_chart_labels = None
280 | 
281 |         if show_row_markers or show_row_labels:
282 |             row_coords = self.row_coordinates(X)
283 |             row_coords.columns = [f"component {i}" for i in row_coords.columns]
284 |             row_coords = row_coords.assign(
285 |                 variable=row_coords.index.name or "row",
286 |                 value=row_coords.index.astype(str),
287 |             )
288 |             row_labels = pd.Series(row_coords.index, index=row_coords.index)
289 |             row_chart = alt.Chart(row_coords.assign(label=row_labels)).encode(
290 |                 x=alt.X(
291 |                     f"component {x_component}",
292 |                     scale=alt.Scale(zero=False),
293 |                     axis=alt.Axis(
294 |                         title=f"component {x_component} — {eig[x_component]['% of variance'] / 100:.2%}"
295 |                     ),
296 |                 ),
297 |                 y=alt.Y(
298 |                     f"component {y_component}",
299 |                     scale=alt.Scale(zero=False),
300 |                     axis=alt.Axis(
301 |                         title=f"component {y_component} — {eig[y_component]['% of variance'] / 100:.2%}"
302 |                     ),
303 |                 ),
304 |             )
305 |             row_chart_markers = row_chart.mark_circle(size=50 if show_row_markers else 0).encode(
306 |                 color="variable",
307 |                 tooltip=[
308 |                     "variable",
309 |                     "value",
310 |                     f"component {x_component}",
311 |                     f"component {y_component}",
312 |                 ],
313 |             )
314 |             if show_row_labels:
315 |                 row_chart_labels = row_chart.mark_text().encode(text="label:N")
316 | 
317 |         if show_column_markers or show_column_labels:
318 |             column_coords = self.column_coordinates(X)
319 |             column_coords.columns = [f"component {i}" for i in column_coords.columns]
320 |             column_coords = column_coords.assign(
321 |                 variable=column_coords.index.name or "column",
322 |                 value=column_coords.index.astype(str),
323 |             )
324 |             column_labels = pd.Series(column_coords.index, index=column_coords.index)
325 |             column_chart = alt.Chart(column_coords.assign(label=column_labels)).encode(
326 |                 x=alt.X(
327 |                     f"component {x_component}",
328 |                     scale=alt.Scale(zero=False),
329 |                     axis=alt.Axis(
330 |                         title=f"component {x_component} — {eig[x_component]['% of variance'] / 100:.2%}"
331 |                     ),
332 |                 ),
333 |                 y=alt.Y(
334 |                     f"component {y_component}",
335 |                     scale=alt.Scale(zero=False),
336 |                     axis=alt.Axis(
337 |                         title=f"component {y_component} — {eig[y_component]['% of variance'] / 100:.2%}"
338 |                     ),
339 |                 ),
340 |             )
341 |             column_chart_markers = column_chart.mark_circle(
342 |                 size=50 if show_column_markers else 0
343 |             ).encode(
344 |                 color="variable",
345 |                 tooltip=[
346 |                     "variable",
347 |                     "value",
348 |                     f"component {x_component}",
349 |                     f"component {y_component}",
350 |                 ],
351 |             )
352 |             if show_column_labels:
353 |                 column_chart_labels = column_chart.mark_text().encode(text="label:N")
354 | 
355 |         charts = filter(
356 |             None,
357 |             (
358 |                 row_chart_markers,
359 |                 row_chart_labels,
360 |                 column_chart_markers,
361 |                 column_chart_labels,
362 |             ),
363 |         )
364 | 
365 |         return alt.layer(*charts).interactive()
366 | 


--------------------------------------------------------------------------------
/prince/datasets.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import pathlib
  4 | 
  5 | import pandas as pd
  6 | 
  7 | DATASETS_DIR = pathlib.Path(__file__).parent / "datasets"
  8 | 
  9 | 
 10 | def load_energy_mix(year=2019, normalize=True):
 11 |     """Per capita energy mix by country in 2019.
 12 | 
 13 |     Each row corresponds to a country. There is one column for each energy source.
 14 |     A value corresponds to the average energy consumption of a source per capita.
 15 |     For instance, in France, every citizen consumed 15,186 kWh of nuclear energy.
 16 | 
 17 |     This data comes from https://ourworldindata.org/energy-mix
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     year
 22 |         The year the study was made.
 23 |     normalize
 24 |         Whether or not to normalize the kWh by country.
 25 | 
 26 |     """
 27 | 
 28 |     df = (
 29 |         pd.read_csv(DATASETS_DIR / "per-capita-energy-stacked.csv")
 30 |         .query("Year == @year")
 31 |         .query("Entity not in ['Africa', 'Europe', 'North America', 'World']")
 32 |         .drop(columns=["Code", "Year"])
 33 |         .rename(columns={"Entity": "Country"})
 34 |         .rename(columns=lambda x: x.replace(" per capita (kWh)", "").lower())
 35 |         .set_index(["continent", "country"])
 36 |     )
 37 |     if normalize:
 38 |         return df.div(df.sum(axis="columns"), axis="rows")
 39 |     return df
 40 | 
 41 | 
 42 | def load_decathlon():
 43 |     """The Decathlon dataset from FactoMineR."""
 44 |     decathlon = pd.read_csv(DATASETS_DIR / "decathlon.csv")
 45 |     decathlon.columns = ["athlete", *map(str.lower, decathlon.columns[1:])]
 46 |     decathlon.athlete = decathlon.athlete.apply(str.title)
 47 |     decathlon = decathlon.set_index(["competition", "athlete"])
 48 |     return decathlon
 49 | 
 50 | 
 51 | def load_french_elections():
 52 |     """Voting data for the 2022 French elections, by region.
 53 | 
 54 |     The [original dataset](https://www.data.gouv.fr/fr/datasets/resultats-du-premier-tour-de-lelection-presidentielle-2022-par-commune-et-par-departement/#resources)
 55 |     has been transformed into a contingency matrix. The latter tallies the number of votes for the
 56 |     12 candidates across all 18 regions. The number of blank and abstentions are also recorded.
 57 |     More information about these regions, including a map, can be found
 58 |     [on Wikipedia](https://www.wikiwand.com/fr/Région_française).
 59 | 
 60 |     """
 61 |     dataset = pd.read_csv(DATASETS_DIR / "02-resultats-par-region.csv")
 62 |     cont = dataset.pivot(index="reg_name", columns="cand_nom", values="cand_nb_voix")
 63 |     cont["Abstention"] = dataset.groupby("reg_name")["abstention_nb"].min()
 64 |     cont["Blank"] = dataset.groupby("reg_name")["blancs_nb"].min()
 65 |     cont.columns = [c.title() for c in cont.columns]
 66 |     cont.index.name = "region"
 67 |     cont.columns.name = "candidate"
 68 |     return cont
 69 | 
 70 | 
 71 | def load_punctuation_marks():
 72 |     """Punctuation marks of six French writers."""
 73 |     return pd.read_csv(DATASETS_DIR / "punctuation_marks.csv", index_col="author")
 74 | 
 75 | 
 76 | def load_hearthstone_cards():
 77 |     """Hearthstone standard cards.
 78 | 
 79 |     Source: https://gist.github.com/MaxHalford/32ed2c80672d7391ec5b4e6f291f14c1
 80 | 
 81 |     """
 82 |     return pd.read_csv(DATASETS_DIR / "hearthstone_cards.csv", index_col="id")
 83 | 
 84 | 
 85 | def load_burgundy_wines():
 86 |     """Burgundy wines dataset.
 87 | 
 88 |     Source: https://personal.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf
 89 | 
 90 |     """
 91 |     wines = pd.DataFrame(
 92 |         data=[
 93 |             ["Yes", "No", "No", "Yes", "No", "No", "No", "No", "No", "No"],
 94 |             ["No", "Maybe", "Yes", "No", "Yes", "Maybe", "Yes", "No", "Yes", "Yes"],
 95 |             ["No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes"],
 96 |             ["No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"],
 97 |             ["Yes", "No", "No", "Yes", "No", "No", "No", "Yes", "No", "No"],
 98 |             ["Yes", "Maybe", "No", "Yes", "No", "Maybe", "No", "Yes", "No", "No"],
 99 |         ],
100 |         columns=pd.MultiIndex.from_tuples(
101 |             [
102 |                 ("Expert 1", "Fruity"),
103 |                 ("Expert 1", "Woody"),
104 |                 ("Expert 1", "Coffee"),
105 |                 ("Expert 2", "Red fruit"),
106 |                 ("Expert 2", "Roasted"),
107 |                 ("Expert 2", "Vanillin"),
108 |                 ("Expert 2", "Woody"),
109 |                 ("Expert 3", "Fruity"),
110 |                 ("Expert 3", "Butter"),
111 |                 ("Expert 3", "Woody"),
112 |             ],
113 |             names=("expert", "aspect"),
114 |         ),
115 |         index=[f"Wine {i + 1}" for i in range(6)],
116 |     )
117 |     wines.insert(0, "Oak type", [1, 2, 2, 2, 1, 1])
118 |     return wines
119 | 
120 | 
121 | def load_beers():
122 |     """Beers dataset.
123 | 
124 |     The data is taken from https://github.com/philipperemy/beer-dataset.
125 | 
126 |     """
127 |     return pd.read_csv(DATASETS_DIR / "beers.csv.zip", index_col="name")
128 | 
129 | 
130 | def load_premier_league():
131 |     """Premier League dataset.
132 | 
133 |     The data is taken from Wikipedia, using pd.read_html.
134 | 
135 |     """
136 |     return pd.read_csv(DATASETS_DIR / "premier_league.csv", index_col=0, header=[0, 1])
137 | 


--------------------------------------------------------------------------------
/prince/datasets/beers.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaxHalford/prince/37f562def54a204174e2dce37038c85e90df27a8/prince/datasets/beers.csv.zip


--------------------------------------------------------------------------------
/prince/datasets/decathlon.csv:
--------------------------------------------------------------------------------
 1 | "","100m","Long.jump","Shot.put","High.jump","400m","110m.hurdle","Discus","Pole.vault","Javeline","1500m","Rank","Points","Competition"
 2 | "SEBRLE",11.04,7.58,14.83,2.07,49.81,14.69,43.75,5.02,63.19,291.7,1,8217,"Decastar"
 3 | "CLAY",10.76,7.4,14.26,1.86,49.37,14.05,50.72,4.92,60.15,301.5,2,8122,"Decastar"
 4 | "KARPOV",11.02,7.3,14.77,2.04,48.37,14.09,48.95,4.92,50.31,300.2,3,8099,"Decastar"
 5 | "BERNARD",11.02,7.23,14.25,1.92,48.93,14.99,40.87,5.32,62.77,280.1,4,8067,"Decastar"
 6 | "YURKOV",11.34,7.09,15.19,2.1,50.42,15.31,46.26,4.72,63.44,276.4,5,8036,"Decastar"
 7 | "WARNERS",11.11,7.6,14.31,1.98,48.68,14.23,41.1,4.92,51.77,278.1,6,8030,"Decastar"
 8 | "ZSIVOCZKY",11.13,7.3,13.48,2.01,48.62,14.17,45.67,4.42,55.37,268,7,8004,"Decastar"
 9 | "McMULLEN",10.83,7.31,13.76,2.13,49.91,14.38,44.41,4.42,56.37,285.1,8,7995,"Decastar"
10 | "MARTINEAU",11.64,6.81,14.57,1.95,50.14,14.93,47.6,4.92,52.33,262.1,9,7802,"Decastar"
11 | "HERNU",11.37,7.56,14.41,1.86,51.1,15.06,44.99,4.82,57.19,285.1,10,7733,"Decastar"
12 | "BARRAS",11.33,6.97,14.09,1.95,49.48,14.48,42.1,4.72,55.4,282,11,7708,"Decastar"
13 | "NOOL",11.33,7.27,12.68,1.98,49.2,15.29,37.92,4.62,57.44,266.6,12,7651,"Decastar"
14 | "BOURGUIGNON",11.36,6.8,13.46,1.86,51.16,15.67,40.49,5.02,54.68,291.7,13,7313,"Decastar"
15 | "Sebrle",10.85,7.84,16.36,2.12,48.36,14.05,48.72,5,70.52,280.01,1,8893,"OlympicG"
16 | "Clay",10.44,7.96,15.23,2.06,49.19,14.13,50.11,4.9,69.71,282,2,8820,"OlympicG"
17 | "Karpov",10.5,7.81,15.93,2.09,46.81,13.97,51.65,4.6,55.54,278.11,3,8725,"OlympicG"
18 | "Macey",10.89,7.47,15.73,2.15,48.97,14.56,48.34,4.4,58.46,265.42,4,8414,"OlympicG"
19 | "Warners",10.62,7.74,14.48,1.97,47.97,14.01,43.73,4.9,55.39,278.05,5,8343,"OlympicG"
20 | "Zsivoczky",10.91,7.14,15.31,2.12,49.4,14.95,45.62,4.7,63.45,269.54,6,8287,"OlympicG"
21 | "Hernu",10.97,7.19,14.65,2.03,48.73,14.25,44.72,4.8,57.76,264.35,7,8237,"OlympicG"
22 | "Nool",10.8,7.53,14.26,1.88,48.81,14.8,42.05,5.4,61.33,276.33,8,8235,"OlympicG"
23 | "Bernard",10.69,7.48,14.8,2.12,49.13,14.17,44.75,4.4,55.27,276.31,9,8225,"OlympicG"
24 | "Schwarzl",10.98,7.49,14.01,1.94,49.76,14.25,42.43,5.1,56.32,273.56,10,8102,"OlympicG"
25 | "Pogorelov",10.95,7.31,15.1,2.06,50.79,14.21,44.6,5,53.45,287.63,11,8084,"OlympicG"
26 | "Schoenbeck",10.9,7.3,14.77,1.88,50.3,14.34,44.41,5,60.89,278.82,12,8077,"OlympicG"
27 | "Barras",11.14,6.99,14.91,1.94,49.41,14.37,44.83,4.6,64.55,267.09,13,8067,"OlympicG"
28 | "Smith",10.85,6.81,15.24,1.91,49.27,14.01,49.02,4.2,61.52,272.74,14,8023,"OlympicG"
29 | "Averyanov",10.55,7.34,14.44,1.94,49.72,14.39,39.88,4.8,54.51,271.02,15,8021,"OlympicG"
30 | "Ojaniemi",10.68,7.5,14.97,1.94,49.12,15.01,40.35,4.6,59.26,275.71,16,8006,"OlympicG"
31 | "Smirnov",10.89,7.07,13.88,1.94,49.11,14.77,42.47,4.7,60.88,263.31,17,7993,"OlympicG"
32 | "Qi",11.06,7.34,13.55,1.97,49.65,14.78,45.13,4.5,60.79,272.63,18,7934,"OlympicG"
33 | "Drews",10.87,7.38,13.07,1.88,48.51,14.01,40.11,5,51.53,274.21,19,7926,"OlympicG"
34 | "Parkhomenko",11.14,6.61,15.69,2.03,51.04,14.88,41.9,4.8,65.82,277.94,20,7918,"OlympicG"
35 | "Terek",10.92,6.94,15.15,1.94,49.56,15.12,45.62,5.3,50.62,290.36,21,7893,"OlympicG"
36 | "Gomez",11.08,7.26,14.57,1.85,48.61,14.41,40.95,4.4,60.71,269.7,22,7865,"OlympicG"
37 | "Turi",11.08,6.91,13.62,2.03,51.67,14.26,39.83,4.8,59.34,290.01,23,7708,"OlympicG"
38 | "Lorenzo",11.1,7.03,13.22,1.85,49.34,15.38,40.22,4.5,58.36,263.08,24,7592,"OlympicG"
39 | "Karlivans",11.33,7.26,13.3,1.97,50.54,14.98,43.34,4.5,52.92,278.67,25,7583,"OlympicG"
40 | "Korkizoglou",10.86,7.07,14.81,1.94,51.16,14.96,46.07,4.7,53.05,317,26,7573,"OlympicG"
41 | "Uldal",11.23,6.99,13.53,1.85,50.95,15.09,43.01,4.5,60,281.7,27,7495,"OlympicG"
42 | "Casarsa",11.36,6.68,14.92,1.94,53.2,15.39,48.66,4.4,58.62,296.12,28,7404,"OlympicG"
43 | 


--------------------------------------------------------------------------------
/prince/datasets/premier_league.csv:
--------------------------------------------------------------------------------
 1 | ,2021-22,2021-22,2021-22,2021-22,2021-22,2021-22,2022-23,2022-23,2022-23,2022-23,2022-23,2022-23,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24
 2 | ,W,D,L,GF,GA,Pts,W,D,L,GF,GA,Pts,W,D,L,GF,GA,Pts
 3 | Team,,,,,,,,,,,,,,,,,,
 4 | Arsenal,22,3,13,61,48,69,26,6,6,88,43,84,28,5,5,91,29,89
 5 | Aston Villa,13,6,19,52,54,45,18,7,13,51,46,61,20,8,10,76,61,68
 6 | Brentford,13,7,18,48,56,46,15,14,9,58,46,59,10,9,19,56,65,39
 7 | Brighton & Hove Albion,12,15,11,42,44,51,18,8,12,72,53,62,12,12,14,55,62,48
 8 | Chelsea,21,11,6,76,33,74,11,11,16,38,47,44,18,9,11,77,63,63
 9 | Crystal Palace,11,15,12,50,46,48,11,12,15,40,49,45,13,10,15,57,58,49
10 | Everton,11,6,21,43,66,39,8,12,18,34,57,36,13,9,16,40,51,40
11 | Liverpool,28,8,2,94,26,92,19,10,9,75,47,67,24,10,4,86,41,82
12 | Manchester City,29,6,3,99,26,93,28,5,5,94,33,89,28,7,3,96,34,91
13 | Manchester United,16,10,12,57,57,58,23,6,9,58,43,75,18,6,14,57,58,60
14 | Newcastle United,13,10,15,44,62,49,19,14,5,68,33,71,18,6,14,85,62,60
15 | Tottenham Hotspur,22,5,11,69,40,71,18,6,14,70,63,60,20,6,12,74,61,66
16 | West Ham United,16,8,14,60,51,56,11,7,20,42,55,40,14,10,14,60,74,52
17 | Wolverhampton Wanderers,15,6,17,38,43,51,11,8,19,31,58,41,13,7,18,50,65,46
18 | 


--------------------------------------------------------------------------------
/prince/datasets/punctuation_marks.csv:
--------------------------------------------------------------------------------
1 | "author","period","comma","other"
2 | "Rousseau",7836,13112,6026
3 | "Chateaubriand",53655,102383,42413
4 | "Hugo",115615,184541,59226
5 | "Zola",161926,340479,62754
6 | "Proust",38177,105101,12670
7 | "Giraudoux",46371,58367,14299
8 | 


--------------------------------------------------------------------------------
/prince/famd.py:
--------------------------------------------------------------------------------
  1 | """Factor Analysis of Mixed Data (FAMD)"""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import sklearn.utils
  8 | from sklearn import preprocessing
  9 | 
 10 | from prince import pca, utils
 11 | 
 12 | 
 13 | class FAMD(pca.PCA):
 14 |     def __init__(
 15 |         self,
 16 |         n_components=2,
 17 |         n_iter=3,
 18 |         copy=True,
 19 |         check_input=True,
 20 |         random_state=None,
 21 |         engine="sklearn",
 22 |         handle_unknown="error",
 23 |     ):
 24 |         super().__init__(
 25 |             rescale_with_mean=True,
 26 |             rescale_with_std=False,
 27 |             n_components=n_components,
 28 |             n_iter=n_iter,
 29 |             copy=copy,
 30 |             check_input=check_input,
 31 |             random_state=random_state,
 32 |             engine=engine,
 33 |         )
 34 |         self.handle_unknown = handle_unknown
 35 | 
 36 |     def _check_input(self, X):
 37 |         if self.check_input:
 38 |             sklearn.utils.check_array(X, dtype=[str, "numeric"])
 39 | 
 40 |     @utils.check_is_dataframe_input
 41 |     def fit(self, X, y=None):
 42 |         # Separate numerical columns from categorical columns
 43 |         self.num_cols_ = X.select_dtypes(include=["float"]).columns.tolist()
 44 |         if not self.num_cols_:
 45 |             raise ValueError("All variables are qualitative: MCA should be used")
 46 |         self.cat_cols_ = X.columns.difference(self.num_cols_).tolist()
 47 |         if not self.cat_cols_:
 48 |             raise ValueError("All variables are quantitative: PCA should be used")
 49 | 
 50 |         # Preprocess numerical columns
 51 |         X_num = X[self.num_cols_].copy()
 52 |         self.num_scaler_ = preprocessing.StandardScaler().fit(X_num)
 53 |         X_num[:] = self.num_scaler_.transform(X_num)
 54 | 
 55 |         # Preprocess categorical columns
 56 |         X_cat = X[self.cat_cols_]
 57 |         self.cat_scaler_ = preprocessing.OneHotEncoder(handle_unknown=self.handle_unknown).fit(
 58 |             X_cat
 59 |         )
 60 |         X_cat_oh = pd.DataFrame.sparse.from_spmatrix(
 61 |             self.cat_scaler_.transform(X_cat),
 62 |             index=X_cat.index,
 63 |             columns=self.cat_scaler_.get_feature_names_out(self.cat_cols_),
 64 |         )
 65 |         prop = X_cat_oh.sum() / X_cat_oh.sum().sum() * 2
 66 |         X_cat_oh_norm = X_cat_oh.sub(X_cat_oh.mean(axis="rows")).div(prop**0.5, axis="columns")
 67 | 
 68 |         # PCA.fit doesn't work with sparse matrices. Well, it accepts them, but it densifies them.
 69 |         # We pre-densify them here to avoid a warning.
 70 |         # TODO: In the future, PCA should be able to handle sparse matrices.
 71 |         X_cat_oh_norm = X_cat_oh_norm.sparse.to_dense()
 72 | 
 73 |         Z = pd.concat([X_num, X_cat_oh_norm], axis=1)
 74 |         super().fit(Z)
 75 | 
 76 |         # Determine column_coordinates_
 77 |         # This is based on line 184 in FactoMineR's famd.R file
 78 |         rc = self.row_coordinates(X)
 79 |         weights = np.ones(len(X_cat_oh)) / len(X_cat_oh)
 80 |         norm = (rc**2).multiply(weights, axis=0).sum()
 81 |         eta2 = pd.DataFrame(index=rc.columns)
 82 |         for i, col in enumerate(self.cat_cols_):
 83 |             # TODO: there must be a better way to select a subset of the one-hot encoded matrix
 84 |             tt = X_cat_oh[[f"{col}_{i}" for i in self.cat_scaler_.categories_[i]]]
 85 |             ni = (tt / len(tt)).sum()
 86 |             eta2[col] = (
 87 |                 rc.apply(lambda x: (tt.multiply(x * weights, axis=0).sum() ** 2 / ni).sum()) / norm
 88 |             ).values
 89 |         self.column_coordinates_ = pd.concat(
 90 |             [self.column_coordinates_.loc[self.num_cols_] ** 2, eta2.T]
 91 |         )
 92 |         self.column_coordinates_.columns.name = "component"
 93 |         self.column_coordinates_.index.name = "variable"
 94 | 
 95 |         return self
 96 | 
 97 |     @utils.check_is_dataframe_input
 98 |     @utils.check_is_fitted
 99 |     def row_coordinates(self, X):
100 |         # Separate numerical columns from categorical columns
101 |         X_num = X[self.num_cols_].copy()
102 |         X_cat = X[self.cat_cols_]
103 | 
104 |         # Preprocess numerical columns
105 |         X_num[:] = self.num_scaler_.transform(X_num)
106 | 
107 |         # Preprocess categorical columns
108 |         X_cat = pd.DataFrame.sparse.from_spmatrix(
109 |             self.cat_scaler_.transform(X_cat),
110 |             index=X_cat.index,
111 |             columns=self.cat_scaler_.get_feature_names_out(self.cat_cols_),
112 |         )
113 |         prop = X_cat.sum() / X_cat.sum().sum() * 2
114 |         X_cat = X_cat.sub(X_cat.mean(axis="rows")).div(prop**0.5, axis="columns")
115 | 
116 |         Z = pd.concat([X_num, X_cat.sparse.to_dense()], axis=1).fillna(0.0)
117 | 
118 |         return super().row_coordinates(Z)
119 | 
120 |     @utils.check_is_dataframe_input
121 |     @utils.check_is_fitted
122 |     def inverse_transform(self, X):
123 |         raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet")
124 | 
125 |     @utils.check_is_dataframe_input
126 |     @utils.check_is_fitted
127 |     def row_standard_coordinates(self, X):
128 |         raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet")
129 | 
130 |     @utils.check_is_dataframe_input
131 |     @utils.check_is_fitted
132 |     def row_cosine_similarities(self, X):
133 |         raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet")
134 | 
135 |     @utils.check_is_dataframe_input
136 |     @utils.check_is_fitted
137 |     def column_correlations(self, X):
138 |         raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet")
139 | 
140 |     @utils.check_is_dataframe_input
141 |     @utils.check_is_fitted
142 |     def column_cosine_similarities_(self, X):
143 |         raise NotImplementedError("FAMD inherits from PCA, but this method is not implemented yet")
144 | 
145 |     @property
146 |     def column_contributions_(self):
147 |         return self.column_coordinates_ / self.eigenvalues_
148 | 


--------------------------------------------------------------------------------
/prince/gpa.py:
--------------------------------------------------------------------------------
  1 | """Generalized Procrustes Analysis (GPA)"""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import numpy as np
  6 | from scipy.linalg import orthogonal_procrustes
  7 | from scipy.spatial import procrustes
  8 | from sklearn import base
  9 | from sklearn import utils as sk_utils
 10 | 
 11 | from prince import utils
 12 | 
 13 | 
 14 | class GPA(base.BaseEstimator, base.TransformerMixin):
 15 |     """Generalized Procrustes Analysis (GPA).
 16 | 
 17 |     Algorithm outline:
 18 | 
 19 |     1. Choose a reference shape.
 20 |     2. Apply Procrustes Analysis to superimpose all shapes to the reference shape.
 21 |     3. Compute the mean shape of the superimposed shapes.
 22 |     4. Repeat steps 2 and 3 until convergence.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     max_iter
 27 |         The maximum number of Procrustes analysis iterations.
 28 |     tol
 29 |         The tolerance for the optimization; stops if the Procrustes distance decreases by less or
 30 |         equal to `tol` between iterations.
 31 |     init
 32 |         Method for initializing reference shape.
 33 |         - 'random' : choose reference shape from shape list
 34 |         - 'mean' : initialize reference shape as mean of shape list
 35 |     scale
 36 |         Whether to compute transformations with a scale component.
 37 |     copy
 38 |         Whether to copy data or perform the computations inplace. If False, data passed to fit are
 39 |         overwritten and running fit(X).transform(X) will not yield the expected results,
 40 |         use fit_transform(X) instead.
 41 |     check_input
 42 |         Whether to check the consistency of the inputs.
 43 |     random_state
 44 |         Determines random number generation for initialization when `init=='random'`.
 45 | 
 46 |     References
 47 |     ----------
 48 |     https://wikipedia.org/wiki/Generalized_Procrustes_analysis
 49 |     https://medium.com/@olga_kravchenko/generalized-procrustes-analysis-with-python-numpy-c571e8e8a421
 50 | 
 51 |     """
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         max_iter=10,
 56 |         tol=1e-4,
 57 |         init="random",
 58 |         scale=True,
 59 |         copy=True,
 60 |         check_input=True,
 61 |         random_state=None,
 62 |     ):
 63 |         self.max_iter = max_iter
 64 |         self.tol = tol
 65 |         self.init = init
 66 |         self.scale = scale
 67 |         self.copy = copy
 68 |         self.check_input = check_input
 69 |         self.random_state = random_state
 70 | 
 71 |     def fit(self, X, y=None):
 72 |         """Fit the model with X.
 73 | 
 74 |         The algorithm naturally fits and transforms at the same time, so this
 75 |         simply calls ``.fit_transform``
 76 | 
 77 |         Parameters:
 78 |             X (array-like of shape (n_shapes, n_points, n_dim)): Matrix of
 79 |                 shapes to match to each other.
 80 |             y: Ignored
 81 | 
 82 |         Returns:
 83 |             self (object): The instance itself
 84 |         """
 85 |         self.fit_transform(X)
 86 | 
 87 |         return self
 88 | 
 89 |     @utils.check_is_fitted
 90 |     def transform(self, X):
 91 |         """Align X to the reference shape.
 92 | 
 93 |         Parameters:
 94 |             X (array-like of shape (n_shapes, n_points, n_dim)): Matrix of
 95 |                 shapes to align to the refernce shape.
 96 | 
 97 |         Returns:
 98 |             X_new (array-like of shape (n_shapes, n_points, n_dim)): Matrix of
 99 |                 aligned shapes
100 |         """
101 |         self._check_is_fitted()
102 |         if self.check_input:
103 |             self._check_input(X)
104 | 
105 |         X_new = np.empty(X.shape)
106 |         for shape_idx in range(X.shape[0]):
107 |             _, X_new[shape_idx], _ = procrustes(self.reference_shape, X[shape_idx])
108 | 
109 |         return X_new
110 | 
111 |     def fit_transform(self, X, y=None):
112 |         """Fit the model with X and return the aligned shapes.
113 | 
114 |         Parameters:
115 |             X (array-like of shape (n_shapes, n_points, n_dim)): Matrix of
116 |                 shapes to match to each other.
117 |             y: Ignored
118 | 
119 |         Returns:
120 |             X_new (array-like of shape (n_shapes, n_points, n_dim)): Matrix X
121 |                 of aligned shapes
122 |         """
123 | 
124 |         # Check input
125 |         if self.check_input:
126 |             self._check_input(X)
127 | 
128 |         # Copy data
129 |         if self.copy:
130 |             X = np.array(X, copy=True)
131 | 
132 |         # scikit-learn SLEP010
133 |         n_shapes, n_points, n_dim = X.shape
134 |         self.n_features_in_ = n_dim
135 | 
136 |         # Pick reference shape
137 |         if self.init == "random":
138 |             random_state = sk_utils.check_random_state(self.random_state)
139 |             ref_shape_idx = random_state.randint(X.shape[0])
140 |             reference_shape = X[ref_shape_idx].copy()
141 |         elif self.init == "mean":
142 |             reference_shape = X.mean(axis=0)
143 |         else:
144 |             raise ValueError("init method must be one of ('random', 'mean')")
145 | 
146 |         for iter_idx in range(self.max_iter):
147 |             # Align each shape to reference shape
148 |             for shape_idx in range(X.shape[0]):
149 |                 if self.scale:
150 |                     _, X[shape_idx], _ = procrustes(reference_shape, X[shape_idx])
151 |                 else:
152 |                     _, X[shape_idx] = unscaled_procrustes(reference_shape, X[shape_idx])
153 | 
154 |             # Compute diagnostics
155 |             mean_shape = X.mean(axis=0)
156 |             procrustes_distance = np.linalg.norm(reference_shape - mean_shape)
157 | 
158 |             # Update reference shape
159 |             reference_shape = mean_shape
160 | 
161 |             # Check for convergence
162 |             if procrustes_distance <= self.tol:
163 |                 break
164 | 
165 |         # Store properties
166 |         self._reference_shape = reference_shape
167 | 
168 |         # Return the aligned shapes
169 |         return X
170 | 
171 |     def _check_input(self, X):
172 |         sk_utils.check_array(X, allow_nd=True)
173 |         if X.ndim != 3:
174 |             raise ValueError("Expected 3-dimensional input of (n_shapes, n_points, n_dim)")
175 | 
176 |     def _check_is_fitted(self):
177 |         sk_utils.validation.check_is_fitted(self, "_reference_shape")
178 | 
179 |     @property
180 |     def reference_shape(self):
181 |         """Returns the final reference shape."""
182 |         self._check_is_fitted()
183 |         return self._reference_shape
184 | 
185 | 
186 | def unscaled_procrustes(reference, data):
187 |     """Fit `data` to `reference` using procrustes analysis without scaling.
188 |     Uses translation (mean-centering), reflection, and orthogonal rotation.
189 | 
190 |     Parameters:
191 |         reference (array-like of shape (n_points, n_dim)): reference shape to
192 |             fit `data` to
193 |         data (array-like of shape (n_points, n_dim)): shape to align to
194 |             `reference`
195 | 
196 |     Returns:
197 |         reference_centered (np.ndarray of shape (n_points, n_dim)): 0-centered
198 |             `reference` shape
199 |         data_aligned (np.ndarray of shape (n_points, n_dim)): `data` aligned to
200 |             the reference shape
201 |     """
202 |     # Convert inputs to np.ndarray types
203 |     reference = np.array(reference, dtype=np.double)
204 |     data = np.array(data, dtype=np.double)
205 | 
206 |     # Translate data to the origin
207 |     reference_centered = reference - reference.mean(axis=0)
208 |     data_centered = data - data.mean(axis=0)
209 | 
210 |     # Rotate / reflect data to match reference
211 |     # transform mtx2 to minimize disparity
212 |     R, _ = orthogonal_procrustes(data_centered, reference_centered)
213 |     data_aligned = data_centered @ R
214 | 
215 |     return reference_centered, data_aligned
216 | 


--------------------------------------------------------------------------------
/prince/mca.py:
--------------------------------------------------------------------------------
  1 | """Multiple Correspondence Analysis (MCA)"""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import sklearn.base
  8 | import sklearn.preprocessing
  9 | import sklearn.utils
 10 | 
 11 | from prince import utils
 12 | 
 13 | from . import ca
 14 | 
 15 | 
 16 | class MCA(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin, ca.CA):
 17 |     def __init__(
 18 |         self,
 19 |         n_components=2,
 20 |         n_iter=10,
 21 |         copy=True,
 22 |         check_input=True,
 23 |         random_state=None,
 24 |         engine="sklearn",
 25 |         one_hot=True,
 26 |         correction=None,
 27 |     ):
 28 |         if correction is not None:
 29 |             if correction not in {"benzecri", "greenacre"}:
 30 |                 raise ValueError("correction must be either 'benzecri' or 'greenacre' if provided.")
 31 |             if not one_hot:
 32 |                 raise ValueError(
 33 |                     "correction can only be applied when one_hot is True. This is because the "
 34 |                     "number of original variables is needed to apply the correction."
 35 |                 )
 36 | 
 37 |         super().__init__(
 38 |             n_components=n_components,
 39 |             n_iter=n_iter,
 40 |             copy=copy,
 41 |             check_input=check_input,
 42 |             random_state=random_state,
 43 |             engine=engine,
 44 |         )
 45 |         self.one_hot = one_hot
 46 |         self.correction = correction
 47 | 
 48 |     def _prepare(self, X):
 49 |         if self.one_hot:
 50 |             X = pd.get_dummies(X, columns=X.columns, prefix_sep="__")
 51 |             if (one_hot_columns_ := getattr(self, "one_hot_columns_", None)) is not None:
 52 |                 X = X.reindex(columns=one_hot_columns_.union(X.columns), fill_value=False)
 53 |         return X
 54 | 
 55 |     def get_feature_names_out(self, input_features=None):
 56 |         return np.arange(self.n_components_)
 57 | 
 58 |     @property
 59 |     def eigenvalues_(self):
 60 |         """Returns the eigenvalues associated with each principal component."""
 61 |         eigenvalues = super().eigenvalues_
 62 |         # Benzécri and Greenacre corrections
 63 |         if self.correction in {"benzecri", "greenacre"}:
 64 |             K = self.K_
 65 |             return np.array(
 66 |                 [(K / (K - 1) * (eig - 1 / K)) ** 2 if eig > 1 / K else 0 for eig in eigenvalues]
 67 |             )
 68 |         return eigenvalues
 69 | 
 70 |     @property
 71 |     @utils.check_is_fitted
 72 |     def percentage_of_variance_(self):
 73 |         """Returns the percentage of explained inertia per principal component."""
 74 |         # Benzécri correction
 75 |         if self.correction == "benzecri":
 76 |             eigenvalues = self.eigenvalues_
 77 |             return 100 * eigenvalues / eigenvalues.sum()
 78 |         # Greenacre correction
 79 |         if self.correction == "greenacre":
 80 |             eigenvalues = super().eigenvalues_
 81 |             benzecris = self.eigenvalues_
 82 |             K, J = (self.K_, self.J_)
 83 |             average_inertia = (K / (K - 1)) * ((eigenvalues**2).sum() - (J - K) / K**2)
 84 |             return 100 * benzecris / average_inertia
 85 |         # No correction
 86 |         return super().percentage_of_variance_
 87 | 
 88 |     @utils.check_is_dataframe_input
 89 |     def fit(self, X, y=None):
 90 |         """Fit the MCA for the dataframe X.
 91 | 
 92 |         The MCA is computed on the indicator matrix (i.e. `X.get_dummies()`). If some of the columns are already
 93 |         in indicator matrix format, you'll want to pass in `K` as the number of "real" variables that it represents.
 94 |         (That's used for correcting the inertia linked to each dimension.)
 95 | 
 96 |         """
 97 | 
 98 |         if self.check_input:
 99 |             sklearn.utils.check_array(X, dtype=[str, "numeric"])
100 | 
101 |         # K is the number of actual variables, to apply the Benzécri correction
102 |         self.K_ = X.shape[1]
103 | 
104 |         # One-hot encode the data
105 |         one_hot = self._prepare(X)
106 |         self.one_hot_columns_ = one_hot.columns
107 | 
108 |         # We need the number of columns to apply the Greenacre correction
109 |         self.J_ = one_hot.shape[1]
110 | 
111 |         # Apply CA to the indicator matrix
112 |         super().fit(one_hot)
113 | 
114 |         return self
115 | 
116 |     @utils.check_is_dataframe_input
117 |     @utils.check_is_fitted
118 |     def row_coordinates(self, X):
119 |         return super().row_coordinates(self._prepare(X))
120 | 
121 |     @utils.check_is_dataframe_input
122 |     @utils.check_is_fitted
123 |     def row_cosine_similarities(self, X):
124 |         oh = self._prepare(X)
125 |         return super()._row_cosine_similarities(X=oh, F=super().row_coordinates(oh))
126 | 
127 |     @utils.check_is_dataframe_input
128 |     @utils.check_is_fitted
129 |     def column_coordinates(self, X):
130 |         return super().column_coordinates(self._prepare(X))
131 | 
132 |     @utils.check_is_dataframe_input
133 |     @utils.check_is_fitted
134 |     def column_cosine_similarities(self, X):
135 |         oh = self._prepare(X)
136 |         return super()._column_cosine_similarities(X=oh, G=super().column_coordinates(oh))
137 | 
138 |     @utils.check_is_dataframe_input
139 |     @utils.check_is_fitted
140 |     def transform(self, X):
141 |         """Computes the row principal coordinates of a dataset."""
142 |         if self.check_input:
143 |             sklearn.utils.check_array(X, dtype=[str, "numeric"])
144 |         return self.row_coordinates(X)
145 | 


--------------------------------------------------------------------------------
/prince/mfa.py:
--------------------------------------------------------------------------------
  1 | """Multiple Factor Analysis (MFA)"""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import collections
  6 | 
  7 | import altair as alt
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | from prince import pca, utils
 12 | 
 13 | 
 14 | class MFA(pca.PCA, collections.UserDict):
 15 |     def __init__(
 16 |         self,
 17 |         n_components=2,
 18 |         n_iter=3,
 19 |         copy=True,
 20 |         check_input=True,
 21 |         random_state=None,
 22 |         engine="sklearn",
 23 |     ):
 24 |         super().__init__(
 25 |             rescale_with_mean=True,
 26 |             rescale_with_std=True,
 27 |             n_components=n_components,
 28 |             n_iter=n_iter,
 29 |             copy=copy,
 30 |             check_input=check_input,
 31 |             random_state=random_state,
 32 |             engine=engine,
 33 |         )
 34 |         collections.UserDict.__init__(self)
 35 | 
 36 |     @utils.check_is_dataframe_input
 37 |     def fit(self, X, y=None, groups=None, supplementary_groups=None):
 38 |         # Checks groups are provided
 39 |         self.groups_ = self._determine_groups(X, groups)
 40 |         if supplementary_groups is not None:
 41 |             for group in supplementary_groups:
 42 |                 if group not in self.groups_:
 43 |                     raise ValueError(f"Supplementary group '{group}' is not in the groups")
 44 |             self.supplementary_groups_ = supplementary_groups
 45 | 
 46 |         # Check group types are consistent
 47 |         self.all_nums_ = {}
 48 |         for group, cols in sorted(self.groups_.items()):
 49 |             all_num = all(pd.api.types.is_numeric_dtype(X[c]) for c in cols)
 50 |             all_cat = all(pd.api.types.is_string_dtype(X[c]) for c in cols)
 51 |             if not (all_num or all_cat):
 52 |                 raise ValueError(f'Not all columns in "{group}" group are of the same type')
 53 |             self.all_nums_[group] = all_num
 54 | 
 55 |         # Run a factor analysis in each group
 56 |         for group, cols in sorted(self.groups_.items()):
 57 |             if self.all_nums_[group]:
 58 |                 fa = pca.PCA(
 59 |                     rescale_with_mean=True,
 60 |                     rescale_with_std=True,
 61 |                     n_components=self.n_components,
 62 |                     n_iter=self.n_iter,
 63 |                     copy=True,
 64 |                     random_state=self.random_state,
 65 |                     engine=self.engine,
 66 |                 )
 67 |             else:
 68 |                 raise NotImplementedError("Groups of non-numerical variables are not supported yet")
 69 |             self[group] = fa.fit(X.loc[:, cols])
 70 | 
 71 |         # Fit the global PCA
 72 |         Z = self._build_Z(X)
 73 |         column_weights = np.array(
 74 |             [
 75 |                 1 / self[group].eigenvalues_[0]
 76 |                 for group, cols in self.groups_.items()
 77 |                 for _ in cols
 78 |                 if group not in getattr(self, "supplementary_groups_", [])
 79 |             ]
 80 |         )
 81 |         super().fit(
 82 |             Z,
 83 |             column_weight=column_weights,
 84 |             supplementary_columns=[
 85 |                 column
 86 |                 for group in getattr(self, "supplementary_groups_", [])
 87 |                 for column in self.groups_[group]
 88 |             ],
 89 |         )
 90 | 
 91 |         return self
 92 | 
 93 |     def _determine_groups(self, X: pd.DataFrame, groups: dict | list | None) -> dict:
 94 |         if groups is None:
 95 |             if isinstance(X.columns, pd.MultiIndex):
 96 |                 groups = X.columns.get_level_values(0).unique().tolist()
 97 |             else:
 98 |                 raise ValueError("Groups have to be specified")
 99 | 
100 |         if isinstance(groups, list):
101 |             if not isinstance(X.columns, pd.MultiIndex):
102 |                 raise ValueError(
103 |                     "X has to have MultiIndex columns if groups are provided as a list"
104 |                 )
105 |             groups = {
106 |                 group: [
107 |                     (group, column)
108 |                     for column in X.columns.get_level_values(1)[
109 |                         X.columns.get_level_values(0) == group
110 |                     ]
111 |                 ]
112 |                 for group in groups
113 |             }
114 |         return groups
115 | 
116 |     def _build_Z(self, X):
117 |         return pd.concat(
118 |             (X[cols] for _, cols in self.groups_.items()),
119 |             axis="columns",
120 |         )
121 | 
122 |     @utils.check_is_dataframe_input
123 |     @utils.check_is_fitted
124 |     def row_coordinates(self, X):
125 |         """Returns the row principal coordinates."""
126 |         Z = self._build_Z(X)
127 |         return super().row_coordinates(Z)
128 | 
129 |     @utils.check_is_dataframe_input
130 |     @utils.check_is_fitted
131 |     def partial_row_coordinates(self, X):
132 |         """Returns the partial row principal coordinates."""
133 |         Z = self._build_Z(X)
134 |         coords = []
135 |         for _, names in self.groups_.items():
136 |             partial_coords = pd.DataFrame(0.0, index=Z.index, columns=Z.columns)
137 |             partial_coords.loc[:, names] = (Z[names] - Z[names].mean()) / Z[names].std(ddof=0)
138 |             partial_coords = partial_coords * self.column_weight_
139 |             partial_coords = (len(self.groups_) * partial_coords).dot(self.svd_.V.T)
140 |             coords.append(partial_coords)
141 |         coords = pd.concat(coords, axis=1, keys=self.groups_.keys())
142 |         coords.columns.name = "component"
143 |         return coords
144 | 
145 |     @utils.check_is_dataframe_input
146 |     @utils.check_is_fitted
147 |     def column_coordinates(self, X):
148 |         Z = self._build_Z(X)
149 |         return super().column_coordinates(Z)
150 | 
151 |     @utils.check_is_dataframe_input
152 |     @utils.check_is_fitted
153 |     def inverse_transform(self, X):
154 |         raise NotImplementedError("MFA inherits from PCA, but this method is not implemented yet")
155 | 
156 |     @utils.check_is_dataframe_input
157 |     @utils.check_is_fitted
158 |     def row_standard_coordinates(self, X):
159 |         Z = self._build_Z(X)
160 |         return super().row_standard_coordinates(Z)
161 | 
162 |     @utils.check_is_dataframe_input
163 |     @utils.check_is_fitted
164 |     def row_cosine_similarities(self, X):
165 |         Z = self._build_Z(X)
166 |         return super().row_cosine_similarities(Z)
167 | 
168 |     @utils.check_is_dataframe_input
169 |     @utils.check_is_fitted
170 |     def column_cosine_similarities_(self, X):
171 |         Z = self._build_Z(X)
172 |         return super().column_cosine_similarities_(Z)
173 | 
174 |     @utils.check_is_dataframe_input
175 |     @utils.check_is_fitted
176 |     def plot(self, X, x_component=0, y_component=1, show_partial_rows=False, **params):
177 |         index_name = X.index.name or "index"
178 | 
179 |         params["tooltip"] = (
180 |             X.index.names if isinstance(X.index, pd.MultiIndex) else [index_name]
181 |         ) + [
182 |             "group",
183 |             f"component {x_component}",
184 |             f"component {y_component}",
185 |         ]
186 | 
187 |         eig = self._eigenvalues_summary.to_dict(orient="index")
188 | 
189 |         row_plot = None
190 |         partial_row_plot = None
191 |         edges_plot = None
192 | 
193 |         # Barycenters
194 |         row_coords = self.row_coordinates(X)
195 |         row_coords.columns = [f"component {i}" for i in row_coords.columns]
196 |         row_coords = row_coords.reset_index()
197 |         row_coords["group"] = "Global"
198 |         if show_partial_rows:
199 |             params["color"] = "group:N"
200 |         row_plot = (
201 |             alt.Chart(row_coords)
202 |             .mark_point(filled=True, size=50)
203 |             .encode(
204 |                 alt.X(
205 |                     f"component {x_component}",
206 |                     scale=alt.Scale(zero=False),
207 |                     axis=alt.Axis(
208 |                         title=f"component {x_component} — {eig[x_component]['% of variance'] / 100:.2%}"
209 |                     ),
210 |                 ),
211 |                 alt.Y(
212 |                     f"component {y_component}",
213 |                     scale=alt.Scale(zero=False),
214 |                     axis=alt.Axis(
215 |                         title=f"component {y_component} — {eig[y_component]['% of variance'] / 100:.2%}"
216 |                     ),
217 |                 ),
218 |                 **params,
219 |             )
220 |         )
221 | 
222 |         # Partial row coordinates
223 |         if show_partial_rows:
224 |             partial_row_coords = self.partial_row_coordinates(X).stack(level=0, future_stack=True)
225 |             partial_row_coords.columns = [f"component {i}" for i in partial_row_coords.columns]
226 |             partial_row_coords = partial_row_coords.reset_index(names=[index_name, "group"])
227 | 
228 |             partial_row_plot = (
229 |                 alt.Chart(partial_row_coords)
230 |                 .mark_point(shape="circle")
231 |                 .encode(
232 |                     alt.X(f"component {x_component}", scale=alt.Scale(zero=False)),
233 |                     alt.Y(f"component {y_component}", scale=alt.Scale(zero=False)),
234 |                     **params,
235 |                 )
236 |             )
237 | 
238 |         # Edges to connect the main markers to the partial markers
239 |         if show_partial_rows:
240 |             edges = pd.merge(
241 |                 left=row_coords[
242 |                     [index_name, f"component {x_component}", f"component {y_component}"]
243 |                 ],
244 |                 right=partial_row_coords[
245 |                     [index_name, f"component {x_component}", f"component {y_component}", "group"]
246 |                 ],
247 |                 on=index_name,
248 |                 suffixes=("_global", "_partial"),
249 |             )
250 |             edges_plot = (
251 |                 alt.Chart(edges)
252 |                 .mark_line(opacity=0.7)
253 |                 .encode(
254 |                     x=f"component {x_component}_global:Q",
255 |                     y=f"component {y_component}_global:Q",
256 |                     x2=f"component {x_component}_partial:Q",
257 |                     y2=f"component {y_component}_partial:Q",
258 |                     color="group:N",
259 |                     strokeDash=alt.value([2, 2]),
260 |                 )
261 |             )
262 | 
263 |         charts = filter(
264 |             None,
265 |             (row_plot, partial_row_plot, edges_plot),
266 |         )
267 | 
268 |         return alt.layer(*charts).interactive()
269 | 


--------------------------------------------------------------------------------
/prince/pca.py:
--------------------------------------------------------------------------------
  1 | """Principal Component Analysis (PCA)"""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import functools
  6 | 
  7 | import altair as alt
  8 | import numpy as np
  9 | import pandas as pd
 10 | import sklearn.base
 11 | import sklearn.utils
 12 | from sklearn import preprocessing
 13 | 
 14 | from prince import svd, utils
 15 | 
 16 | 
 17 | def select_active_variables(method):
 18 |     @functools.wraps(method)
 19 |     def _impl(self, X=None, *method_args, **method_kwargs):
 20 |         if hasattr(self, "feature_names_in_") and isinstance(X, pd.DataFrame):
 21 |             return method(self, X[self.feature_names_in_], *method_args, **method_kwargs)
 22 |         return method(self, X, *method_args, **method_kwargs)
 23 | 
 24 |     return _impl
 25 | 
 26 | 
 27 | class PCA(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin, utils.EigenvaluesMixin):
 28 |     """Principal Component Analysis (PCA).
 29 | 
 30 |     Parameters
 31 |     ----------
 32 |     rescale_with_mean
 33 |         Whether or not to subtract each column's mean before performing SVD.
 34 |     rescale_with_std
 35 |         Whether or not to standardize each column before performing SVD.
 36 |     n_components
 37 |         The number of principal components to compute.
 38 |     n_iter
 39 |         The number of iterations used for computing the SVD.
 40 |     copy
 41 |         Whether nor to perform the computations inplace.
 42 |     check_input
 43 |         Whether to check the coherence of the inputs or not.
 44 | 
 45 |     """
 46 | 
 47 |     def __init__(
 48 |         self,
 49 |         rescale_with_mean=True,
 50 |         rescale_with_std=True,
 51 |         n_components=2,
 52 |         n_iter=3,
 53 |         copy=True,
 54 |         check_input=True,
 55 |         random_state=None,
 56 |         engine="sklearn",
 57 |     ):
 58 |         self.n_components = n_components
 59 |         self.n_iter = n_iter
 60 |         self.rescale_with_mean = rescale_with_mean
 61 |         self.rescale_with_std = rescale_with_std
 62 |         self.copy = copy
 63 |         self.check_input = check_input
 64 |         self.random_state = random_state
 65 |         self.engine = engine
 66 | 
 67 |     def _check_input(self, X):
 68 |         if self.check_input:
 69 |             sklearn.utils.check_array(X)
 70 | 
 71 |     def get_feature_names_out(self, input_features=None):
 72 |         return np.arange(self.n_components_)
 73 | 
 74 |     @utils.check_is_dataframe_input
 75 |     def fit(
 76 |         self,
 77 |         X,
 78 |         y=None,
 79 |         sample_weight=None,
 80 |         column_weight=None,
 81 |         supplementary_columns=None,
 82 |     ):
 83 |         self._check_input(X)
 84 | 
 85 |         # Massage input
 86 |         supplementary_columns = supplementary_columns or []
 87 |         active_variables = X.columns.difference(supplementary_columns, sort=False).tolist()
 88 |         sample_weight = np.ones(len(X)) if sample_weight is None else sample_weight
 89 |         sample_weight = sample_weight / sample_weight.sum()
 90 |         column_weight = np.ones(len(active_variables)) if column_weight is None else column_weight
 91 |         self.column_weight_ = column_weight
 92 | 
 93 |         # https://scikit-learn.org/stable/developers/develop.html#universal-attributes
 94 |         self.feature_names_in_ = active_variables
 95 |         self.n_features_in_ = len(active_variables)
 96 | 
 97 |         X_active = X[active_variables].to_numpy(dtype=np.float64, copy=self.copy)
 98 |         if supplementary_columns:
 99 |             X_sup = X[supplementary_columns].to_numpy(dtype=np.float64, copy=self.copy)
100 | 
101 |         # Scale datarow_contributions
102 |         if self.rescale_with_mean or self.rescale_with_std:
103 |             self.scaler_ = preprocessing.StandardScaler(
104 |                 copy=self.copy,
105 |                 with_mean=self.rescale_with_mean,
106 |                 with_std=self.rescale_with_std,
107 |             ).fit(X_active, sample_weight=sample_weight)
108 |             X_active = self.scaler_.transform(X_active)  # TODO: maybe fit_transform is faster
109 |             if supplementary_columns:
110 |                 X_sup = preprocessing.StandardScaler(
111 |                     copy=self.copy,
112 |                     with_mean=self.rescale_with_mean,
113 |                     with_std=self.rescale_with_std,
114 |                 ).fit_transform(X_sup)
115 | 
116 |         self._column_dist = pd.Series(
117 |             (X_active**2 * sample_weight[:, np.newaxis]).sum(axis=0),
118 |             index=active_variables,
119 |         )
120 |         if supplementary_columns:
121 |             self._column_dist = pd.concat(
122 |                 (
123 |                     self._column_dist,
124 |                     pd.Series(
125 |                         (X_sup**2 / len(X_sup)).sum(axis=0),
126 |                         index=supplementary_columns,
127 |                     ),
128 |                 )
129 |             )
130 | 
131 |         self.svd_ = svd.compute_svd(
132 |             X=X_active,
133 |             n_components=self.n_components,
134 |             n_iter=self.n_iter,
135 |             random_state=self.random_state,
136 |             engine=self.engine,
137 |             row_weights=sample_weight,
138 |             column_weights=column_weight,
139 |         )
140 | 
141 |         self.total_inertia_ = np.sum(
142 |             np.square(X_active) * column_weight * sample_weight[:, np.newaxis]
143 |         )
144 | 
145 |         self.column_coordinates_ = pd.DataFrame(
146 |             data=self.svd_.V.T * self.eigenvalues_**0.5,
147 |             index=active_variables,
148 |         )
149 |         if supplementary_columns:
150 |             self.column_coordinates_ = pd.concat(
151 |                 [
152 |                     self.column_coordinates_,
153 |                     pd.DataFrame(
154 |                         data=X_sup.T @ (self.svd_.U / len(self.svd_.U) ** 0.5),
155 |                         index=supplementary_columns,
156 |                     ),
157 |                 ]
158 |             )
159 |         self.column_coordinates_.columns.name = "component"
160 |         self.column_coordinates_.index.name = "variable"
161 |         row_coords = pd.DataFrame(
162 |             self.svd_.U * self.eigenvalues_**0.5,
163 |             # HACK: there's a circular dependency between row_contributions_
164 |             # and active_row_coordinates in self.__init__
165 |             index=self.row_contributions_.index if hasattr(self, "row_contributions_") else None,
166 |         )
167 |         row_coords.columns.name = "component"
168 |         self.row_contributions_ = (row_coords**2 * sample_weight[:, np.newaxis]).div(
169 |             self.eigenvalues_, axis=1
170 |         )
171 |         self.row_contributions_.index = X.index
172 | 
173 |         return self
174 | 
175 |     @property
176 |     @utils.check_is_fitted
177 |     def eigenvalues_(self):
178 |         """Returns the eigenvalues associated with each principal component."""
179 |         return np.square(self.svd_.s)
180 | 
181 |     def _scale(self, X):
182 |         if not hasattr(self, "scaler_"):
183 |             return X
184 | 
185 |         if sup_variables := X.columns.difference(self.feature_names_in_, sort=False).tolist():
186 |             X = np.concatenate(
187 |                 (
188 |                     self.scaler_.transform(X[self.feature_names_in_].to_numpy()),
189 |                     preprocessing.StandardScaler(
190 |                         copy=self.copy,
191 |                         with_mean=self.rescale_with_mean,
192 |                         with_std=self.rescale_with_std,
193 |                     ).fit_transform(X[sup_variables]),
194 |                 ),
195 |                 axis=1,
196 |             )
197 |         else:
198 |             X = self.scaler_.transform(X.to_numpy())
199 | 
200 |         return X
201 | 
202 |     @utils.check_is_dataframe_input
203 |     @utils.check_is_fitted
204 |     @select_active_variables
205 |     def row_coordinates(self, X: pd.DataFrame):
206 |         """Returns the row principal coordinates.
207 | 
208 |         The row principal coordinates are obtained by projecting `X` on the right eigenvectors.
209 | 
210 |         Synonyms
211 |         --------
212 |         Row projections
213 |         Factor scores
214 |         Loadings
215 | 
216 |         """
217 | 
218 |         index = X.index if isinstance(X, pd.DataFrame) else None
219 |         X = self._scale(X)
220 |         X = np.array(X, copy=self.copy)
221 |         X *= self.column_weight_
222 | 
223 |         coord = pd.DataFrame(data=X.dot(self.svd_.V.T), index=index)
224 |         coord.columns.name = "component"
225 |         return coord
226 | 
227 |     @utils.check_is_dataframe_input
228 |     @utils.check_is_fitted
229 |     def transform(self, X, as_array=False):
230 |         """Computes the row principal coordinates of a dataset.
231 | 
232 |         Same as calling `row_coordinates`. This is just for compatibility with
233 |         scikit-learn.
234 | 
235 |         """
236 |         self._check_input(X)
237 |         rc = self.row_coordinates(X)
238 |         return rc.to_numpy() if as_array else rc
239 | 
240 |     @utils.check_is_dataframe_input
241 |     def fit_transform(self, X, y=None, as_array=False):
242 |         """A faster way to fit/transform.
243 | 
244 |         This methods produces exactly the same result as calling `fit(X)` followed
245 |         by `transform(X)`. It is however much faster, as it avoids a matrix multiplication
246 |         between the input data and the right eigenvectors. The row coordinates are instead obtained
247 |         directly from the left eigenvectors.
248 | 
249 |         """
250 |         self._check_input(X)
251 |         self.fit(X)
252 |         rc = self.row_coordinates(X)
253 |         return rc.to_numpy() if as_array else rc
254 | 
255 |     @utils.check_is_dataframe_input
256 |     @utils.check_is_fitted
257 |     def inverse_transform(self, X, as_array=False):
258 |         """Transforms row projections back to their original space.
259 | 
260 |         In other words, return a dataset whose transform would be X.
261 | 
262 |         """
263 | 
264 |         X_inv = np.dot(X, self.svd_.V)
265 | 
266 |         if hasattr(self, "scaler_"):
267 |             X_inv = self.scaler_.inverse_transform(X_inv)
268 | 
269 |         if as_array:
270 |             return X_inv
271 | 
272 |         # Extract index
273 |         index = X.index if isinstance(X, pd.DataFrame) else None
274 |         return pd.DataFrame(data=X_inv, index=index)
275 | 
276 |     @utils.check_is_dataframe_input
277 |     @utils.check_is_fitted
278 |     def row_standard_coordinates(self, X: pd.DataFrame = None):
279 |         """Returns the row standard coordinates.
280 | 
281 |         The row standard coordinates are obtained by dividing each row principal coordinate by it's
282 |         associated eigenvalue.
283 | 
284 |         """
285 |         return self.row_coordinates(X).div(self.eigenvalues_, axis="columns")
286 | 
287 |     @utils.check_is_dataframe_input
288 |     @utils.check_is_fitted
289 |     @select_active_variables
290 |     def row_cosine_similarities(self, X):
291 |         """Returns the cosine similarities between the rows and their principal components.
292 | 
293 |         The row cosine similarities are obtained by calculating the cosine of the angle shaped by
294 |         the row principal coordinates and the row principal components. This is calculated by
295 |         squaring each row projection coordinate and dividing each squared coordinate by the sum of
296 |         the squared coordinates, which results in a ratio comprised between 0 and 1 representing
297 |         the squared cosine.
298 | 
299 |         """
300 |         squared_coordinates = (np.square(self._scale(X)) * self.column_weight_).sum(axis=1)
301 |         return (self.row_coordinates(X) ** 2).div(squared_coordinates, axis=0)
302 | 
303 |     @property
304 |     @utils.check_is_fitted
305 |     def column_correlations(self):
306 |         """Calculate correlations between variables and components.
307 | 
308 |         The correlation between a variable and a component estimates the information they share. In
309 |         the PCA framework, this correlation is called a loading.
310 | 
311 |         Note that the sum of the squared coefficients of correlation between a variable and all the
312 |         components is equal to 1. As a consequence, the squared loadings are easier to interpret
313 |         than the loadings (because the squared loadings give the proportion of the variance of the
314 |         variables explained by the components).
315 | 
316 |         """
317 |         return self.column_coordinates_.div(self._column_dist**0.5, axis=0)
318 | 
319 |     @property
320 |     @utils.check_is_fitted
321 |     def column_cosine_similarities_(self):
322 |         return self.column_correlations**2
323 | 
324 |     @property
325 |     @utils.check_is_fitted
326 |     def column_contributions_(self):
327 |         return (
328 |             ((self.column_coordinates_.loc[self.feature_names_in_]) ** 2)
329 |             * self.column_weight_[:, np.newaxis]
330 |         ).div(self.eigenvalues_, axis=1)
331 | 
332 |     @utils.check_is_dataframe_input
333 |     @utils.check_is_fitted
334 |     def plot(
335 |         self,
336 |         X,
337 |         x_component=0,
338 |         y_component=1,
339 |         color_rows_by=None,
340 |         show_row_markers=True,
341 |         show_column_markers=True,
342 |         show_row_labels=False,
343 |         show_column_labels=False,
344 |         row_labels_column=None,
345 |     ):
346 |         row_params = {
347 |             "tooltip": (
348 |                 X.index.names
349 |                 if isinstance(X.index, pd.MultiIndex)
350 |                 else [X.index.name or "index"]  # index is the default name
351 |             )
352 |             + [
353 |                 f"component {x_component}",
354 |                 f"component {y_component}",
355 |             ]
356 |         }
357 |         if color_rows_by:
358 |             row_params["color"] = color_rows_by
359 | 
360 |         eig = self._eigenvalues_summary.to_dict(orient="index")
361 | 
362 |         row_chart_markers = None
363 |         row_chart_labels = None
364 |         column_chart_markers = None
365 |         column_chart_labels = None
366 | 
367 |         if show_row_markers or show_row_labels:
368 |             row_coords = self.row_coordinates(X)
369 |             row_coords.columns = [f"component {i}" for i in row_coords.columns]
370 |             row_labels = (
371 |                 pd.Series(
372 |                     row_coords.index.get_level_values(
373 |                         row_labels_column or row_coords.index.names[0]
374 |                     ),
375 |                     index=row_coords.index,
376 |                 )
377 |                 if isinstance(row_coords.index, pd.MultiIndex)
378 |                 else pd.Series(row_coords.index, index=row_coords.index)
379 |             )
380 | 
381 |             row_chart = alt.Chart(row_coords.assign(label=row_labels).reset_index()).encode(
382 |                 alt.X(
383 |                     f"component {x_component}",
384 |                     scale=alt.Scale(zero=False),
385 |                     axis=alt.Axis(
386 |                         title=f"component {x_component} — {eig[x_component]['% of variance'] / 100:.2%}"
387 |                     ),
388 |                 ),
389 |                 alt.Y(
390 |                     f"component {y_component}",
391 |                     scale=alt.Scale(zero=False),
392 |                     axis=alt.Axis(
393 |                         title=f"component {y_component} — {eig[y_component]['% of variance'] / 100:.2%}"
394 |                     ),
395 |                 ),
396 |                 **row_params,
397 |             )
398 |             row_chart_markers = row_chart.mark_circle(size=50 if show_row_markers else 0)
399 |             if show_row_labels:
400 |                 row_chart_labels = row_chart.mark_text().encode(text="label:N")
401 | 
402 |         if show_column_markers or show_column_labels:
403 |             column_coords = self.column_coordinates_.copy()
404 |             column_coords.columns = [f"component {i}" for i in column_coords.columns]
405 |             # Scale the column coordinates to the row coordinates
406 |             column_coords = column_coords * row_coords.abs().max()
407 |             column_labels = pd.Series(column_coords.index, index=column_coords.index)
408 | 
409 |             column_chart = alt.Chart(
410 |                 column_coords.assign(label=column_labels).reset_index()
411 |             ).encode(
412 |                 alt.X(f"component {x_component}", scale=alt.Scale(zero=False)),
413 |                 alt.Y(f"component {y_component}", scale=alt.Scale(zero=False)),
414 |                 tooltip=["variable"],
415 |             )
416 |             column_chart_markers = column_chart.mark_square(
417 |                 color="green", size=50 if show_column_markers else 0
418 |             )
419 |             if show_column_labels:
420 |                 column_chart_labels = column_chart.mark_text().encode(text="label:N")
421 | 
422 |         charts = filter(
423 |             None,
424 |             (
425 |                 row_chart_markers,
426 |                 row_chart_labels,
427 |                 column_chart_markers,
428 |                 column_chart_labels,
429 |             ),
430 |         )
431 | 
432 |         return alt.layer(*charts).interactive()
433 | 


--------------------------------------------------------------------------------
/prince/plot.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from collections import OrderedDict
 4 | 
 5 | import numpy as np
 6 | from scipy import linalg
 7 | 
 8 | GRAY = OrderedDict([("light", "#bababa"), ("dark", "#404040")])
 9 | 
10 | 
11 | def stylize_axis(ax, grid=True):
12 |     if grid:
13 |         ax.grid()
14 | 
15 |     ax.xaxis.set_ticks_position("none")
16 |     ax.yaxis.set_ticks_position("none")
17 | 
18 |     ax.axhline(y=0, linestyle="-", linewidth=1.2, color=GRAY["dark"], alpha=0.6)
19 |     ax.axvline(x=0, linestyle="-", linewidth=1.2, color=GRAY["dark"], alpha=0.6)
20 | 
21 |     return ax
22 | 
23 | 
24 | def build_ellipse(X, Y):
25 |     """Construct ellipse coordinates from two arrays of numbers.
26 | 
27 |     Args:
28 |         X (1D array_like)
29 |         Y (1D array_like)
30 | 
31 |     Returns:
32 |         float: The mean of `X`.
33 |         float: The mean of `Y`.
34 |         float: The width of the ellipse.
35 |         float: The height of the ellipse.
36 |         float: The angle of orientation of the ellipse.
37 | 
38 |     """
39 |     x_mean = np.mean(X)
40 |     y_mean = np.mean(Y)
41 | 
42 |     cov_matrix = np.cov(np.vstack((X, Y)))
43 |     U, s, V = linalg.svd(cov_matrix, full_matrices=False)
44 | 
45 |     chi_95 = np.sqrt(4.61)  # 90% quantile of the chi-square distribution
46 |     width = np.sqrt(cov_matrix[0][0]) * chi_95 * 2
47 |     height = np.sqrt(cov_matrix[1][1]) * chi_95 * 2
48 | 
49 |     eigenvector = V.T[0]
50 |     angle = np.arctan(eigenvector[1] / eigenvector[0])
51 | 
52 |     return x_mean, y_mean, width, height, angle
53 | 


--------------------------------------------------------------------------------
/prince/svd.py:
--------------------------------------------------------------------------------
 1 | """Singular Value Decomposition (SVD)"""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import dataclasses
 6 | 
 7 | try:
 8 |     import fbpca
 9 | 
10 |     FBPCA_INSTALLED = True
11 | except ImportError:
12 |     FBPCA_INSTALLED = False
13 | import numpy as np
14 | import scipy
15 | from sklearn.utils import extmath
16 | 
17 | 
18 | @dataclasses.dataclass
19 | class SVD:
20 |     U: np.ndarray
21 |     s: np.ndarray
22 |     V: np.ndarray
23 | 
24 | 
25 | def compute_svd(
26 |     X: np.ndarray,
27 |     n_components: int,
28 |     n_iter: int,
29 |     engine: str,
30 |     random_state: int | None = None,
31 |     row_weights: np.ndarray | None = None,
32 |     column_weights: np.ndarray | None = None,
33 | ) -> SVD:
34 |     """Computes an SVD with k components."""
35 | 
36 |     if row_weights is not None:
37 |         X = X * np.sqrt(row_weights[:, np.newaxis])  # row-wise scaling
38 |     if column_weights is not None:
39 |         X = X * np.sqrt(column_weights)
40 | 
41 |     # Compute the SVD
42 |     if engine == "fbpca":
43 |         if FBPCA_INSTALLED:
44 |             U, s, V = fbpca.pca(X, k=n_components, n_iter=n_iter)
45 |         else:
46 |             raise ValueError("fbpca is not installed; please install it if you want to use it")
47 |     elif engine == "scipy":
48 |         U, s, V = scipy.linalg.svd(X)
49 |         U = U[:, :n_components]
50 |         s = s[:n_components]
51 |         V = V[:n_components, :]
52 |     elif engine == "sklearn":
53 |         U, s, V = extmath.randomized_svd(
54 |             X, n_components=n_components, n_iter=n_iter, random_state=random_state
55 |         )
56 |     else:
57 |         raise ValueError("engine has to be one of ('fbpca', 'scipy', 'sklearn')")
58 | 
59 |     # U, V = extmath.svd_flip(U, V)
60 | 
61 |     if row_weights is not None:
62 |         U = U / np.sqrt(row_weights)[:, np.newaxis]  # row-wise scaling
63 |     if column_weights is not None:
64 |         V = V / np.sqrt(column_weights)
65 | 
66 |     return SVD(U, s, V)
67 | 


--------------------------------------------------------------------------------
/prince/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import functools
  4 | 
  5 | import altair as alt
  6 | import numpy as np
  7 | import pandas as pd
  8 | from sklearn.utils import validation
  9 | 
 10 | 
 11 | def check_is_fitted(method):
 12 |     @functools.wraps(method)
 13 |     def _impl(self, *method_args, **method_kwargs):
 14 |         validation.check_is_fitted(self)
 15 |         return method(self, *method_args, **method_kwargs)
 16 | 
 17 |     return _impl
 18 | 
 19 | 
 20 | def check_is_dataframe_input(func):
 21 |     @functools.wraps(func)
 22 |     def wrapper(*args, **kwargs):
 23 |         X = args[1]  # Assuming the first argument is 'self' or an instance
 24 |         if not isinstance(X, pd.DataFrame):
 25 |             raise ValueError(
 26 |                 f"The X argument must be a pandas DataFrame, but got {type(X).__name__}"
 27 |             )
 28 |         return func(*args, **kwargs)
 29 | 
 30 |     return wrapper
 31 | 
 32 | 
 33 | def make_labels_and_names(X):
 34 |     if isinstance(X, pd.DataFrame):
 35 |         row_label = X.index.name if X.index.name else "Rows"
 36 |         row_names = X.index.tolist()
 37 |         col_label = X.columns.name if X.columns.name else "Columns"
 38 |         col_names = X.columns.tolist()
 39 |     else:
 40 |         row_label = "Rows"
 41 |         row_names = list(range(X.shape[0]))
 42 |         col_label = "Columns"
 43 |         col_names = list(range(X.shape[1]))
 44 | 
 45 |     return row_label, row_names, col_label, col_names
 46 | 
 47 | 
 48 | class EigenvaluesMixin:
 49 |     @property
 50 |     @check_is_fitted
 51 |     def percentage_of_variance_(self):
 52 |         """Returns the percentage of explained inertia per principal component."""
 53 |         return 100 * self.eigenvalues_ / self.total_inertia_
 54 | 
 55 |     @property
 56 |     @check_is_fitted
 57 |     def cumulative_percentage_of_variance_(self):
 58 |         """Returns the percentage of explained inertia per principal component."""
 59 |         return np.cumsum(self.percentage_of_variance_)
 60 | 
 61 |     @property
 62 |     @check_is_fitted
 63 |     def _eigenvalues_summary(self):
 64 |         """Return a summary of the eigenvalues and their importance."""
 65 |         return pd.DataFrame(
 66 |             {
 67 |                 "eigenvalue": self.eigenvalues_,
 68 |                 r"% of variance": self.percentage_of_variance_,
 69 |                 r"% of variance (cumulative)": self.cumulative_percentage_of_variance_,
 70 |             },
 71 |             index=pd.RangeIndex(0, len(self.eigenvalues_), name="component"),
 72 |         )
 73 | 
 74 |     @property
 75 |     def eigenvalues_summary(self):
 76 |         """Return a summary of the eigenvalues and their importance."""
 77 |         summary = self._eigenvalues_summary
 78 |         summary["% of variance"] /= 100
 79 |         summary["% of variance (cumulative)"] /= 100
 80 |         summary["eigenvalue"] = summary["eigenvalue"].map("{:,.3f}".format)
 81 |         summary["% of variance"] = summary["% of variance"].map("{:.2%}".format)
 82 |         summary["% of variance (cumulative)"] = summary["% of variance (cumulative)"].map(
 83 |             "{:.2%}".format
 84 |         )
 85 |         summary.index.name = "component"
 86 |         return summary
 87 | 
 88 |     def scree_plot(self):
 89 |         """Scree plot.
 90 | 
 91 |         References
 92 |         ----------
 93 |         https://en.wikipedia.org/wiki/Scree_plot
 94 | 
 95 |         """
 96 |         eig = self._eigenvalues_summary.reset_index()
 97 |         eig["component"] = eig["component"].astype(str)
 98 |         return (
 99 |             alt.Chart(
100 |                 self._eigenvalues_summary.reset_index().assign(
101 |                     component=lambda x: x["component"].astype(str)
102 |                 )
103 |             )
104 |             .mark_bar(size=10)
105 |             .encode(x="component", y="eigenvalue", tooltip=eig.columns.tolist())
106 |         )
107 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "prince"
 3 | version = "0.16.0"
 4 | description = "Factor analysis in Python: PCA, CA, MCA, MFA, FAMD, GPA"
 5 | authors = ["Max Halford <maxhalford25@gmail.com>"]
 6 | license = "MIT"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.10,<4.0"
10 | scikit-learn = "^1.5.1"
11 | pandas = "^2.2.0"
12 | altair = "^5.0.0"
13 | 
14 | [tool.poetry.group.dev.dependencies]
15 | nbconvert = "^7.16.5"
16 | fbpca = "^1.0"
17 | pytest = "^8.3.4"
18 | ipykernel = "^6.13.0"
19 | rpy2 = "^3.5.2"
20 | ruff = "^0.8.5"
21 | xarray = "^2025.1.0"
22 | pre-commit = "^4.0.1"
23 | 
24 | [tool.ruff]
25 | lint.select = ["E", "F", "I", "UP"]  # https://beta.ruff.rs/docs/rules/
26 | line-length = 100
27 | target-version = 'py310'
28 | lint.ignore = ["E501"]
29 | 
30 | [tool.ruff.lint.isort]
31 | required-imports = ["from __future__ import annotations"]
32 | 
33 | [build-system]
34 | requires = ["poetry-core>=1.0.0"]
35 | build-backend = "poetry.core.masonry.api"
36 | 
37 | [tool.pytest.ini_options]
38 | addopts = [
39 |     "--verbose",
40 |     "--doctest-modules",
41 |     "--doctest-glob=*.md"
42 | ]
43 | doctest_optionflags = "NORMALIZE_WHITESPACE NUMBER ELLIPSIS"
44 | 


--------------------------------------------------------------------------------
/tests/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: prince-test
2 | Version: 0.0.0.1
3 | Title: Test dependencies
4 | Imports:
5 |     FactoMineR
6 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import rpy2.rinterface_lib
 6 | from rpy2.robjects import r as R
 7 | 
 8 | 
 9 | def load_df_from_R(code):
10 |     df = R(code)
11 |     if isinstance(df.names, rpy2.rinterface_lib.sexp.NULLType):
12 |         return pd.DataFrame(np.array(df))
13 |     return pd.DataFrame(np.array(df), index=df.names[0], columns=df.names[1])
14 | 


--------------------------------------------------------------------------------
/tests/test_ca.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import math
  4 | import tempfile
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import pytest
  9 | import rpy2.robjects as robjects
 10 | import sklearn.utils.estimator_checks
 11 | import sklearn.utils.validation
 12 | from rpy2.robjects import r as R
 13 | from scipy import sparse
 14 | 
 15 | import prince
 16 | from tests import load_df_from_R
 17 | 
 18 | 
 19 | @pytest.mark.parametrize(
 20 |     "sup_rows, sup_cols",
 21 |     [
 22 |         pytest.param(
 23 |             sup_rows,
 24 |             sup_cols,
 25 |             id=":".join(["sup_rows" if sup_rows else "", "sup_cols" if sup_cols else ""]).strip(
 26 |                 ":"
 27 |             ),
 28 |         )
 29 |         for sup_rows in [False, True]
 30 |         for sup_cols in [False, True]
 31 |     ],
 32 | )
 33 | class TestCA:
 34 |     _row_name = "row"
 35 |     _col_name = "col"
 36 | 
 37 |     @pytest.fixture(autouse=True)
 38 |     def _prepare(self, sup_rows, sup_cols):
 39 |         self.sup_rows = sup_rows
 40 |         self.sup_cols = sup_cols
 41 | 
 42 |         n_components = 5
 43 | 
 44 |         # Fit Prince
 45 |         self.dataset = prince.datasets.load_french_elections()
 46 |         active = self.dataset.copy()
 47 |         if sup_rows:
 48 |             active = active.drop("Île-de-France")
 49 |         if self.sup_cols:
 50 |             active = active.drop(columns=["Abstention", "Blank"])
 51 |         self.ca = prince.CA(n_components=n_components)
 52 |         self.ca.fit(active)
 53 | 
 54 |         # Fit FactoMineR
 55 |         R("library('FactoMineR')")
 56 |         with tempfile.NamedTemporaryFile() as fp:
 57 |             self.dataset.to_csv(fp)
 58 |             R(f"dataset <- read.csv('{fp.name}', row.names=1)")
 59 | 
 60 |         args = f"dataset, ncp={n_components}, graph=F"
 61 |         if self.sup_cols:
 62 |             if sup_rows:
 63 |                 R(f"ca <- CA({args}, col.sup=c(13, 14), row.sup=c(18))")
 64 |             else:
 65 |                 R(f"ca <- CA({args}, col.sup=c(13, 14))")
 66 |         else:
 67 |             if sup_rows:
 68 |                 R(f"ca <- CA({args}, row.sup=c(18))")
 69 |             else:
 70 |                 R(f"ca <- CA({args})")
 71 | 
 72 |     def test_check_is_fitted(self):
 73 |         assert isinstance(self.ca, prince.CA)
 74 |         sklearn.utils.validation.check_is_fitted(self.ca)
 75 | 
 76 |     def test_svd_U(self):
 77 |         F = load_df_from_R("ca$svd$U").to_numpy()
 78 |         P = sparse.diags(self.ca.row_masses_.to_numpy() ** -0.5) @ self.ca.svd_.U
 79 |         np.testing.assert_allclose(np.abs(F), np.abs(P))
 80 | 
 81 |     def test_svd_V(self):
 82 |         F = load_df_from_R("ca$svd$V").to_numpy()
 83 |         P = sparse.diags(self.ca.col_masses_.to_numpy() ** -0.5) @ self.ca.svd_.V.T
 84 |         np.testing.assert_allclose(np.abs(F), np.abs(P))
 85 | 
 86 |     def test_total_inertia(self):
 87 |         F = robjects.r("sum(ca$eig[,1])")[0]
 88 |         P = self.ca.total_inertia_
 89 |         assert math.isclose(F, P)
 90 | 
 91 |     def test_eigenvalues(self):
 92 |         F = load_df_from_R("ca$eig")[: self.ca.n_components]
 93 |         P = self.ca._eigenvalues_summary
 94 |         np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"])
 95 |         np.testing.assert_allclose(F["percentage of variance"], P["% of variance"])
 96 |         np.testing.assert_allclose(
 97 |             F["cumulative percentage of variance"], P["% of variance (cumulative)"]
 98 |         )
 99 | 
100 |     def test_row_coords(self, method_name="row_coordinates"):
101 |         F = load_df_from_R(f"ca${self._row_name}$coord")
102 |         if self.sup_rows:
103 |             F = pd.concat((F, load_df_from_R(f"ca${self._row_name}.sup$coord")))
104 | 
105 |         method = getattr(self.ca, method_name)
106 |         P = method(self.dataset)
107 | 
108 |         np.testing.assert_allclose(F.abs(), P.abs())
109 | 
110 |     def test_row_contrib(self):
111 |         F = load_df_from_R(f"ca${self._row_name}$contrib")
112 |         P = self.ca.row_contributions_
113 |         np.testing.assert_allclose(F, P * 100)
114 | 
115 |     def test_row_cosine_similarities(self):
116 |         F = load_df_from_R(f"ca${self._row_name}$cos2")
117 |         if self.sup_rows:
118 |             F = pd.concat((F, load_df_from_R(f"ca${self._row_name}.sup$cos2")))
119 |         P = self.ca.row_cosine_similarities(self.dataset)
120 |         np.testing.assert_allclose(F, P)
121 | 
122 |     def test_col_coords(self):
123 |         F = load_df_from_R(f"ca${self._col_name}$coord")
124 |         if self.sup_cols:
125 |             F = pd.concat((F, load_df_from_R(f"ca${self._col_name}.sup$coord")))
126 |         P = self.ca.column_coordinates(self.dataset)
127 |         np.testing.assert_allclose(F.abs(), P.abs())
128 | 
129 |     def test_col_contrib(self):
130 |         F = load_df_from_R(f"ca${self._col_name}$contrib")
131 |         P = self.ca.column_contributions_
132 |         np.testing.assert_allclose(F, P * 100)
133 | 
134 |     def test_col_cos2(self):
135 |         F = load_df_from_R(f"ca${self._col_name}$cos2")
136 |         if self.sup_cols:
137 |             F = pd.concat((F, load_df_from_R(f"ca${self._col_name}.sup$cos2")))
138 |         P = self.ca.column_cosine_similarities(self.dataset)
139 |         np.testing.assert_allclose(F, P)
140 | 


--------------------------------------------------------------------------------
/tests/test_famd.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import tempfile
  4 | 
  5 | import numpy as np
  6 | import pytest
  7 | import sklearn.utils.estimator_checks
  8 | import sklearn.utils.validation
  9 | from rpy2.robjects import r as R
 10 | 
 11 | import prince
 12 | from tests import load_df_from_R
 13 | 
 14 | 
 15 | @pytest.mark.parametrize(
 16 |     "sup_rows, sup_cols",
 17 |     [
 18 |         pytest.param(
 19 |             sup_rows,
 20 |             sup_cols,
 21 |             id=":".join(["sup_rows" if sup_rows else "", "sup_cols" if sup_cols else ""]).strip(
 22 |                 ":"
 23 |             ),
 24 |         )
 25 |         for sup_rows in [False]
 26 |         for sup_cols in [False]
 27 |     ],
 28 | )
 29 | class TestFAMD:
 30 |     _row_name = "row"
 31 |     _col_name = "col"
 32 | 
 33 |     @pytest.fixture(autouse=True)
 34 |     def _prepare(self, sup_rows, sup_cols):
 35 |         self.sup_rows = sup_rows
 36 |         self.sup_cols = sup_cols
 37 | 
 38 |         n_components = 5
 39 | 
 40 |         # Fit Prince
 41 |         self.dataset = prince.datasets.load_beers().head(200)
 42 |         active = self.dataset.copy()
 43 |         self.famd = prince.FAMD(n_components=n_components, engine="scipy")
 44 |         self.famd.fit(active)
 45 | 
 46 |         # Fit FactoMineR
 47 |         R("library('FactoMineR')")
 48 |         with tempfile.NamedTemporaryFile() as fp:
 49 |             self.dataset.to_csv(fp)
 50 |             R(f"dataset <- read.csv('{fp.name}', row.names=c(1))")
 51 |             R("famd <- FAMD(dataset, graph=F)")
 52 | 
 53 |     def test_check_is_fitted(self):
 54 |         assert isinstance(self.famd, prince.FAMD)
 55 |         sklearn.utils.validation.check_is_fitted(self.famd)
 56 | 
 57 |     def test_num_cols(self):
 58 |         assert sorted(self.famd.num_cols_) == [
 59 |             "alcohol_by_volume",
 60 |             "final_gravity",
 61 |             "international_bitterness_units",
 62 |             "standard_reference_method",
 63 |         ]
 64 | 
 65 |     def test_cat_cols(self):
 66 |         assert sorted(self.famd.cat_cols_) == ["is_organic", "style"]
 67 | 
 68 |     def test_eigenvalues(self):
 69 |         F = load_df_from_R("famd$eig")[: self.famd.n_components]
 70 |         P = self.famd._eigenvalues_summary
 71 |         np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"])
 72 |         np.testing.assert_allclose(F["percentage of variance"], P["% of variance"])
 73 |         np.testing.assert_allclose(
 74 |             F["cumulative percentage of variance"], P["% of variance (cumulative)"]
 75 |         )
 76 | 
 77 |     @pytest.mark.parametrize("method_name", ("row_coordinates", "transform"))
 78 |     def test_row_coords(self, method_name):
 79 |         method = getattr(self.famd, method_name)
 80 |         F = load_df_from_R("famd$ind$coord")
 81 |         P = method(self.dataset)
 82 |         np.testing.assert_allclose(F.abs(), P.abs())
 83 | 
 84 |     def test_row_contrib(self):
 85 |         F = load_df_from_R("famd$ind$contrib")
 86 |         P = self.famd.row_contributions_
 87 |         np.testing.assert_allclose(F, P * 100)
 88 | 
 89 |     def test_col_coords(self):
 90 |         F = load_df_from_R("famd$var$coord")
 91 |         P = self.famd.column_coordinates_
 92 |         np.testing.assert_allclose(F.abs(), P.abs())
 93 | 
 94 |     def test_col_contrib(self):
 95 |         F = load_df_from_R("famd$var$contrib")
 96 |         P = self.famd.column_contributions_
 97 |         np.testing.assert_allclose(F, P * 100)
 98 | 
 99 | 
100 | def test_issue_169():
101 |     """
102 | 
103 |     https://github.com/MaxHalford/prince/issues/169
104 | 
105 |     >>> import pandas as pd
106 |     >>> from prince import FAMD
107 |     >>> df = pd.DataFrame({'var1':['c', 'a', 'b','c'], 'var2':['x','y','y','z'],'var2': [0.,10.,30.4,0.]})
108 | 
109 |     >>> famd = FAMD(n_components=2, random_state=42)
110 |     >>> famd = famd.fit(df[:3])
111 | 
112 |     >>> famd.transform(df[0:3])
113 |     component         0         1
114 |     0         -1.303760 -0.658334
115 |     1         -0.335621  0.981047
116 |     2          1.639381 -0.322713
117 | 
118 |     >>> famd.transform(df[0:2])
119 |     component         0         1
120 |     0         -1.000920 -0.669274
121 |     1         -0.092001  0.669274
122 | 
123 |     >>> famd.transform(df[3:]).round(6)
124 |     component         0    1
125 |     3         -0.869173 -0.0
126 | 
127 |     """
128 | 


--------------------------------------------------------------------------------
/tests/test_gpa.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import unittest
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | import prince
  9 | 
 10 | 
 11 | class TestGPA(unittest.TestCase):
 12 |     def setUp(self):
 13 |         # Create a list of 2-D circles with different locations and rotations
 14 |         n_shapes = 4
 15 |         n_points = 12
 16 |         n_dims = 2
 17 | 
 18 |         shape_sizes = np.arange(1, n_shapes + 1)
 19 |         shape_angle_offsets = 10 * np.arange(n_shapes)
 20 |         shape_center_offsets = np.tile(np.arange(n_shapes), (n_dims, 1))
 21 | 
 22 |         base_angles = np.linspace(0, 2 * np.pi, num=n_points, endpoint=False)
 23 |         # Size (n_shapes, n_points)
 24 |         angles = base_angles[np.newaxis, :] + shape_angle_offsets[:, np.newaxis]
 25 | 
 26 |         # Calculate along dimensions
 27 |         x = np.cos(angles) * shape_sizes[:, np.newaxis] + shape_center_offsets[0][:, np.newaxis]
 28 |         y = np.sin(angles) * shape_sizes[:, np.newaxis] + shape_center_offsets[1][:, np.newaxis]
 29 | 
 30 |         self.shapes = np.stack([x, y], axis=-1)
 31 | 
 32 |     def test_fit(self):
 33 |         gpa = prince.GPA()
 34 |         self.assertIsInstance(gpa.fit(self.shapes), prince.GPA)
 35 | 
 36 |     def test_fit_random(self):
 37 |         gpa = prince.GPA(init="random")
 38 |         self.assertIsInstance(gpa.fit(self.shapes), prince.GPA)
 39 | 
 40 |     def test_fit_mean(self):
 41 |         gpa = prince.GPA(init="mean")
 42 |         self.assertIsInstance(gpa.fit(self.shapes), prince.GPA)
 43 | 
 44 |     def test_fit_bad_init(self):
 45 |         gpa = prince.GPA(init="bad init type")
 46 | 
 47 |         with self.assertRaises(ValueError):
 48 |             gpa.fit(self.shapes)
 49 | 
 50 |     def test_fit_bad_input_size(self):
 51 |         gpa = prince.GPA()
 52 | 
 53 |         with self.assertRaises(ValueError):
 54 |             gpa.fit(self.shapes[0])
 55 | 
 56 |     def test_transform(self):
 57 |         gpa = prince.GPA(copy=True)
 58 |         aligned_shapes = gpa.fit(self.shapes).transform(self.shapes)
 59 |         self.assertIsInstance(aligned_shapes, np.ndarray)
 60 |         self.assertEqual(self.shapes.shape, aligned_shapes.shape)
 61 | 
 62 |     def test_fit_transform_equal(self):
 63 |         """In our specific case of all-same-shape circles, the shapes should
 64 |         align perfectly."""
 65 |         gpa = prince.GPA()
 66 |         aligned_shapes = gpa.fit_transform(self.shapes)
 67 |         self.assertIsInstance(aligned_shapes, np.ndarray)
 68 |         np.testing.assert_array_almost_equal(aligned_shapes[:-1], aligned_shapes[1:])
 69 | 
 70 |     def test_fit_transform_single(self):
 71 |         """Aligning a single shape should return the same shape, just normalized."""
 72 |         gpa = prince.GPA()
 73 |         shapes = self.shapes[0:1]
 74 |         aligned_shapes = gpa.fit_transform(shapes)
 75 |         np.testing.assert_array_almost_equal(shapes / np.linalg.norm(shapes), aligned_shapes)
 76 | 
 77 |     def test_copy(self):
 78 |         shapes_copy = np.copy(self.shapes)
 79 | 
 80 |         gpa = prince.GPA(copy=True)
 81 |         gpa.fit(shapes_copy)
 82 |         np.testing.assert_array_equal(self.shapes, shapes_copy)
 83 | 
 84 |         gpa = prince.GPA(copy=False)
 85 |         gpa.fit(shapes_copy)
 86 |         self.assertRaises(AssertionError, np.testing.assert_array_equal, self.shapes, shapes_copy)
 87 | 
 88 |     def test_xarray(self):
 89 |         points = pd.DataFrame(
 90 |             data=[
 91 |                 [0, 0, 0, 0],
 92 |                 [0, 2, 0, 1],
 93 |                 [1, 0, 0, 2],
 94 |                 [3, 2, 1, 0],
 95 |                 [1, 2, 1, 1],
 96 |                 [3, 3, 1, 2],
 97 |                 [0, 0, 2, 0],
 98 |                 [0, 4, 2, 1],
 99 |                 [2, 0, 2, 2],
100 |             ],
101 |             columns=["x", "y", "shape", "point"],
102 |         ).astype({"x": float, "y": float})
103 | 
104 |         ds = points.set_index(["shape", "point"]).to_xarray()
105 |         da = ds.to_stacked_array("xy", ["shape", "point"])
106 |         shapes = da.values
107 | 
108 |         gpa = prince.GPA()
109 |         aligned_shapes = gpa.fit_transform(shapes)
110 |         da.values = aligned_shapes
111 |         da.to_unstacked_dataset("xy").to_dataframe().reset_index()
112 | 


--------------------------------------------------------------------------------
/tests/test_mca.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import tempfile
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import pytest
  8 | from rpy2.robjects import r as R
  9 | 
 10 | import prince
 11 | from tests import load_df_from_R
 12 | from tests.test_ca import TestCA as _TestCA
 13 | 
 14 | 
 15 | class TestMCA(_TestCA):
 16 |     _row_name = "ind"
 17 |     _col_name = "var"
 18 | 
 19 |     @pytest.fixture(autouse=True)
 20 |     def _prepare(self, sup_rows, sup_cols):
 21 |         self.sup_rows = sup_rows
 22 |         self.sup_cols = sup_cols
 23 | 
 24 |         n_components = 5
 25 |         n_active_rows = 1_000
 26 | 
 27 |         # Fit Prince
 28 |         self.dataset = prince.datasets.load_hearthstone_cards()
 29 |         active = self.dataset.copy()
 30 |         if self.sup_rows:
 31 |             active = active[:n_active_rows]
 32 |         if self.sup_cols:
 33 |             active = active.drop(columns=["type_or_school"])
 34 |         self.ca = prince.MCA(n_components=n_components, engine="scipy")
 35 |         self.ca.fit(active)
 36 | 
 37 |         # Fit FactoMineR
 38 |         R("library('FactoMineR')")
 39 |         with tempfile.NamedTemporaryFile() as fp:
 40 |             self.dataset.to_csv(fp)
 41 |             R(f"dataset <- read.csv('{fp.name}')[,-1]")
 42 | 
 43 |         args = f"dataset, ncp={n_components}, graph=F"
 44 |         if self.sup_cols:
 45 |             if self.sup_rows:
 46 |                 R(
 47 |                     f"ca <- MCA({args}, quali.sup=c(4), ind.sup=c({n_active_rows + 1}:nrow(dataset)))"
 48 |                 )
 49 |             else:
 50 |                 R(f"ca <- MCA({args}, quali.sup=c(4))")
 51 |         else:
 52 |             if self.sup_rows:
 53 |                 R(f"ca <- MCA({args}, ind.sup=c({n_active_rows + 1}:nrow(dataset)))")
 54 |             else:
 55 |                 R(f"ca <- MCA({args})")
 56 | 
 57 |     @pytest.mark.parametrize("method_name", ("row_coordinates", "transform"))
 58 |     def test_row_coords(self, method_name):
 59 |         super().test_row_coords(method_name=method_name)
 60 | 
 61 |     def test_col_coords(self):
 62 |         if self.sup_cols:
 63 |             F = load_df_from_R("ca$var$coord")
 64 |             if self.sup_cols:
 65 |                 F = pd.concat((F, load_df_from_R("ca$quali.sup$coord")))
 66 |             P = self.ca.column_coordinates(self.dataset)
 67 |             # Prince adds a prefix to each column. We need to remove it in order to align the rows
 68 |             # of the two dataframes
 69 |             P.index = [idx.split("__", 1)[1] for idx in P.index]
 70 |             np.testing.assert_allclose(F.abs(), P.abs().loc[F.index])
 71 |         else:
 72 |             super().test_col_coords()
 73 | 
 74 |     def test_col_cos2(self):
 75 |         if self.sup_cols:
 76 |             F = load_df_from_R("ca$var$cos2")
 77 |             if self.sup_cols:
 78 |                 F = pd.concat((F, load_df_from_R("ca$quali.sup$cos2")))
 79 |             P = self.ca.column_cosine_similarities(self.dataset)
 80 |             # Prince adds a prefix to each column. We need to remove it in order to align the rows
 81 |             # of the two dataframes
 82 |             P.index = [idx.split("__", 1)[1] for idx in P.index]
 83 |             np.testing.assert_allclose(F, P.loc[F.index])
 84 |         else:
 85 |             super().test_col_cos2()
 86 | 
 87 | 
 88 | def test_with_and_without_one_hot():
 89 |     """
 90 | 
 91 |     >>> df = pd.DataFrame({
 92 |     ...     "foo": [1, 2, 3, 3, 5],
 93 |     ...     "bar": ["a", "b", "c", "b", "e"],
 94 |     ... })
 95 |     >>> mca = prince.MCA(n_components=2, one_hot=True, engine="scipy")
 96 |     >>> mca = mca.fit(df)
 97 |     >>> coords = mca.transform(df)
 98 |     >>> assert coords.shape == (5, 2)
 99 |     >>> coords.round(2).abs().sort_index(axis='columns')  # doctest: +SKIP
100 |           0    1
101 |     0  0.00  2.0
102 |     1  0.65  0.5
103 |     2  0.65  0.5
104 |     3  0.65  0.5
105 |     4  1.94  0.5
106 | 
107 |     >>> mca = prince.MCA(n_components=2, one_hot=False, engine="scipy")
108 |     >>> one_hot = pd.get_dummies(df, columns=['foo', 'bar'])
109 |     >>> mca = mca.fit(one_hot)
110 |     >>> coords = mca.transform(one_hot)
111 |     >>> assert coords.shape == (5, 2)
112 |     >>> coords.round(2).abs().sort_index(axis='columns')  # doctest: +SKIP
113 |           0    1
114 |     0  0.00  1.0
115 |     1  0.65  0.5
116 |     2  0.65  0.5
117 |     3  0.65  0.5
118 |     4  1.94  0.5
119 | 
120 |     """
121 | 
122 | 
123 | def test_issue_131():
124 |     """
125 | 
126 |     https://github.com/MaxHalford/prince/issues/131#issuecomment-1591426031
127 | 
128 |     >>> df = pd.DataFrame({
129 |     ...     "foo": [1, 2, 3, 3, 5],
130 |     ...     "bar": ["a", "b", "c", "b", "e"],
131 |     ... })
132 |     >>> mca = prince.MCA(engine="scipy")
133 |     >>> mca = mca.fit(df)
134 |     >>> coords = mca.transform(df)
135 |     >>> assert coords.shape == (5, 2)
136 |     >>> coords.round(2).abs().sort_index(axis='columns')  # doctest: +SKIP
137 |           0    1
138 |     0  0.00  2.0
139 |     1  0.65  0.5
140 |     2  0.65  0.5
141 |     3  0.65  0.5
142 |     4  1.94  0.5
143 | 
144 |     >>> mca.K_, mca.J_
145 |     (2, 8)
146 | 
147 |     """
148 | 
149 | 
150 | def test_issue_171():
151 |     """
152 | 
153 |     https://github.com/MaxHalford/prince/issues/171
154 | 
155 |     >>> from sklearn import impute
156 |     >>> from sklearn import pipeline
157 | 
158 |     >>> rng = np.random.RandomState(0)
159 |     >>> test_data = pd.DataFrame(data=rng.random((10, 5)))
160 |     >>> test = pipeline.Pipeline(steps=[
161 |     ...     ('impute', impute.SimpleImputer()),  # would break the pipeline since it returns an ndarray
162 |     ...     ('mca', prince.PCA()),
163 |     ... ])
164 |     >>> _ = test[0].set_output(transform='pandas')
165 |     >>> test.fit_transform(test_data)
166 |     component         0         1
167 |     0         -0.392617  0.296831
168 |     1          0.119661 -1.660653
169 |     2         -1.541581 -0.826863
170 |     3          3.105498 -0.538801
171 |     4         -2.439259 -0.343292
172 |     5          1.129341 -0.533576
173 |     6         -1.077436  0.899673
174 |     7          0.020571 -0.941029
175 |     8          1.498005  1.566376
176 |     9         -0.422184  2.081334
177 | 
178 |     """
179 | 
180 | 
181 | def test_type_doesnt_matter():
182 |     """
183 | 
184 |     Checks that the type of the columns doesn't affect the result.
185 | 
186 |     """
187 |     outputs = []
188 |     dataset = prince.datasets.load_hearthstone_cards().head(100)
189 |     for col in dataset.columns:
190 |         labels, levels = pd.factorize(dataset[col])
191 |         dataset[col] = labels
192 |     for typ in ("int", "float", "str", "category"):
193 |         dataset = dataset.astype(typ)
194 |         mca = prince.MCA(n_components=2, engine="scipy")
195 |         mca = mca.fit(dataset)
196 |         outputs.append(mca.transform(dataset).abs())
197 | 
198 |     for i in range(len(outputs) - 1):
199 |         np.testing.assert_allclose(outputs[i], outputs[i + 1])
200 | 
201 | 
202 | issue_161_data = """
203 | ,category,userid,location,applicationname,browser\n
204 | 0,Portal Login,a@b.com,"San Jose, CA, United States",A,Chrome\n
205 | 1,Application Access,b@b.com,"San Jose, CA, United States",B,Other\n
206 | 2,Application Access,a@b.com,"San Jose, CA, United States",C,Other\n
207 | 3,Portal Login,c@b.com,"San Diego, CA, United States",A,Chrome\n
208 | """
209 | 
210 | 
211 | def test_issue_161():
212 |     """
213 | 
214 |     https://github.com/MaxHalford/prince/issues/161
215 | 
216 |     >>> import io
217 |     >>> data = pd.read_csv(io.StringIO(issue_161_data), index_col=0)
218 | 
219 |     >>> mca = prince.MCA(
220 |     ...     n_components=10,
221 |     ...     n_iter=3,
222 |     ...     copy=True,
223 |     ...     check_input=True,
224 |     ...     engine='sklearn',
225 |     ...     random_state=42
226 |     ... )
227 |     >>> mca = mca.fit(data[:3])
228 | 
229 |     >>> mca.eigenvalues_summary
230 |               eigenvalue % of variance % of variance (cumulative)
231 |     component
232 |     0              0.673        67.32%                     67.32%
233 |     1              0.327        32.68%                    100.00%
234 | 
235 |     >>> mca.row_coordinates(data[:3])
236 |               0         1
237 |     0  1.120811 -0.209242
238 |     1 -0.820491 -0.571660
239 |     2 -0.300320  0.780902
240 | 
241 |     >>> mca.transform(data[3:])
242 |               0         1
243 |     3  1.664888 -0.640285
244 | 
245 |     """
246 | 
247 | 
248 | def test_abdi_2007_correction():
249 |     """
250 | 
251 |     >>> wines = prince.datasets.load_burgundy_wines()
252 |     >>> wines = wines.drop(columns=["Oak type"], level=0)
253 | 
254 |     >>> mca = prince.MCA(n_components=4, correction=None)
255 |     >>> mca = mca.fit(wines)
256 |     >>> mca.eigenvalues_.round(4).tolist()
257 |     [0.8532, 0.2, 0.1151, 0.0317]
258 |     >>> mca.percentage_of_variance_.round(3).tolist()
259 |     [71.101, 16.667, 9.593, 2.64]
260 | 
261 |     >>> mca = prince.MCA(n_components=4, correction="benzecri")
262 |     >>> mca = mca.fit(wines)
263 |     >>> mca.eigenvalues_.round(4).tolist()
264 |     [0.7004, 0.0123, 0.0003, 0.0]
265 |     >>> mca.percentage_of_variance_.round(3).tolist()
266 |     [98.229, 1.731, 0.04, 0.0]
267 | 
268 |     >>> mca = prince.MCA(n_components=4, correction="greenacre")
269 |     >>> mca = mca.fit(wines)
270 |     >>> mca.eigenvalues_.round(4).tolist()
271 |     [0.7004, 0.0123, 0.0003, 0.0]
272 |     >>> mca.percentage_of_variance_.round(3).tolist()
273 |     [95.189, 1.678, 0.038, 0.0]
274 | 
275 |     """
276 | 


--------------------------------------------------------------------------------
/tests/test_mfa.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import math
  4 | import tempfile
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import pytest
  9 | import rpy2.robjects as robjects
 10 | import sklearn.utils.estimator_checks
 11 | import sklearn.utils.validation
 12 | from rpy2.robjects import r as R
 13 | 
 14 | import prince
 15 | from tests import load_df_from_R
 16 | 
 17 | 
 18 | @pytest.mark.parametrize(
 19 |     "sup_rows, sup_groups",
 20 |     [
 21 |         pytest.param(sup_rows, sup_groups, id=f"{sup_rows=}:{sup_groups=}")
 22 |         for sup_rows in [False, True]
 23 |         for sup_groups in [False, True]
 24 |     ],
 25 | )
 26 | class TestMFA:
 27 |     _row_name = "row"
 28 |     _col_name = "col"
 29 | 
 30 |     @pytest.fixture(autouse=True)
 31 |     def _prepare(self, sup_rows, sup_groups):
 32 |         self.sup_rows = sup_rows
 33 |         self.sup_groups = sup_groups
 34 | 
 35 |         n_components = 3
 36 | 
 37 |         # Fit Prince
 38 |         self.dataset = prince.datasets.load_premier_league()
 39 |         active = self.dataset.copy()
 40 |         if self.sup_rows:
 41 |             active = active.drop(index=["Manchester City", "Manchester United"])
 42 |         supplementary_groups = ["2023-24"] if self.sup_groups else []
 43 |         self.groups = self.dataset.columns.levels[0].tolist()
 44 |         self.mfa = prince.MFA(n_components=n_components)
 45 |         self.mfa.fit(active, groups=self.groups, supplementary_groups=supplementary_groups)
 46 | 
 47 |         # Fit FactoMineR
 48 |         R("library('FactoMineR')")
 49 |         with tempfile.NamedTemporaryFile() as fp:
 50 |             dataset = self.dataset.copy()
 51 |             dataset.columns = [" ".join(parts) for parts in dataset.columns]
 52 |             dataset.to_csv(fp, index=False)
 53 |             R(f"dataset <- read.csv('{fp.name}')")
 54 | 
 55 |         args = "dataset, group=c(6, 6, 6), graph=F"
 56 |         if self.sup_rows:
 57 |             args += ", ind.sup=c(9:10)"
 58 |         if self.sup_groups:
 59 |             args += ", num.group.sup=c(3)"
 60 | 
 61 |         R(f"mfa <- MFA({args})")
 62 | 
 63 |     def test_check_is_fitted(self):
 64 |         assert isinstance(self.mfa, prince.MFA)
 65 |         sklearn.utils.validation.check_is_fitted(self.mfa)
 66 | 
 67 |     def test_total_inertia(self):
 68 |         F = robjects.r("sum(mfa$eig[,1])")[0]
 69 |         P = self.mfa.total_inertia_
 70 |         assert math.isclose(F, P)
 71 | 
 72 |     def test_eigenvalues(self):
 73 |         F = load_df_from_R("mfa$eig")[: self.mfa.n_components]
 74 |         P = self.mfa._eigenvalues_summary
 75 |         np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"])
 76 |         np.testing.assert_allclose(F["percentage of variance"], P["% of variance"])
 77 |         np.testing.assert_allclose(
 78 |             F["cumulative percentage of variance"], P["% of variance (cumulative)"]
 79 |         )
 80 | 
 81 |     def test_group_eigenvalues(self):
 82 |         for i, group in enumerate(self.groups, start=1):
 83 |             F = load_df_from_R(f"mfa$separate.analyses$Gr{i}$eig")[: self.mfa.n_components]
 84 |             P = self.mfa[group]._eigenvalues_summary
 85 |             np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"])
 86 |             np.testing.assert_allclose(F["percentage of variance"], P["% of variance"])
 87 |             np.testing.assert_allclose(
 88 |                 F["cumulative percentage of variance"], P["% of variance (cumulative)"]
 89 |             )
 90 | 
 91 |     @pytest.mark.parametrize("method_name", ("row_coordinates", "transform"))
 92 |     def test_row_coords(self, method_name):
 93 |         method = getattr(self.mfa, method_name)
 94 |         F = load_df_from_R("mfa$ind$coord")
 95 |         P = method(self.dataset)
 96 |         if self.sup_rows:
 97 |             F = pd.concat((F, load_df_from_R("mfa$ind.sup$coord")))
 98 |             # Move supplementary rows to the end
 99 |             P = pd.concat(
100 |                 [
101 |                     P.loc[P.index.difference(["Manchester City", "Manchester United"])],
102 |                     P.loc[["Manchester City", "Manchester United"]],
103 |                 ]
104 |             )
105 |         F = F.iloc[:, : self.mfa.n_components]
106 |         np.testing.assert_allclose(F.abs(), P.abs())
107 | 
108 |     def test_row_contrib(self):
109 |         F = load_df_from_R("mfa$ind$contrib").iloc[:, : self.mfa.n_components]
110 |         P = self.mfa.row_contributions_
111 |         np.testing.assert_allclose(F, P * 100)
112 | 


--------------------------------------------------------------------------------
/tests/test_pca.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import math
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import pytest
  8 | import rpy2.robjects as robjects
  9 | import sklearn.utils.estimator_checks
 10 | import sklearn.utils.validation
 11 | from rpy2.robjects import numpy2ri
 12 | from sklearn import decomposition, pipeline, preprocessing
 13 | 
 14 | import prince
 15 | from tests import load_df_from_R
 16 | 
 17 | 
 18 | @pytest.mark.parametrize(
 19 |     "sup_rows, sup_cols, scale, sample_weights, column_weights",
 20 |     [
 21 |         pytest.param(
 22 |             sup_rows,
 23 |             sup_cols,
 24 |             scale,
 25 |             sample_weights,
 26 |             column_weights,
 27 |             id=f"{sup_rows=}:{sup_cols=}:{scale=}:{sample_weights=}:{column_weights=}",
 28 |         )
 29 |         for sup_rows in [False, True]
 30 |         for sup_cols in [False, True]
 31 |         for scale in [False, True]
 32 |         for sample_weights in [False, True]
 33 |         for column_weights in [False, True]
 34 |     ],
 35 | )
 36 | class TestPCA:
 37 |     @pytest.fixture(autouse=True)
 38 |     def _prepare(self, sup_rows, sup_cols, scale, sample_weights, column_weights):
 39 |         self.sup_rows = sup_rows
 40 |         self.sup_cols = sup_cols
 41 |         self.scale = scale
 42 | 
 43 |         n_components = 5
 44 | 
 45 |         # Fit Prince
 46 |         self.dataset = prince.datasets.load_decathlon()
 47 |         self.active = self.dataset.copy()
 48 |         if self.sup_rows:
 49 |             self.active = self.active.query('competition == "Decastar"')
 50 |         self.sample_weights = (
 51 |             np.random.default_rng().dirichlet([1] * len(self.active)) if sample_weights else None
 52 |         )
 53 |         supplementary_columns = ["rank", "points"] if self.sup_cols else []
 54 |         self.column_weights = (
 55 |             np.random.default_rng().random(
 56 |                 len(self.active.columns.difference(supplementary_columns))
 57 |             )
 58 |             if column_weights
 59 |             else None
 60 |         )
 61 |         self.pca = prince.PCA(n_components=n_components, rescale_with_std=self.scale)
 62 |         self.pca.fit(
 63 |             self.active,
 64 |             sample_weight=self.sample_weights,
 65 |             column_weight=self.column_weights,
 66 |             supplementary_columns=supplementary_columns,
 67 |         )
 68 | 
 69 |         # scikit-learn
 70 |         if self.scale:
 71 |             self.sk_pca = pipeline.make_pipeline(
 72 |                 preprocessing.StandardScaler(),
 73 |                 decomposition.PCA(n_components=n_components),
 74 |             )
 75 |         else:
 76 |             self.sk_pca = pipeline.make_pipeline(
 77 |                 decomposition.PCA(n_components=n_components),
 78 |             )
 79 |         # sklearn's PCA doesn't support sample weights
 80 |         self.sk_pca.fit(self.active[self.pca.feature_names_in_])
 81 | 
 82 |         # Fit FactoMineR
 83 |         robjects.r(
 84 |             """
 85 |         library('FactoMineR')
 86 | 
 87 |         data(decathlon)
 88 |         decathlon <- subset(decathlon, select = -c(Competition))
 89 |         """
 90 |         )
 91 | 
 92 |         args = f"decathlon, ncp={n_components}, graph=F"
 93 |         if sample_weights:
 94 |             robjects.r.assign("row.w", numpy2ri.py2rpy(self.sample_weights))
 95 |             robjects.r("row.w <- as.vector(row.w)")
 96 |             args += ", row.w=row.w"
 97 |         if column_weights:
 98 |             robjects.r.assign("col.w", numpy2ri.py2rpy(self.column_weights))
 99 |             robjects.r("col.w <- as.vector(col.w)")
100 |             args += ", col.w=col.w"
101 |         if not self.scale:
102 |             args += ", scale.unit=F"
103 |         if self.sup_cols:
104 |             if self.sup_rows:
105 |                 robjects.r(f"pca = PCA({args}, quanti.sup=c(11, 12), ind.sup=c(14:41))")
106 |             else:
107 |                 robjects.r(f"pca = PCA({args}, quanti.sup=c(11, 12))")
108 |         else:
109 |             if self.sup_rows:
110 |                 robjects.r(f"pca = PCA({args}, ind.sup=c(14:41))")
111 |             else:
112 |                 robjects.r(f"pca = PCA({args})")
113 | 
114 |     def test_check_is_fitted(self):
115 |         assert isinstance(self.pca, prince.PCA)
116 |         sklearn.utils.validation.check_is_fitted(self.pca)
117 | 
118 |     def test_total_inertia(self):
119 |         F = robjects.r("sum(pca$eig[,1])")[0]
120 |         P = self.pca.total_inertia_
121 |         assert math.isclose(F, P)
122 | 
123 |     def test_eigenvalues(self):
124 |         P = self.pca._eigenvalues_summary
125 |         # Test against FactoMineR
126 |         F = load_df_from_R("pca$eig")[: self.pca.n_components]
127 |         np.testing.assert_allclose(F["eigenvalue"], P["eigenvalue"])
128 |         np.testing.assert_allclose(F["percentage of variance"], P["% of variance"])
129 |         np.testing.assert_allclose(
130 |             F["cumulative percentage of variance"], P["% of variance (cumulative)"]
131 |         )
132 |         # Test against scikit-learn
133 |         if self.sample_weights is None and self.column_weights is None:
134 |             n = len(self.active)
135 |             S = self.sk_pca[-1].explained_variance_ * (n - 1) / n
136 |             np.testing.assert_allclose(P["eigenvalue"], S)
137 |             np.testing.assert_allclose(
138 |                 P["% of variance"], self.sk_pca[-1].explained_variance_ratio_ * 100
139 |             )
140 | 
141 |     @pytest.mark.parametrize("method_name", ("row_coordinates", "transform"))
142 |     def test_row_coords(self, method_name):
143 |         method = getattr(self.pca, method_name)
144 |         P = method(self.dataset)
145 |         # Test againt FactoMineR
146 |         F = load_df_from_R("pca$ind$coord")
147 |         if self.sup_rows:
148 |             F = pd.concat((F, load_df_from_R("pca$ind.sup$coord")))
149 |         np.testing.assert_allclose(F.abs(), P.abs())
150 |         # Test against scikit-learn
151 |         if self.sample_weights is None and self.column_weights is None:
152 |             S = self.sk_pca.transform(self.dataset[self.pca.feature_names_in_])
153 |             np.testing.assert_allclose(np.abs(S), P.abs())
154 | 
155 |     def test_row_cosine_similarities(self):
156 |         F = load_df_from_R("pca$ind$cos2")
157 |         if self.sup_rows:
158 |             F = pd.concat((F, load_df_from_R("pca$ind.sup$cos2")))
159 |         P = self.pca.row_cosine_similarities(self.dataset)
160 |         np.testing.assert_allclose(F, P)
161 | 
162 |     def test_row_contrib(self):
163 |         F = load_df_from_R("pca$ind$contrib")
164 |         P = self.pca.row_contributions_
165 |         np.testing.assert_allclose(F, P * 100)
166 | 
167 |     def test_col_coords(self):
168 |         F = load_df_from_R("pca$var$coord")
169 |         P = self.pca.column_coordinates_
170 |         if self.sup_cols:
171 |             P = P.drop(["rank", "points"])
172 |         np.testing.assert_allclose(F.abs(), P.abs())
173 | 
174 |     def test_col_cos2(self):
175 |         F = load_df_from_R("pca$var$cos2")
176 |         P = self.pca.column_cosine_similarities_
177 |         if self.sup_cols:
178 |             P = P.drop(["rank", "points"])
179 |         np.testing.assert_allclose(F, P)
180 | 
181 |     def test_col_contrib(self):
182 |         F = load_df_from_R("pca$var$contrib")
183 |         P = self.pca.column_contributions_
184 |         np.testing.assert_allclose(F, P * 100)
185 | 


--------------------------------------------------------------------------------
/tests/test_svd.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | import rpy2.robjects as robjects
 6 | from rpy2.robjects import numpy2ri
 7 | 
 8 | from prince import svd
 9 | from tests import load_df_from_R
10 | 
11 | 
12 | @pytest.mark.parametrize(
13 |     "n_components, are_rows_weighted, are_columns_weighted",
14 |     [
15 |         pytest.param(
16 |             n_components,
17 |             are_rows_weighted,
18 |             are_columns_weighted,
19 |             id=f"{n_components=}:{are_rows_weighted=}:{are_columns_weighted=}",
20 |         )
21 |         for n_components in [1, 3, 10]
22 |         for are_rows_weighted in [False, True]
23 |         for are_columns_weighted in [False, True]
24 |     ],
25 | )
26 | class TestSVD:
27 |     @pytest.fixture(autouse=True)
28 |     def _prepare(self, n_components, are_rows_weighted, are_columns_weighted):
29 |         self.n_components = n_components
30 |         self.are_rows_weighted = are_rows_weighted
31 |         self.are_columns_weighted = are_columns_weighted
32 | 
33 |         self.dataset = np.random.rand(100, 10)
34 |         self.row_weights = np.random.rand(100)
35 |         self.row_weights /= self.row_weights.sum()
36 |         self.column_weights = np.random.rand(10)
37 | 
38 |         # Fit Prince
39 |         self.svd = svd.compute_svd(
40 |             X=self.dataset,
41 |             row_weights=self.row_weights if are_rows_weighted else None,
42 |             column_weights=self.column_weights if are_columns_weighted else None,
43 |             n_components=n_components,
44 |             n_iter=3,
45 |             random_state=42,
46 |             engine="scipy",
47 |         )
48 | 
49 |         # Fit FactoMineR
50 |         robjects.r("library('FactoMineR')")
51 |         robjects.r.assign("X", numpy2ri.py2rpy(self.dataset))
52 |         robjects.r.assign("row.w", numpy2ri.py2rpy(self.row_weights))
53 |         robjects.r.assign("col.w", numpy2ri.py2rpy(self.column_weights))
54 |         robjects.r("row.w <- as.vector(row.w)")
55 |         robjects.r("col.w <- as.vector(col.w)")
56 |         args = f"X, ncp={n_components}"
57 |         if are_rows_weighted:
58 |             args += ", row.w=row.w"
59 |         if are_columns_weighted:
60 |             args += ", col.w=col.w"
61 |         robjects.r(f"svd = svd.triplet({args})")
62 | 
63 |     def test_U(self):
64 |         assert self.svd.U.shape == (100, self.n_components)
65 |         if self.are_rows_weighted:
66 |             P = self.svd.U
67 |             F = load_df_from_R("svd$U")
68 |             np.testing.assert_allclose(np.abs(F), np.abs(P))
69 | 
70 |     def test_s(self):
71 |         assert self.svd.s.shape == (self.n_components,)
72 |         if self.are_rows_weighted:
73 |             P = self.svd.s
74 |             F = robjects.r("svd$vs")[: self.n_components]
75 |             np.testing.assert_allclose(np.abs(F), np.abs(P))
76 | 
77 |     def test_V(self):
78 |         assert self.svd.V.shape == (self.n_components, 10)
79 |         P = self.svd.V
80 |         F = load_df_from_R("svd$V").T
81 |         np.testing.assert_allclose(np.abs(F), np.abs(P))
82 | 


--------------------------------------------------------------------------------