├── .gitignore ├── .conda ├── conda_build_config.yaml └── meta.yaml ├── pysrc └── xprec │ ├── __init__.py │ └── linalg.py ├── .dev └── container │ ├── Dockerfile │ └── devcontainer.json ├── .github └── workflows │ ├── conda.yml │ ├── wheels.yml │ └── pytest.yml ├── LICENSE.txt ├── test ├── test_dtype.py ├── test_whitespace.py ├── test_linalg.py ├── test_ufunc.py └── test_mpmath.py ├── README.md ├── csrc ├── dd_linalg.h ├── dd_linalg.c ├── dd_arith.h ├── _dd_linalg.c ├── dd_arith.c └── _dd_ufunc.c ├── QD-LICENSE.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .* 2 | *~ 3 | \#*\# 4 | 5 | *.pyc 6 | __pycache__/ 7 | build/ 8 | dist/ 9 | *.o 10 | *.so 11 | *.egg-info/ 12 | 13 | notebooks/*.ipynb 14 | 15 | !.gitignore 16 | !/.github/ 17 | !/.editorconfig 18 | -------------------------------------------------------------------------------- /.conda/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | python: 2 | - 3.13 3 | - 3.12 4 | - 3.11 5 | 6 | numpy: 7 | # 1.18 does not build with Python 3.9 8 | #- 1.19 9 | #- 1.20 10 | - 2.1 11 | - 2.2 12 | 13 | pin_run_as_build: 14 | numpy: x.x 15 | -------------------------------------------------------------------------------- /.conda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set data = load_setup_py_data(setup_file='../setup.py', from_recipe_dir=True) %} 2 | {% set name = "xprec" %} 3 | {% set version = data.get("version") %} 4 | 5 | package: 6 | name: "{{ name|lower }}" 7 | version: "{{ version }}" 8 | 9 | source: 10 | path: ../ 11 | 12 | build: 13 | number: 0 14 | script: "{{ PYTHON }} -m pip install . -vv" 15 | 16 | requirements: 17 | buid: 18 | - python {{ python }} 19 | - numpy {{ numpy }} 20 | host: 21 | - python {{ python }} 22 | - numpy {{ numpy }} 23 | run: 24 | - python {{ python }} 25 | - numpy {{ numpy }} 26 | 27 | about: 28 | home: "https://github.com/tuwien-cms/xprec" 29 | license: MIT 30 | summary: "xprec precision numpy extension" 31 | 32 | extra: 33 | recipe-maintainers: 34 | - shinaoka 35 | -------------------------------------------------------------------------------- /pysrc/xprec/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2021 Markus Wallerberger and others 2 | # SPDX-License-Identifier: MIT 3 | """ 4 | Extension module for numpy providing the `ddouble` data type. 5 | 6 | Loading this module registers an additional scalar data type `ddouble` with 7 | numpy implementing double-double arithmetic. You can use use the data type 8 | by passing `dtype=xprec.ddouble` to numpy functions. 9 | 10 | Example: 11 | 12 | import numpy as np 13 | from xprec import ddouble 14 | 15 | x = np.arange(5, dtype=ddouble) 16 | print(2 * x) 17 | 18 | """ 19 | __version__ = "1.4.7" 20 | 21 | import numpy as _np 22 | 23 | from . import _dd_ufunc 24 | from . import _dd_linalg # needed for matmul 25 | 26 | ddouble = _dd_ufunc.dtype 27 | 28 | 29 | def finfo(dtype): 30 | dtype = _np.dtype(dtype) 31 | try: 32 | finfo_dunder = dtype.type.__finfo__ 33 | except AttributeError: 34 | return _np.finfo(dtype) 35 | else: 36 | return finfo_dunder() 37 | -------------------------------------------------------------------------------- /.dev/container/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/anaconda3 2 | #FROM continuumio/anaconda3:2020.02 3 | 4 | ENV PYTHONUNBUFFERED=1 5 | 6 | RUN apt-get update && \ 7 | DEBIAN_FRONTEND=noninteractive apt-get install -y \ 8 | build-essential \ 9 | curl \ 10 | ca-certificates \ 11 | git \ 12 | zip \ 13 | vim \ 14 | cmake pkg-config gfortran \ 15 | sudo \ 16 | && \ 17 | apt-get clean && rm -rf /var/cache/apt/archives/* /var/lib/apt/lists/* # clean up 18 | 19 | #RUN mkdir /opt/conda/pkgs 20 | #RUN chown 1000:1000 /opt/conda 21 | 22 | # Create non-root user 23 | ARG NB_USER=vscode 24 | ARG NB_UID=1000 25 | RUN useradd -u $NB_UID -m $NB_USER -s /bin/bash && \ 26 | echo 'vscode ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers 27 | USER $NB_USER 28 | ENV PATH "/home/${NB_USER}/.local/bin:${PATH}" 29 | ENV PYTHONPATH "/home/${NB_USER}/work/src:${PYTONPATH}" 30 | 31 | # for vscode 32 | RUN mkdir /home/${NB_USER}/work 33 | 34 | RUN conda config --add pkgs_dirs /home/vscode/.conda/pkgs -------------------------------------------------------------------------------- /.github/workflows/conda.yml: -------------------------------------------------------------------------------- 1 | name: Build and upload conda packages 2 | 3 | # Triggered a new tag starting with "v" is pushed 4 | on: 5 | push: 6 | tags: 7 | - 'v*' 8 | 9 | jobs: 10 | build: 11 | runs-on: ${{ matrix .os }} 12 | strategy: 13 | matrix: 14 | # https://github.com/s-weigand/setup-conda/issues/432 15 | os: [ubuntu-latest, windows-2019, macos-latest] 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: conda-incubator/setup-miniconda@v3 20 | with: 21 | auto-update-conda: true 22 | - name: Conda info 23 | shell: bash -el {0} 24 | run: conda info 25 | - name: Install dependencies 26 | run: | 27 | conda install conda-build anaconda-client -y 28 | 29 | - name: Bulid and upload 30 | env: 31 | ANACONDA_API_TOKEN: ${{secrets.ANACONDA_TOKEN}} 32 | run: | 33 | python3 --version 34 | conda config --set anaconda_upload yes 35 | conda build .conda --user SpM-lab 36 | -------------------------------------------------------------------------------- /.github/workflows/wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build and upload to PyPI 2 | 3 | # Triggered a new tag starting with "v" is pushed 4 | on: 5 | push: 6 | tags: 7 | - 'v*' 8 | 9 | jobs: 10 | build_sdist: 11 | name: Build distribution 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | - name: Examine system 17 | run: pip freeze --all 18 | 19 | - name: Build sdist 20 | run: python setup.py sdist 21 | 22 | - uses: actions/upload-artifact@v4 23 | with: 24 | name: dist 25 | path: dist/xprec-*.tar.gz 26 | 27 | upload_pypi: 28 | name: Upload to PyPI 29 | needs: [build_sdist] 30 | runs-on: ubuntu-latest 31 | steps: 32 | - uses: actions/download-artifact@v4 33 | with: 34 | name: dist 35 | path: dist 36 | 37 | - uses: pypa/gh-action-pypi-publish@v1.4.2 38 | with: 39 | user: __token__ 40 | password: ${{ secrets.PYPI_API_TOKEN }} 41 | skip_existing: true 42 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Markus Wallerberger and others 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /test/test_dtype.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2021 Markus Wallerberger and others 2 | # SPDX-License-Identifier: MIT 3 | import numpy as np 4 | import pytest 5 | 6 | from xprec import ddouble 7 | 8 | 9 | COMPATIBLE_DTYPES = [ 10 | np.int8, np.int16, np.int32, np.int64, np.bool_, np.float32, np.float64, 11 | np.uint8, np.uint16, np.uint32, np.uint64, 12 | ] 13 | 14 | 15 | @pytest.mark.parametrize('other', COMPATIBLE_DTYPES) 16 | def test_cast_from(other): 17 | assert np.can_cast(other, ddouble, 'unsafe') 18 | assert np.can_cast(other, ddouble, 'safe') 19 | 20 | x = np.eye(3, dtype=other) 21 | y = x.astype(ddouble) 22 | assert (x == y).all() 23 | 24 | 25 | @pytest.mark.parametrize('other', COMPATIBLE_DTYPES) 26 | def test_cast_to(other): 27 | assert np.can_cast(ddouble, other, 'unsafe') 28 | assert not np.can_cast(ddouble, other, 'safe') 29 | 30 | x = np.eye(3, dtype=ddouble) 31 | y = x.astype(other) 32 | assert (x == y).all() 33 | 34 | 35 | def test_i64(): 36 | x = np.int64((1 << 62) + 1) 37 | assert x == x.astype(ddouble).astype(x.dtype) 38 | 39 | y = -x 40 | assert (y + 1) == (y.astype(ddouble) + 1).astype(x.dtype) 41 | 42 | x = x.astype(np.uint64) 43 | assert x == x.astype(ddouble).astype(x.dtype) 44 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | name: xprec python package 2 | 3 | on: 4 | push: 5 | branches: 6 | mainline 7 | pull_request: 8 | branches: 9 | mainline 10 | 11 | jobs: 12 | build: 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | include: 17 | - os: ubuntu-latest 18 | numpy-version: auto 19 | python-version: 3.9 20 | - os: ubuntu-latest 21 | numpy-version: 2.0 22 | python-version: 3.11 23 | - os: windows-latest 24 | numpy-version: auto 25 | python-version: 3.9 26 | - os: macos-latest 27 | numpy-version: auto 28 | python-version: 3.11 29 | steps: 30 | - uses: actions/checkout@v4 31 | 32 | - name: Set up python ${{ matrix.python-version }} 33 | uses: actions/setup-python@v5 34 | with: 35 | python-version: ${{ matrix.python-version }} 36 | 37 | - name: Install numpy ${{ matrix.numpy-version }} 38 | if: ${{ matrix.numpy-version != 'auto' }} 39 | run: | 40 | pip install numpy==${{ matrix.numpy-version }} 41 | 42 | - name: Install package with testing dependencies 43 | run: | 44 | pip install -v .[test] 45 | 46 | - name: Test with pytest 47 | run: | 48 | pytest 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Library for double-double arithmetic calculation 2 | ================================================ 3 | 4 | Extension module for numpy providing the `ddouble` data type. 5 | 6 | Loading this module registers an additional scalar data type `ddouble` with 7 | numpy implementing double-double arithmetic. You can use use the data type 8 | by passing `dtype=xprec.ddouble` to numpy functions. 9 | 10 | The `xprec.linalg` module provides some linear algebra subroutines, in 11 | particular QR, RRQR, SVD and truncated SVD. 12 | 13 | Installation 14 | ------------ 15 | 16 | $ pip install xprec 17 | 18 | Quickstart 19 | ---------- 20 | 21 | import numpy as np 22 | x = np.linspace(0, np.pi) 23 | 24 | # import double-double precision data type 25 | from xprec import ddouble 26 | x = x.astype(ddouble) 27 | y = x * x + 1 28 | z = np.sin(x) 29 | 30 | # do some linalg 31 | import xprec.linalg 32 | A = np.vander(np.linspace(-1, 1, 80, dtype=ddouble), 150) 33 | U, s, VT = xprec.linalg.svd(A) 34 | 35 | Trouble shooting 36 | --- 37 | 38 | * icc
39 | You may suffer from a long runtime when xprec is built with icc. If you encounter this problem, please try the following: 40 | 41 | ``` 42 | CFLAGS="-fp-model=precise" pip install xprec 43 | ``` 44 | 45 | Licence 46 | ------- 47 | The xprec library is 48 | Copyright (C) 2021 Markus Wallerberger. 49 | Licensed under the MIT license (see LICENSE.txt). 50 | 51 | Contains code from the QD library, which is 52 | Copyright (C) 2012 Yozo Hida, Xiaoye S. Li, David H. Bailey. 53 | Released under a modified BSD license (see QD-LICENSE.txt). 54 | -------------------------------------------------------------------------------- /.dev/container/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or the definition README at 2 | // https://github.com/microsoft/vscode-dev-containers/tree/master/containers/docker-existing-dockerfile 3 | { 4 | "name": "Existing Dockerfile", 5 | // Sets the run context to one level up instead of the .devcontainer folder. 6 | "context": "..", 7 | // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. 8 | "dockerFile": "./Dockerfile", 9 | // The optional 'runArgs' property can be used to specify additional runtime arguments. 10 | "runArgs": [], 11 | // Use 'settings' to set *default* container specific settings.json values on container create. 12 | // You can edit these settings after create using File > Preferences > Settings > Remote. 13 | // Uncomment the next line if you want to publish any ports. 14 | // "appPort": [], 15 | // Uncomment the next line to run commands after the container is created - for example installing git. 16 | // "postCreateCommand": "apt-get update && apt-get install -y git", 17 | // Add the IDs of extensions you want installed when the container is created in the array below. 18 | "extensions": [ 19 | "ms-azuretools.vscode-docker", 20 | "mutantdino.resourcemonitor", 21 | "shardulm94.trailing-spaces", 22 | "cliffordfajardo.hightlight-selections-vscode", 23 | "wdawson.better-kill-ring", 24 | "oderwat.indent-rainbow", 25 | "github.vscode-pull-request-github", 26 | "mhutchie.git-graph", 27 | "donjayamanne.githistory", 28 | "eamodio.gitlens", 29 | "bungcip.better-toml", 30 | "usernamehw.errorlens", 31 | "ms-vscode.live-server", 32 | "christian-kohler.path-intellisense", 33 | "ms-python.python", 34 | ], 35 | "remoteUser": "vscode", 36 | "workspaceFolder": "/home/vscode/work", 37 | "workspaceMount": "src=${localWorkspaceFolder},dst=/home/vscode/work,type=bind", 38 | } 39 | -------------------------------------------------------------------------------- /test/test_whitespace.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | HEREPATH = os.path.abspath(os.path.dirname(__file__)) 4 | print("HEREPATH", HEREPATH) 5 | ROOTDIR = os.path.abspath(os.path.join(HEREPATH, os.path.pardir)) 6 | PYSRCDIR = os.path.join(ROOTDIR, "pysrc", "xprec") 7 | CSCRDIR = os.path.join(ROOTDIR, "csrc") 8 | 9 | def check_whitespace(files): 10 | errors = [] 11 | blank = 0 12 | lineno = 0 13 | line = "" 14 | def add_error(fmt, *params): 15 | errors.append((fname, lineno, line, fmt.format(*params))) 16 | 17 | for fname in files: 18 | with open(fname, "r") as file: 19 | line = "" 20 | for lineno, line in enumerate(file, start=1): 21 | if line[-1:] != '\n': 22 | add_error("file must end in blank line") 23 | line = line[:-1] 24 | if line: 25 | blank = 0 26 | else: 27 | blank += 1 28 | if line[-1:] == '\r': 29 | add_error("file must only have unix line endings") 30 | if line[-1:] == ' ': 31 | add_error("line ends in whitespace") 32 | if '\t' in line: 33 | add_error("line contains tab characters") 34 | if len(line) > 90: 35 | add_error("line is too long: {:d} chars", len(line)) 36 | # end of file 37 | if blank != 0: 38 | add_error("file has {:d} superflouos blank lines", blank) 39 | 40 | msg = "" 41 | for fname, lineno, line, lmsg in errors: 42 | msg += "{}:{}: {}\n".format(fname.name, lineno, lmsg) 43 | if msg: 44 | raise ValueError("Whitespace errors\n" + msg) 45 | 46 | 47 | def all_files(path, ext): 48 | for entry in os.scandir(path): 49 | if entry.is_file() and entry.name.endswith(ext): 50 | yield entry 51 | 52 | 53 | def test_ws_testdir(): 54 | check_whitespace(all_files(HEREPATH, ".py")) 55 | 56 | 57 | def test_ws_setup(): 58 | check_whitespace(all_files(ROOTDIR, ".py")) 59 | 60 | 61 | def test_ws_pysrcdir(): 62 | check_whitespace(all_files(PYSRCDIR, ".py")) 63 | 64 | 65 | def test_ws_csrcdir(): 66 | check_whitespace(all_files(CSCRDIR, ".c")) 67 | check_whitespace(all_files(CSCRDIR, ".h")) 68 | -------------------------------------------------------------------------------- /csrc/dd_linalg.h: -------------------------------------------------------------------------------- 1 | /* Double-double linear algebra library 2 | * 3 | * Implementations were partly inspired by LAPACK, partly from Fredrik 4 | * Johansson's excellent MPMATH library. 5 | * 6 | * Copyright (C) 2021 Markus Wallerberger and others 7 | * SPDX-License-Identifier: MIT 8 | */ 9 | #pragma once 10 | #include "dd_arith.h" 11 | 12 | /** 13 | * Apply Givens rotation to vector: 14 | * 15 | * [ a ] = [ c s ] [ x ] 16 | * [ b ] [ -s c ] [ y ] 17 | */ 18 | static inline void lmul_givensq( 19 | ddouble *a, ddouble *b, ddouble c, ddouble s, ddouble x, ddouble y) 20 | { 21 | *a = addww(mulww(c, x), mulww(s, y)); 22 | *b = subww(mulww(c, y), mulww(s, x)); 23 | } 24 | 25 | /** Compute 2-norm of a vector */ 26 | ddouble normw(const ddouble *x, long nn, long sxn); 27 | 28 | /** 29 | * Perform a rank-one update of a `ii` times `jj` matrix: 30 | * 31 | * A[i, j] += v[i] * w[j] 32 | */ 33 | void rank1updateq(ddouble *a, long ais, long ajs, const ddouble *v, long vs, 34 | const ddouble *w, long ws, long ii, long jj); 35 | 36 | /** 37 | * Compute Givens rotation `R` matrix that satisfies: 38 | * 39 | * [ c s ] [ f ] [ r ] 40 | * [ -s c ] [ g ] = [ 0 ] 41 | */ 42 | void givensw(ddouble f, ddouble g, ddouble *c, ddouble *s, ddouble *r); 43 | 44 | /** 45 | * Compute Householder reflector `H[tau, v]`, defined as: 46 | * 47 | * H[tau, v] = I - tau * v @ v.T 48 | * 49 | * that, when applied to a given `x`, zeros out all but the first component. 50 | * The scaling factor `tau` is returned, while `v` is written. 51 | */ 52 | ddouble householderw(const ddouble *x, ddouble *v, long nn, long sx, long sv); 53 | 54 | /** 55 | * Perform the SVD of an arbitrary two-by-two matrix: 56 | * 57 | * [ a11 a12 ] = [ cu -su ] [ smax 0 ] [ cv sv ] 58 | * [ a21 a22 ] [ su cu ] [ 0 smin ] [ -sv cv ] 59 | */ 60 | void svd_2x2(ddouble a11, ddouble a12, ddouble a21, ddouble a22, ddouble *smin, 61 | ddouble *smax, ddouble *cv, ddouble *sv, ddouble *cu, ddouble *su); 62 | 63 | 64 | 65 | ddouble jacobi_sweep(ddouble *u, long sui, long suj, ddouble *vt, long svi, 66 | long svj, long ii, long jj); 67 | 68 | 69 | void golub_kahan_chaseq(ddouble *d, long sd, ddouble *e, long se, long ii, 70 | ddouble *rot); 71 | -------------------------------------------------------------------------------- /QD-LICENSE.txt: -------------------------------------------------------------------------------- 1 | Contains code from the QD library, which is: 2 | Copyright (C) 2012 Yozo Hida, Xiaoye S. Li, David H. Bailey. 3 | 4 | This License Agreement is entered into by The Regents of the University of 5 | California, Department of Energy contract-operators of the Lawrence Berkeley 6 | National Laboratory, 1 Cyclotron Road, Berkeley, CA 94720 (“Berkeley Lab”), 7 | and the entity listed below (“you” or "Licensee"). 8 | 9 | 1. Redistribution and use in source and binary forms, with or without 10 | modification, are permitted provided that the following conditions are met: 11 | 12 | (1) Redistributions of source code must retain the copyright notice, this 13 | list of conditions and the following disclaimer. 14 | 15 | (2) Redistributions in binary form must reproduce the copyright notice, 16 | this list of conditions and the following disclaimer in the 17 | documentation and/or other materials provided with the distribution. 18 | 19 | (3) Neither the name of the University of California, Lawrence Berkeley 20 | National Laboratory, U.S. Dept. of Energy nor the names of its 21 | contributors may be used to endorse or promote products derived from 22 | this software without specific prior written permission. 23 | 24 | 2. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 25 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 26 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 27 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 28 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 29 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 30 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 31 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 32 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 33 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 34 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | 36 | 3. You are under no obligation whatsoever to provide any bug fixes, patches, 37 | or upgrades to the features, functionality or performance of the source 38 | code ("Enhancements") to anyone; however, if you choose to make your 39 | Enhancements available either publicly, or directly to Lawrence Berkeley 40 | National Laboratory, without imposing a separate written license agreement 41 | for such Enhancements, then you hereby grant the following license: a 42 | non-exclusive, royalty-free perpetual license to install, use, modify, 43 | prepare derivative works, incorporate into other computer software, 44 | distribute, and sublicense such enhancements or derivative works thereof, 45 | in binary and source code form. 46 | -------------------------------------------------------------------------------- /test/test_linalg.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2021 Markus Wallerberger and others 2 | # SPDX-License-Identifier: MIT 3 | import numpy as np 4 | 5 | import xprec 6 | import xprec.linalg 7 | from xprec import ddouble 8 | 9 | 10 | def test_householder_vec(): 11 | rng = np.random.RandomState(4711) 12 | xd = rng.random_sample(20) 13 | xq = np.array(xd, dtype=ddouble) 14 | 15 | betaq, vq = xprec.linalg.householder(xq) 16 | eq = xq - betaq * vq * (vq @ xq) 17 | np.testing.assert_allclose(eq[1:].astype(float), 0, atol=1e-31) 18 | 19 | 20 | def test_bidiag(): 21 | rng = np.random.RandomState(4711) 22 | m, n = 7, 5 23 | A = rng.normal(size=(m,n)).astype(ddouble) 24 | Q, B, RT = xprec.linalg.bidiag(A) 25 | diff = Q @ B @ RT - A 26 | 27 | # FIXME: too large precision goals 28 | np.testing.assert_allclose(diff.astype(float), 0, atol=1e-29) 29 | 30 | 31 | def test_svd(): 32 | rng = np.random.RandomState(4711) 33 | A = rng.randn(100, 84) 34 | 35 | U, s, VT = xprec.linalg.svd(A.astype(xprec.ddouble), full_matrices=False) 36 | R = U * s @ VT - A 37 | np.testing.assert_allclose(R.astype(float), 0, atol=5e-29, rtol=0) 38 | 39 | _, sx, _ = np.linalg.svd(A.astype(float), full_matrices=False) 40 | np.testing.assert_allclose(s, sx, atol=1e-14 * sx[0], rtol=0) 41 | 42 | 43 | def test_givens(): 44 | f, g = np.array([3.0, -2.0], dtype=ddouble) 45 | c, s, r = xprec.linalg.givens_rotation(f, g) 46 | 47 | R = np.reshape([c, s, -s, c], (2,2)) 48 | v = np.hstack([f, g]) 49 | w = np.hstack([r, np.zeros_like(r)]) 50 | res = R @ v - w 51 | np.testing.assert_allclose(res.astype(float), 0, atol=1e-31) 52 | 53 | 54 | def test_givens(): 55 | a = np.array([3.0, -2.0], dtype=ddouble) 56 | r, G = xprec.linalg.givens(a) 57 | diff = r - G @ a 58 | np.testing.assert_allclose(diff.astype(float), 0, atol=1e-31) 59 | 60 | 61 | def test_qr(): 62 | A = np.vander(np.linspace(-1, 1, 60), 80).astype(ddouble) 63 | Q, R = xprec.linalg.qr(A) 64 | I_m = np.eye(60) 65 | D = Q @ Q.T - I_m 66 | np.testing.assert_allclose(D.astype(float), 0, atol=4e-30) 67 | D = Q @ R - A 68 | np.testing.assert_allclose(D.astype(float), 0, atol=4e-30) 69 | 70 | 71 | def test_qr_pivot(): 72 | A = np.vander(np.linspace(-1, 1, 60), 80).astype(ddouble) 73 | Q, R, piv = xprec.linalg.rrqr(A) 74 | I_m = np.eye(60) 75 | D = Q @ Q.T - I_m 76 | np.testing.assert_allclose(D.astype(float), 0, atol=4e-30) 77 | 78 | D = Q @ R - A[:,piv] 79 | np.testing.assert_allclose(D.astype(float), 0, atol=4e-30) 80 | 81 | Rdiag = np.abs(R.diagonal()) 82 | assert (Rdiag[1:] <= Rdiag[:-1]).all() 83 | 84 | 85 | def test_jacobi(): 86 | A = np.vander(np.linspace(-1, 1, 60), 80).astype(ddouble) 87 | U, s, VT = xprec.linalg.svd_trunc(A) 88 | np.testing.assert_allclose((U * s) @ VT - A, 0.0, atol=5e-30) 89 | -------------------------------------------------------------------------------- /test/test_ufunc.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2021 Markus Wallerberger and others 2 | # SPDX-License-Identifier: MIT 3 | import numpy as np 4 | import xprec 5 | 6 | 7 | def _compare_ufunc(ufunc, *args, ulps=1): 8 | fx_d = ufunc(*args) 9 | fx_q = ufunc(*(a.astype(xprec.ddouble) for a in args)).astype(float) 10 | 11 | # Ensure relative accuracy of 2 ulps 12 | np.testing.assert_array_almost_equal_nulp(fx_d, fx_q, ulps) 13 | 14 | 15 | def test_log(): 16 | x = np.geomspace(1e-300, 1e300, 1953) 17 | _compare_ufunc(np.log, x) 18 | 19 | zeroq = xprec.ddouble.type(0) 20 | assert np.isinf(np.log(zeroq)) 21 | 22 | 23 | def test_sqrt(): 24 | x = np.geomspace(1e-300, 1e300, 1953) 25 | _compare_ufunc(np.sqrt, x) 26 | 27 | 28 | def test_exp(): 29 | x = np.geomspace(1e-300, 700, 4953) 30 | x = np.hstack([-x[::-1], 0, x]) 31 | _compare_ufunc(np.exp, x) 32 | 33 | # Unfortunately, on Windows expm1 is less precise, so we need to increase 34 | # the tolerance slightly 35 | _compare_ufunc(np.expm1, x, ulps=2) 36 | 37 | 38 | def test_cosh(): 39 | x = np.geomspace(1e-300, 700, 4953) 40 | x = np.hstack([-x[::-1], 0, x]) 41 | _compare_ufunc(np.cosh, x) 42 | _compare_ufunc(np.sinh, x) 43 | 44 | thousand = xprec.ddouble.type(1000) 45 | assert np.isinf(np.cosh(thousand)) 46 | assert np.isinf(np.cosh(-thousand)) 47 | 48 | 49 | def test_hypot(): 50 | x = np.geomspace(1e-300, 1e260, 47) 51 | x = np.hstack([-x[::-1], 0, x]) 52 | _compare_ufunc(np.hypot, x[:,None], x[None,:]) 53 | 54 | 55 | def test_modf(): 56 | ulps = 1 57 | x = np.linspace(-100, 100, 100) 58 | x_d = x.astype(xprec.ddouble) 59 | 60 | fx_d = np.modf(x) 61 | fx_q = np.modf(x_d) 62 | 63 | # Ensure relative accuracy of 1 ulp 64 | np.testing.assert_array_almost_equal_nulp(fx_d[0], fx_q[0].astype(float), ulps) 65 | np.testing.assert_array_almost_equal_nulp(fx_d[1], fx_q[1].astype(float), ulps) 66 | 67 | 68 | def test_power(): 69 | x = np.linspace(0, 100, 100) 70 | _compare_ufunc(np.power, x[:,None], x[None,:]) 71 | 72 | 73 | def test_arctan2(): 74 | x = np.linspace(-100, 100, 100) 75 | _compare_ufunc(np.arctan2, x[:,None], x[None,:], ulps=2) 76 | 77 | 78 | def test_arcsin(): 79 | x = np.linspace(-1, 1, 100) 80 | _compare_ufunc(np.arcsin, x, ulps=2) 81 | 82 | 83 | def test_arccos(): 84 | x = np.linspace(-1, 1, 100) 85 | _compare_ufunc(np.arccos, x) 86 | 87 | 88 | def test_arctan(): 89 | x = np.linspace(-100, 100, 100) 90 | _compare_ufunc(np.arctan, x) 91 | 92 | 93 | def test_arccosh(): 94 | x = np.linspace(1, 100, 100) 95 | _compare_ufunc(np.arccosh, x) 96 | 97 | 98 | def test_arcsinh(): 99 | x = np.linspace(-100, 100, 100) 100 | _compare_ufunc(np.arcsinh, x) 101 | 102 | 103 | def test_arctanh(): 104 | x = np.linspace(-0.99, 0.99, 100) 105 | _compare_ufunc(np.arctanh, x) 106 | -------------------------------------------------------------------------------- /test/test_mpmath.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2021 Markus Wallerberger and others 2 | # SPDX-License-Identifier: MIT 3 | import numpy as np 4 | import pytest 5 | 6 | import xprec 7 | 8 | EPS = xprec.finfo(xprec.ddouble).eps 9 | 10 | try: 11 | import mpmath 12 | except ImportError: 13 | pytest.skip("No mpmath library avaiable", allow_module_level=True) 14 | else: 15 | mpmath.mp.prec = 120 16 | 17 | 18 | def mpf_for_xprec(x): 19 | """Converts xprec.ddouble array to array of mpmath mpf scalars""" 20 | x = np.asarray(x) 21 | if x.dtype != xprec.ddouble: 22 | raise ValueError("dtype shall be ddouble") 23 | 24 | x_flat = x.ravel() 25 | x_hi = x_flat.astype(float) 26 | x_lo = (x_flat - x_hi).astype(float) 27 | x_mpf = np.array(list(map(mpmath.mpf, x_hi))) 28 | x_mpf += x_lo 29 | return x_mpf.reshape(x.shape) 30 | 31 | 32 | def map_mpmath(fn, x): 33 | x = np.asarray(x) 34 | x_flat = x.ravel() 35 | y_flat = np.array(list(map(fn, x_flat)), dtype=object) 36 | y = y_flat.reshape(x.shape) 37 | return y 38 | 39 | 40 | def check_unary(mpmath_fn, numpy_fn, x, rtol): 41 | y_ref = map_mpmath(mpmath_fn, x) 42 | y_our = numpy_fn(x.astype(xprec.ddouble)) 43 | y_float = y_ref.astype(float) 44 | 45 | diff = (y_ref - mpf_for_xprec(y_our)).astype(float) 46 | ok = np.abs(diff) <= rtol * np.abs(y_float) 47 | if not ok.all(): 48 | x = x[~ok] 49 | y_float = y_float[~ok] 50 | y_our = y_our[~ok] 51 | diff = diff[~ok] 52 | reldiff = diff / np.abs(y_float) 53 | 54 | msg = f"{'x':>13s} {'mpmath':>13s} {'xprec':>13s} {'rel diff':>13s}\n" 55 | msg += "\n".join(f"{xi:13g} {y_refi:13g} {y_ouri:13g} {reldiffi:13g}" 56 | for xi, y_refi, y_ouri, reldiffi, _ 57 | in zip(x, y_float, y_our, reldiff, range(10)) 58 | ) 59 | raise ValueError(f"not equal to rtol = {rtol:3g}\n" + msg) 60 | 61 | 62 | 63 | def test_sqrt(): 64 | # Once the low part of the ddouble becomes a denormal number, we 65 | # are in trouble, so we truncate the lower end of the range by 66 | # another 16 digits 67 | x = np.geomspace(1e-292, 1e307, 1953) 68 | check_unary(mpmath.sqrt, np.sqrt, x, 2*EPS) 69 | 70 | 71 | def test_log(): 72 | x = np.reciprocal(np.geomspace(1e-292, 1e307, 1953)) 73 | check_unary(mpmath.log, np.log, x, 70 * EPS) 74 | 75 | 76 | def test_exp(): 77 | x = np.geomspace(1e-280, 670, 1511) 78 | x = np.hstack([-x[::-1], 0, x]) 79 | check_unary(mpmath.exp, np.exp, x, 60 * EPS) 80 | check_unary(mpmath.expm1, np.expm1, x, 60 * EPS) 81 | 82 | check_unary(mpmath.sinh, np.sinh, x, 60 * EPS) 83 | check_unary(mpmath.cosh, np.cosh, x, 60 * EPS) 84 | check_unary(mpmath.tanh, np.tanh, x, 60 * EPS) 85 | 86 | 87 | def test_sincos(): 88 | x = np.geomspace(1e-280, 4.8 * np.pi, 1511) 89 | x = np.hstack([-x[::-1], 0, x]) 90 | check_unary(mpmath.sin, np.sin, x, 2 * EPS) 91 | check_unary(mpmath.cos, np.cos, x, 2 * EPS) 92 | -------------------------------------------------------------------------------- /csrc/dd_linalg.c: -------------------------------------------------------------------------------- 1 | /* Double-double linear algebra library 2 | * 3 | * Implementations were partly inspired by LAPACK, partly from Fredrik 4 | * Johansson's excellent MPMATH library. 5 | * 6 | * Copyright (C) 2021 Markus Wallerberger and others 7 | * SPDX-License-Identifier: MIT 8 | */ 9 | #include "dd_linalg.h" 10 | 11 | // 2**500 and 2**(-500); 12 | static const double LARGE = 3.273390607896142e+150; 13 | static const double INV_LARGE = 3.054936363499605e-151; 14 | 15 | static ddouble normq_scaled(const ddouble *x, long nn, long sxn, 16 | double scaling) 17 | { 18 | ddouble sum = Q_ZERO; 19 | for (long n = 0; n < nn; ++n, x += sxn) { 20 | ddouble curr = mul_pwr2(*x, scaling); 21 | sum = addww(sum, sqrw(curr)); 22 | }; 23 | return mul_pwr2(sqrtw(sum), 1.0/scaling); 24 | } 25 | 26 | ddouble normw(const ddouble *x, long nn, long sxn) 27 | { 28 | ddouble sum = normq_scaled(x, nn, sxn, 1.0); 29 | 30 | // fall back to other routines in case of over/underflow 31 | if (sum.hi > LARGE) 32 | return normq_scaled(x, nn, sxn, INV_LARGE); 33 | else if (sum.hi < INV_LARGE) 34 | return normq_scaled(x, nn, sxn, LARGE); 35 | else 36 | return sum; 37 | } 38 | 39 | ddouble householderw(const ddouble *x, ddouble *v, long nn, long sx, long sv) 40 | { 41 | if (nn == 0) 42 | return Q_ZERO; 43 | 44 | ddouble norm_x = normw(x + sx, nn - 1, sx); 45 | if (iszerow(norm_x)) 46 | return Q_ZERO; 47 | 48 | ddouble alpha = *x; 49 | ddouble beta = copysignww(hypotww(alpha, norm_x), alpha); 50 | 51 | ddouble diff = subww(beta, alpha); 52 | ddouble tau = divww(diff, beta); 53 | ddouble scale = reciprocalw(negw(diff)); 54 | 55 | v[0] = Q_ONE; 56 | for (long n = 1; n != nn; ++n) 57 | v[n * sv] = mulww(scale, x[n * sx]); 58 | return tau; 59 | } 60 | 61 | void rank1updateq(ddouble *a, long ais, long ajs, const ddouble *v, long vs, 62 | const ddouble *w, long ws, long ii, long jj) 63 | { 64 | #pragma omp parallel for collapse(2) 65 | for (long i = 0; i < ii; ++i) { 66 | for (long j = 0; j < jj; ++j) { 67 | ddouble tmp = mulww(v[i * vs], w[j * ws]); 68 | a[i * ais + j * ajs] = addww(a[i * ais + j * ajs], tmp); 69 | } 70 | } 71 | } 72 | 73 | void givensw(ddouble f, ddouble g, ddouble *c, ddouble *s, ddouble *r) 74 | { 75 | /* ACM Trans. Math. Softw. 28(2), 206, Alg 1 */ 76 | if (iszerow(g)) { 77 | *c = Q_ONE; 78 | *s = Q_ZERO; 79 | *r = f; 80 | } else if (iszerow(f)) { 81 | *c = Q_ZERO; 82 | *s = (ddouble) {signbitw(g), 0.0}; 83 | *r = absw(g); 84 | } else { 85 | *r = copysignww(hypotww(f, g), f); 86 | 87 | /* This may come at a slight loss of precision, however, we should 88 | * not really have to care ... 89 | */ 90 | ddouble inv_r = reciprocalw(*r); 91 | *c = mulww(f, inv_r); 92 | *s = mulww(g, inv_r); 93 | } 94 | } 95 | 96 | static void svd_tri2x2( 97 | ddouble f, ddouble g, ddouble h, ddouble *smin, ddouble *smax, 98 | ddouble *cv, ddouble *sv, ddouble *cu, ddouble *su) 99 | { 100 | ddouble fa = absw(f); 101 | ddouble ga = absw(g); 102 | ddouble ha = absw(h); 103 | bool compute_uv = cv != NULL; 104 | 105 | if (lessww(fa, ha)) { 106 | // switch h <-> f, cu <-> sv, cv <-> su 107 | svd_tri2x2(h, g, f, smin, smax, su, cu, sv, cv); 108 | return; 109 | } 110 | if (iszerow(ga)) { 111 | // already diagonal 112 | *smin = ha; 113 | *smax = fa; 114 | if (compute_uv) { 115 | *cu = Q_ONE; 116 | *su = Q_ZERO; 117 | *cv = Q_ONE; 118 | *sv = Q_ZERO; 119 | } 120 | return; 121 | } 122 | if (fa.hi < Q_EPS.hi * ga.hi) { 123 | // ga is very large 124 | *smax = ga; 125 | if (ha.hi > 1.0) 126 | *smin = divww(fa, divww(ga, ha)); 127 | else 128 | *smin = mulww(divww(fa, ga), ha); 129 | if (compute_uv) { 130 | *cu = Q_ONE; 131 | *su = divww(h, g); 132 | *cv = Q_ONE; 133 | *sv = divww(f, g); 134 | } 135 | return; 136 | } 137 | // normal case 138 | ddouble fmh = subww(fa, ha); 139 | ddouble d = divww(fmh, fa); 140 | ddouble q = divww(g, f); 141 | ddouble s = subdw(2.0, d); 142 | ddouble spw = hypotww(q, s); 143 | ddouble dpw = hypotww(d, q); 144 | ddouble a = mul_pwr2(addww(spw, dpw), 0.5); 145 | *smin = absw(divww(ha, a)); 146 | *smax = absw(mulww(fa, a)); 147 | 148 | if (compute_uv) { 149 | ddouble tmp = addww(divww(q, addww(spw, s)), 150 | divww(q, addww(dpw, d))); 151 | tmp = mulww(tmp, adddw(1.0, a)); 152 | ddouble tt = hypotwd(tmp, 2.0); 153 | *cv = divdw(2.0, tt); 154 | *sv = divww(tmp, tt); 155 | *cu = divww(addww(*cv, mulww(*sv, q)), a); 156 | *su = divww(mulww(divww(h, f), *sv), a); 157 | } 158 | } 159 | 160 | void svd_2x2(ddouble a11, ddouble a12, ddouble a21, ddouble a22, ddouble *smin, 161 | ddouble *smax, ddouble *cv, ddouble *sv, ddouble *cu, ddouble *su) 162 | { 163 | bool compute_uv = cv != NULL; 164 | if(iszerow(a21)) 165 | return svd_tri2x2(a11, a12, a22, smin, smax, cv, sv, cu, su); 166 | 167 | /* First, we use a givens rotation Rx 168 | * [ cx sx ] [ a11 a12 ] = [ rx a12' ] 169 | * [ -sx cx ] [ a21 a22 ] [ 0 a22' ] 170 | */ 171 | ddouble cx, sx, rx; 172 | givensw(a11, a21, &cx, &sx, &rx); 173 | a11 = rx; 174 | a21 = Q_ZERO; 175 | lmul_givensq(&a12, &a22, cx, sx, a12, a22); 176 | 177 | /* Next, use the triangular routine 178 | * [ f g ] = [ cu -su ] [ smax 0 ] [ cv sv ] 179 | * [ 0 h ] [ su cu ] [ 0 smin ] [ -sv cv ] 180 | */ 181 | svd_tri2x2(a11, a12, a22, smin, smax, cv, sv, cu, su); 182 | 183 | /* Finally, update the LHS (U) transform as follows: 184 | * [ cx -sx ] [ cu -su ] = [ cu' -su' ] 185 | * [ sx cx ] [ su cu ] [ su' cu' ] 186 | */ 187 | if (compute_uv) 188 | lmul_givensq(cu, su, cx, negw(sx), *cu, *su); 189 | } 190 | 191 | ddouble jacobi_sweep(ddouble *u, long sui, long suj, ddouble *vt, long svi, 192 | long svj, long ii, long jj) 193 | { 194 | ddouble _cu, _su, cv, sv, _smin, _smax; 195 | ddouble offd = Q_ZERO; 196 | 197 | if (ii < jj) 198 | return nanw(); 199 | 200 | // Note that the inner loop only runs over the square portion! 201 | for (long i = 0; i < jj - 1; ++i) { 202 | for (long j = i + 1; j < jj; ++j) { 203 | // Construct the matrix to be diagonalized 204 | ddouble Hii = Q_ZERO, Hij = Q_ZERO, Hjj = Q_ZERO; 205 | for (long k = 0; k != ii; ++k) { 206 | ddouble u_ki = u[k * sui + i * suj]; 207 | ddouble u_kj = u[k * sui + j * suj]; 208 | Hii = addww(Hii, mulww(u_ki, u_ki)); 209 | Hij = addww(Hij, mulww(u_ki, u_kj)); 210 | Hjj = addww(Hjj, mulww(u_kj, u_kj)); 211 | } 212 | offd = addww(offd, sqrw(Hij)); 213 | 214 | // diagonalize 215 | svd_2x2(Hii, Hij, Hij, Hjj, &_smin, &_smax, &cv, &sv, &_cu, &_su); 216 | 217 | // apply rotation to VT 218 | for (long k = 0; k < jj; ++k) { 219 | ddouble *vt_ik = &vt[i * svi + k * svj]; 220 | ddouble *vt_jk = &vt[j * svi + k * svj]; 221 | lmul_givensq(vt_ik, vt_jk, cv, sv, *vt_ik, *vt_jk); 222 | } 223 | 224 | // apply transposed rotation to U 225 | for (long k = 0; k < ii; ++k) { 226 | ddouble *u_ki = &u[k * sui + i * suj]; 227 | ddouble *u_kj = &u[k * sui + j * suj]; 228 | lmul_givensq(u_ki, u_kj, cv, sv, *u_ki, *u_kj); 229 | } 230 | } 231 | } 232 | offd = sqrtw(offd); 233 | return offd; 234 | } 235 | 236 | static ddouble gk_shift(ddouble d1, ddouble e1, ddouble d2) 237 | { 238 | /* Get singular values of 2x2 triangular matrix formed from the lower 239 | * right corner in the array: 240 | * 241 | * [ d[ii-2] e[ii-2] ] 242 | * [ 0 d[ii-1] ] 243 | */ 244 | ddouble smin, smax; 245 | svd_tri2x2(d1, e1, d2, &smin, &smax, NULL, NULL, NULL, NULL); 246 | 247 | ddouble smin_dist = absw(subww(smin, d2)); 248 | ddouble smax_dist = absw(subww(smax, d2)); 249 | return lessww(smin_dist, smax_dist) ? smin : smax; 250 | } 251 | 252 | void golub_kahan_chaseq(ddouble *d, long sd, ddouble *e, long se, long ii, 253 | ddouble *rot) 254 | { 255 | if (ii < 2) 256 | return; 257 | 258 | ddouble shift = gk_shift(d[(ii-2)*sd], e[(ii-2)*se], d[(ii-1)*sd]); 259 | ddouble g = e[0]; 260 | ddouble f = addww(copysigndw(1.0, d[0]), divww(shift, d[0])); 261 | f = mulww(f, subww(absw(d[0]), shift)); 262 | 263 | for (long i = 0; i < (ii - 1); ++i) { 264 | ddouble r, cosr, sinr; 265 | givensw(f, g, &cosr, &sinr, &r); 266 | if (i != 0) 267 | e[(i-1)*se] = r; 268 | 269 | lmul_givensq(&f, &e[i*se], cosr, sinr, d[i*sd], e[i*se]); 270 | lmul_givensq(&g, &d[(i+1)*sd], cosr, sinr, Q_ZERO, d[(i+1)*sd]); 271 | *(rot++) = cosr; 272 | *(rot++) = sinr; 273 | 274 | ddouble cosl, sinl; 275 | givensw(f, g, &cosl, &sinl, &r); 276 | d[i*sd] = r; 277 | lmul_givensq(&f, &d[(i+1)*sd], cosl, sinl, e[i*se], d[(i+1)*sd]); 278 | if (i < ii - 2) { 279 | lmul_givensq(&g, &e[(i+1)*se], cosl, sinl, Q_ZERO, e[(i+1)*se]); 280 | } 281 | *(rot++) = cosl; 282 | *(rot++) = sinl; 283 | } 284 | e[(ii-2)*se] = f; 285 | } 286 | -------------------------------------------------------------------------------- /pysrc/xprec/linalg.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2021 Markus Wallerberger and others 2 | # SPDX-License-Identifier: MIT 3 | # 4 | # Some of the code in this module is adapted from the LAPACK reference 5 | # implementation. 6 | import numpy as np 7 | from warnings import warn 8 | 9 | from . import ddouble 10 | from . import _dd_linalg 11 | 12 | norm = _dd_linalg.norm 13 | givens = _dd_linalg.givens 14 | householder = _dd_linalg.householder 15 | rank1update = _dd_linalg.rank1update 16 | 17 | 18 | def qr(A, reflectors=False): 19 | """QR decomposition without pivoting. 20 | 21 | Decomposes a `(m, n)` matrix `A` into the product: 22 | 23 | A == Q @ R 24 | 25 | where `Q` is an `(m, m)` orthogonal matrix and `R` is a `(m, n)` upper 26 | triangular matrix. No pivoting is used. 27 | """ 28 | R = np.array(A) 29 | m, n = R.shape 30 | k = min(m, n) 31 | 32 | Q = np.zeros((k, m), A.dtype) 33 | for i in range(k): 34 | householder_update(R[i:,i:], Q[i:,i:]) 35 | if not reflectors: 36 | I = np.eye(m, dtype=A.dtype) 37 | Q = householder_apply(Q, I) 38 | return Q, R 39 | 40 | 41 | def rrqr(A, tol=5e-32, reflectors=False): 42 | """Truncated rank-revealing QR decomposition with full column pivoting. 43 | 44 | Decomposes a `(m, n)` matrix `A` into the product: 45 | 46 | A[:,piv] == Q @ R 47 | 48 | where `Q` is an `(m, k)` isometric matrix, `R` is a `(k, n)` upper 49 | triangular matrix, `piv` is a permutation vector, and `k` is chosen such 50 | that the relative tolerance `tol` is met in the equality above. 51 | """ 52 | R = np.array(A) 53 | m, n = R.shape 54 | k = min(m, n) 55 | 56 | Q = np.zeros((m, k), A.dtype) 57 | jpvt = np.arange(n) 58 | norms = norm(A.T) 59 | xnorms = norms.copy() 60 | TOL3Z = np.finfo(float).eps 61 | for i in range(k): 62 | pvt = i + np.argmax(norms[i:]) 63 | if i != pvt: 64 | R[:,[i, pvt]] = R[:,[pvt, i]] 65 | jpvt[[i, pvt]] = jpvt[[pvt, i]] 66 | norms[pvt] = norms[i] 67 | xnorms[pvt] = xnorms[i] 68 | 69 | householder_update(R[i:,i:], Q[i:,i:]) 70 | 71 | js = (i + 1) + norms[i + 1:].nonzero()[0] 72 | temp = np.abs(R[i,js]) / norms[js] 73 | temp = np.fmax(0.0, (1 + temp)*(1 - temp)) 74 | temp2 = temp * np.square(norms[js] / xnorms[js]) 75 | 76 | wheresmall = temp2 < TOL3Z 77 | jsmall = js[wheresmall] 78 | upd_norms = norm(R[i+1:,jsmall].T) 79 | norms[jsmall] = upd_norms 80 | xnorms[jsmall] = upd_norms 81 | jbig = js[~wheresmall] 82 | norms[jbig] *= np.sqrt(temp[~wheresmall]) 83 | 84 | if tol is not None: 85 | acc = np.abs(R[i,i] / R[0,0]) 86 | if acc < tol: 87 | k = i + 1 88 | Q = Q[:,:k] 89 | R = R[:k,:] 90 | break 91 | 92 | if not reflectors: 93 | I = np.eye(m, k, dtype=A.dtype) 94 | Q = householder_apply(Q, I) 95 | return Q, R, jpvt 96 | 97 | 98 | def svd(A, full_matrices=True): 99 | """Truncated singular value decomposition. 100 | 101 | Decomposes a `(m, n)` matrix `A` into the product: 102 | 103 | A == U @ (s[:,None] * VT) 104 | 105 | where `U` is a `(m, k)` matrix with orthogonal columns, `VT` is a `(k, n)` 106 | matrix with orthogonal rows and `s` are the singular values, a set of `k` 107 | nonnegative numbers in non-ascending order and `k = min(m, n)`. 108 | """ 109 | A = np.asarray(A) 110 | m, n = A.shape 111 | if m < n: 112 | U, s, VT = svd(A.T, full_matrices) 113 | return VT.T, s, U.T 114 | 115 | Q, B, RT = bidiag(A) 116 | for _ in range(20 * n): 117 | if svd_bidiag_step(Q, B, RT): 118 | break 119 | else: 120 | warn("Did not converge") 121 | 122 | U, s, VH = svd_normalize(Q, B.diagonal(), RT) 123 | if not full_matrices: 124 | U = U[:,:n] 125 | return U, s, VH 126 | 127 | 128 | def svd_trunc(A, tol=5e-32, method='jacobi', max_iter=20): 129 | """Truncated singular value decomposition. 130 | 131 | Decomposes a `(m, n)` matrix `A` into the product: 132 | 133 | A == U @ (s[:,None] * VT) 134 | 135 | where `U` is a `(m, k)` matrix with orthogonal columns, `VT` is a `(k, n)` 136 | matrix with orthogonal rows and `s` are the singular values, a set of `k` 137 | nonnegative numbers in non-ascending order. The SVD is truncated in the 138 | sense that singular values below `tol` are discarded. 139 | """ 140 | # RRQR is an excellent preconditioner for Jacobi. One should then perform 141 | # Jacobi on RT 142 | Q, R, p = rrqr(A, tol) 143 | if method == 'jacobi': 144 | U, s, VT = svd_jacobi(R.T, tol, max_iter) 145 | elif method == 'golub-kahan': 146 | U, s, VT = svd(R.T, full_matrices=False) 147 | else: 148 | raise ValueError("invalid method") 149 | 150 | # Reconstruct A from QRs 151 | U_A = Q @ VT.T 152 | VT_B = U.T[:, p.argsort()] 153 | return U_A, s, VT_B 154 | 155 | 156 | def bidiag(A, reflectors=False, force_structure=False): 157 | """Biadiagonalizes an arbitray rectangular matrix. 158 | 159 | Decomposes a `(m, n)` matrix `A` into the product: 160 | 161 | A == Q @ B @ RT 162 | 163 | where `Q` is a `(m, m)` orthogonal matrix, `RT` is a `(n, n)` orthogonal 164 | matrix, and `B` is a bidiagonal matrix, where the upper diagonal is 165 | nonzero for `m >= n` and the lower diagonal is nonzero for `m < n`. 166 | """ 167 | A = np.asarray(A) 168 | m, n = A.shape 169 | if m < n: 170 | Q, B, RT = bidiag(A.T, reflectors) 171 | return RT.T, B.T, Q.T 172 | 173 | rq = n - (m == n) 174 | B = A.copy() 175 | Q = np.zeros_like(B) 176 | R = np.zeros_like(B[:n,:n]) 177 | 178 | for j in range(n-2): 179 | householder_update(B[j:,j:], Q[j:,j:]) 180 | householder_update(B[j:,j+1:].T, R[j+1:,j+1:]) 181 | for j in range(n-2, rq): 182 | householder_update(B[j:,j:], Q[j:,j:]) 183 | 184 | if force_structure: 185 | d = B.diagonal().copy() 186 | e = B.diagonal(1).copy() 187 | B[...] = 0 188 | i = np.arange(n) 189 | B[i, i] = d 190 | B[i[:-1], i[:-1]+1] = e 191 | if not reflectors: 192 | Q = householder_apply(Q, np.eye(m, dtype=B.dtype)) 193 | R = householder_apply(R, np.eye(n, dtype=B.dtype)) 194 | return Q, B, R.T 195 | 196 | 197 | def svd_jacobi(A, tol=5e-32, max_iter=20): 198 | """Singular value decomposition using Jacobi rotations.""" 199 | U = A.copy() 200 | m, n = U.shape 201 | if m < n: 202 | raise RuntimeError("expecting tall matrix") 203 | 204 | VT = np.eye(n, dtype=U.dtype) 205 | offd = np.empty((), ddouble) 206 | 207 | limit = tol * np.linalg.norm(U[:n,:n], 'fro') 208 | for _ in range(max_iter): 209 | _dd_linalg.jacobi_sweep(U, VT, out=(U, VT, offd)) 210 | if offd <= limit: 211 | break 212 | else: 213 | warn("Did not converge") 214 | 215 | s = norm(U.T) 216 | U = U / s 217 | return U, s, VT 218 | 219 | 220 | def householder_update(A, Q): 221 | """Reflects the zeroth column onto a multiple of the unit vector""" 222 | beta, v = householder(A[:,0]) 223 | w = -beta * (A.T @ v) 224 | rank1update(A, v, w, out=A) 225 | Q[0,0] = beta 226 | Q[1:,0] = v[1:] 227 | 228 | 229 | def householder_apply(H, Q): 230 | """Applies a set of reflectors to a matrix""" 231 | H = np.asarray(H) 232 | Q = Q.copy() 233 | m, r = H.shape 234 | if Q.shape[0] != m: 235 | raise ValueError("invalid shape") 236 | if Q.shape[1] < r: 237 | raise ValueError("invalid shape") 238 | for j in range(r-1, -1, -1): 239 | beta = H[j,j] 240 | if np.equal(beta, 0): 241 | continue 242 | v = np.empty_like(H[j:,0]) 243 | v[0] = 1 244 | v[1:] = H[j+1:,j] 245 | Qpart = Q[j:,j:] 246 | w = -beta * (Qpart.T @ v) 247 | rank1update(Qpart, v, w, out=Qpart) 248 | return Q 249 | 250 | 251 | def svd_normalize(U, d, VH): 252 | """Given a SVD-like decomposition, normalize""" 253 | # Invert 254 | n = d.size 255 | VH[np.signbit(d)] = -VH[np.signbit(d)] 256 | d = np.abs(d) 257 | 258 | # Sort 259 | order = np.argsort(d)[::-1] 260 | d = d[order] 261 | VH = VH[order] 262 | U = U.copy() 263 | U[:,:n] = U[:,order] 264 | return U, d, VH 265 | 266 | 267 | def svd_bidiag_step(Q, B, RT): 268 | """Single SVD step for a bidiagonal matrix""" 269 | d = B.diagonal().copy() 270 | e = np.hstack([B.diagonal(1), 0.0]) 271 | 272 | p, q = bidiag_partition(d, e) 273 | if q <= 1: 274 | return True 275 | 276 | d_part = d[p:q] 277 | e_part = e[p:q] 278 | rot = np.empty((d_part.size, 4), d.dtype) 279 | _dd_linalg.golub_kahan_chase(d_part, e_part, out=(d_part, e_part, rot)) 280 | 281 | i = np.arange(p, q) 282 | B[i, i] = d_part 283 | B[i[:-1], i[:-1]+1] = e_part[:-1] 284 | 285 | rot_Q = rot[:, 2:] 286 | rot_R = rot[:, :2] 287 | QT_part = Q[:, p:q].T 288 | RT_part = RT[p:q, :] 289 | _dd_linalg.givens_seq(rot_Q, QT_part, out=QT_part) 290 | _dd_linalg.givens_seq(rot_R, RT_part, out=RT_part) 291 | return False 292 | 293 | 294 | def bidiag_partition(d, e, eps=5e-32): 295 | """Partition bidiagonal matrix into blocks for implicit QR. 296 | 297 | Return `p,q` which partions a bidiagonal `B` matrix into three blocks: 298 | 299 | - B[0:p, 0:p], an arbitrary bidiaonal matrix 300 | - B[p:q, p:q], a matrix with all off-diagonal elements nonzero 301 | - B[q:, q:], a diagonal matrix 302 | """ 303 | abs_e = np.abs(e) 304 | abs_d = np.abs(d) 305 | e_zero = abs_e <= eps * (abs_d + abs_e) 306 | e[e_zero] = 0 307 | 308 | q = _find_last(~e_zero) + 1 309 | if q <= 0: 310 | return 0, 0 311 | p = _find_last(e_zero[:q]) + 1 312 | return p, q + 1 313 | 314 | 315 | def _find_last(a, axis=-1): 316 | a = a.astype(bool) 317 | maxloc = a.shape[axis] - 1 - a[::-1].argmax(axis) 318 | return np.where(a[maxloc], maxloc, -1) 319 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Setup script - embracing the setuptools madness. 2 | # 3 | # Copyright (C) 2021 Markus Wallerberger and others 4 | # SPDX-License-Identifier: MIT 5 | import io 6 | import os.path 7 | import os 8 | import platform 9 | import re 10 | 11 | from setuptools import setup, find_packages 12 | from setuptools.extension import Extension 13 | from setuptools.command.build_ext import build_ext as BuildExt 14 | 15 | 16 | def readfile(*parts): 17 | """Return contents of file with path relative to script directory""" 18 | herepath = os.path.abspath(os.path.dirname(__file__)) 19 | fullpath = os.path.join(herepath, *parts) 20 | with io.open(fullpath, 'r') as f: 21 | return f.read() 22 | 23 | 24 | def extract_version(*parts): 25 | """Extract value of __version__ variable by parsing python script""" 26 | initfile = readfile(*parts) 27 | version_re = re.compile(r"(?m)^__version__\s*=\s*['\"]([^'\"]*)['\"]") 28 | match = version_re.search(initfile) 29 | return match.group(1) 30 | 31 | 32 | def rebase_links(text, base_url): 33 | """Rebase links to doc/ directory to ensure they work online.""" 34 | doclink_re = re.compile( 35 | r"(?m)^\s*\[\s*([^\]\n\r]+)\s*\]:\s*(doc/[./\w]+)\s*$") 36 | result, nsub = doclink_re.subn(r"[\1]: %s/\2" % base_url, text) 37 | return result 38 | 39 | 40 | def append_if_absent(list, arg): 41 | """Append argument to list if absent""" 42 | if arg not in list: 43 | list.append(arg) 44 | 45 | 46 | def get_flags_dict(exec): 47 | # First, let us clean up the mess of compiler options a little bit: Move 48 | # flags out into a dictionary, thereby removing the myriad of duplicates 49 | cc_so, *cflags_so_list = exec 50 | cflags_curr = None 51 | cflags_so = {} 52 | for arg in cflags_so_list: 53 | if arg.startswith("-"): 54 | if cflags_curr is not None: 55 | cflags_so[cflags_curr] = None 56 | cflags_curr = None 57 | arg = arg.split("=", 1) 58 | if len(arg) == 1: 59 | cflags_curr, = arg 60 | else: 61 | k, v = arg 62 | cflags_so[k] = v 63 | else: 64 | if cflags_curr is None: 65 | raise ValueError("expected flag" + str(exec)) 66 | cflags_so[cflags_curr] = arg 67 | cflags_curr = None 68 | if cflags_curr is not None: 69 | cflags_so[cflags_curr] = None 70 | 71 | return cc_so, cflags_so 72 | 73 | 74 | def make_exec_string(cc_so, cflags_so): 75 | # Now update the flags 76 | cflags_so = [k + ("=" + v if v is not None else "") 77 | for (k,v) in cflags_so.items()] 78 | return [cc_so] + cflags_so 79 | 80 | 81 | class OptionsMixin: 82 | _convert_to_bool = {"true": True, "false": False} 83 | user_options = [ 84 | ("with-openmp=", None, "use openmp to build (default: false)"), 85 | ("opt-arch=", None, "optimized for architecture"), 86 | ("numpy-include-dir=", None, "numpy include directory"), 87 | ] 88 | 89 | def initialize_options(self): 90 | super().initialize_options() 91 | self.with_openmp = None 92 | self.numpy_include_dir = None 93 | self.opt_arch = None 94 | 95 | def finalize_options(self): 96 | if self.with_openmp is not None: 97 | self.with_openmp = self._convert_to_bool[self.with_openmp.lower()] 98 | if self.opt_arch is not None: 99 | self.opt_arch = self._convert_to_bool[self.opt_arch.lower()] 100 | if self.numpy_include_dir is not None: 101 | if not os.path.isdir(self.numpy_include_dir): 102 | raise ValueError("include directory must exist") 103 | super().finalize_options() 104 | 105 | 106 | class BuildExtWithNumpy(OptionsMixin, BuildExt): 107 | """Wrapper class for building numpy extensions""" 108 | user_options = BuildExt.user_options + OptionsMixin.user_options 109 | 110 | def build_extensions(self): 111 | """Modify paths according to options""" 112 | # This must be deferred to build time, because that is when 113 | # self.compiler starts being a compiler instance (before, it is 114 | # a flag) *slow-clap* 115 | # compiler type is either 'unix', 'msvc' or 'mingw' 116 | compiler_type = self.compiler.compiler_type 117 | 118 | compiler_binary = getattr(self.compiler, 'compiler', [''])[0] 119 | compiler_binary = os.path.basename(compiler_binary) 120 | compiler_make = '' 121 | if 'gcc' in compiler_binary or 'g++' in compiler_binary: 122 | compiler_make = 'gcc' 123 | elif 'clang' in compiler_binary: 124 | compiler_make = 'clang' 125 | elif 'icc' in compiler_binary: 126 | compiler_make = 'icc' 127 | elif compiler_type == 'msvc': 128 | # See msvccompiler.py:206 - a comment worth reading in its 129 | # entirety. distutils sets up an abstraction which it immediately 130 | # break with its own derived classes. *slow-clap* 131 | compiler_make = 'msvc' 132 | 133 | if compiler_type != 'msvc': 134 | new_flags = {"-Wextra": None, "-std": "c11"} 135 | # By default, we do not optimize for the architecture by default, 136 | # because this is harmful when building a binary package. 137 | if self.opt_arch: 138 | new_flags["-mtune"] = new_flags["-march"] = "native" 139 | 140 | cc_so, flags_dict = get_flags_dict(self.compiler.compiler_so) 141 | 142 | # Replace arch with march 143 | if "-arch" in flags_dict: 144 | flags_dict["-march"] = flags_dict.pop("-arch") 145 | 146 | # Remove any existing -mtune, -march, -arch flags if not self.opt_arch 147 | if not self.opt_arch: 148 | for key in ["-mtune", "-march", "-arch"]: 149 | if key in flags_dict: 150 | del flags_dict[key] 151 | 152 | flags_dict.update(new_flags) 153 | self.compiler.compiler_so = make_exec_string(cc_so, flags_dict) 154 | 155 | # clang on 14.4.1 fails to include C header files... 156 | if platform.system() == 'Darwin': 157 | sdk_path = ( 158 | "/Applications/Xcode.app/Contents/Developer/Platforms/" 159 | "MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include" 160 | ) 161 | current_cpath = os.environ.get('CPATH', '') 162 | os.environ['CPATH'] = f"{sdk_path}:{current_cpath}" 163 | 164 | # This has to be set to false because MacOS does not ship openmp 165 | # by default. 166 | if self.with_openmp is None: 167 | self.with_openmp = platform.system() == 'Linux' 168 | 169 | # Numpy headers: numpy must be imported here rather than 170 | # globally, because otherwise it may not be available at the time 171 | # when the setup script is run. *slow-cl ... ah, f*ck it. 172 | if self.numpy_include_dir is None: 173 | import numpy 174 | self.numpy_include_dir = numpy.get_include() 175 | 176 | for ext in self.extensions: 177 | append_if_absent(ext.include_dirs, self.numpy_include_dir) 178 | if self.with_openmp: 179 | append_if_absent(ext.extra_compile_args, '-fopenmp') 180 | append_if_absent(ext.extra_link_args, '-fopenmp') 181 | if compiler_make == 'clang': 182 | append_if_absent(ext.extra_link_args, '-lomp') 183 | 184 | super().build_extensions() 185 | 186 | def get_source_files(self): 187 | """Return list of files to include in source dist""" 188 | # Specifying include_dirs= argument in Extension adds headers from that 189 | # directory to the sdist ... on some machines. On others, not. Note 190 | # that overriding sdist will not save you, since this is not called 191 | # from sdist.add_defaults(), as you might expect. (With setuptools, it 192 | # is never what you expect.) Instead, sdist requires egg_info, which 193 | # hooks into a hidden manifest_maker class derived from sdist, where 194 | # add_defaults() called, the list passed back to sdist, sidestepping 195 | # the method in the orginal class. Kudos. 196 | # 197 | # Really, if you have monkeys type out 1000 pages on typewriters, use 198 | # the result as toilet paper for a month, unfold it, scan it at 20 dpi, 199 | # and run it through text recognition software, it would still yield 200 | # better code than setuptools. 201 | source_files = super().get_source_files() 202 | header_regex = re.compile(r"\.(?:h|hh|hpp|hxx|H|HH|HPP|HXX)$") 203 | 204 | include_dirs = set() 205 | for ext in self.extensions: 206 | include_dirs.update(ext.include_dirs) 207 | for dir in include_dirs: 208 | for entry in os.scandir(dir): 209 | if not entry.is_file(): 210 | continue 211 | if not header_regex.search(entry.name): 212 | continue 213 | source_files.append(entry.path) 214 | 215 | return source_files 216 | 217 | 218 | VERSION = extract_version('pysrc', 'xprec', '__init__.py') 219 | REPO_URL = "https://github.com/tuwien-cms/xprec" 220 | DOCTREE_URL = "%s/tree/v%s" % (REPO_URL, VERSION) 221 | LONG_DESCRIPTION = rebase_links(readfile('README.md'), DOCTREE_URL) 222 | 223 | setup( 224 | name='xprec', 225 | version=VERSION, 226 | 227 | description='xprec precision numpy extension', 228 | long_description=LONG_DESCRIPTION, 229 | long_description_content_type='text/markdown', 230 | keywords=' '.join([ 231 | 'double-double' 232 | ]), 233 | classifiers=[ 234 | 'Development Status :: 5 - Production/Stable', 235 | 'Intended Audience :: Science/Research', 236 | 'Intended Audience :: Developers', 237 | 'Programming Language :: Python :: 3', 238 | 'License :: OSI Approved :: MIT License', 239 | ], 240 | 241 | url=REPO_URL, 242 | author=', '.join([ 243 | 'Markus Wallerberger' 244 | ]), 245 | author_email='markus.wallerberger@tuwien.ac.at', 246 | 247 | python_requires='>=3', 248 | install_requires=[ 249 | # we need matmul to be an ufunc -> 1.16 250 | 'numpy>=1.16', 251 | ], 252 | extras_require={ 253 | 'test': ['pytest', 'mpmath'], 254 | }, 255 | 256 | ext_modules=[ 257 | Extension("xprec._dd_ufunc", 258 | ["csrc/_dd_ufunc.c", "csrc/dd_arith.c"], 259 | include_dirs=["csrc"]), 260 | Extension("xprec._dd_linalg", 261 | ["csrc/_dd_linalg.c", "csrc/dd_arith.c", "csrc/dd_linalg.c"], 262 | include_dirs=["csrc"]), 263 | ], 264 | setup_requires=[ 265 | 'numpy>=1.16' 266 | ], 267 | cmdclass={ 268 | 'build_ext': BuildExtWithNumpy 269 | }, 270 | 271 | package_dir={'': 'pysrc'}, 272 | packages=find_packages(where='pysrc'), 273 | ) 274 | -------------------------------------------------------------------------------- /csrc/dd_arith.h: -------------------------------------------------------------------------------- 1 | /* Double-double arithmetic library 2 | * 3 | * Part of the functions are modified from the QD library for U.C. Berkeley 4 | * and licensed under a modified BSD license (see QD-LICENSE.txt) 5 | * 6 | * Some of the algorithms were updated according to the findings in 7 | * M. Joldes, et al., ACM Trans. Math. Softw. 44, 1-27 (2018) 8 | * (Algorithm numbers in the code) 9 | * 10 | * Copyright (C) 2012 Yozo Hida, Xiaoye S. Li, David H. Bailey 11 | * Copyright (C) 2021 Markus Wallerberger and others 12 | * SPDX-License-Identifier: MIT and Modified-BSD 13 | */ 14 | #pragma once 15 | #include 16 | #include 17 | #include 18 | 19 | /** 20 | * Type for double-double calculations 21 | */ 22 | typedef struct { 23 | double hi; 24 | double lo; 25 | } ddouble; 26 | 27 | static inline ddouble two_sum_quick(double a, double b) 28 | { 29 | double s = a + b; 30 | double lo = b - (s - a); 31 | return (ddouble){.hi = s, .lo = lo}; 32 | } 33 | 34 | static inline ddouble two_sum(double a, double b) 35 | { 36 | double s = a + b; 37 | double v = s - a; 38 | double lo = (a - (s - v)) + (b - v); 39 | return (ddouble){.hi = s, .lo = lo}; 40 | } 41 | 42 | static inline ddouble two_diff(double a, double b) 43 | { 44 | double s = a - b; 45 | double v = s - a; 46 | double lo = (a - (s - v)) - (b + v); 47 | return (ddouble){.hi = s, .lo = lo}; 48 | } 49 | 50 | static inline ddouble two_prod(double a, double b) 51 | { 52 | double s = a * b; 53 | double lo = fma(a, b, -s); 54 | return (ddouble){.hi = s, .lo = lo}; 55 | } 56 | 57 | /* -------------------- Combining quad/double ------------------------ */ 58 | 59 | static inline ddouble addwd(ddouble x, double y) 60 | { 61 | ddouble s = two_sum(x.hi, y); 62 | double v = x.lo + s.lo; 63 | return two_sum_quick(s.hi, v); 64 | } 65 | 66 | static inline ddouble subwd(ddouble x, double y) 67 | { 68 | ddouble s = two_diff(x.hi, y); 69 | double v = x.lo + s.lo; 70 | return two_sum_quick(s.hi, v); 71 | } 72 | 73 | static inline ddouble mulwd(ddouble x, double y) 74 | { 75 | ddouble c = two_prod(x.hi, y); 76 | double v = fma(x.lo, y, c.lo); 77 | return two_sum_quick(c.hi, v); 78 | } 79 | 80 | static inline ddouble divwd(ddouble x, double y) 81 | { 82 | /* Alg 14 */ 83 | double t_hi = x.hi / y; 84 | ddouble pi = two_prod(t_hi, y); 85 | double d_hi = x.hi - pi.hi; 86 | double d_lo = x.lo - pi.lo; 87 | double t_lo = (d_hi + d_lo) / y; 88 | return two_sum_quick(t_hi, t_lo); 89 | } 90 | 91 | /* -------------------- Combining double/quad ------------------------- */ 92 | 93 | static inline ddouble negw(ddouble); 94 | static inline ddouble reciprocalw(ddouble); 95 | 96 | static inline ddouble adddw(double x, ddouble y) 97 | { 98 | return addwd(y, x); 99 | } 100 | 101 | static inline ddouble subdw(double x, ddouble y) 102 | { 103 | /* TODO: Probably not ideal */ 104 | return addwd(negw(y), x); 105 | } 106 | 107 | static inline ddouble muldw(double x, ddouble y) 108 | { 109 | return mulwd(y, x); 110 | } 111 | 112 | static inline ddouble divdw(double x, ddouble y) 113 | { 114 | /* TODO: Probably not ideal */ 115 | return mulwd(reciprocalw(y), x); 116 | } 117 | 118 | static inline ddouble mul_pwr2(ddouble a, double b) { 119 | return (ddouble){a.hi * b, a.lo * b}; 120 | } 121 | 122 | /* -------------------- Combining quad/quad ------------------------- */ 123 | 124 | static inline ddouble addww(ddouble x, ddouble y) 125 | { 126 | ddouble s = two_sum(x.hi, y.hi); 127 | ddouble t = two_sum(x.lo, y.lo); 128 | ddouble v = two_sum_quick(s.hi, s.lo + t.hi); 129 | ddouble z = two_sum_quick(v.hi, t.lo + v.lo); 130 | return z; 131 | } 132 | 133 | static inline ddouble subww(ddouble x, ddouble y) 134 | { 135 | ddouble s = two_diff(x.hi, y.hi); 136 | ddouble t = two_diff(x.lo, y.lo); 137 | ddouble v = two_sum_quick(s.hi, s.lo + t.hi); 138 | ddouble z = two_sum_quick(v.hi, t.lo + v.lo); 139 | return z; 140 | } 141 | 142 | static inline ddouble mulww(ddouble a, ddouble b) 143 | { 144 | /* Alg 11 */ 145 | ddouble c = two_prod(a.hi, b.hi); 146 | double t = a.hi * b.lo; 147 | t = fma(a.lo, b.hi, t); 148 | return two_sum_quick(c.hi, c.lo + t); 149 | } 150 | 151 | static inline ddouble divww(ddouble x, ddouble y) 152 | { 153 | /* Alg 17 */ 154 | double t_hi = x.hi / y.hi; 155 | ddouble r = mulwd(y, t_hi); 156 | double pi_hi = x.hi - r.hi; 157 | double d = pi_hi + (x.lo - r.lo); 158 | double t_lo = d / y.hi; 159 | return two_sum_quick(t_hi, t_lo); 160 | } 161 | 162 | /* -------------------- Unary functions ------------------------- */ 163 | 164 | static inline ddouble negw(ddouble a) 165 | { 166 | return (ddouble){-a.hi, -a.lo}; 167 | } 168 | 169 | static inline ddouble posw(ddouble a) 170 | { 171 | return (ddouble){-a.hi, -a.lo}; 172 | } 173 | 174 | static inline ddouble absw(ddouble a) 175 | { 176 | return signbit(a.hi) ? negw(a) : a; 177 | } 178 | 179 | static inline ddouble reciprocalw(ddouble y) 180 | { 181 | /* Alg 17 with x = 1 */ 182 | double t_hi = 1.0 / y.hi; 183 | ddouble r = mulwd(y, t_hi); 184 | double pi_hi = 1.0 - r.hi; 185 | double d = pi_hi - r.lo; 186 | double t_lo = d / y.hi; 187 | return two_sum_quick(t_hi, t_lo); 188 | } 189 | 190 | static inline ddouble sqrw(ddouble a) 191 | { 192 | /* Alg 11 */ 193 | ddouble c = two_prod(a.hi, a.hi); 194 | double t = 2 * a.hi * a.lo; 195 | return two_sum_quick(c.hi, c.lo + t); 196 | } 197 | 198 | static inline ddouble roundw(ddouble a) 199 | { 200 | double hi = round(a.hi); 201 | double lo; 202 | 203 | if (hi == a.hi) { 204 | /* High word is an integer already. Round the low word.*/ 205 | lo = round(a.lo); 206 | 207 | /* Renormalize. This is needed if x[0] = some integer, x[1] = 1/2.*/ 208 | return two_sum_quick(hi, lo); 209 | } else { 210 | /* High word is not an integer. */ 211 | lo = 0.0; 212 | if (fabs(hi - a.hi) == 0.5 && a.lo < 0.0) { 213 | /* There is a tie in the high word, consult the low word 214 | * to break the tie. 215 | * NOTE: This does not cause INEXACT. 216 | */ 217 | hi -= 1.0; 218 | } 219 | return (ddouble){hi, lo}; 220 | } 221 | } 222 | 223 | static inline ddouble floorw(ddouble a) 224 | { 225 | double hi = floor(a.hi); 226 | double lo = 0.0; 227 | 228 | if (hi == a.hi) { 229 | /* High word is integer already. Round the low word. */ 230 | lo = floor(a.lo); 231 | return two_sum_quick(hi, lo); 232 | } 233 | return (ddouble){hi, lo}; 234 | } 235 | 236 | static inline ddouble ceilw(ddouble a) 237 | { 238 | double hi = ceil(a.hi); 239 | double lo = 0.0; 240 | 241 | if (hi == a.hi) { 242 | /* High word is integer already. Round the low word. */ 243 | lo = ceil(a.lo); 244 | return two_sum_quick(hi, lo); 245 | } 246 | return (ddouble){hi, lo}; 247 | } 248 | 249 | static inline bool signbitw(ddouble x) 250 | { 251 | return signbit(x.hi); 252 | } 253 | 254 | static inline ddouble copysignww(ddouble x, ddouble y) 255 | { 256 | /* The sign is determined by the hi part, however, the sign of hi and lo 257 | * need not be the same, so we cannot merely broadcast copysign to both 258 | * parts. 259 | */ 260 | return signbitw(x) != signbitw(y) ? negw(x) : x; 261 | } 262 | 263 | static inline ddouble copysignwd(ddouble x, double y) 264 | { 265 | return signbitw(x) != signbit(y) ? negw(x) : x; 266 | } 267 | 268 | static inline ddouble copysigndw(double x, ddouble y) 269 | { 270 | /* It is less surprising to return a ddouble here */ 271 | double res = copysign(x, y.hi); 272 | return (ddouble) {res, 0.0}; 273 | } 274 | 275 | static inline bool iszerow(ddouble x); 276 | 277 | static inline ddouble signw(ddouble x) 278 | { 279 | /* The numpy sign function does not respect signed zeros. We do. */ 280 | if (iszerow(x)) 281 | return x; 282 | return copysigndw(1.0, x); 283 | } 284 | 285 | /******************************** Constants *********************************/ 286 | 287 | static inline ddouble nanw() 288 | { 289 | double nan = strtod("NaN", NULL); 290 | return (ddouble){nan, nan}; 291 | } 292 | 293 | static inline ddouble infw() 294 | { 295 | double inf = strtod("Inf", NULL); 296 | return (ddouble){inf, inf}; 297 | } 298 | 299 | static const ddouble Q_ZERO = {0.0, 0.0}; 300 | static const ddouble Q_ONE = {1.0, 0.0}; 301 | static const ddouble Q_2PI = {6.283185307179586232e+00, 2.449293598294706414e-16}; 302 | static const ddouble Q_PI = {3.141592653589793116e+00, 1.224646799147353207e-16}; 303 | static const ddouble Q_PI_2 = {1.570796326794896558e+00, 6.123233995736766036e-17}; 304 | static const ddouble Q_PI_4 = {7.853981633974482790e-01, 3.061616997868383018e-17}; 305 | static const ddouble Q_3PI_4 = {2.356194490192344837e+00, 9.1848509936051484375e-17}; 306 | static const ddouble Q_PI_16 = {1.963495408493620697e-01, 7.654042494670957545e-18}; 307 | static const ddouble Q_E = {2.718281828459045091e+00, 1.445646891729250158e-16}; 308 | static const ddouble Q_LOG2 = {6.931471805599452862e-01, 2.319046813846299558e-17}; 309 | static const ddouble Q_LOG10 = {2.302585092994045901e+00, -2.170756223382249351e-16}; 310 | 311 | static const ddouble Q_EPS = {4.93038065763132e-32, 0.0}; 312 | static const ddouble Q_MIN = {2.0041683600089728e-292, 0.0}; 313 | static const ddouble Q_MAX = {1.79769313486231570815e+308, 0.0}; 314 | static const ddouble Q_TINY = {2.2250738585072014e-308, 0.0}; 315 | 316 | 317 | static inline bool isfinitew(ddouble x) 318 | { 319 | return isfinite(x.hi); 320 | } 321 | 322 | static inline bool isinfw(ddouble x) 323 | { 324 | return isinf(x.hi); 325 | } 326 | 327 | static inline bool isnanw(ddouble x) 328 | { 329 | return isnan(x.hi); 330 | } 331 | 332 | /*********************** Comparisons q/q ***************************/ 333 | 334 | static inline bool equalww(ddouble a, ddouble b) 335 | { 336 | return a.hi == b.hi && a.lo == b.lo; 337 | } 338 | 339 | static inline bool notequalww(ddouble a, ddouble b) 340 | { 341 | return a.hi != b.hi || a.lo != b.lo; 342 | } 343 | 344 | static inline bool greaterww(ddouble a, ddouble b) 345 | { 346 | return a.hi > b.hi || (a.hi == b.hi && a.lo > b.lo); 347 | } 348 | 349 | static inline bool lessww(ddouble a, ddouble b) 350 | { 351 | return a.hi < b.hi || (a.hi == b.hi && a.lo < b.lo); 352 | } 353 | 354 | static inline bool greaterequalww(ddouble a, ddouble b) 355 | { 356 | return a.hi > b.hi || (a.hi == b.hi && a.lo >= b.lo); 357 | } 358 | 359 | static inline bool lessequalww(ddouble a, ddouble b) 360 | { 361 | return a.hi < b.hi || (a.hi == b.hi && a.lo <= b.lo); 362 | } 363 | 364 | /*********************** Comparisons q/d ***************************/ 365 | 366 | static inline bool equalwd(ddouble a, double b) 367 | { 368 | return equalww(a, (ddouble){b, 0}); 369 | } 370 | 371 | static inline bool notequalwd(ddouble a, double b) 372 | { 373 | return notequalww(a, (ddouble){b, 0}); 374 | } 375 | 376 | static inline bool greaterwd(ddouble a, double b) 377 | { 378 | return greaterww(a, (ddouble){b, 0}); 379 | } 380 | 381 | static inline bool lesswd(ddouble a, double b) 382 | { 383 | return lessww(a, (ddouble){b, 0}); 384 | } 385 | 386 | static inline bool greaterequalwd(ddouble a, double b) 387 | { 388 | return greaterequalww(a, (ddouble){b, 0}); 389 | } 390 | 391 | static inline bool lessequalwd(ddouble a, double b) 392 | { 393 | return lessequalww(a, (ddouble){b, 0}); 394 | } 395 | 396 | /*********************** Comparisons d/q ***************************/ 397 | 398 | static inline bool equaldw(double a, ddouble b) 399 | { 400 | return equalww((ddouble){a, 0}, b); 401 | } 402 | 403 | static inline bool notequaldw(double a, ddouble b) 404 | { 405 | return notequalww((ddouble){a, 0}, b); 406 | } 407 | 408 | static inline bool greaterdw(double a, ddouble b) 409 | { 410 | return greaterww((ddouble){a, 0}, b); 411 | } 412 | 413 | static inline bool lessdw(double a, ddouble b) 414 | { 415 | return lessww((ddouble){a, 0}, b); 416 | } 417 | 418 | static inline bool greaterequaldw(double a, ddouble b) 419 | { 420 | return greaterequalww((ddouble){a, 0}, b); 421 | } 422 | 423 | static inline bool lessequaldw(double a, ddouble b) 424 | { 425 | return lessequalww((ddouble){a, 0}, b); 426 | } 427 | 428 | /************************ Minimum/maximum ************************/ 429 | 430 | static inline ddouble fminww(ddouble a, ddouble b) 431 | { 432 | return lessww(a, b) ? a : b; 433 | } 434 | 435 | static inline ddouble fmaxww(ddouble a, ddouble b) 436 | { 437 | return greaterww(a, b) ? a : b; 438 | } 439 | 440 | static inline ddouble fminwd(ddouble a, double b) 441 | { 442 | return lesswd(a, b) ? a : (ddouble) {b, 0}; 443 | } 444 | 445 | static inline ddouble fmaxwd(ddouble a, double b) 446 | { 447 | return greaterwd(a, b) ? a : (ddouble) {b, 0}; 448 | } 449 | 450 | static inline ddouble fmindw(double a, ddouble b) 451 | { 452 | return lessdw(a, b) ? (ddouble) {a, 0} : b; 453 | } 454 | 455 | static inline ddouble fmaxdw(double a, ddouble b) 456 | { 457 | return greaterdw(a, b) ? (ddouble) {a, 0} : b; 458 | } 459 | 460 | /************************** Unary tests **************************/ 461 | 462 | static inline bool iszerow(ddouble x) 463 | { 464 | return x.hi == 0.0; 465 | } 466 | 467 | static inline bool isonew(ddouble x) 468 | { 469 | return x.hi == 1.0 && x.lo == 0.0; 470 | } 471 | 472 | static inline bool ispositivew(ddouble x) 473 | { 474 | return x.hi > 0.0; 475 | } 476 | 477 | static inline bool isnegativew(ddouble x) 478 | { 479 | return x.hi < 0.0; 480 | } 481 | 482 | /************************** Advanced math functions ********************/ 483 | 484 | ddouble sqrtw(ddouble a); 485 | 486 | static inline ddouble ldexpw(ddouble a, int exp) 487 | { 488 | return (ddouble) {ldexp(a.hi, exp), ldexp(a.lo, exp)}; 489 | } 490 | 491 | /************************* Binary functions ************************/ 492 | 493 | ddouble _hypotqq_ordered(ddouble x, ddouble y); 494 | 495 | static inline ddouble hypotww(ddouble x, ddouble y) 496 | { 497 | x = absw(x); 498 | y = absw(y); 499 | if (x.hi < y.hi) 500 | return _hypotqq_ordered(y, x); 501 | else 502 | return _hypotqq_ordered(x, y); 503 | } 504 | 505 | static inline ddouble hypotdw(double x, ddouble y) 506 | { 507 | return hypotww((ddouble){x, 0}, y); 508 | } 509 | 510 | static inline ddouble hypotwd(ddouble x, double y) 511 | { 512 | return hypotww(x, (ddouble){y, 0}); 513 | } 514 | 515 | /* Computes the nearest integer to d. */ 516 | static inline ddouble nintw(ddouble d) { 517 | if (equalww(d, floorw(d))) { 518 | return d; 519 | } 520 | return floorw(addww(d, (ddouble){0.5, 0})); 521 | } 522 | 523 | ddouble expw(ddouble a); 524 | ddouble expm1w(ddouble a); 525 | ddouble ldexpwi(ddouble a, int m); 526 | ddouble logw(ddouble a); 527 | ddouble sinw(ddouble a); 528 | ddouble cosw(ddouble a); 529 | ddouble tanw(ddouble a); 530 | ddouble sinhw(ddouble a); 531 | ddouble coshw(ddouble a); 532 | ddouble tanhw(ddouble a); 533 | ddouble atanw(ddouble a); 534 | ddouble acosw(ddouble a); 535 | ddouble asinw(ddouble a); 536 | ddouble atanhw(ddouble a); 537 | ddouble acoshw(ddouble a); 538 | ddouble asinhw(ddouble a); 539 | ddouble atan2wd(ddouble a, double b); 540 | ddouble atan2dw(double a, ddouble b); 541 | ddouble atan2ww(ddouble a, ddouble b); 542 | ddouble powww(ddouble a, ddouble b); 543 | ddouble powwd(ddouble a, double b); 544 | ddouble powdw(double a, ddouble b); 545 | ddouble modfww(ddouble a, ddouble *b); 546 | -------------------------------------------------------------------------------- /csrc/_dd_linalg.c: -------------------------------------------------------------------------------- 1 | /* Python extension module for linear algebra functions. 2 | * 3 | * Copyright (C) 2021 Markus Wallerberger and others 4 | * SPDX-License-Identifier: MIT 5 | */ 6 | #include "Python.h" 7 | #include "math.h" 8 | #include "stdio.h" 9 | 10 | #include "dd_arith.h" 11 | #include "dd_linalg.h" 12 | 13 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 14 | #include "numpy/ndarraytypes.h" 15 | #include "numpy/ufuncobject.h" 16 | #include "numpy/npy_3kcompat.h" 17 | 18 | /** 19 | * Allows parameter to be marked unused 20 | */ 21 | #define MARK_UNUSED(x) do { (void)(x); } while(false) 22 | 23 | /************************ Linear algebra ***************************/ 24 | 25 | static void u_matmulw(char **args, const npy_intp *dims, const npy_intp* steps, 26 | void *data) 27 | { 28 | // signature (n;i,j),(n;j,k)->(n;i,k) 29 | const npy_intp nn = dims[0], ii = dims[1], jj = dims[2], kk = dims[3]; 30 | const npy_intp _san = steps[0], _sbn = steps[1], _scn = steps[2], 31 | _sai = steps[3], _saj = steps[4], _sbj = steps[5], 32 | _sbk = steps[6], _sci = steps[7], _sck = steps[8]; 33 | char *_a = args[0], *_b = args[1], *_c = args[2]; 34 | 35 | const npy_intp sai = _sai / sizeof(ddouble), saj = _saj / sizeof(ddouble), 36 | sbj = _sbj / sizeof(ddouble), sbk = _sbk / sizeof(ddouble), 37 | sci = _sci / sizeof(ddouble), sck = _sck / sizeof(ddouble); 38 | 39 | for (npy_intp n = 0; n != nn; ++n, _a += _san, _b += _sbn, _c += _scn) { 40 | const ddouble *a = (const ddouble *)_a, *b = (const ddouble *)_b; 41 | ddouble *c = (ddouble *)_c; 42 | 43 | #pragma omp parallel for collapse(2) 44 | for (npy_intp i = 0; i < ii; ++i) { 45 | for (npy_intp k = 0; k < kk; ++k) { 46 | ddouble val = Q_ZERO, tmp; 47 | for (npy_intp j = 0; j < jj; ++j) { 48 | tmp = mulww(a[i * sai + j * saj], b[j * sbj + k * sbk]); 49 | val = addww(val, tmp); 50 | } 51 | c[i * sci + k * sck] = val; 52 | } 53 | } 54 | } 55 | MARK_UNUSED(data); 56 | } 57 | 58 | /****************************** Helper functions *************************/ 59 | 60 | static void ensure_inplace_2( 61 | char *in, char *out, npy_intp n1, npy_intp si1, npy_intp so1, 62 | npy_intp n2, npy_intp si2, npy_intp so2) 63 | { 64 | if (in == out) 65 | return; 66 | 67 | char *in1 = in, *out1 = out; 68 | for (npy_intp i1 = 0; i1 != n1; ++i1, in1 += si1, out1 += so1) { 69 | char *in2 = in1, *out2 = out1; 70 | for (npy_intp i2 = 0; i2 != n2; ++i2, in2 += si2, out2 += so2) { 71 | char *inx = in2, *outx = out2; 72 | *(ddouble *)outx = *(ddouble *)inx; 73 | } 74 | } 75 | } 76 | 77 | static void ensure_inplace_3( 78 | char *in, char *out, npy_intp n1, npy_intp si1, npy_intp so1, 79 | npy_intp n2, npy_intp si2, npy_intp so2, npy_intp n3, npy_intp si3, 80 | npy_intp so3) 81 | { 82 | if (in == out) 83 | return; 84 | 85 | char *in1 = in, *out1 = out; 86 | for (npy_intp i1 = 0; i1 != n1; ++i1, in1 += si1, out1 += so1) { 87 | char *in2 = in1, *out2 = out1; 88 | for (npy_intp i2 = 0; i2 != n2; ++i2, in2 += si2, out2 += so2) { 89 | char *in3 = in2, *out3 = out2; 90 | for (npy_intp i3 = 0; i3 != n3; ++i3, in3 += si3, out3 += so3) { 91 | char *inx = in3, *outx = out3; 92 | *(ddouble *)outx = *(ddouble *)inx; 93 | } 94 | } 95 | } 96 | } 97 | 98 | /*************************** More complicated ***********************/ 99 | 100 | static void u_normw( 101 | char **args, const npy_intp *dims, const npy_intp* steps, void *data) 102 | { 103 | // signature (n;i)->(n;) 104 | const npy_intp nn = dims[0], ii = dims[1]; 105 | const npy_intp san = steps[0], sbn = steps[1], _sai = steps[2]; 106 | char *_a = args[0], *_b = args[1]; 107 | 108 | for (npy_intp n = 0; n != nn; ++n, _a += san, _b += sbn) { 109 | *(ddouble *)_b = normw((const ddouble *)_a, ii, _sai / sizeof(ddouble)); 110 | } 111 | MARK_UNUSED(data); 112 | } 113 | 114 | static void u_householderw( 115 | char **args, const npy_intp *dims, const npy_intp* steps, void *data) 116 | { 117 | // signature (n;i)->(n;),(n;i) 118 | const npy_intp nn = dims[0], ii = dims[1]; 119 | const npy_intp _san = steps[0], _sbn = steps[1], _scn = steps[2], 120 | _sai = steps[3], _sci = steps[4]; 121 | char *_a = args[0], *_b = args[1], *_c = args[2]; 122 | 123 | for (npy_intp n = 0; n != nn; ++n, _a += _san, _b += _sbn, _c += _scn) { 124 | *(ddouble *)_b = householderw( 125 | (const ddouble *)_a, (ddouble *)_c, ii, 126 | _sai / sizeof(ddouble), _sci / sizeof(ddouble)); 127 | } 128 | MARK_UNUSED(data); 129 | } 130 | 131 | static void u_rank1updateq( 132 | char **args, const npy_intp *dims, const npy_intp* steps, void *data) 133 | { 134 | // signature (n;i,j),(n;i),(n;j)->(n;i,j) 135 | const npy_intp nn = dims[0], ii = dims[1], jj = dims[2]; 136 | const npy_intp _san = steps[0], _sbn = steps[1], _scn = steps[2], 137 | _sdn = steps[3], _sai = steps[4], _saj = steps[5], 138 | _sbi = steps[6], _scj = steps[7], _sdi = steps[8], 139 | _sdj = steps[9]; 140 | char *_a = args[0], *_b = args[1], *_c = args[2], *_d = args[3]; 141 | 142 | ensure_inplace_3(_a, _d, nn, _san, _sdn, ii, _sai, _sdi, jj, _saj, _sdj); 143 | for (npy_intp n = 0; n != nn; ++n, _a += _san, _b += _sbn, _c += _scn) { 144 | rank1updateq( 145 | (ddouble *)_d, _sai / sizeof(ddouble), _saj / sizeof(ddouble), 146 | (const ddouble *)_b, _sbi / sizeof(ddouble), 147 | (const ddouble *)_c, _scj / sizeof(ddouble), ii, jj); 148 | } 149 | MARK_UNUSED(data); 150 | } 151 | 152 | static void u_jacobisweepw( 153 | char **args, const npy_intp *dims, const npy_intp* steps, void *data) 154 | { 155 | // signature (n;i,j),(n;i=j,j)->(n;i,j),(n;i=j,j);(n,) 156 | const npy_intp nn = dims[0], ii = dims[1], jj = dims[2]; 157 | const npy_intp _san = steps[0], _sbn = steps[1], _scn = steps[2], 158 | _sdn = steps[3], _sen = steps[4], _sai = steps[5], 159 | _saj = steps[6], _sbi = steps[7], _sbj = steps[8], 160 | _sci = steps[9], _scj = steps[10], _sdi = steps[11], 161 | _sdj = steps[12]; 162 | char *_a = args[0], *_b = args[1], *_c = args[2], *_d = args[3], 163 | *_e = args[4]; 164 | 165 | ensure_inplace_3(_a, _c, nn, _san, _scn, ii, _sai, _sci, jj, _saj, _scj); 166 | ensure_inplace_3(_b, _d, nn, _sbn, _sdn, jj, _sbi, _sdi, jj, _sbj, _sdj); 167 | for (npy_intp n = 0; n != nn; ++n, _c += _scn, _d += _sdn, _e += _sen) { 168 | ddouble *c = (ddouble *)_c, *d = (ddouble *)_d, *e = (ddouble *)_e; 169 | const npy_intp 170 | sci = _sci / sizeof(ddouble), scj = _scj / sizeof(ddouble), 171 | sdi = _sdi / sizeof(ddouble), sdj = _sdj / sizeof(ddouble); 172 | 173 | *e = jacobi_sweep(c, sci, scj, d, sdi, sdj, ii, jj); 174 | } 175 | MARK_UNUSED(data); 176 | } 177 | 178 | static void u_givensw( 179 | char **args, const npy_intp *dims, const npy_intp* steps, void *data) 180 | { 181 | // signature (n;2)->(n;2),(n;2,2) 182 | const npy_intp nn = dims[0]; 183 | const npy_intp san = steps[0], sbn = steps[1], scn = steps[2], 184 | sai = steps[3], sbi = steps[4], sci = steps[5], 185 | scj = steps[6]; 186 | char *_a = args[0], *_b = args[1], *_c = args[2]; 187 | 188 | for (npy_intp n = 0; n != nn; ++n, _a += san, _b += sbn, _c += scn) { 189 | ddouble f = *(ddouble *) _a; 190 | ddouble g = *(ddouble *) (_a + sai); 191 | 192 | ddouble c, s, r; 193 | givensw(f, g, &c, &s, &r); 194 | 195 | *(ddouble *)_b = r; 196 | *(ddouble *)(_b + sbi) = Q_ZERO; 197 | *(ddouble *)_c = c; 198 | *(ddouble *)(_c + scj) = s; 199 | *(ddouble *)(_c + sci) = negw(s); 200 | *(ddouble *)(_c + sci + scj) = c; 201 | } 202 | MARK_UNUSED(data); 203 | } 204 | 205 | static void u_givens_seqq( 206 | char **args, const npy_intp *dims, const npy_intp* steps, void *data) 207 | { 208 | // signature (n;i,2),(n;i,j)->(n;i,j) 209 | const npy_intp nn = dims[0], ii = dims[1], jj = dims[3]; 210 | const npy_intp _san = steps[0], _sbn = steps[1], _scn = steps[2], 211 | _sai = steps[3], _saq = steps[4], _sbi = steps[5], 212 | _sbj = steps[6], _sci = steps[7], _scj = steps[8]; 213 | char *_a = args[0], *_b = args[1], *_c = args[2]; 214 | 215 | ensure_inplace_3(_b, _c, nn, _sbn, _scn, ii, _sbi, _sci, jj, _sbj, _scj); 216 | for (npy_intp n = 0; n != nn; ++n, _a += _san, _c += _scn) { 217 | /* The rotation are interdependent, so we splice the array in 218 | * the other direction. 219 | */ 220 | #pragma omp parallel for 221 | for (npy_intp j = 0; j < jj; ++j) { 222 | for (npy_intp i = 0; i < ii - 1; ++i) { 223 | ddouble *c_x = (ddouble *)(_c + i *_sci + j * _scj); 224 | ddouble *c_y = (ddouble *)(_c + (i + 1) *_sci + j * _scj); 225 | ddouble g_cos = *(ddouble *)(_a + i * _sai); 226 | ddouble g_sin = *(ddouble *)(_a + i * _sai + _saq); 227 | lmul_givensq(c_x, c_y, g_cos, g_sin, *c_x, *c_y); 228 | } 229 | } 230 | } 231 | MARK_UNUSED(data); 232 | } 233 | 234 | static void u_golub_kahan_chaseq( 235 | char **args, const npy_intp *dims, const npy_intp* steps, void *data) 236 | { 237 | // signature (n;i),(n;i)->(n;i),(n;i),(n;i,4) 238 | const npy_intp nn = dims[0], ii = dims[1]; 239 | const npy_intp _san = steps[0], _sbn = steps[1], _scn = steps[2], 240 | _sdn = steps[3], _sen = steps[4], _sai = steps[5], 241 | _sbi = steps[6], _sci = steps[7], _sdi = steps[8], 242 | _sei = steps[9], _se4 = steps[10]; 243 | char *_a = args[0], *_b = args[1], *_c = args[2], *_d = args[3], 244 | *_e = args[4]; 245 | 246 | ensure_inplace_2(_a, _c, nn, _san, _scn, ii, _sai, _sci); 247 | ensure_inplace_2(_b, _d, nn, _sbn, _sdn, ii, _sbi, _sdi); 248 | if (_se4 != sizeof(ddouble) || _sei != 4 * sizeof(ddouble)) { 249 | fprintf(stderr, "rot is not contiguous, but needs to be"); 250 | return; 251 | } 252 | 253 | for (npy_intp n = 0; n != nn; ++n, _c += _scn, _d += _sdn, _e += _sen) { 254 | golub_kahan_chaseq((ddouble *)_c, _sci / sizeof(ddouble), 255 | (ddouble *)_d, _sdi / sizeof(ddouble), 256 | ii, (ddouble *)_e); 257 | } 258 | MARK_UNUSED(data); 259 | } 260 | 261 | static void u_svd_2x2( 262 | char **args, const npy_intp *dims, const npy_intp* steps, void *data) 263 | { 264 | // signature (n;2,2)->(n;2,2),(n;2),(n;2,2) 265 | const npy_intp nn = dims[0]; 266 | const npy_intp san = steps[0], sbn = steps[1], scn = steps[2], 267 | sdn = steps[3], sai = steps[4], saj = steps[5], 268 | sbi = steps[6], sbj = steps[7], sci = steps[8], 269 | sdi = steps[9], sdj = steps[10]; 270 | char *_a = args[0], *_b = args[1], *_c = args[2], *_d = args[3]; 271 | 272 | for (npy_intp n = 0; n != nn; 273 | ++n, _a += san, _b += sbn, _c += scn, _d += sdn) { 274 | ddouble a11 = *(ddouble *) _a; 275 | ddouble a12 = *(ddouble *) (_a + saj); 276 | ddouble a21 = *(ddouble *) (_a + sai); 277 | ddouble a22 = *(ddouble *) (_a + sai + saj); 278 | 279 | ddouble smin, smax, cu, su, cv, sv; 280 | svd_2x2(a11, a12, a21, a22, &smin, &smax, &cv, &sv, &cu, &su); 281 | 282 | *(ddouble *)_b = cu; 283 | *(ddouble *)(_b + sbj) = negw(su); 284 | *(ddouble *)(_b + sbi) = su; 285 | *(ddouble *)(_b + sbi + sbj) = cu; 286 | 287 | *(ddouble *)_c = smax; 288 | *(ddouble *)(_c + sci) = smin; 289 | 290 | *(ddouble *)_d = cv; 291 | *(ddouble *)(_d + sdj) = sv; 292 | *(ddouble *)(_d + sdi) = negw(sv); 293 | *(ddouble *)(_d + sdi + sdj) = cv; 294 | } 295 | MARK_UNUSED(data); 296 | } 297 | 298 | static void u_svvals_2x2( 299 | char **args, const npy_intp *dims, const npy_intp* steps, void *data) 300 | { 301 | // signature (n;2,2)->(n;2) 302 | const npy_intp nn = dims[0]; 303 | const npy_intp san = steps[0], sbn = steps[1], sai = steps[2], 304 | saj = steps[3], sbi = steps[4]; 305 | char *_a = args[0], *_b = args[1]; 306 | 307 | for (npy_intp n = 0; n != nn; ++n, _a += san, _b += sbn) { 308 | ddouble a11 = *(ddouble *) _a; 309 | ddouble a12 = *(ddouble *) (_a + saj); 310 | ddouble a21 = *(ddouble *) (_a + sai); 311 | ddouble a22 = *(ddouble *) (_a + sai + saj); 312 | 313 | ddouble smin, smax; 314 | svd_2x2(a11, a12, a21, a22, &smin, &smax, NULL, NULL, NULL, NULL); 315 | 316 | *(ddouble *)_b = smax; 317 | *(ddouble *)(_b + sbi) = smin; 318 | } 319 | MARK_UNUSED(data); 320 | } 321 | 322 | /* ----------------------- Python stuff -------------------------- */ 323 | 324 | static PyObject *module; 325 | static PyObject *numpy_module = NULL; 326 | static int type_num; 327 | 328 | static PyObject *make_module() 329 | { 330 | static PyMethodDef no_methods[] = { 331 | {NULL, NULL, 0, NULL} // No methods defined 332 | }; 333 | static struct PyModuleDef module_def = { 334 | PyModuleDef_HEAD_INIT, 335 | "_dd_linalg", 336 | NULL, 337 | -1, 338 | no_methods, 339 | NULL, 340 | NULL, 341 | NULL, 342 | NULL 343 | }; 344 | module = PyModule_Create(&module_def); 345 | return module; 346 | } 347 | 348 | static int import_ddouble_dtype() 349 | { 350 | PyObject *dd_module = PyImport_ImportModule("xprec._dd_ufunc"); 351 | if (dd_module == NULL) 352 | return -1; 353 | 354 | PyArray_Descr *dtype = 355 | (PyArray_Descr *)PyObject_GetAttrString(dd_module, "dtype"); 356 | if (dtype == NULL) 357 | return -1; 358 | 359 | /* Let's pray at least this stays public */ 360 | type_num = dtype->type_num; 361 | return 0; 362 | } 363 | 364 | static int gufunc( 365 | PyUFuncGenericFunction uloop, int nin, int nout, 366 | const char *signature, const char *name, const char *docstring, 367 | bool in_numpy) 368 | { 369 | PyUFuncObject *ufunc = NULL; 370 | int *arg_types = NULL, retcode = 0; 371 | 372 | if (in_numpy) { 373 | ufunc = (PyUFuncObject *)PyObject_GetAttrString(numpy_module, name); 374 | } else { 375 | ufunc = (PyUFuncObject *)PyUFunc_FromFuncAndDataAndSignature( 376 | NULL, NULL, NULL, 0, nin, nout, PyUFunc_None, name, 377 | docstring, 0, signature); 378 | } 379 | if (ufunc == NULL) goto error; 380 | 381 | int *dtypes = PyMem_New(int, nin + nout); 382 | if (dtypes == NULL) goto error; 383 | 384 | for (int i = 0; i != nin + nout; ++i) 385 | dtypes[i] = type_num; 386 | 387 | retcode = PyUFunc_RegisterLoopForType(ufunc, type_num, 388 | uloop, arg_types, NULL); 389 | if (retcode < 0) goto error; 390 | 391 | return PyModule_AddObject(module, name, (PyObject *)ufunc); 392 | 393 | error: 394 | if (!in_numpy) 395 | Py_XDECREF(ufunc); 396 | PyMem_Free(arg_types); 397 | return -1; 398 | } 399 | 400 | PyMODINIT_FUNC PyInit__dd_linalg(void) 401 | { 402 | if (!make_module()) 403 | return NULL; 404 | 405 | /* Initialize numpy things */ 406 | import_array(); 407 | import_umath(); 408 | 409 | numpy_module = PyImport_ImportModule("numpy"); 410 | if (numpy_module == NULL) 411 | return NULL; 412 | 413 | if (import_ddouble_dtype() < 0) 414 | return NULL; 415 | 416 | gufunc(u_normw, 1, 1, "(i)->()", 417 | "norm", "Vector 2-norm", false); 418 | gufunc(u_matmulw, 2, 1, "(i?,j),(j,k?)->(i?,k?)", 419 | "matmul", "Matrix multiplication", true); 420 | gufunc(u_givensw, 1, 2, "(2)->(2),(2,2)", 421 | "givens", "Generate Givens rotation", false); 422 | gufunc(u_givens_seqq, 2, 1, "(i,2),(i,j?)->(i,j?)", 423 | "givens_seq", "apply sequence of givens rotation to matrix", false); 424 | gufunc(u_householderw, 1, 2, "(i)->(),(i)", 425 | "householder", "Generate Householder reflectors", false); 426 | gufunc(u_rank1updateq, 3, 1, "(i,j),(i),(j)->(i,j)", 427 | "rank1update", "Perform rank-1 update of matrix", false); 428 | gufunc(u_svd_2x2, 1, 3, "(2,2)->(2,2),(2),(2,2)", 429 | "svd2x2", "SVD of upper triangular 2x2 problem", false); 430 | gufunc(u_svvals_2x2, 1, 1, "(2,2)->(2)", 431 | "svvals2x2", "singular values of upper triangular 2x2 problem", false); 432 | gufunc(u_jacobisweepw, 2, 3, "(i,j),(j,j)->(i,j),(j,j),()", 433 | "jacobi_sweep", "Perform sweep of one-sided Jacobi rotations", false); 434 | gufunc(u_golub_kahan_chaseq, 2, 3, "(i),(i)->(i),(i),(i,4)", 435 | "golub_kahan_chase", "bidiagonal chase procedure", false); 436 | 437 | /* Make dtype */ 438 | PyArray_Descr *dtype = PyArray_DescrFromType(NPY_CDOUBLE); 439 | PyModule_AddObject(module, "dtype", (PyObject *)dtype); 440 | 441 | /* Module is ready */ 442 | return module; 443 | } 444 | -------------------------------------------------------------------------------- /csrc/dd_arith.c: -------------------------------------------------------------------------------- 1 | /* Double-double arithmetic library 2 | * 3 | * Part of the functions are copied from the QD library for U.C. Berkeley 4 | * and licensed modified BSD (see QD-LICENSE.txt) 5 | * 6 | * Copyright (C) 2012 Yozo Hida, Xiaoye S. Li, David H. Bailey 7 | * Copyright (C) 2021 Markus Wallerberger and others 8 | * SPDX-License-Identifier: MIT and Modified-BSD 9 | */ 10 | #include "./dd_arith.h" 11 | #include 12 | 13 | // 2**500 and 2**(-500); 14 | static const double LARGE = 3.273390607896142e+150; 15 | static const double INV_LARGE = 3.054936363499605e-151; 16 | 17 | static ddouble hypotqq_compute(ddouble x, ddouble y) 18 | { 19 | return sqrtw(addww(sqrw(x), sqrw(y))); 20 | } 21 | 22 | ddouble _hypotqq_ordered(ddouble x, ddouble y) 23 | { 24 | // assume that x >= y >= 0 25 | // special cases 26 | if (iszerow(y)) 27 | return x; 28 | 29 | // if very large or very small, renormalize 30 | if (x.hi > LARGE) { 31 | x = mul_pwr2(x, INV_LARGE); 32 | y = mul_pwr2(y, INV_LARGE); 33 | return mul_pwr2(hypotqq_compute(x, y), LARGE); 34 | } 35 | if (x.hi < INV_LARGE) { 36 | x = mul_pwr2(x, LARGE); 37 | y = mul_pwr2(y, LARGE); 38 | return mul_pwr2(hypotqq_compute(x, y), INV_LARGE); 39 | } 40 | 41 | // normal case 42 | return hypotqq_compute(x, y); 43 | } 44 | 45 | ddouble sqrtw(ddouble a) 46 | { 47 | /* Given approximation x to 1/sqrt(a), perform a single Newton step: 48 | * 49 | * sqrt(a) = a*x + [a - (a*x)^2] * x / 2 (approx) 50 | * 51 | * The approximation is accurate to twice the accuracy of x. 52 | * Also, the multiplication (a*x) and [-]*x can be done with 53 | * only half the precision. 54 | * From: Karp, High Precision Division and Square Root, 1993 55 | */ 56 | if (a.hi <= 0) 57 | return (ddouble){sqrt(a.hi), 0}; 58 | 59 | double x = 1.0 / sqrt(a.hi); 60 | double ax = a.hi * x; 61 | ddouble ax_sqr = sqrw((ddouble){ax, 0}); 62 | double diff = subww(a, ax_sqr).hi * x * 0.5; 63 | return two_sum(ax, diff); 64 | } 65 | 66 | /* Inverse Factorials from 1/0!, 1/1!, 1/2!, asf. */ 67 | static int _n_inv_fact = 18; 68 | static const ddouble _inv_fact[] = { 69 | {1.00000000000000000e+00, 0.00000000000000000e+00}, 70 | {1.00000000000000000e+00, 0.00000000000000000e+00}, 71 | {5.00000000000000000e-01, 0.00000000000000000e+00}, 72 | {1.66666666666666657e-01, 9.25185853854297066e-18}, 73 | {4.16666666666666644e-02, 2.31296463463574266e-18}, 74 | {8.33333333333333322e-03, 1.15648231731787138e-19}, 75 | {1.38888888888888894e-03, -5.30054395437357706e-20}, 76 | {1.98412698412698413e-04, 1.72095582934207053e-22}, 77 | {2.48015873015873016e-05, 2.15119478667758816e-23}, 78 | {2.75573192239858925e-06, -1.85839327404647208e-22}, 79 | {2.75573192239858883e-07, 2.37677146222502973e-23}, 80 | {2.50521083854417202e-08, -1.44881407093591197e-24}, 81 | {2.08767569878681002e-09, -1.20734505911325997e-25}, 82 | {1.60590438368216133e-10, 1.25852945887520981e-26}, 83 | {1.14707455977297245e-11, 2.06555127528307454e-28}, 84 | {7.64716373181981641e-13, 7.03872877733453001e-30}, 85 | {4.77947733238738525e-14, 4.39920548583408126e-31}, 86 | {2.81145725434552060e-15, 1.65088427308614326e-31} 87 | }; 88 | 89 | /** 90 | * For the exponential of `a`, return compute tuple `x, m` such that: 91 | * 92 | * exp(a) = ldexp(1 + x, m), 93 | * 94 | * where `m` is chosen such that `abs(x) < 1`. The value `x` is returned, 95 | * whereas the value `m` is given as an out parameter. 96 | */ 97 | static ddouble _exp_reduced(ddouble a, int *m) 98 | { 99 | // Strategy: We first reduce the size of x by noting that 100 | // 101 | // exp(k * r + m * log(2)) = 2^m * exp(r)^k 102 | // 103 | // where m and k are integers. By choosing m appropriately 104 | // we can make |k * r| <= log(2) / 2 = 0.347. 105 | const double k = 512.0; 106 | const double inv_k = 1.0 / k; 107 | double mm = floor(a.hi / Q_LOG2.hi + 0.5); 108 | ddouble r = mul_pwr2(subww(a, mulwd(Q_LOG2, mm)), inv_k); 109 | *m = (int)mm; 110 | 111 | // Now, evaluate exp(r) using the Taylor series, since reducing 112 | // the argument substantially speeds up the convergence. We omit order 0 113 | // and start at order 1: 114 | ddouble rpower = r; 115 | ddouble term = r; 116 | ddouble sum = term; 117 | 118 | // Order 2 119 | rpower = sqrw(r); 120 | term = mul_pwr2(rpower, 0.5); 121 | sum = addww(sum, term); 122 | 123 | // Order 3 and up 124 | for (int i = 3; i < 9; i++) { 125 | rpower = mulww(rpower, r); 126 | term = mulww(rpower, _inv_fact[i]); 127 | sum = addww(sum, term); 128 | if (fabs(term.hi) <= inv_k * Q_EPS.hi) 129 | break; 130 | } 131 | 132 | // We now have that approximately exp(r) == 1 + sum. Raise that to 133 | // the m'th (512) power by squaring the binomial nine times 134 | sum = addww(mul_pwr2(sum, 2.0), sqrw(sum)); 135 | sum = addww(mul_pwr2(sum, 2.0), sqrw(sum)); 136 | sum = addww(mul_pwr2(sum, 2.0), sqrw(sum)); 137 | sum = addww(mul_pwr2(sum, 2.0), sqrw(sum)); 138 | sum = addww(mul_pwr2(sum, 2.0), sqrw(sum)); 139 | sum = addww(mul_pwr2(sum, 2.0), sqrw(sum)); 140 | sum = addww(mul_pwr2(sum, 2.0), sqrw(sum)); 141 | sum = addww(mul_pwr2(sum, 2.0), sqrw(sum)); 142 | sum = addww(mul_pwr2(sum, 2.0), sqrw(sum)); 143 | return sum; 144 | } 145 | 146 | ddouble expw(ddouble a) 147 | { 148 | if (a.hi <= -709.0) 149 | return Q_ZERO; 150 | if (a.hi >= 709.0) 151 | return infw(); 152 | if (iszerow(a)) 153 | return Q_ONE; 154 | if (isonew(a)) 155 | return Q_E; 156 | 157 | int m; 158 | ddouble sum = _exp_reduced(a, &m); 159 | 160 | /** Add back the one and multiply by 2 to the m */ 161 | sum = addwd(sum, 1.0); 162 | return ldexpw(sum, (int)m); 163 | } 164 | 165 | ddouble expm1w(ddouble a) 166 | { 167 | if (a.hi <= -709.0) 168 | return (ddouble){-1.0, 0.0}; 169 | if (a.hi >= 709.0) 170 | return infw(); 171 | if (iszerow(a)) 172 | return Q_ZERO; 173 | 174 | int m; 175 | ddouble sum = _exp_reduced(a, &m); 176 | 177 | /* Truncation case: simply return sum */ 178 | if (m == 0) 179 | return sum; 180 | 181 | /* Non-truncation case: compute full exp, then remove the one */ 182 | sum = addwd(sum, 1.0); 183 | sum = ldexpw(sum, (int)m); 184 | return subwd(sum, 1.0); 185 | } 186 | 187 | ddouble ldexpwi(ddouble a, int exp) 188 | { 189 | return ldexpw(a, exp); 190 | } 191 | 192 | ddouble logw(ddouble a) 193 | { 194 | /* Strategy. The Taylor series for log converges much more 195 | * slowly than that of exp, due to the lack of the factorial 196 | * term in the denominator. Hence this routine instead tries 197 | * to determine the root of the function 198 | * 199 | * f(x) = exp(x) - a 200 | * 201 | * using Newton iteration. The iteration is given by 202 | * 203 | * x' = x - f(x)/f'(x) 204 | * = x - (1 - a * exp(-x)) 205 | * = x + a * exp(-x) - 1. 206 | * 207 | * Only one iteration is needed, since Newton's iteration 208 | * approximately doubles the number of digits per iteration. 209 | */ 210 | if (isonew(a)) 211 | return Q_ZERO; 212 | if (iszerow(a)) 213 | return negw(infw()); 214 | if (!ispositivew(a)) 215 | return nanw(); 216 | 217 | ddouble x = {log(a.hi), 0.0}; /* Initial approximation */ 218 | x = subwd(addww(x, mulww(a, expw(negw(x)))), 1.0); 219 | return x; 220 | } 221 | 222 | /* Table of sin(k * pi/16) and cos(k * pi/16). */ 223 | static const ddouble _sin_table[] = { 224 | {1.950903220161282758e-01, -7.991079068461731263e-18}, 225 | {3.826834323650897818e-01, -1.005077269646158761e-17}, 226 | {5.555702330196021776e-01, 4.709410940561676821e-17}, 227 | {7.071067811865475727e-01, -4.833646656726456726e-17} 228 | }; 229 | 230 | static const ddouble _cos_table[] = { 231 | {9.807852804032304306e-01, 1.854693999782500573e-17}, 232 | {9.238795325112867385e-01, 1.764504708433667706e-17}, 233 | {8.314696123025452357e-01, 1.407385698472802389e-18}, 234 | {7.071067811865475727e-01, -4.833646656726456726e-17} 235 | }; 236 | 237 | static ddouble sin_taylor(ddouble a) 238 | { 239 | // Use the Taylor series a - a^3/3! + a^5/5! + ... 240 | const double thresh = 0.5 * fabs(a.hi) * Q_EPS.hi; 241 | const ddouble minus_asquared = negw(sqrw(a)); 242 | 243 | // First order: 244 | ddouble apow = a; 245 | ddouble term = a; 246 | ddouble sum = a; 247 | 248 | // Subsequent orders: 249 | for (int i = 3; i < _n_inv_fact; i += 2) { 250 | apow = mulww(apow, minus_asquared); 251 | term = mulww(apow, _inv_fact[i]); 252 | sum = addww(sum, term); 253 | if (fabs(term.hi) <= thresh) 254 | break; 255 | } 256 | return sum; 257 | } 258 | 259 | static ddouble cos_taylor(ddouble a) 260 | { 261 | // Use Taylor series 1 - x^2/2! + x^4/4! + ... 262 | const double thresh = 0.5 * Q_EPS.hi; 263 | const ddouble minus_asquared = negw(sqrw(a)); 264 | 265 | // Zeroth and second order: 266 | ddouble apow = minus_asquared; 267 | ddouble term = mul_pwr2(apow, 0.5); 268 | ddouble sum = adddw(1.0, term); 269 | 270 | // From fourth order: 271 | for (int i = 4; i < _n_inv_fact; i += 2) { 272 | apow = mulww(apow, minus_asquared); 273 | term = mulww(apow, _inv_fact[i]); 274 | sum = addww(sum, term); 275 | if (fabs(term.hi) <= thresh) 276 | break; 277 | } 278 | return sum; 279 | } 280 | 281 | static void sincos_taylor(ddouble a, ddouble *sin_a, ddouble *cos_a) 282 | { 283 | if (iszerow(a)) { 284 | *sin_a = Q_ZERO; 285 | *cos_a = Q_ONE; 286 | } else { 287 | *sin_a = sin_taylor(a); 288 | *cos_a = sqrtw(subdw(1.0, sqrw(*sin_a))); 289 | } 290 | } 291 | 292 | /** 293 | * To compute 2pi-periodic function, we reduce the argument `a` by 294 | * choosing integers z, -2 <= j <= 2 and -4 <= k <= 4 such that: 295 | * 296 | * a == z * (2*pi) + j * (pi/2) + k * (pi/16) + t, 297 | * 298 | * where `abs(t) <= pi/32`. 299 | */ 300 | static ddouble mod_pi16(ddouble a, int *j, int *k) 301 | { 302 | static const ddouble pi_16 = 303 | {1.963495408493620697e-01, 7.654042494670957545e-18}; 304 | 305 | // approximately reduce modulo 2*pi 306 | ddouble z = roundw(divww(a, Q_2PI)); 307 | ddouble r = subww(a, mulww(Q_2PI, z)); 308 | 309 | // approximately reduce modulo pi/2 310 | double q = floor(r.hi / Q_PI_2.hi + 0.5); 311 | ddouble t = subww(r, mulwd(Q_PI_2, q)); 312 | *j = (int)q; 313 | 314 | // approximately reduce modulo pi/16. 315 | q = floor(t.hi / pi_16.hi + 0.5); 316 | t = subww(t, mulwd(pi_16, q)); 317 | *k = (int)q; 318 | return t; 319 | } 320 | 321 | ddouble sinw(ddouble a) 322 | { 323 | /* Strategy. To compute sin(x), we choose integers a, b so that 324 | * 325 | * x = s + a * (pi/2) + b * (pi/16) 326 | * 327 | * and |s| <= pi/32. Using the fact that 328 | * 329 | * sin(pi/16) = 0.5 * sqrt(2 - sqrt(2 + sqrt(2))) 330 | * 331 | * we can compute sin(x) from sin(s), cos(s). This greatly 332 | * increases the convergence of the sine Taylor series. 333 | */ 334 | if (iszerow(a)) 335 | return Q_ZERO; 336 | 337 | int j, k; 338 | ddouble t = mod_pi16(a, &j, &k); 339 | int abs_k = abs(k); 340 | 341 | if (j < -2 || j > 2) 342 | return nanw(); 343 | 344 | if (abs_k > 4) 345 | return nanw(); 346 | 347 | if (k == 0) { 348 | switch (j) 349 | { 350 | case 0: 351 | return sin_taylor(t); 352 | case 1: 353 | return cos_taylor(t); 354 | case -1: 355 | return negw(cos_taylor(t)); 356 | default: 357 | return negw(sin_taylor(t)); 358 | } 359 | } 360 | 361 | ddouble u = _cos_table[abs_k - 1]; 362 | ddouble v = _sin_table[abs_k - 1]; 363 | ddouble sin_x, cos_x, r; 364 | sincos_taylor(t, &sin_x, &cos_x); 365 | if (j == 0) { 366 | if (k > 0) 367 | r = addww(mulww(u, sin_x), mulww(v, cos_x)); 368 | else 369 | r = subww(mulww(u, sin_x), mulww(v, cos_x)); 370 | } else if (j == 1) { 371 | if (k > 0) 372 | r = subww(mulww(u, cos_x), mulww(v, sin_x)); 373 | else 374 | r = addww(mulww(u, cos_x), mulww(v, sin_x)); 375 | } else if (j == -1) { 376 | if (k > 0) 377 | r = subww(mulww(v, sin_x), mulww(u, cos_x)); 378 | else if (k < 0) /* NOTE! */ 379 | r = subww(mulww(negw(u), cos_x), mulww(v, sin_x)); 380 | } else { 381 | if (k > 0) 382 | r = subww(mulww(negw(u), sin_x), mulww(v, cos_x)); 383 | else 384 | r = subww(mulww(v, cos_x), mulww(u, sin_x)); 385 | } 386 | return r; 387 | } 388 | 389 | ddouble cosw(ddouble a) 390 | { 391 | if (iszerow(a)) 392 | return Q_ONE; 393 | 394 | int j, k; 395 | ddouble t = mod_pi16(a, &j, &k); 396 | int abs_k = abs(k); 397 | 398 | if (j < -2 || j > 2) 399 | return nanw(); 400 | 401 | if (abs_k > 4) 402 | return nanw(); 403 | 404 | if (k == 0) { 405 | switch (j) { 406 | case 0: 407 | return cos_taylor(t); 408 | case 1: 409 | return negw(sin_taylor(t)); 410 | case -1: 411 | return sin_taylor(t); 412 | default: 413 | return negw(cos_taylor(t)); 414 | } 415 | } 416 | 417 | ddouble sin_x, cos_x, r; 418 | sincos_taylor(t, &sin_x, &cos_x); 419 | ddouble u = _cos_table[abs_k - 1]; 420 | ddouble v = _sin_table[abs_k - 1]; 421 | 422 | if (j == 0) { 423 | if (k > 0) 424 | r = subww(mulww(u, cos_x), mulww(v, sin_x)); 425 | else 426 | r = addww(mulww(u, cos_x), mulww(v, sin_x)); 427 | } else if (j == 1) { 428 | if (k > 0) 429 | r = subww(mulww(negw(u), sin_x), mulww(v, cos_x)); 430 | else 431 | r = subww(mulww(v, cos_x), mulww(u, sin_x)); 432 | } else if (j == -1) { 433 | if (k > 0) 434 | r = addww(mulww(u, sin_x), mulww(v, cos_x)); 435 | else 436 | r = subww(mulww(u, sin_x), mulww(v, cos_x)); 437 | } else { 438 | if (k > 0) 439 | r = subww(mulww(v, sin_x), mulww(u, cos_x)); 440 | else 441 | r = subww(mulww(negw(u), cos_x), mulww(v, sin_x)); 442 | } 443 | return r; 444 | } 445 | 446 | ddouble sinhw(ddouble a) 447 | { 448 | if (iszerow(a)) 449 | return Q_ZERO; 450 | 451 | if (absw(a).hi > 0.05) { 452 | ddouble ea = expw(a); 453 | if (isinfw(ea)) 454 | return ea; 455 | if (iszerow(ea)) 456 | return negw(infw()); 457 | return mul_pwr2(subww(ea, reciprocalw(ea)), 0.5); 458 | } 459 | 460 | // When a is small, using the above formula gives a lot of cancellation. 461 | // Use Taylor series: x + x^3/3! + x^5/5! + ... 462 | const ddouble asquared = sqrw(a); 463 | const double thresh = fabs(a.hi) * Q_EPS.hi; 464 | 465 | // First order: 466 | ddouble apower = a; 467 | ddouble sum = a; 468 | ddouble term = a; 469 | 470 | // From third order: 471 | for (int i = 3; i < _n_inv_fact; i += 2) { 472 | apower = mulww(apower, asquared); 473 | term = mulww(apower, _inv_fact[i]); 474 | sum = addww(sum, term); 475 | if (fabs(term.hi) <= thresh) 476 | break; 477 | } 478 | return sum; 479 | } 480 | 481 | ddouble coshw(ddouble a) 482 | { 483 | if (iszerow(a)) 484 | return Q_ONE; 485 | 486 | ddouble ea = expw(a); 487 | if (isinfw(ea) || iszerow(ea)) 488 | return infw(); 489 | return mul_pwr2(addww(ea, reciprocalw(ea)), 0.5); 490 | } 491 | 492 | ddouble tanhw(ddouble a) 493 | { 494 | if (iszerow(a)) 495 | return Q_ZERO; 496 | 497 | if (fabs(a.hi) > 0.05) { 498 | ddouble ea = expw(a); 499 | ddouble inv_ea = reciprocalw(ea); 500 | return divww(subww(ea, inv_ea), addww(ea, inv_ea)); 501 | } 502 | 503 | ddouble s, c; 504 | s = sinhw(a); 505 | c = sqrtw(adddw(1.0, sqrw(s))); 506 | return divww(s, c); 507 | } 508 | 509 | ddouble tanw(ddouble a) 510 | { 511 | if (iszerow(a)) 512 | return Q_ZERO; 513 | 514 | ddouble s, c; 515 | s = sinw(a); 516 | c = cosw(a); 517 | return divww(s, c); 518 | } 519 | 520 | void sincosw(const ddouble a, ddouble *sin_a, ddouble *cos_a) 521 | { 522 | if (iszerow(a)) { 523 | *sin_a = Q_ZERO; 524 | *cos_a = Q_ONE; 525 | return; 526 | } 527 | 528 | int j, k; 529 | ddouble t = mod_pi16(a, &j, &k); 530 | int abs_j = abs(j), abs_k = abs(k); 531 | 532 | if (abs_j > 2 || abs_k > 4) { 533 | *cos_a = *sin_a = nanw(); 534 | return; 535 | } 536 | 537 | ddouble sin_t, cos_t; 538 | ddouble s, c; 539 | 540 | sincos_taylor(t, &sin_t, &cos_t); 541 | 542 | if (abs_k == 0) { 543 | s = sin_t; 544 | c = cos_t; 545 | } else { 546 | ddouble u = _cos_table[abs_k - 1]; 547 | ddouble v = _sin_table[abs_k - 1]; 548 | 549 | if (k > 0) { 550 | s = addww(mulww(u, sin_t), mulww(v, cos_t)); 551 | c = subww(mulww(u, cos_t), mulww(v, sin_t)); 552 | } else { 553 | s = subww(mulww(u, sin_t), mulww(v, cos_t)); 554 | c = addww(mulww(u, cos_t), mulww(v, sin_t)); 555 | } 556 | } 557 | if (abs_j == 0) { 558 | *sin_a = s; 559 | *cos_a = c; 560 | } else if (j == 1) { 561 | *sin_a = c; 562 | *cos_a = negw(s); 563 | } else if (j == -1) { 564 | *sin_a = negw(c); 565 | *cos_a = s; 566 | } else { 567 | *sin_a = negw(s); 568 | *cos_a = negw(c); 569 | } 570 | 571 | } 572 | 573 | ddouble atan2ww(ddouble y, ddouble x) 574 | { 575 | /* Strategy: Instead of using Taylor series to compute 576 | * arctan, we instead use Newton's iteration to solve 577 | * the equation 578 | * 579 | * sin(z) = y/r or cos(z) = x/r 580 | * 581 | * where r = sqrt(x^2 + y^2). 582 | * The iteration is given by 583 | * 584 | * z' = z + (y - sin(z)) / cos(z) (for equation 1) 585 | * z' = z - (x - cos(z)) / sin(z) (for equation 2) 586 | * 587 | * Here, x and y are normalized so that x^2 + y^2 = 1. 588 | * If |x| > |y|, then first iteration is used since the 589 | * denominator is larger. Otherwise, the second is used. 590 | */ 591 | if (iszerow(x) && iszerow(y)) 592 | return Q_ZERO; 593 | if (iszerow(x)) 594 | return (ispositivew(y)) ? Q_PI_2 : negw(Q_PI_2); 595 | if (iszerow(y)) 596 | return (ispositivew(x)) ? Q_ZERO : Q_PI; 597 | if (equalww(x, y)) 598 | return (ispositivew(y)) ? Q_PI_4: negw(Q_3PI_4); 599 | if (equalww(x, negw(y))) 600 | return (ispositivew(y)) ? Q_3PI_4 : negw(Q_PI_4); 601 | 602 | ddouble r = hypotww(x, y); 603 | x = divww(x, r); 604 | y = divww(y, r); 605 | 606 | /* Compute double precision approximation to atan. */ 607 | ddouble z = (ddouble){atan2(y.hi, x.hi), 0.}; 608 | ddouble sin_z, cos_z; 609 | 610 | sincosw(z, &sin_z, &cos_z); 611 | if (fabs(x.hi) > fabs(y.hi)) { 612 | /* Use Newton iteration 1. z' = z + (y - sin(z)) / cos(z) */ 613 | z = addww(z, divww(subww(y, sin_z), cos_z)); 614 | } else { 615 | /* Use Newton iteration 2. z' = z - (x - cos(z)) / sin(z) */ 616 | z = subww(z, divww(subww(x, cos_z), sin_z)); 617 | } 618 | return z; 619 | } 620 | 621 | ddouble atan2dw(const double a, const ddouble b) 622 | { 623 | return atan2ww((ddouble){a, 0.}, b); 624 | } 625 | 626 | ddouble atan2wd(const ddouble a, const double b) 627 | { 628 | return atan2ww(a, (ddouble){b, 0.}); 629 | } 630 | 631 | ddouble atanw(const ddouble a) 632 | { 633 | return atan2ww(a, Q_ONE); 634 | } 635 | 636 | ddouble acosw(const ddouble a) 637 | { 638 | ddouble abs_a = absw(a); 639 | if (greaterww(abs_a, Q_ONE)) 640 | return nanw(); 641 | if (isonew(abs_a)) 642 | return (ispositivew(a)) ? Q_ZERO : Q_PI; 643 | 644 | return atan2ww(sqrtw(subdw(1.0, sqrw(a))), a); 645 | } 646 | 647 | ddouble asinw(const ddouble a) 648 | { 649 | ddouble abs_a = absw(a); 650 | if (greaterwd(abs_a, 1.0)) 651 | return nanw(); 652 | if (isonew(abs_a)) 653 | return (ispositivew(a)) ? Q_PI_2 : negw(Q_PI_2); 654 | 655 | return atan2ww(a, sqrtw(subdw(1.0, sqrw(a)))); 656 | } 657 | 658 | ddouble asinhw(const ddouble a) 659 | { 660 | return logw(addww(a,sqrtw(addwd(sqrw(a),1.0)))); 661 | } 662 | 663 | ddouble acoshw(const ddouble a) 664 | { 665 | if (lesswd(a, 1.0)) 666 | return nanw(); 667 | 668 | return logw(addww(a, sqrtw(subwd(sqrw(a), 1.0)))); 669 | } 670 | 671 | ddouble atanhw(const ddouble a) 672 | { 673 | if (equalwd(a, -1.0)) 674 | return negw(infw()); 675 | if (isonew(a)) 676 | return infw(); 677 | if (greaterwd(absw(a), 1.0)) 678 | return nanw(); 679 | 680 | return mul_pwr2(logw(divww(adddw(1.0, a) , subdw(1.0, a))), 0.5); 681 | } 682 | 683 | ddouble powww(const ddouble a, const ddouble b) 684 | { 685 | if (iszerow(a) && iszerow(b)) 686 | return Q_ONE; 687 | if (iszerow(a) && !iszerow(b)) 688 | return Q_ZERO; 689 | 690 | return expw(mulww(b, logw(a))); 691 | } 692 | 693 | ddouble powwd(const ddouble a, const double b) 694 | { 695 | if (iszerow(a) && b == 0) 696 | return Q_ONE; 697 | if (iszerow(a) && b != 0) 698 | return Q_ZERO; 699 | 700 | return expw(muldw(b, logw(a))); 701 | } 702 | 703 | ddouble powdw(const double a, const ddouble b) 704 | { 705 | if (a == 0 && iszerow(b)) 706 | return Q_ONE; 707 | if (a == 0 && !iszerow(b)) 708 | return Q_ZERO; 709 | 710 | return expw(mulwd(b, log(a))); 711 | } 712 | 713 | ddouble modfww(const ddouble a, ddouble *b) 714 | { 715 | if (isnegativew(a)) { 716 | *b = ceilw(a); 717 | } else { 718 | *b = floorw(a); 719 | } 720 | return subww(a, *b); 721 | } 722 | -------------------------------------------------------------------------------- /csrc/_dd_ufunc.c: -------------------------------------------------------------------------------- 1 | /* Python extension module for the ddouble data type. 2 | * 3 | * Code is adapted from tensorflow's bfloat16 extension type, found here: 4 | * `tensorflow/python/lib/core/bfloat16.cc` and licensed Apache 2.0. 5 | * 6 | * Copyright (C) 2021 Markus Wallerberger and others 7 | * SPDX-License-Identifier: MIT 8 | */ 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include "dd_arith.h" 17 | 18 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 19 | #include "numpy/ndarraytypes.h" 20 | #include "numpy/ufuncobject.h" 21 | #include "numpy/npy_3kcompat.h" 22 | 23 | #if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE) 24 | static inline void _Py_SET_TYPE(PyObject *ob, PyTypeObject *type) 25 | { ob->ob_type = type; } 26 | #define Py_SET_TYPE(ob, type) _Py_SET_TYPE((PyObject*)(ob), type) 27 | #endif 28 | 29 | /** 30 | * Allows parameter to be marked unused 31 | */ 32 | #define MARK_UNUSED(x) do { (void)(x); } while(false) 33 | 34 | #ifdef _MSC_VER 35 | #define alignof __alignof 36 | #endif 37 | 38 | /* ------------------------ DDouble object ----------------------- */ 39 | 40 | static PyObject *module = NULL; 41 | static PyObject *numpy_module = NULL; 42 | static int type_num = -1; //FIXME 43 | 44 | static PyTypeObject *pyddouble_type = NULL; 45 | static PyObject *pyddouble_finfo = NULL; 46 | 47 | typedef struct { 48 | PyObject_HEAD 49 | ddouble x; 50 | } PyDDouble; 51 | 52 | static bool PyDDouble_Check(PyObject* object) 53 | { 54 | return PyObject_IsInstance(object, (PyObject *)pyddouble_type); 55 | } 56 | 57 | static PyObject *PyDDouble_Wrap(ddouble x) 58 | { 59 | PyDDouble *obj = (PyDDouble *) pyddouble_type->tp_alloc(pyddouble_type, 0); 60 | if (obj != NULL) 61 | obj->x = x; 62 | return (PyObject *)obj; 63 | } 64 | 65 | static ddouble PyDDouble_Unwrap(PyObject *arg) 66 | { 67 | return ((PyDDouble *)arg)->x; 68 | } 69 | 70 | static bool PyDDouble_Cast(PyObject *arg, ddouble *out) 71 | { 72 | if (PyDDouble_Check(arg)) { 73 | *out = PyDDouble_Unwrap(arg); 74 | } else if (PyFloat_Check(arg)) { 75 | double val = PyFloat_AsDouble(arg); 76 | *out = (ddouble) {val, 0.0}; 77 | } else if (PyLong_Check(arg)) { 78 | long val = PyLong_AsLong(arg); 79 | *out = (ddouble) {val, 0.0}; 80 | } else if (PyArray_IsScalar(arg, Float)) { 81 | float val; 82 | PyArray_ScalarAsCtype(arg, &val); 83 | *out = (ddouble) {val, 0.0}; 84 | } else if (PyArray_IsScalar(arg, Double)) { 85 | double val; 86 | PyArray_ScalarAsCtype(arg, &val); 87 | *out = (ddouble) {val, 0.0}; 88 | } else if (PyArray_IsZeroDim(arg)) { 89 | PyArrayObject* arr = (PyArrayObject *)arg; 90 | if (PyArray_TYPE(arr) == type_num) { 91 | *out = *(ddouble *)PyArray_DATA(arr); 92 | } else { 93 | arr = (PyArrayObject *)PyArray_Cast(arr, type_num); 94 | if (!PyErr_Occurred()) 95 | *out = *(ddouble *)PyArray_DATA(arr); 96 | else 97 | *out = nanw(); 98 | Py_XDECREF(arr); 99 | } 100 | } else { 101 | *out = nanw(); 102 | PyErr_Format(PyExc_TypeError, 103 | "Cannot cast instance of %s to ddouble scalar", 104 | arg->ob_type->tp_name); 105 | } 106 | return !PyErr_Occurred(); 107 | } 108 | 109 | static PyObject* PyDDouble_New(PyTypeObject *type, PyObject *args, PyObject *kwds) 110 | { 111 | PyObject *arg = NULL; 112 | if (PyArg_ParseTuple(args, "O", &arg) < 0) 113 | return NULL; 114 | 115 | ddouble val; 116 | if (PyDDouble_Check(arg)) { 117 | Py_INCREF(arg); 118 | return arg; 119 | } else if (PyDDouble_Cast(arg, &val)) { 120 | return PyDDouble_Wrap(val); 121 | } else { 122 | PyErr_Format(PyExc_TypeError, "expected ddouble, got %s", 123 | arg->ob_type->tp_name); 124 | return NULL; 125 | } 126 | MARK_UNUSED(type); 127 | MARK_UNUSED(kwds); 128 | } 129 | 130 | static PyObject* PyDDouble_Float(PyObject* self) 131 | { 132 | ddouble x = PyDDouble_Unwrap(self); 133 | return PyFloat_FromDouble(x.hi); 134 | } 135 | 136 | static PyObject* PyDDouble_Int(PyObject* self) 137 | { 138 | ddouble x = PyDDouble_Unwrap(self); 139 | return PyFloat_FromDouble((long) x.hi); 140 | } 141 | 142 | #define PYWRAP_UNARY(name, inner) \ 143 | static PyObject* name(PyObject* _x) \ 144 | { \ 145 | ddouble r, x; \ 146 | x = PyDDouble_Unwrap(_x); \ 147 | r = inner(x); \ 148 | return PyDDouble_Wrap(r); \ 149 | } 150 | 151 | #define PYWRAP_BINARY(name, inner, tp_inner_op) \ 152 | static PyObject* name(PyObject* _x, PyObject* _y) \ 153 | { \ 154 | ddouble r, x, y; \ 155 | if (PyArray_Check(_y)) \ 156 | return PyArray_Type.tp_as_number->tp_inner_op(_x, _y); \ 157 | if (PyDDouble_Cast(_x, &x) && PyDDouble_Cast(_y, &y)) { \ 158 | r = inner(x, y); \ 159 | return PyDDouble_Wrap(r); \ 160 | } \ 161 | return NULL; \ 162 | } 163 | 164 | #define PYWRAP_INPLACE(name, inner) \ 165 | static PyObject* name(PyObject* _self, PyObject* _y) \ 166 | { \ 167 | PyDDouble *self = (PyDDouble *)_self; \ 168 | ddouble y; \ 169 | if (PyDDouble_Cast(_y, &y)) { \ 170 | self->x = inner(self->x, y); \ 171 | Py_XINCREF(_self); \ 172 | return _self; \ 173 | } else { \ 174 | return NULL; \ 175 | } \ 176 | } 177 | 178 | PYWRAP_UNARY(PyDDouble_Positive, posw) 179 | PYWRAP_UNARY(PyDDouble_Negative, negw) 180 | PYWRAP_UNARY(PyDDouble_Absolute, absw) 181 | 182 | PYWRAP_BINARY(PyDDouble_Add, addww, nb_add) 183 | PYWRAP_BINARY(PyDDouble_Subtract, subww, nb_subtract) 184 | PYWRAP_BINARY(PyDDouble_Multiply, mulww, nb_multiply) 185 | PYWRAP_BINARY(PyDDouble_Divide, divww, nb_true_divide) 186 | 187 | PYWRAP_INPLACE(PyDDouble_InPlaceAdd, addww) 188 | PYWRAP_INPLACE(PyDDouble_InPlaceSubtract, subww) 189 | PYWRAP_INPLACE(PyDDouble_InPlaceMultiply, mulww) 190 | PYWRAP_INPLACE(PyDDouble_InPlaceDivide, divww) 191 | 192 | static int PyDDouble_Nonzero(PyObject* _x) 193 | { 194 | ddouble x = PyDDouble_Unwrap(_x); 195 | return !(x.hi == 0); 196 | } 197 | 198 | static PyObject* PyDDouble_RichCompare(PyObject* _x, PyObject* _y, int op) 199 | { 200 | ddouble x, y; 201 | if (!PyDDouble_Cast(_x, &x) || !PyDDouble_Cast(_y, &y)) 202 | return PyGenericArrType_Type.tp_richcompare(_x, _y, op); 203 | 204 | bool result; 205 | switch (op) { 206 | case Py_LT: 207 | result = lessww(x, y); 208 | break; 209 | case Py_LE: 210 | result = lessequalww(x, y); 211 | break; 212 | case Py_EQ: 213 | result = equalww(x, y); 214 | break; 215 | case Py_NE: 216 | result = notequalww(x, y); 217 | break; 218 | case Py_GT: 219 | result = greaterww(x, y); 220 | break; 221 | case Py_GE: 222 | result = greaterequalww(x, y); 223 | break; 224 | default: 225 | PyErr_SetString(PyExc_RuntimeError, "Invalid op type"); 226 | return NULL; 227 | } 228 | return PyBool_FromLong(result); 229 | } 230 | 231 | static Py_hash_t PyDDouble_Hash(PyObject *_x) 232 | { 233 | ddouble x = PyDDouble_Unwrap(_x); 234 | 235 | int exp; 236 | double mantissa; 237 | mantissa = frexp(x.hi, &exp); 238 | return (Py_hash_t)(LONG_MAX * mantissa) + exp; 239 | } 240 | 241 | static PyObject *PyDDouble_Str(PyObject *self) 242 | { 243 | char out[200]; 244 | ddouble x = PyDDouble_Unwrap(self); 245 | snprintf(out, 200, "%.16g", x.hi); 246 | return PyUnicode_FromString(out); 247 | } 248 | 249 | static PyObject *PyDDouble_Repr(PyObject *self) 250 | { 251 | char out[200]; 252 | ddouble x = PyDDouble_Unwrap(self); 253 | snprintf(out, 200, "ddouble(%.16g+%.16g)", x.hi, x.lo); 254 | return PyUnicode_FromString(out); 255 | } 256 | 257 | static PyObject *PyDDoubleGetFinfo(PyObject *self, PyObject *_dummy) 258 | { 259 | Py_INCREF(pyddouble_finfo); 260 | return pyddouble_finfo; 261 | MARK_UNUSED(self); 262 | MARK_UNUSED(_dummy); 263 | } 264 | 265 | static int make_ddouble_type() 266 | { 267 | static PyNumberMethods ddouble_as_number = { 268 | .nb_add = PyDDouble_Add, 269 | .nb_subtract = PyDDouble_Subtract, 270 | .nb_multiply = PyDDouble_Multiply, 271 | .nb_true_divide = PyDDouble_Divide, 272 | .nb_inplace_add = PyDDouble_InPlaceAdd, 273 | .nb_inplace_subtract = PyDDouble_InPlaceSubtract, 274 | .nb_inplace_multiply = PyDDouble_InPlaceMultiply, 275 | .nb_inplace_true_divide = PyDDouble_InPlaceDivide, 276 | .nb_negative = PyDDouble_Negative, 277 | .nb_positive = PyDDouble_Positive, 278 | .nb_absolute = PyDDouble_Absolute, 279 | .nb_bool = PyDDouble_Nonzero, 280 | .nb_int = PyDDouble_Int, 281 | .nb_float = PyDDouble_Float, 282 | }; 283 | static PyMethodDef ddouble_methods[] = { 284 | {"__finfo__", PyDDoubleGetFinfo, METH_NOARGS | METH_CLASS, 285 | "floating point information for type"}, 286 | {NULL, NULL, 0, NULL} 287 | }; 288 | static PyTypeObject ddouble_type = { 289 | PyVarObject_HEAD_INIT(NULL, 0) 290 | .tp_name = "ddouble", 291 | .tp_basicsize = sizeof(PyDDouble), 292 | .tp_repr = PyDDouble_Repr, 293 | .tp_as_number = &ddouble_as_number, 294 | .tp_hash = PyDDouble_Hash, 295 | .tp_str = PyDDouble_Str, 296 | .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, 297 | .tp_doc = "double-double floating point type", 298 | .tp_richcompare = PyDDouble_RichCompare, 299 | .tp_new = PyDDouble_New, 300 | .tp_methods = ddouble_methods 301 | }; 302 | 303 | ddouble_type.tp_base = &PyFloatingArrType_Type; 304 | if (PyType_Ready(&ddouble_type) < 0) 305 | return -1; 306 | 307 | pyddouble_type = &ddouble_type; 308 | return PyModule_AddObject(module, "ddouble", (PyObject *)pyddouble_type); 309 | } 310 | 311 | /* --------------------- Ddouble Finfo object -------------------- */ 312 | 313 | typedef struct { 314 | PyObject_HEAD 315 | PyObject *dtype; // which dtype 316 | int bits; // number of bits 317 | PyObject *max; // largest positive number 318 | PyObject *min; // largest negative number 319 | PyObject *eps; // machine epsilon (spacing) 320 | int nexp; // number of exponent bits 321 | int nmant; // number of mantissa bits 322 | PyObject *machar; // machar object (unused) 323 | } PyDDoubleFInfo; 324 | 325 | static PyTypeObject *PyDDoubleFinfoType; 326 | 327 | static PyObject *PPyDDoubleFInfo_Make() 328 | { 329 | PyDDoubleFInfo *self = 330 | (PyDDoubleFInfo *) PyDDoubleFinfoType->tp_alloc(PyDDoubleFinfoType, 0); 331 | if (self == NULL) 332 | return NULL; 333 | 334 | Py_INCREF(Py_None); 335 | self->dtype = (PyObject *)PyArray_DescrFromType(type_num); 336 | self->bits = CHAR_BIT * sizeof(ddouble); 337 | self->max = PyDDouble_Wrap(Q_MAX); 338 | self->min = PyDDouble_Wrap(Q_MIN); 339 | self->eps = PyDDouble_Wrap(Q_EPS); 340 | self->nexp = 11; 341 | self->nmant = 104; 342 | self->machar = Py_None; 343 | return (PyObject *)self; 344 | } 345 | 346 | static int make_finfo() 347 | { 348 | static PyMemberDef finfo_members[] = { 349 | {"dtype", T_OBJECT_EX, offsetof(PyDDoubleFInfo, dtype), READONLY, 350 | "underlying dtype object"}, 351 | {"bits", T_INT, offsetof(PyDDoubleFInfo, bits), READONLY, 352 | "storage size of object in bits"}, 353 | {"max", T_OBJECT_EX, offsetof(PyDDoubleFInfo, max), READONLY, 354 | "largest positive number"}, 355 | {"min", T_OBJECT_EX, offsetof(PyDDoubleFInfo, min), READONLY, 356 | "largest negative number"}, 357 | {"eps", T_OBJECT_EX, offsetof(PyDDoubleFInfo, eps), READONLY, 358 | "machine epsilon"}, 359 | {"nexp", T_INT, offsetof(PyDDoubleFInfo, nexp), READONLY, 360 | "number of bits in exponent"}, 361 | {"nmant", T_INT, offsetof(PyDDoubleFInfo, nmant), READONLY, 362 | "number of bits in mantissa"}, 363 | {"machar", T_OBJECT_EX, offsetof(PyDDoubleFInfo, machar), READONLY, 364 | "machar object (unused)"}, 365 | {NULL, 0, 0, 0, NULL} 366 | }; 367 | static PyTypeObject finfo_type = { 368 | PyVarObject_HEAD_INIT(NULL, 0) 369 | .tp_name = "ddouble_finfo", 370 | .tp_basicsize = sizeof(PyDDoubleFInfo), 371 | .tp_members = finfo_members, 372 | .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, 373 | .tp_doc = "finfo type" 374 | }; 375 | 376 | if (PyType_Ready(&finfo_type) < 0) 377 | return -1; 378 | 379 | PyDDoubleFinfoType = &finfo_type; 380 | pyddouble_finfo = PPyDDoubleFInfo_Make(); 381 | if (pyddouble_finfo == NULL) 382 | return -1; 383 | 384 | return 0; 385 | } 386 | 387 | /* ------------------------------ Descriptor ----------------------------- */ 388 | 389 | static PyObject *NPyDDouble_GetItem(void *data, void *arr) 390 | { 391 | ddouble x = *(ddouble *)data; 392 | return PyDDouble_Wrap(x); 393 | MARK_UNUSED(arr); 394 | } 395 | 396 | static int NPyDDouble_SetItem(PyObject *item, void *data, void *arr) 397 | { 398 | ddouble x; 399 | if (!PyDDouble_Cast(item, &x)) 400 | return -1; 401 | *(ddouble *)data = x; 402 | return 0; 403 | MARK_UNUSED(arr); 404 | } 405 | 406 | static int NPyDDouble_Compare(const void *_a, const void *_b, void *arr) 407 | { 408 | ddouble a = *(const ddouble *)_a; 409 | ddouble b = *(const ddouble *)_b; 410 | 411 | if (lessww(a, b)) 412 | return -1; 413 | if (greaterww(a, b)) 414 | return 1; 415 | if (isnanw(b)) 416 | return 1; 417 | return 0; 418 | MARK_UNUSED(arr); 419 | } 420 | 421 | static void NPyDDouble_CopySwapN(void *_d, npy_intp sd, void *_s, npy_intp ss, 422 | npy_intp ii, int swap, void* arr) 423 | { 424 | if (_s == NULL) 425 | return; 426 | char *_cd = (char *)_d, *_cs = (char *)_s; 427 | if (swap) { 428 | for (npy_intp i = 0; i != ii; ++i, _cd += sd, _cs += ss) { 429 | ddouble *s = (ddouble *)_cs, *d = (ddouble *)_cd, tmp; 430 | tmp = *d; 431 | *d = *s; 432 | *s = tmp; 433 | } 434 | } else { 435 | for (npy_intp i = 0; i != ii; ++i, _cd += sd, _cs += ss) { 436 | ddouble *s = (ddouble *)_cs, *d = (ddouble *)_cd; 437 | *d = *s; 438 | } 439 | } 440 | MARK_UNUSED(arr); 441 | } 442 | 443 | static void NPyDDouble_CopySwap(void *_d, void *_s, int swap, void* arr) 444 | { 445 | ddouble *s = _s, *d = _d, tmp; 446 | if (_s == NULL) 447 | return; 448 | if (swap) { 449 | tmp = *d; 450 | *d = *s; 451 | *s = tmp; 452 | } else { 453 | *d = *s; 454 | } 455 | MARK_UNUSED(arr); 456 | } 457 | 458 | static npy_bool NPyDDouble_NonZero(void *data, void *arr) 459 | { 460 | ddouble x = *(ddouble *)data; 461 | return !iszerow(x); 462 | MARK_UNUSED(arr); 463 | } 464 | 465 | static int NPyDDouble_Fill(void *_buffer, npy_intp ii, void *arr) 466 | { 467 | // Fill with linear array 468 | ddouble *buffer = (ddouble *)_buffer; 469 | if (ii < 2) 470 | return -1; 471 | 472 | ddouble curr = buffer[1]; 473 | ddouble step = subww(curr, buffer[0]); 474 | for (npy_intp i = 2; i != ii; ++i) { 475 | curr = addww(curr, step); 476 | buffer[i] = curr; 477 | } 478 | return 0; 479 | MARK_UNUSED(arr); 480 | } 481 | 482 | static int NPyDDouble_FillWithScalar(void *_buffer, npy_intp ii, void *_value, 483 | void *arr) 484 | { 485 | ddouble *buffer = (ddouble *)_buffer; 486 | ddouble value = *(ddouble *)_value; 487 | for (npy_intp i = 0; i < ii; ++i) 488 | buffer[i] = value; 489 | return 0; 490 | MARK_UNUSED(arr); 491 | } 492 | 493 | static void NPyDDouble_DotFunc(void *_in1, npy_intp is1, void *_in2, 494 | npy_intp is2, void *_out, npy_intp ii, void *arr) 495 | { 496 | ddouble out = Q_ZERO; 497 | char *_cin1 = (char *)_in1, *_cin2 = (char *)_in2; 498 | for (npy_intp i = 0; i < ii; ++i, _cin1 += is1, _cin2 += is2) { 499 | ddouble in1 = *(ddouble *)_cin1, in2 = *(ddouble *)_cin2; 500 | out = addww(out, mulww(in1, in2)); 501 | } 502 | *(ddouble *)_out = out; 503 | MARK_UNUSED(arr); 504 | } 505 | 506 | static int NPyDDouble_ArgMax(void *_data, npy_intp n, npy_intp *max_ind, 507 | void *arr) 508 | { 509 | ddouble *data = (ddouble *)_data; 510 | ddouble max_val = negw(infw()); 511 | for (npy_intp i = 0; i < n; ++i) { 512 | if (greaterww(data[i], max_val)) { 513 | max_val = data[i]; 514 | *max_ind = i; 515 | } 516 | } 517 | return 0; 518 | MARK_UNUSED(arr); 519 | } 520 | 521 | static int NPyDDouble_ArgMin(void *_data, npy_intp n, npy_intp *min_ind, 522 | void *arr) 523 | { 524 | ddouble *data = (ddouble *)_data; 525 | ddouble min_val = infw(); 526 | for (npy_intp i = 0; i < n; ++i) { 527 | if (lessww(data[i], min_val)) { 528 | min_val = data[i]; 529 | *min_ind = i; 530 | } 531 | } 532 | return 0; 533 | MARK_UNUSED(arr); 534 | } 535 | 536 | /* This is necessary in order to ensure both 1.0 and 2.0 compatibility. 537 | * https://numpy.org/doc/stable/reference/c-api/array.html#c.PyArray_RegisterDataType 538 | */ 539 | #if NPY_ABI_VERSION < 0x02000000 540 | #define PyArray_DescrProto PyArray_Descr 541 | #endif 542 | 543 | static int make_dtype() 544 | { 545 | /* Check if another module has registered a ddouble type. 546 | * 547 | * FIXME: this check is removed, let's see if it is missed ... 548 | */ 549 | //type_num = PyArray_TypeNumFromName("ddouble"); 550 | //if (type_num != NPY_NOTYPE) { 551 | // return type_num; 552 | //} 553 | 554 | static PyArray_ArrFuncs ddouble_arrfuncs; 555 | 556 | static PyArray_DescrProto ddouble_dtype = { 557 | PyObject_HEAD_INIT(NULL) 558 | 559 | /* We must register ddouble with a kind other than "f", because numpy 560 | * considers two types with the same kind and size to be equal, but 561 | * float128 != ddouble. The downside of this is that NumPy scalar 562 | * promotion does not work with ddoubles. 563 | */ 564 | .kind = 'V', 565 | .type = 'E', 566 | .byteorder = '=', 567 | 568 | /* NPY_USE_GETITEM is not needed, since we inherit from numpy scalar, 569 | * which according to the docs means that "standard conversion" is 570 | * used. However, we still need to define and register getitem() 571 | * below, otherwise PyArray_RegisterDataType complains. 572 | */ 573 | .flags = 0, 574 | .elsize = sizeof(ddouble), 575 | .alignment = alignof(ddouble), 576 | .hash = -1 577 | }; 578 | 579 | ddouble_dtype.typeobj = pyddouble_type; 580 | ddouble_dtype.f = &ddouble_arrfuncs; 581 | Py_SET_TYPE(&ddouble_dtype, &PyArrayDescr_Type); 582 | 583 | PyArray_InitArrFuncs(&ddouble_arrfuncs); 584 | ddouble_arrfuncs.getitem = NPyDDouble_GetItem; 585 | ddouble_arrfuncs.setitem = NPyDDouble_SetItem; 586 | ddouble_arrfuncs.compare = NPyDDouble_Compare; 587 | ddouble_arrfuncs.copyswapn = NPyDDouble_CopySwapN; 588 | ddouble_arrfuncs.copyswap = NPyDDouble_CopySwap; 589 | ddouble_arrfuncs.nonzero = NPyDDouble_NonZero; 590 | ddouble_arrfuncs.fill = NPyDDouble_Fill; 591 | ddouble_arrfuncs.fillwithscalar = NPyDDouble_FillWithScalar; 592 | ddouble_arrfuncs.dotfunc = NPyDDouble_DotFunc; 593 | ddouble_arrfuncs.argmax = NPyDDouble_ArgMax; 594 | ddouble_arrfuncs.argmin = NPyDDouble_ArgMin; 595 | 596 | type_num = PyArray_RegisterDataType(&ddouble_dtype); 597 | return type_num; 598 | } 599 | 600 | /* ------------------------------- Casts ------------------------------ */ 601 | 602 | #define NPY_CAST_FROM(func, from_type) \ 603 | static void func(void *_from, void *_to, npy_intp n, \ 604 | void *_arr_from, void *_arr_to) \ 605 | { \ 606 | ddouble *to = (ddouble *)_to; \ 607 | const from_type *from = (const from_type *)_from; \ 608 | for (npy_intp i = 0; i < n; ++i) \ 609 | to[i] = (ddouble) { from[i], 0.0 }; \ 610 | MARK_UNUSED(_arr_from); \ 611 | MARK_UNUSED(_arr_to); \ 612 | } 613 | 614 | #define NPY_CAST_FROM_I64(func, from_type) \ 615 | static void func(void *_from, void *_to, npy_intp n, \ 616 | void *_arr_from, void *_arr_to) \ 617 | { \ 618 | ddouble *to = (ddouble *)_to; \ 619 | const from_type *from = (const from_type *)_from; \ 620 | for (npy_intp i = 0; i < n; ++i) { \ 621 | double hi = from[i]; \ 622 | double lo = from[i] - (from_type) hi; \ 623 | to[i] = (ddouble){hi, lo}; \ 624 | } \ 625 | MARK_UNUSED(_arr_from); \ 626 | MARK_UNUSED(_arr_to); \ 627 | } 628 | 629 | #define NPY_CAST_TO(func, to_type) \ 630 | static void func(void *_from, void *_to, npy_intp n, \ 631 | void *_arr_from, void *_arr_to) \ 632 | { \ 633 | to_type *to = (to_type *)_to; \ 634 | const ddouble *from = (const ddouble *)_from; \ 635 | for (npy_intp i = 0; i < n; ++i) \ 636 | to[i] = (to_type) from[i].hi; \ 637 | MARK_UNUSED(_arr_from); \ 638 | MARK_UNUSED(_arr_to); \ 639 | } 640 | 641 | #define NPY_CAST_TO_I64(func, to_type) \ 642 | static void func(void *_from, void *_to, npy_intp n, \ 643 | void *_arr_from, void *_arr_to) \ 644 | { \ 645 | to_type *to = (to_type *)_to; \ 646 | const ddouble *from = (const ddouble *)_from; \ 647 | for (npy_intp i = 0; i < n; ++i) \ 648 | to[i] = (to_type) from[i].hi + (to_type) from[i].lo; \ 649 | MARK_UNUSED(_arr_from); \ 650 | MARK_UNUSED(_arr_to); \ 651 | } 652 | 653 | // These casts are all loss-less 654 | NPY_CAST_FROM(from_double, double) 655 | NPY_CAST_FROM(from_float, float) 656 | NPY_CAST_FROM(from_bool, bool) 657 | NPY_CAST_FROM(from_int8, int8_t) 658 | NPY_CAST_FROM(from_int16, int16_t) 659 | NPY_CAST_FROM(from_int32, int32_t) 660 | NPY_CAST_FROM(from_uint8, uint8_t) 661 | NPY_CAST_FROM(from_uint16, uint16_t) 662 | NPY_CAST_FROM(from_uint32, uint32_t) 663 | 664 | // These casts are also lossless, because we have now 2*54 bits of mantissa 665 | NPY_CAST_FROM_I64(from_int64, int64_t) 666 | NPY_CAST_FROM_I64(from_uint64, uint64_t) 667 | 668 | // These casts are all lossy 669 | NPY_CAST_TO(to_double, double) 670 | NPY_CAST_TO(to_float, float) 671 | NPY_CAST_TO(to_bool, bool) 672 | NPY_CAST_TO(to_int8, int8_t) 673 | NPY_CAST_TO(to_int16, int16_t) 674 | NPY_CAST_TO(to_int32, int32_t) 675 | NPY_CAST_TO(to_uint8, uint8_t) 676 | NPY_CAST_TO(to_uint16, uint16_t) 677 | NPY_CAST_TO(to_uint32, uint32_t) 678 | 679 | // These casts can be made more accurate 680 | NPY_CAST_TO_I64(to_int64, int64_t) 681 | NPY_CAST_TO_I64(to_uint64, uint64_t) 682 | 683 | 684 | static bool register_cast(int other_type, PyArray_VectorUnaryFunc from_other, 685 | PyArray_VectorUnaryFunc to_other) 686 | { 687 | PyArray_Descr *other_descr = NULL, *ddouble_descr = NULL; 688 | int ret; 689 | 690 | other_descr = PyArray_DescrFromType(other_type); 691 | if (other_descr == NULL) goto error; 692 | 693 | ddouble_descr = PyArray_DescrFromType(type_num); 694 | if (ddouble_descr == NULL) goto error; 695 | 696 | ret = PyArray_RegisterCastFunc(other_descr, type_num, from_other); 697 | if (ret < 0) goto error; 698 | 699 | // NPY_NOSCALAR apparently implies that casting is safe? 700 | ret = PyArray_RegisterCanCast(other_descr, type_num, NPY_NOSCALAR); 701 | if (ret < 0) goto error; 702 | 703 | ret = PyArray_RegisterCastFunc(ddouble_descr, other_type, to_other); 704 | if (ret < 0) goto error; 705 | return true; 706 | 707 | error: 708 | return false; 709 | } 710 | 711 | static int register_casts() 712 | { 713 | bool ok = register_cast(NPY_DOUBLE, from_double, to_double) 714 | && register_cast(NPY_FLOAT, from_float, to_float) 715 | && register_cast(NPY_BOOL, from_bool, to_bool) 716 | && register_cast(NPY_INT8, from_int8, to_int8) 717 | && register_cast(NPY_INT16, from_int16, to_int16) 718 | && register_cast(NPY_INT32, from_int32, to_int32) 719 | && register_cast(NPY_INT64, from_int64, to_int64) 720 | && register_cast(NPY_UINT8, from_uint8, to_uint8) 721 | && register_cast(NPY_UINT16, from_uint16, to_uint16) 722 | && register_cast(NPY_UINT32, from_uint32, to_uint32) 723 | && register_cast(NPY_UINT64, from_uint64, to_uint64); 724 | return ok ? 0 : -1; 725 | } 726 | 727 | /* ------------------------------- Ufuncs ----------------------------- */ 728 | 729 | #define ULOOP_UNARY(func_name, inner_func, type_out, type_in) \ 730 | static void func_name(char **args, const npy_intp *dimensions, \ 731 | const npy_intp *steps, void *data) \ 732 | { \ 733 | const npy_intp n = dimensions[0]; \ 734 | const npy_intp is = steps[0] / sizeof(type_in), \ 735 | os = steps[1] / sizeof(type_out); \ 736 | const type_in *in = (const type_in *)args[0]; \ 737 | type_out *out = (type_out *)args[1]; \ 738 | \ 739 | for (npy_intp i = 0; i < n; ++i) \ 740 | out[i * os] = inner_func(in[i * is]); \ 741 | MARK_UNUSED(data); \ 742 | } 743 | 744 | #define ULOOP_BINARY(func_name, inner_func, type_out, type_a, type_b) \ 745 | static void func_name(char **args, const npy_intp *dimensions, \ 746 | const npy_intp* steps, void *data) \ 747 | { \ 748 | const npy_intp n = dimensions[0]; \ 749 | const npy_intp as = steps[0] / sizeof(type_a), \ 750 | bs = steps[1] / sizeof(type_b), \ 751 | os = steps[2] / sizeof(type_out); \ 752 | const type_a *a = (const type_a *)args[0]; \ 753 | const type_b *b = (const type_b *)args[1]; \ 754 | type_out *out = (type_out *)args[2]; \ 755 | \ 756 | for (npy_intp i = 0; i < n; ++i) { \ 757 | out[i * os] = inner_func(a[i * as], b[i * bs]); \ 758 | } \ 759 | MARK_UNUSED(data); \ 760 | } 761 | 762 | #define ULOOP_MODF(func_name, inner_func, type_out, type_a, type_b) \ 763 | static void func_name(char **args, const npy_intp *dimensions, \ 764 | const npy_intp* steps, void *data) \ 765 | { \ 766 | const npy_intp n = dimensions[0]; \ 767 | const npy_intp as = steps[0] / sizeof(type_a), \ 768 | bs = steps[1] / sizeof(type_b), \ 769 | os = steps[2] / sizeof(type_out); \ 770 | const type_a *a = (const type_a *)args[0]; \ 771 | type_b *b = (type_b *)args[2]; \ 772 | type_out *out = (type_out *)args[1]; \ 773 | \ 774 | for (npy_intp i = 0; i < n; ++i) { \ 775 | out[i * os] = inner_func(a[i * as], &b[i * bs]); \ 776 | } \ 777 | MARK_UNUSED(data); \ 778 | } 779 | 780 | ULOOP_BINARY(u_addwd, addwd, ddouble, ddouble, double) 781 | ULOOP_BINARY(u_subwd, subwd, ddouble, ddouble, double) 782 | ULOOP_BINARY(u_mulwd, mulwd, ddouble, ddouble, double) 783 | ULOOP_BINARY(u_divwd, divwd, ddouble, ddouble, double) 784 | ULOOP_BINARY(u_adddw, adddw, ddouble, double, ddouble) 785 | ULOOP_BINARY(u_subdw, subdw, ddouble, double, ddouble) 786 | ULOOP_BINARY(u_muldw, muldw, ddouble, double, ddouble) 787 | ULOOP_BINARY(u_divdw, divdw, ddouble, double, ddouble) 788 | ULOOP_BINARY(u_addww, addww, ddouble, ddouble, ddouble) 789 | ULOOP_BINARY(u_subww, subww, ddouble, ddouble, ddouble) 790 | ULOOP_BINARY(u_mulww, mulww, ddouble, ddouble, ddouble) 791 | ULOOP_BINARY(u_divww, divww, ddouble, ddouble, ddouble) 792 | ULOOP_BINARY(u_copysignww, copysignww, ddouble, ddouble, ddouble) 793 | ULOOP_BINARY(u_copysignwd, copysignwd, ddouble, ddouble, double) 794 | ULOOP_BINARY(u_copysigndw, copysigndw, ddouble, double, ddouble) 795 | ULOOP_BINARY(u_equalww, equalww, bool, ddouble, ddouble) 796 | ULOOP_BINARY(u_notequalww, notequalww, bool, ddouble, ddouble) 797 | ULOOP_BINARY(u_greaterww, greaterww, bool, ddouble, ddouble) 798 | ULOOP_BINARY(u_lessww, lessww, bool, ddouble, ddouble) 799 | ULOOP_BINARY(u_greaterequalww, greaterww, bool, ddouble, ddouble) 800 | ULOOP_BINARY(u_lessequalww, lessww, bool, ddouble, ddouble) 801 | ULOOP_BINARY(u_equalwd, equalwd, bool, ddouble, double) 802 | ULOOP_BINARY(u_notequalwd, notequalwd, bool, ddouble, double) 803 | ULOOP_BINARY(u_greaterwd, greaterwd, bool, ddouble, double) 804 | ULOOP_BINARY(u_lesswd, lesswd, bool, ddouble, double) 805 | ULOOP_BINARY(u_greaterequalwd, greaterequalwd, bool, ddouble, double) 806 | ULOOP_BINARY(u_lessequalwd, lessequalwd, bool, ddouble, double) 807 | ULOOP_BINARY(u_equaldw, equaldw, bool, double, ddouble) 808 | ULOOP_BINARY(u_notequaldw, notequaldw, bool, double, ddouble) 809 | ULOOP_BINARY(u_greaterdw, greaterdw, bool, double, ddouble) 810 | ULOOP_BINARY(u_lessdw, lessdw, bool, double, ddouble) 811 | ULOOP_BINARY(u_greaterequaldw, greaterequaldw, bool, double, ddouble) 812 | ULOOP_BINARY(u_lessequaldw, lessequaldw, bool, double, ddouble) 813 | ULOOP_BINARY(u_fminww, fminww, ddouble, ddouble, ddouble) 814 | ULOOP_BINARY(u_fmaxww, fmaxww, ddouble, ddouble, ddouble) 815 | ULOOP_BINARY(u_fminwd, fminwd, ddouble, ddouble, double) 816 | ULOOP_BINARY(u_fmaxwd, fmaxwd, ddouble, ddouble, double) 817 | ULOOP_BINARY(u_fmindw, fmindw, ddouble, double, ddouble) 818 | ULOOP_BINARY(u_fmaxdw, fmaxdw, ddouble, double, ddouble) 819 | ULOOP_BINARY(u_atan2wd, atan2wd, ddouble, ddouble, double) 820 | ULOOP_BINARY(u_atan2dw, atan2dw, ddouble, double, ddouble) 821 | ULOOP_BINARY(u_atan2ww, atan2ww, ddouble, ddouble, ddouble) 822 | ULOOP_BINARY(u_powwd, powwd, ddouble, ddouble, double) 823 | ULOOP_BINARY(u_powdw, powdw, ddouble, double, ddouble) 824 | ULOOP_BINARY(u_powww, powww, ddouble, ddouble, ddouble) 825 | ULOOP_BINARY(u_hypotww, hypotww, ddouble, ddouble, ddouble) 826 | ULOOP_BINARY(u_hypotdw, hypotdw, ddouble, double, ddouble) 827 | ULOOP_BINARY(u_hypotwd, hypotwd, ddouble, ddouble, double) 828 | ULOOP_BINARY(u_ldexpwi, ldexpwi, ddouble, ddouble, int) 829 | ULOOP_MODF(u_modfww, modfww, ddouble, ddouble, ddouble) 830 | ULOOP_UNARY(u_signbitw, signbitw, bool, ddouble) 831 | ULOOP_UNARY(u_signw, signw, ddouble, ddouble) 832 | ULOOP_UNARY(u_isfinitew, isfinitew, bool, ddouble) 833 | ULOOP_UNARY(u_isinfw, isinfw, bool, ddouble) 834 | ULOOP_UNARY(u_isnanw, isnanw, bool, ddouble) 835 | ULOOP_UNARY(u_negw, negw, ddouble, ddouble) 836 | ULOOP_UNARY(u_posw, posw, ddouble, ddouble) 837 | ULOOP_UNARY(u_absw, absw, ddouble, ddouble) 838 | ULOOP_UNARY(u_reciprocalw, reciprocalw, ddouble, ddouble) 839 | ULOOP_UNARY(u_sqrw, sqrw, ddouble, ddouble) 840 | ULOOP_UNARY(u_roundw, roundw, ddouble, ddouble) 841 | ULOOP_UNARY(u_floorw, floorw, ddouble, ddouble) 842 | ULOOP_UNARY(u_ceilw, ceilw, ddouble, ddouble) 843 | ULOOP_UNARY(u_sqrtw, sqrtw, ddouble, ddouble) 844 | ULOOP_UNARY(u_expw, expw, ddouble, ddouble) 845 | ULOOP_UNARY(u_expm1w, expm1w, ddouble, ddouble) 846 | ULOOP_UNARY(u_logw, logw, ddouble, ddouble) 847 | ULOOP_UNARY(u_sinw, sinw, ddouble, ddouble) 848 | ULOOP_UNARY(u_cosw, cosw, ddouble, ddouble) 849 | ULOOP_UNARY(u_tanw, tanw, ddouble, ddouble) 850 | ULOOP_UNARY(u_atanw, atanw, ddouble, ddouble) 851 | ULOOP_UNARY(u_acosw, acosw, ddouble, ddouble) 852 | ULOOP_UNARY(u_asinw, asinw, ddouble, ddouble) 853 | ULOOP_UNARY(u_atanhw, atanhw, ddouble, ddouble) 854 | ULOOP_UNARY(u_acoshw, acoshw, ddouble, ddouble) 855 | ULOOP_UNARY(u_asinhw, asinhw, ddouble, ddouble) 856 | ULOOP_UNARY(u_sinhw, sinhw, ddouble, ddouble) 857 | ULOOP_UNARY(u_coshw, coshw, ddouble, ddouble) 858 | ULOOP_UNARY(u_tanhw, tanhw, ddouble, ddouble) 859 | 860 | static bool register_binary(PyUFuncGenericFunction dq_func, 861 | PyUFuncGenericFunction qd_func, PyUFuncGenericFunction qq_func, 862 | int ret_dtype, const char *name) 863 | { 864 | PyUFuncObject *ufunc; 865 | int *arg_types = NULL, retcode = 0; 866 | 867 | ufunc = (PyUFuncObject *)PyObject_GetAttrString(numpy_module, name); 868 | if (ufunc == NULL) goto error; 869 | 870 | arg_types = PyMem_New(int, 3 * 3); 871 | if (arg_types == NULL) goto error; 872 | 873 | arg_types[0] = NPY_DOUBLE; 874 | arg_types[1] = type_num; 875 | arg_types[2] = ret_dtype; 876 | retcode = PyUFunc_RegisterLoopForType(ufunc, type_num, 877 | dq_func, arg_types, NULL); 878 | if (retcode < 0) goto error; 879 | 880 | arg_types[3] = type_num; 881 | arg_types[4] = NPY_DOUBLE; 882 | arg_types[5] = ret_dtype; 883 | retcode = PyUFunc_RegisterLoopForType(ufunc, type_num, 884 | qd_func, arg_types + 3, NULL); 885 | if (retcode < 0) goto error; 886 | 887 | arg_types[6] = type_num; 888 | arg_types[7] = type_num; 889 | arg_types[8] = ret_dtype; 890 | retcode = PyUFunc_RegisterLoopForType(ufunc, type_num, 891 | qq_func, arg_types + 6, NULL); 892 | if (retcode < 0) goto error; 893 | return true; 894 | 895 | error: 896 | return false; 897 | } 898 | 899 | static int register_unary(PyUFuncGenericFunction func, int ret_dtype, 900 | const char *name) 901 | { 902 | PyUFuncObject *ufunc; 903 | int *arg_types = NULL, retcode = 0; 904 | 905 | ufunc = (PyUFuncObject *)PyObject_GetAttrString(numpy_module, name); 906 | if (ufunc == NULL) goto error; 907 | 908 | arg_types = PyMem_New(int, 2); 909 | if (arg_types == NULL) goto error; 910 | 911 | arg_types[0] = type_num; 912 | arg_types[1] = ret_dtype; 913 | retcode = PyUFunc_RegisterLoopForType(ufunc, type_num, 914 | func, arg_types, NULL); 915 | if (retcode < 0) goto error; 916 | return true; 917 | 918 | error: 919 | return false; 920 | } 921 | 922 | static int register_ldexp(PyUFuncGenericFunction func, int ret_dtype, 923 | const char *name) 924 | { 925 | PyUFuncObject *ufunc; 926 | int *arg_types = NULL, retcode = 0; 927 | 928 | ufunc = (PyUFuncObject *)PyObject_GetAttrString(numpy_module, name); 929 | if (ufunc == NULL) goto error; 930 | 931 | arg_types = PyMem_New(int, 3); 932 | if (arg_types == NULL) goto error; 933 | 934 | arg_types[0] = type_num; 935 | arg_types[1] = NPY_INTP; 936 | arg_types[2] = ret_dtype; 937 | retcode = PyUFunc_RegisterLoopForType(ufunc, type_num, 938 | func, arg_types, NULL); 939 | if (retcode < 0) goto error; 940 | return true; 941 | 942 | error: 943 | return false; 944 | } 945 | 946 | static int register_modf(PyUFuncGenericFunction func, int ret_dtype, 947 | const char *name) 948 | { 949 | PyUFuncObject *ufunc; 950 | int *arg_types = NULL, retcode = 0; 951 | 952 | ufunc = (PyUFuncObject *)PyObject_GetAttrString(numpy_module, name); 953 | if (ufunc == NULL) goto error; 954 | 955 | arg_types = PyMem_New(int, 4); 956 | if (arg_types == NULL) goto error; 957 | 958 | arg_types[0] = type_num; 959 | arg_types[1] = type_num; 960 | arg_types[2] = ret_dtype; 961 | arg_types[3] = ret_dtype; 962 | retcode = PyUFunc_RegisterLoopForType(ufunc, type_num, 963 | func, arg_types, NULL); 964 | if (retcode < 0) goto error; 965 | return true; 966 | 967 | error: 968 | return false; 969 | } 970 | 971 | static int register_ufuncs() 972 | { 973 | bool ok = register_unary(u_negw, type_num, "negative") 974 | && register_unary(u_posw, type_num, "positive") 975 | && register_unary(u_absw, type_num, "absolute") 976 | && register_unary(u_reciprocalw, type_num, "reciprocal") 977 | && register_unary(u_sqrw, type_num, "square") 978 | && register_unary(u_sqrtw, type_num, "sqrt") 979 | && register_unary(u_signbitw, NPY_BOOL, "signbit") 980 | && register_unary(u_isfinitew, NPY_BOOL, "isfinite") 981 | && register_unary(u_isinfw, NPY_BOOL, "isinf") 982 | && register_unary(u_isnanw, NPY_BOOL, "isnan") 983 | && register_unary(u_roundw, type_num, "rint") 984 | && register_unary(u_floorw, type_num, "floor") 985 | && register_unary(u_ceilw, type_num, "ceil") 986 | && register_unary(u_expw, type_num, "exp") 987 | && register_unary(u_expm1w, type_num, "expm1") 988 | && register_unary(u_logw, type_num, "log") 989 | && register_unary(u_sinw, type_num, "sin") 990 | && register_unary(u_cosw, type_num, "cos") 991 | && register_unary(u_tanw, type_num, "tan") 992 | && register_unary(u_atanw, type_num, "arctan") 993 | && register_unary(u_acosw, type_num, "arccos") 994 | && register_unary(u_asinw, type_num, "arcsin") 995 | && register_unary(u_atanhw, type_num, "arctanh") 996 | && register_unary(u_acoshw, type_num, "arccosh") 997 | && register_unary(u_asinhw, type_num, "arcsinh") 998 | && register_unary(u_sinhw, type_num, "sinh") 999 | && register_unary(u_coshw, type_num, "cosh") 1000 | && register_unary(u_tanhw, type_num, "tanh") 1001 | && register_unary(u_signw, type_num, "sign") 1002 | && register_ldexp(u_ldexpwi, type_num, "ldexp") 1003 | && register_modf(u_modfww, type_num, "modf") 1004 | && register_binary(u_adddw, u_addwd, u_addww, type_num, "add") 1005 | && register_binary(u_subdw, u_subwd, u_subww, type_num, "subtract") 1006 | && register_binary(u_muldw, u_mulwd, u_mulww, type_num, "multiply") 1007 | && register_binary(u_divdw, u_divwd, u_divww, type_num, "true_divide") 1008 | && register_binary(u_powdw, u_powwd, u_powww, type_num, "power") 1009 | && register_binary(u_equaldw, u_equalwd, u_equalww, NPY_BOOL, "equal") 1010 | && register_binary(u_notequaldw, u_notequalwd, u_notequalww, NPY_BOOL, 1011 | "not_equal") 1012 | && register_binary(u_greaterdw, u_greaterwd, u_greaterww, NPY_BOOL, "greater") 1013 | && register_binary(u_lessdw, u_lesswd, u_lessww, NPY_BOOL, "less") 1014 | && register_binary(u_greaterequaldw, u_greaterequalwd, u_greaterequalww, 1015 | NPY_BOOL, "greater_equal") 1016 | && register_binary(u_lessequaldw, u_lessequalwd, u_lessequalww, NPY_BOOL, 1017 | "less_equal") 1018 | && register_binary(u_fmindw, u_fminwd, u_fminww, type_num, "fmin") 1019 | && register_binary(u_fmaxdw, u_fmaxwd, u_fmaxww, type_num, "fmax") 1020 | && register_binary(u_fmindw, u_fminwd, u_fminww, type_num, "minimum") 1021 | && register_binary(u_fmaxdw, u_fmaxwd, u_fmaxww, type_num, "maximum") 1022 | && register_binary(u_atan2dw, u_atan2wd, u_atan2ww, type_num, "arctan2") 1023 | && register_binary(u_copysigndw, u_copysignwd, u_copysignww, type_num, 1024 | "copysign") 1025 | && register_binary(u_hypotdw, u_hypotwd, u_hypotww, type_num, "hypot"); 1026 | return ok ? 0 : -1; 1027 | } 1028 | 1029 | static int register_dtype_in_dicts() 1030 | { 1031 | PyObject *type_dict = NULL; 1032 | 1033 | type_dict = PyObject_GetAttrString(numpy_module, "sctypeDict"); 1034 | if (type_dict == NULL) goto error; 1035 | 1036 | if (PyDict_SetItemString(type_dict, "ddouble", 1037 | (PyObject *)pyddouble_type) < 0) 1038 | goto error; 1039 | return 0; 1040 | 1041 | error: 1042 | Py_XDECREF(type_dict); 1043 | return -1; 1044 | } 1045 | 1046 | /* ----------------------- Python stuff -------------------------- */ 1047 | 1048 | static PyObject *make_module() 1049 | { 1050 | // Defitions 1051 | static PyMethodDef no_methods[] = { 1052 | {NULL, NULL, 0, NULL} // No methods defined 1053 | }; 1054 | static struct PyModuleDef module_def = { 1055 | PyModuleDef_HEAD_INIT, 1056 | "_dd_ufunc", 1057 | NULL, 1058 | -1, 1059 | no_methods, 1060 | NULL, 1061 | NULL, 1062 | NULL, 1063 | NULL 1064 | }; 1065 | 1066 | /* Module definition */ 1067 | module = PyModule_Create(&module_def); 1068 | return module; 1069 | } 1070 | 1071 | static bool constant(ddouble value, const char *name) 1072 | { 1073 | // Note that data must be allocated using malloc, not python allocators! 1074 | ddouble *data = malloc(sizeof value); 1075 | *data = value; 1076 | 1077 | PyArrayObject *array = (PyArrayObject *) 1078 | PyArray_SimpleNewFromData(0, NULL, type_num, data); 1079 | if (array == NULL) return false; 1080 | 1081 | PyArray_ENABLEFLAGS(array, NPY_ARRAY_OWNDATA); 1082 | PyArray_CLEARFLAGS(array, NPY_ARRAY_WRITEABLE); 1083 | 1084 | PyModule_AddObject(module, name, (PyObject *)array); 1085 | return true; 1086 | } 1087 | 1088 | static int register_constants() 1089 | { 1090 | bool ok = constant(Q_MAX, "MAX") 1091 | && constant(Q_MIN, "MIN") 1092 | && constant(Q_EPS, "EPS") 1093 | && constant(Q_2PI, "TWOPI") 1094 | && constant(Q_PI, "PI") 1095 | && constant(Q_PI_2, "PI_2") 1096 | && constant(Q_PI_4, "PI_4") 1097 | && constant(Q_E, "E") 1098 | && constant(Q_LOG2, "LOG2") 1099 | && constant(Q_LOG10, "LOG10") 1100 | && constant(nanw(), "NAN") 1101 | && constant(infw(), "INF"); 1102 | return ok ? 0 : -1; 1103 | } 1104 | 1105 | PyMODINIT_FUNC PyInit__dd_ufunc(void) 1106 | { 1107 | /* Initialize module */ 1108 | if (!make_module()) 1109 | return NULL; 1110 | 1111 | /* Initialize numpy things */ 1112 | import_array(); 1113 | import_umath(); 1114 | 1115 | if (make_ddouble_type() < 0) 1116 | return NULL; 1117 | if (make_dtype() < 0) 1118 | return NULL; 1119 | if (make_finfo() < 0) 1120 | return NULL; 1121 | 1122 | numpy_module = PyImport_ImportModule("numpy"); 1123 | if (numpy_module == NULL) 1124 | return NULL; 1125 | 1126 | PyArray_Descr *dtype = PyArray_DescrFromType(type_num); 1127 | PyModule_AddObject(module, "dtype", (PyObject *)dtype); 1128 | 1129 | /* Casts need to be defined before ufuncs, because numpy >= 1.21 caches 1130 | * casts/ufuncs in a way that is non-trivial... one should consider casts 1131 | * to be "more basic". 1132 | * See: https://github.com/numpy/numpy/issues/20009 1133 | */ 1134 | if (register_casts() < 0) 1135 | return NULL; 1136 | if (register_ufuncs() < 0) 1137 | return NULL; 1138 | if (register_dtype_in_dicts() < 0) 1139 | return NULL; 1140 | if (register_constants() < 0) 1141 | return NULL; 1142 | 1143 | /* Module is ready */ 1144 | return module; 1145 | } 1146 | --------------------------------------------------------------------------------