├── .gitignore
├── .conda
    ├── conda_build_config.yaml
    └── meta.yaml
├── pysrc
    └── xprec
    │   ├── __init__.py
    │   └── linalg.py
├── .dev
    └── container
    │   ├── Dockerfile
    │   └── devcontainer.json
├── .github
    └── workflows
    │   ├── conda.yml
    │   ├── wheels.yml
    │   └── pytest.yml
├── LICENSE.txt
├── test
    ├── test_dtype.py
    ├── test_whitespace.py
    ├── test_linalg.py
    ├── test_ufunc.py
    └── test_mpmath.py
├── README.md
├── csrc
    ├── dd_linalg.h
    ├── dd_linalg.c
    ├── dd_arith.h
    ├── _dd_linalg.c
    ├── dd_arith.c
    └── _dd_ufunc.c
├── QD-LICENSE.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .*
 2 | *~
 3 | \#*\#
 4 | 
 5 | *.pyc
 6 | __pycache__/
 7 | build/
 8 | dist/
 9 | *.o
10 | *.so
11 | *.egg-info/
12 | 
13 | notebooks/*.ipynb
14 | 
15 | !.gitignore
16 | !/.github/
17 | !/.editorconfig
18 | 


--------------------------------------------------------------------------------
/.conda/conda_build_config.yaml:
--------------------------------------------------------------------------------
 1 | python:
 2 |     - 3.13
 3 |     - 3.12
 4 |     - 3.11
 5 | 
 6 | numpy:
 7 |     # 1.18 does not build with Python 3.9
 8 |     #- 1.19
 9 |     #- 1.20
10 |     - 2.1
11 |     - 2.2
12 | 
13 | pin_run_as_build:
14 |     numpy: x.x
15 | 


--------------------------------------------------------------------------------
/.conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set data = load_setup_py_data(setup_file='../setup.py', from_recipe_dir=True) %}
 2 | {% set name = "xprec" %}
 3 | {% set version = data.get("version") %}
 4 | 
 5 | package:
 6 |   name: "{{ name|lower }}"
 7 |   version: "{{ version }}"
 8 | 
 9 | source:
10 |   path: ../
11 | 
12 | build:
13 |   number: 0
14 |   script: "{{ PYTHON }} -m pip install . -vv"
15 | 
16 | requirements:
17 |   buid:
18 |     - python {{ python }}
19 |     - numpy {{ numpy }}
20 |   host:
21 |     - python {{ python }}
22 |     - numpy {{ numpy }}
23 |   run:
24 |     - python {{ python }}
25 |     - numpy {{ numpy }}
26 | 
27 | about:
28 |   home: "https://github.com/tuwien-cms/xprec"
29 |   license: MIT
30 |   summary: "xprec precision numpy extension"
31 | 
32 | extra:
33 |   recipe-maintainers:
34 |     - shinaoka
35 | 


--------------------------------------------------------------------------------
/pysrc/xprec/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2021 Markus Wallerberger and others
 2 | # SPDX-License-Identifier: MIT
 3 | """
 4 | Extension module for numpy providing the `ddouble` data type.
 5 | 
 6 | Loading this module registers an additional scalar data type `ddouble` with
 7 | numpy implementing double-double arithmetic.  You can use use the data type
 8 | by passing `dtype=xprec.ddouble` to numpy functions.
 9 | 
10 | Example:
11 | 
12 |     import numpy as np
13 |     from xprec import ddouble
14 | 
15 |     x = np.arange(5, dtype=ddouble)
16 |     print(2 * x)
17 | 
18 | """
19 | __version__ = "1.4.7"
20 | 
21 | import numpy as _np
22 | 
23 | from . import _dd_ufunc
24 | from . import _dd_linalg    # needed for matmul
25 | 
26 | ddouble = _dd_ufunc.dtype
27 | 
28 | 
29 | def finfo(dtype):
30 |     dtype = _np.dtype(dtype)
31 |     try:
32 |         finfo_dunder = dtype.type.__finfo__
33 |     except AttributeError:
34 |         return _np.finfo(dtype)
35 |     else:
36 |         return finfo_dunder()
37 | 


--------------------------------------------------------------------------------
/.dev/container/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/anaconda3
 2 | #FROM continuumio/anaconda3:2020.02
 3 | 
 4 | ENV PYTHONUNBUFFERED=1
 5 | 
 6 | RUN apt-get update && \
 7 |     DEBIAN_FRONTEND=noninteractive apt-get install -y \
 8 |     build-essential \
 9 |     curl \
10 |     ca-certificates \
11 |     git \
12 |     zip \
13 |     vim \
14 |     cmake pkg-config gfortran \
15 |     sudo \
16 |     && \
17 |     apt-get clean && rm -rf /var/cache/apt/archives/* /var/lib/apt/lists/* # clean up
18 | 
19 | #RUN mkdir /opt/conda/pkgs
20 | #RUN chown 1000:1000 /opt/conda
21 | 
22 | # Create non-root user
23 | ARG NB_USER=vscode
24 | ARG NB_UID=1000
25 | RUN useradd -u $NB_UID -m $NB_USER -s /bin/bash && \
26 |     echo 'vscode ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
27 | USER $NB_USER
28 | ENV PATH "/home/${NB_USER}/.local/bin:${PATH}"
29 | ENV PYTHONPATH "/home/${NB_USER}/work/src:${PYTONPATH}"
30 | 
31 | # for vscode
32 | RUN mkdir /home/${NB_USER}/work
33 | 
34 | RUN conda config --add pkgs_dirs /home/vscode/.conda/pkgs


--------------------------------------------------------------------------------
/.github/workflows/conda.yml:
--------------------------------------------------------------------------------
 1 | name: Build and upload conda packages
 2 | 
 3 | # Triggered a new tag starting with "v" is pushed
 4 | on:
 5 |   push:
 6 |     tags:
 7 |       - 'v*'
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ${{ matrix .os }}
12 |     strategy:
13 |       matrix:
14 |         # https://github.com/s-weigand/setup-conda/issues/432
15 |         os: [ubuntu-latest, windows-2019, macos-latest]
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v4
19 |     - uses: conda-incubator/setup-miniconda@v3
20 |       with:
21 |         auto-update-conda: true
22 |     - name: Conda info
23 |       shell: bash -el {0}
24 |       run: conda info
25 |     - name: Install dependencies
26 |       run: |
27 |         conda install conda-build anaconda-client -y
28 | 
29 |     - name: Bulid and upload
30 |       env:
31 |         ANACONDA_API_TOKEN: ${{secrets.ANACONDA_TOKEN}}
32 |       run: |
33 |         python3 --version
34 |         conda config --set anaconda_upload yes
35 |         conda build .conda --user SpM-lab
36 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels.yml:
--------------------------------------------------------------------------------
 1 | name: Build and upload to PyPI
 2 | 
 3 | # Triggered a new tag starting with "v" is pushed
 4 | on:
 5 |   push:
 6 |     tags:
 7 |       - 'v*'
 8 | 
 9 | jobs:
10 |   build_sdist:
11 |     name: Build distribution
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v4
15 | 
16 |       - name: Examine system
17 |         run: pip freeze --all
18 | 
19 |       - name: Build sdist
20 |         run: python setup.py sdist
21 | 
22 |       - uses: actions/upload-artifact@v4
23 |         with:
24 |           name: dist
25 |           path: dist/xprec-*.tar.gz
26 | 
27 |   upload_pypi:
28 |     name: Upload to PyPI
29 |     needs: [build_sdist]
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - uses: actions/download-artifact@v4
33 |         with:
34 |           name: dist
35 |           path: dist
36 | 
37 |       - uses: pypa/gh-action-pypi-publish@v1.4.2
38 |         with:
39 |           user: __token__
40 |           password: ${{ secrets.PYPI_API_TOKEN }}
41 |           skip_existing: true
42 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021 Markus Wallerberger and others
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/test/test_dtype.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2021 Markus Wallerberger and others
 2 | # SPDX-License-Identifier: MIT
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | from xprec import ddouble
 7 | 
 8 | 
 9 | COMPATIBLE_DTYPES = [
10 |     np.int8, np.int16, np.int32, np.int64, np.bool_, np.float32, np.float64,
11 |     np.uint8, np.uint16, np.uint32, np.uint64,
12 |     ]
13 | 
14 | 
15 | @pytest.mark.parametrize('other', COMPATIBLE_DTYPES)
16 | def test_cast_from(other):
17 |     assert np.can_cast(other, ddouble, 'unsafe')
18 |     assert np.can_cast(other, ddouble, 'safe')
19 | 
20 |     x = np.eye(3, dtype=other)
21 |     y = x.astype(ddouble)
22 |     assert (x == y).all()
23 | 
24 | 
25 | @pytest.mark.parametrize('other', COMPATIBLE_DTYPES)
26 | def test_cast_to(other):
27 |     assert np.can_cast(ddouble, other, 'unsafe')
28 |     assert not np.can_cast(ddouble, other, 'safe')
29 | 
30 |     x = np.eye(3, dtype=ddouble)
31 |     y = x.astype(other)
32 |     assert (x == y).all()
33 | 
34 | 
35 | def test_i64():
36 |     x = np.int64((1 << 62) + 1)
37 |     assert x == x.astype(ddouble).astype(x.dtype)
38 | 
39 |     y = -x
40 |     assert (y + 1) == (y.astype(ddouble) + 1).astype(x.dtype)
41 | 
42 |     x = x.astype(np.uint64)
43 |     assert x == x.astype(ddouble).astype(x.dtype)
44 | 


--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
 1 | name: xprec python package
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       mainline
 7 |   pull_request:
 8 |     branches:
 9 |       mainline
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ${{ matrix.os }}
14 |     strategy:
15 |       matrix:
16 |         include:
17 |           - os: ubuntu-latest
18 |             numpy-version: auto
19 |             python-version: 3.9
20 |           - os: ubuntu-latest
21 |             numpy-version: 2.0
22 |             python-version: 3.11
23 |           - os: windows-latest
24 |             numpy-version: auto
25 |             python-version: 3.9
26 |           - os: macos-latest
27 |             numpy-version: auto
28 |             python-version: 3.11
29 |     steps:
30 |       - uses: actions/checkout@v4
31 | 
32 |       - name: Set up python ${{ matrix.python-version }}
33 |         uses: actions/setup-python@v5
34 |         with:
35 |           python-version: ${{ matrix.python-version }}
36 | 
37 |       - name: Install numpy ${{ matrix.numpy-version }}
38 |         if: ${{ matrix.numpy-version != 'auto' }}
39 |         run: |
40 |           pip install numpy==${{ matrix.numpy-version }}
41 | 
42 |       - name: Install package with testing dependencies
43 |         run: |
44 |           pip install -v .[test]
45 | 
46 |       - name: Test with pytest
47 |         run: |
48 |           pytest
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Library for double-double arithmetic calculation
 2 | ================================================
 3 | 
 4 | Extension module for numpy providing the `ddouble` data type.
 5 | 
 6 | Loading this module registers an additional scalar data type `ddouble` with
 7 | numpy implementing double-double arithmetic.  You can use use the data type
 8 | by passing `dtype=xprec.ddouble` to numpy functions.
 9 | 
10 | The `xprec.linalg` module provides some linear algebra subroutines, in
11 | particular QR, RRQR, SVD and truncated SVD.
12 | 
13 | Installation
14 | ------------
15 | 
16 |     $ pip install xprec
17 | 
18 | Quickstart
19 | ----------
20 | 
21 |     import numpy as np
22 |     x = np.linspace(0, np.pi)
23 | 
24 |     # import double-double precision data type
25 |     from xprec import ddouble
26 |     x = x.astype(ddouble)
27 |     y = x * x + 1
28 |     z = np.sin(x)
29 | 
30 |     # do some linalg
31 |     import xprec.linalg
32 |     A = np.vander(np.linspace(-1, 1, 80, dtype=ddouble), 150)
33 |     U, s, VT = xprec.linalg.svd(A)
34 | 
35 | Trouble shooting
36 | ---
37 | 
38 | * icc<br>
39 | You may suffer from a long runtime when xprec is built with icc. If you encounter this problem, please try the following:
40 | 
41 | ```
42 | CFLAGS="-fp-model=precise" pip install xprec
43 | ```
44 | 
45 | Licence
46 | -------
47 | The xprec library is
48 | Copyright (C) 2021 Markus Wallerberger.
49 | Licensed under the MIT license (see LICENSE.txt).
50 | 
51 | Contains code from the QD library, which is
52 | Copyright (C) 2012 Yozo Hida, Xiaoye S. Li, David H. Bailey.
53 | Released under a modified BSD license (see QD-LICENSE.txt).
54 | 


--------------------------------------------------------------------------------
/.dev/container/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or the definition README at
 2 | // https://github.com/microsoft/vscode-dev-containers/tree/master/containers/docker-existing-dockerfile
 3 | {
 4 |     "name": "Existing Dockerfile",
 5 |     // Sets the run context to one level up instead of the .devcontainer folder.
 6 |     "context": "..",
 7 |     // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
 8 |     "dockerFile": "./Dockerfile",
 9 |     // The optional 'runArgs' property can be used to specify additional runtime arguments.
10 |     "runArgs": [],
11 |     // Use 'settings' to set *default* container specific settings.json values on container create.
12 |     // You can edit these settings after create using File > Preferences > Settings > Remote.
13 |     // Uncomment the next line if you want to publish any ports.
14 |     // "appPort": [],
15 |     // Uncomment the next line to run commands after the container is created - for example installing git.
16 |     // "postCreateCommand": "apt-get update && apt-get install -y git",
17 |     // Add the IDs of extensions you want installed when the container is created in the array below.
18 |     "extensions": [
19 |       "ms-azuretools.vscode-docker",
20 |       "mutantdino.resourcemonitor",
21 |       "shardulm94.trailing-spaces",
22 |       "cliffordfajardo.hightlight-selections-vscode",
23 |       "wdawson.better-kill-ring",
24 |       "oderwat.indent-rainbow",
25 |       "github.vscode-pull-request-github",
26 |       "mhutchie.git-graph",
27 |       "donjayamanne.githistory",
28 |       "eamodio.gitlens",
29 |       "bungcip.better-toml",
30 |       "usernamehw.errorlens",
31 |       "ms-vscode.live-server",
32 |       "christian-kohler.path-intellisense",
33 |     "ms-python.python",
34 |     ],
35 |    "remoteUser": "vscode",
36 |    "workspaceFolder": "/home/vscode/work",
37 |    "workspaceMount": "src=${localWorkspaceFolder},dst=/home/vscode/work,type=bind",
38 |   }
39 | 


--------------------------------------------------------------------------------
/test/test_whitespace.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | HEREPATH = os.path.abspath(os.path.dirname(__file__))
 4 | print("HEREPATH", HEREPATH)
 5 | ROOTDIR = os.path.abspath(os.path.join(HEREPATH, os.path.pardir))
 6 | PYSRCDIR = os.path.join(ROOTDIR, "pysrc", "xprec")
 7 | CSCRDIR = os.path.join(ROOTDIR, "csrc")
 8 | 
 9 | def check_whitespace(files):
10 |     errors = []
11 |     blank = 0
12 |     lineno = 0
13 |     line = ""
14 |     def add_error(fmt, *params):
15 |         errors.append((fname, lineno, line, fmt.format(*params)))
16 | 
17 |     for fname in files:
18 |         with open(fname, "r") as file:
19 |             line = ""
20 |             for lineno, line in enumerate(file, start=1):
21 |                 if line[-1:] != '\n':
22 |                     add_error("file must end in blank line")
23 |                 line = line[:-1]
24 |                 if line:
25 |                     blank = 0
26 |                 else:
27 |                     blank += 1
28 |                 if line[-1:] == '\r':
29 |                     add_error("file must only have unix line endings")
30 |                 if line[-1:] == ' ':
31 |                     add_error("line ends in whitespace")
32 |                 if '\t' in line:
33 |                     add_error("line contains tab characters")
34 |                 if len(line) > 90:
35 |                     add_error("line is too long: {:d} chars", len(line))
36 |             # end of file
37 |             if blank != 0:
38 |                 add_error("file has {:d} superflouos blank lines", blank)
39 | 
40 |     msg = ""
41 |     for fname, lineno, line, lmsg in errors:
42 |         msg += "{}:{}: {}\n".format(fname.name, lineno, lmsg)
43 |     if msg:
44 |         raise ValueError("Whitespace errors\n" + msg)
45 | 
46 | 
47 | def all_files(path, ext):
48 |     for entry in os.scandir(path):
49 |         if entry.is_file() and entry.name.endswith(ext):
50 |             yield entry
51 | 
52 | 
53 | def test_ws_testdir():
54 |     check_whitespace(all_files(HEREPATH, ".py"))
55 | 
56 | 
57 | def test_ws_setup():
58 |     check_whitespace(all_files(ROOTDIR, ".py"))
59 | 
60 | 
61 | def test_ws_pysrcdir():
62 |     check_whitespace(all_files(PYSRCDIR, ".py"))
63 | 
64 | 
65 | def test_ws_csrcdir():
66 |     check_whitespace(all_files(CSCRDIR, ".c"))
67 |     check_whitespace(all_files(CSCRDIR, ".h"))
68 | 


--------------------------------------------------------------------------------
/csrc/dd_linalg.h:
--------------------------------------------------------------------------------
 1 | /* Double-double linear algebra library
 2 |  *
 3 |  * Implementations were partly inspired by LAPACK, partly from Fredrik
 4 |  * Johansson's excellent MPMATH library.
 5 |  *
 6 |  * Copyright (C) 2021 Markus Wallerberger and others
 7 |  * SPDX-License-Identifier: MIT
 8 |  */
 9 | #pragma once
10 | #include "dd_arith.h"
11 | 
12 | /**
13 |  * Apply Givens rotation to vector:
14 |  *
15 |  *      [ a ]  =  [  c   s ] [ x ]
16 |  *      [ b ]     [ -s   c ] [ y ]
17 |  */
18 | static inline void lmul_givensq(
19 |         ddouble *a, ddouble *b, ddouble c, ddouble s, ddouble x, ddouble y)
20 | {
21 |     *a = addww(mulww(c, x), mulww(s, y));
22 |     *b = subww(mulww(c, y), mulww(s, x));
23 | }
24 | 
25 | /** Compute 2-norm of a vector */
26 | ddouble normw(const ddouble *x, long nn, long sxn);
27 | 
28 | /**
29 |  * Perform a rank-one update of a `ii` times `jj` matrix:
30 |  *
31 |  *       A[i, j] += v[i] * w[j]
32 |  */
33 | void rank1updateq(ddouble *a, long ais, long ajs, const ddouble *v, long vs,
34 |                   const ddouble *w, long ws, long ii, long jj);
35 | 
36 | /**
37 |  * Compute Givens rotation `R` matrix that satisfies:
38 |  *
39 |  *      [  c  s ] [ f ]     [ r ]
40 |  *      [ -s  c ] [ g ]  =  [ 0 ]
41 |  */
42 | void givensw(ddouble f, ddouble g, ddouble *c, ddouble *s, ddouble *r);
43 | 
44 | /**
45 |  * Compute Householder reflector `H[tau, v]`, defined as:
46 |  *
47 |  *      H[tau, v] = I - tau * v @ v.T
48 |  *
49 |  * that, when applied to a given `x`, zeros out all but the first component.
50 |  * The scaling factor `tau` is returned, while `v` is written.
51 |  */
52 | ddouble householderw(const ddouble *x, ddouble *v, long nn, long sx, long sv);
53 | 
54 | /**
55 |  * Perform the SVD of an arbitrary two-by-two matrix:
56 |  *
57 |  *      [ a11  a12 ]  =  [  cu  -su ] [ smax     0 ] [  cv   sv ]
58 |  *      [ a21  a22 ]     [  su   cu ] [    0  smin ] [ -sv   cv ]
59 |  */
60 | void svd_2x2(ddouble a11, ddouble a12, ddouble a21, ddouble a22, ddouble *smin,
61 |              ddouble *smax, ddouble *cv, ddouble *sv, ddouble *cu, ddouble *su);
62 | 
63 | 
64 | 
65 | ddouble jacobi_sweep(ddouble *u, long sui, long suj, ddouble *vt, long svi,
66 |                      long svj, long ii, long jj);
67 | 
68 | 
69 | void golub_kahan_chaseq(ddouble *d, long sd, ddouble *e, long se, long ii,
70 |                         ddouble *rot);
71 | 


--------------------------------------------------------------------------------
/QD-LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Contains code from the QD library, which is:
 2 | Copyright (C) 2012 Yozo Hida, Xiaoye S. Li, David H. Bailey.
 3 | 
 4 | This License Agreement is entered into by The Regents of the University of
 5 | California, Department of Energy contract-operators of the Lawrence Berkeley
 6 | National Laboratory, 1 Cyclotron Road, Berkeley, CA 94720 (“Berkeley Lab”), 
 7 | and the entity listed below (“you” or "Licensee").
 8 | 
 9 |  1. Redistribution and use in source and binary forms, with or without
10 |     modification, are permitted provided that the following conditions are met:
11 | 
12 |      (1) Redistributions of source code must retain the copyright notice, this 
13 |          list of conditions and the following disclaimer.
14 | 
15 |      (2) Redistributions in binary form must reproduce the copyright notice,
16 |          this list of conditions and the following disclaimer in the 
17 |          documentation and/or other materials provided with the distribution.
18 | 
19 |      (3) Neither the name of the University of California, Lawrence Berkeley
20 |          National Laboratory, U.S. Dept. of Energy nor the names of its
21 |          contributors may be used to endorse or promote products derived from
22 |          this software without specific prior written permission.
23 | 
24 |  2. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
25 |     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26 |     TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27 |     PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
28 |     CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 |     EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 |     PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 |     PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
32 |     LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
33 |     NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 |     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 | 
36 |  3. You are under no obligation whatsoever to provide any bug fixes, patches,
37 |     or upgrades to the features, functionality or performance of the source
38 |     code ("Enhancements") to anyone; however, if you choose to make your
39 |     Enhancements available either publicly, or directly to Lawrence Berkeley
40 |     National Laboratory, without imposing a separate written license agreement
41 |     for such Enhancements, then you hereby grant the following license: a
42 |     non-exclusive, royalty-free perpetual license to install, use, modify,
43 |     prepare derivative works, incorporate into other computer software,
44 |     distribute, and sublicense such enhancements or derivative works thereof,
45 |     in binary and source code form.
46 | 


--------------------------------------------------------------------------------
/test/test_linalg.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2021 Markus Wallerberger and others
 2 | # SPDX-License-Identifier: MIT
 3 | import numpy as np
 4 | 
 5 | import xprec
 6 | import xprec.linalg
 7 | from xprec import ddouble
 8 | 
 9 | 
10 | def test_householder_vec():
11 |     rng = np.random.RandomState(4711)
12 |     xd = rng.random_sample(20)
13 |     xq = np.array(xd, dtype=ddouble)
14 | 
15 |     betaq, vq = xprec.linalg.householder(xq)
16 |     eq = xq - betaq * vq * (vq @ xq)
17 |     np.testing.assert_allclose(eq[1:].astype(float), 0, atol=1e-31)
18 | 
19 | 
20 | def test_bidiag():
21 |     rng = np.random.RandomState(4711)
22 |     m, n = 7, 5
23 |     A = rng.normal(size=(m,n)).astype(ddouble)
24 |     Q, B, RT = xprec.linalg.bidiag(A)
25 |     diff = Q @ B @ RT - A
26 | 
27 |     # FIXME: too large precision goals
28 |     np.testing.assert_allclose(diff.astype(float), 0, atol=1e-29)
29 | 
30 | 
31 | def test_svd():
32 |     rng = np.random.RandomState(4711)
33 |     A = rng.randn(100, 84)
34 | 
35 |     U, s, VT = xprec.linalg.svd(A.astype(xprec.ddouble), full_matrices=False)
36 |     R = U * s @ VT - A
37 |     np.testing.assert_allclose(R.astype(float), 0, atol=5e-29, rtol=0)
38 | 
39 |     _, sx, _ = np.linalg.svd(A.astype(float), full_matrices=False)
40 |     np.testing.assert_allclose(s, sx, atol=1e-14 * sx[0], rtol=0)
41 | 
42 | 
43 | def test_givens():
44 |     f, g = np.array([3.0, -2.0], dtype=ddouble)
45 |     c, s, r = xprec.linalg.givens_rotation(f, g)
46 | 
47 |     R = np.reshape([c, s, -s, c], (2,2))
48 |     v = np.hstack([f, g])
49 |     w = np.hstack([r, np.zeros_like(r)])
50 |     res = R @ v - w
51 |     np.testing.assert_allclose(res.astype(float), 0, atol=1e-31)
52 | 
53 | 
54 | def test_givens():
55 |     a = np.array([3.0, -2.0], dtype=ddouble)
56 |     r, G = xprec.linalg.givens(a)
57 |     diff = r - G @ a
58 |     np.testing.assert_allclose(diff.astype(float), 0, atol=1e-31)
59 | 
60 | 
61 | def test_qr():
62 |     A = np.vander(np.linspace(-1, 1, 60), 80).astype(ddouble)
63 |     Q, R = xprec.linalg.qr(A)
64 |     I_m = np.eye(60)
65 |     D = Q @ Q.T - I_m
66 |     np.testing.assert_allclose(D.astype(float), 0, atol=4e-30)
67 |     D = Q @ R - A
68 |     np.testing.assert_allclose(D.astype(float), 0, atol=4e-30)
69 | 
70 | 
71 | def test_qr_pivot():
72 |     A = np.vander(np.linspace(-1, 1, 60), 80).astype(ddouble)
73 |     Q, R, piv = xprec.linalg.rrqr(A)
74 |     I_m = np.eye(60)
75 |     D = Q @ Q.T - I_m
76 |     np.testing.assert_allclose(D.astype(float), 0, atol=4e-30)
77 | 
78 |     D = Q @ R - A[:,piv]
79 |     np.testing.assert_allclose(D.astype(float), 0, atol=4e-30)
80 | 
81 |     Rdiag = np.abs(R.diagonal())
82 |     assert (Rdiag[1:] <= Rdiag[:-1]).all()
83 | 
84 | 
85 | def test_jacobi():
86 |     A = np.vander(np.linspace(-1, 1, 60), 80).astype(ddouble)
87 |     U, s, VT = xprec.linalg.svd_trunc(A)
88 |     np.testing.assert_allclose((U * s) @ VT - A, 0.0, atol=5e-30)
89 | 


--------------------------------------------------------------------------------
/test/test_ufunc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2021 Markus Wallerberger and others
  2 | # SPDX-License-Identifier: MIT
  3 | import numpy as np
  4 | import xprec
  5 | 
  6 | 
  7 | def _compare_ufunc(ufunc, *args, ulps=1):
  8 |     fx_d = ufunc(*args)
  9 |     fx_q = ufunc(*(a.astype(xprec.ddouble) for a in args)).astype(float)
 10 | 
 11 |     # Ensure relative accuracy of 2 ulps
 12 |     np.testing.assert_array_almost_equal_nulp(fx_d, fx_q, ulps)
 13 | 
 14 | 
 15 | def test_log():
 16 |     x = np.geomspace(1e-300, 1e300, 1953)
 17 |     _compare_ufunc(np.log, x)
 18 | 
 19 |     zeroq = xprec.ddouble.type(0)
 20 |     assert np.isinf(np.log(zeroq))
 21 | 
 22 | 
 23 | def test_sqrt():
 24 |     x = np.geomspace(1e-300, 1e300, 1953)
 25 |     _compare_ufunc(np.sqrt, x)
 26 | 
 27 | 
 28 | def test_exp():
 29 |     x = np.geomspace(1e-300, 700, 4953)
 30 |     x = np.hstack([-x[::-1], 0, x])
 31 |     _compare_ufunc(np.exp, x)
 32 | 
 33 |     # Unfortunately, on Windows expm1 is less precise, so we need to increase
 34 |     # the tolerance slightly
 35 |     _compare_ufunc(np.expm1, x, ulps=2)
 36 | 
 37 | 
 38 | def test_cosh():
 39 |     x = np.geomspace(1e-300, 700, 4953)
 40 |     x = np.hstack([-x[::-1], 0, x])
 41 |     _compare_ufunc(np.cosh, x)
 42 |     _compare_ufunc(np.sinh, x)
 43 | 
 44 |     thousand = xprec.ddouble.type(1000)
 45 |     assert np.isinf(np.cosh(thousand))
 46 |     assert np.isinf(np.cosh(-thousand))
 47 | 
 48 | 
 49 | def test_hypot():
 50 |     x = np.geomspace(1e-300, 1e260, 47)
 51 |     x = np.hstack([-x[::-1], 0, x])
 52 |     _compare_ufunc(np.hypot, x[:,None], x[None,:])
 53 | 
 54 | 
 55 | def test_modf():
 56 |     ulps = 1
 57 |     x = np.linspace(-100, 100, 100)
 58 |     x_d = x.astype(xprec.ddouble)
 59 | 
 60 |     fx_d = np.modf(x)
 61 |     fx_q = np.modf(x_d)
 62 | 
 63 |     # Ensure relative accuracy of 1 ulp
 64 |     np.testing.assert_array_almost_equal_nulp(fx_d[0], fx_q[0].astype(float), ulps)
 65 |     np.testing.assert_array_almost_equal_nulp(fx_d[1], fx_q[1].astype(float), ulps)
 66 | 
 67 | 
 68 | def test_power():
 69 |     x = np.linspace(0, 100, 100)
 70 |     _compare_ufunc(np.power, x[:,None], x[None,:])
 71 | 
 72 | 
 73 | def test_arctan2():
 74 |     x = np.linspace(-100, 100, 100)
 75 |     _compare_ufunc(np.arctan2, x[:,None], x[None,:], ulps=2)
 76 | 
 77 | 
 78 | def test_arcsin():
 79 |     x = np.linspace(-1, 1, 100)
 80 |     _compare_ufunc(np.arcsin, x, ulps=2)
 81 | 
 82 | 
 83 | def test_arccos():
 84 |     x = np.linspace(-1, 1, 100)
 85 |     _compare_ufunc(np.arccos, x)
 86 | 
 87 | 
 88 | def test_arctan():
 89 |     x = np.linspace(-100, 100, 100)
 90 |     _compare_ufunc(np.arctan, x)
 91 | 
 92 | 
 93 | def test_arccosh():
 94 |     x = np.linspace(1, 100, 100)
 95 |     _compare_ufunc(np.arccosh, x)
 96 | 
 97 | 
 98 | def test_arcsinh():
 99 |     x = np.linspace(-100, 100, 100)
100 |     _compare_ufunc(np.arcsinh, x)
101 | 
102 | 
103 | def test_arctanh():
104 |     x = np.linspace(-0.99, 0.99, 100)
105 |     _compare_ufunc(np.arctanh, x)
106 | 


--------------------------------------------------------------------------------
/test/test_mpmath.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2021 Markus Wallerberger and others
 2 | # SPDX-License-Identifier: MIT
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | import xprec
 7 | 
 8 | EPS = xprec.finfo(xprec.ddouble).eps
 9 | 
10 | try:
11 |     import mpmath
12 | except ImportError:
13 |     pytest.skip("No mpmath library avaiable", allow_module_level=True)
14 | else:
15 |     mpmath.mp.prec = 120
16 | 
17 | 
18 | def mpf_for_xprec(x):
19 |     """Converts xprec.ddouble array to array of mpmath mpf scalars"""
20 |     x = np.asarray(x)
21 |     if x.dtype != xprec.ddouble:
22 |         raise ValueError("dtype shall be ddouble")
23 | 
24 |     x_flat = x.ravel()
25 |     x_hi = x_flat.astype(float)
26 |     x_lo = (x_flat - x_hi).astype(float)
27 |     x_mpf = np.array(list(map(mpmath.mpf, x_hi)))
28 |     x_mpf += x_lo
29 |     return x_mpf.reshape(x.shape)
30 | 
31 | 
32 | def map_mpmath(fn, x):
33 |     x = np.asarray(x)
34 |     x_flat = x.ravel()
35 |     y_flat = np.array(list(map(fn, x_flat)), dtype=object)
36 |     y = y_flat.reshape(x.shape)
37 |     return y
38 | 
39 | 
40 | def check_unary(mpmath_fn, numpy_fn, x, rtol):
41 |     y_ref = map_mpmath(mpmath_fn, x)
42 |     y_our = numpy_fn(x.astype(xprec.ddouble))
43 |     y_float = y_ref.astype(float)
44 | 
45 |     diff = (y_ref - mpf_for_xprec(y_our)).astype(float)
46 |     ok = np.abs(diff) <= rtol * np.abs(y_float)
47 |     if not ok.all():
48 |         x = x[~ok]
49 |         y_float = y_float[~ok]
50 |         y_our = y_our[~ok]
51 |         diff = diff[~ok]
52 |         reldiff = diff / np.abs(y_float)
53 | 
54 |         msg = f"{'x':>13s} {'mpmath':>13s} {'xprec':>13s} {'rel diff':>13s}\n"
55 |         msg += "\n".join(f"{xi:13g} {y_refi:13g} {y_ouri:13g} {reldiffi:13g}"
56 |                          for xi, y_refi, y_ouri, reldiffi, _
57 |                          in zip(x, y_float, y_our, reldiff, range(10))
58 |                          )
59 |         raise ValueError(f"not equal to rtol = {rtol:3g}\n" + msg)
60 | 
61 | 
62 | 
63 | def test_sqrt():
64 |     # Once the low part of the ddouble becomes a denormal number, we
65 |     # are in trouble, so we truncate the lower end of the range by
66 |     # another 16 digits
67 |     x = np.geomspace(1e-292, 1e307, 1953)
68 |     check_unary(mpmath.sqrt, np.sqrt, x, 2*EPS)
69 | 
70 | 
71 | def test_log():
72 |     x = np.reciprocal(np.geomspace(1e-292, 1e307, 1953))
73 |     check_unary(mpmath.log, np.log, x, 70 * EPS)
74 | 
75 | 
76 | def test_exp():
77 |     x = np.geomspace(1e-280, 670, 1511)
78 |     x = np.hstack([-x[::-1], 0, x])
79 |     check_unary(mpmath.exp, np.exp, x, 60 * EPS)
80 |     check_unary(mpmath.expm1, np.expm1, x, 60 * EPS)
81 | 
82 |     check_unary(mpmath.sinh, np.sinh, x, 60 * EPS)
83 |     check_unary(mpmath.cosh, np.cosh, x, 60 * EPS)
84 |     check_unary(mpmath.tanh, np.tanh, x, 60 * EPS)
85 | 
86 | 
87 | def test_sincos():
88 |     x = np.geomspace(1e-280, 4.8 * np.pi, 1511)
89 |     x = np.hstack([-x[::-1], 0, x])
90 |     check_unary(mpmath.sin, np.sin, x, 2 * EPS)
91 |     check_unary(mpmath.cos, np.cos, x, 2 * EPS)
92 | 


--------------------------------------------------------------------------------
/csrc/dd_linalg.c:
--------------------------------------------------------------------------------
  1 | /* Double-double linear algebra library
  2 |  *
  3 |  * Implementations were partly inspired by LAPACK, partly from Fredrik
  4 |  * Johansson's excellent MPMATH library.
  5 |  *
  6 |  * Copyright (C) 2021 Markus Wallerberger and others
  7 |  * SPDX-License-Identifier: MIT
  8 |  */
  9 | #include "dd_linalg.h"
 10 | 
 11 | // 2**500 and 2**(-500);
 12 | static const double LARGE = 3.273390607896142e+150;
 13 | static const double INV_LARGE = 3.054936363499605e-151;
 14 | 
 15 | static ddouble normq_scaled(const ddouble *x, long nn, long sxn,
 16 |                             double scaling)
 17 | {
 18 |     ddouble sum = Q_ZERO;
 19 |     for (long n = 0; n < nn; ++n, x += sxn) {
 20 |         ddouble curr = mul_pwr2(*x, scaling);
 21 |         sum = addww(sum, sqrw(curr));
 22 |     };
 23 |     return mul_pwr2(sqrtw(sum), 1.0/scaling);
 24 | }
 25 | 
 26 | ddouble normw(const ddouble *x, long nn, long sxn)
 27 | {
 28 |     ddouble sum = normq_scaled(x, nn, sxn, 1.0);
 29 | 
 30 |     // fall back to other routines in case of over/underflow
 31 |     if (sum.hi > LARGE)
 32 |         return normq_scaled(x, nn, sxn, INV_LARGE);
 33 |     else if (sum.hi < INV_LARGE)
 34 |         return normq_scaled(x, nn, sxn, LARGE);
 35 |     else
 36 |         return sum;
 37 | }
 38 | 
 39 | ddouble householderw(const ddouble *x, ddouble *v, long nn, long sx, long sv)
 40 | {
 41 |     if (nn == 0)
 42 |         return Q_ZERO;
 43 | 
 44 |     ddouble norm_x = normw(x + sx, nn - 1, sx);
 45 |     if (iszerow(norm_x))
 46 |         return Q_ZERO;
 47 | 
 48 |     ddouble alpha = *x;
 49 |     ddouble beta = copysignww(hypotww(alpha, norm_x), alpha);
 50 | 
 51 |     ddouble diff = subww(beta, alpha);
 52 |     ddouble tau = divww(diff, beta);
 53 |     ddouble scale = reciprocalw(negw(diff));
 54 | 
 55 |     v[0] = Q_ONE;
 56 |     for (long n = 1; n != nn; ++n)
 57 |         v[n * sv] = mulww(scale, x[n * sx]);
 58 |     return tau;
 59 | }
 60 | 
 61 | void rank1updateq(ddouble *a, long ais, long ajs, const ddouble *v, long vs,
 62 |                   const ddouble *w, long ws, long ii, long jj)
 63 | {
 64 |     #pragma omp parallel for collapse(2)
 65 |     for (long i = 0; i < ii; ++i) {
 66 |         for (long j = 0; j < jj; ++j) {
 67 |             ddouble tmp = mulww(v[i * vs], w[j * ws]);
 68 |             a[i * ais + j * ajs] = addww(a[i * ais + j * ajs], tmp);
 69 |         }
 70 |     }
 71 | }
 72 | 
 73 | void givensw(ddouble f, ddouble g, ddouble *c, ddouble *s, ddouble *r)
 74 | {
 75 |     /* ACM Trans. Math. Softw. 28(2), 206, Alg 1 */
 76 |     if (iszerow(g)) {
 77 |         *c = Q_ONE;
 78 |         *s = Q_ZERO;
 79 |         *r = f;
 80 |     } else if (iszerow(f)) {
 81 |         *c = Q_ZERO;
 82 |         *s = (ddouble) {signbitw(g), 0.0};
 83 |         *r = absw(g);
 84 |     } else {
 85 |         *r = copysignww(hypotww(f, g), f);
 86 | 
 87 |         /* This may come at a slight loss of precision, however, we should
 88 |          * not really have to care ...
 89 |          */
 90 |         ddouble inv_r = reciprocalw(*r);
 91 |         *c = mulww(f, inv_r);
 92 |         *s = mulww(g, inv_r);
 93 |     }
 94 | }
 95 | 
 96 | static void svd_tri2x2(
 97 |                 ddouble f, ddouble g, ddouble h, ddouble *smin, ddouble *smax,
 98 |                 ddouble *cv, ddouble *sv, ddouble *cu, ddouble *su)
 99 | {
100 |     ddouble fa = absw(f);
101 |     ddouble ga = absw(g);
102 |     ddouble ha = absw(h);
103 |     bool compute_uv = cv != NULL;
104 | 
105 |     if (lessww(fa, ha)) {
106 |         // switch h <-> f, cu <-> sv, cv <-> su
107 |         svd_tri2x2(h, g, f, smin, smax, su, cu, sv, cv);
108 |         return;
109 |     }
110 |     if (iszerow(ga)) {
111 |         // already diagonal
112 |         *smin = ha;
113 |         *smax = fa;
114 |         if (compute_uv) {
115 |             *cu = Q_ONE;
116 |             *su = Q_ZERO;
117 |             *cv = Q_ONE;
118 |             *sv = Q_ZERO;
119 |         }
120 |         return;
121 |     }
122 |     if (fa.hi < Q_EPS.hi * ga.hi) {
123 |         // ga is very large
124 |         *smax = ga;
125 |         if (ha.hi > 1.0)
126 |             *smin = divww(fa, divww(ga, ha));
127 |         else
128 |             *smin = mulww(divww(fa, ga), ha);
129 |         if (compute_uv) {
130 |             *cu = Q_ONE;
131 |             *su = divww(h, g);
132 |             *cv = Q_ONE;
133 |             *sv = divww(f, g);
134 |         }
135 |         return;
136 |     }
137 |     // normal case
138 |     ddouble fmh = subww(fa, ha);
139 |     ddouble d = divww(fmh, fa);
140 |     ddouble q = divww(g, f);
141 |     ddouble s = subdw(2.0, d);
142 |     ddouble spw = hypotww(q, s);
143 |     ddouble dpw = hypotww(d, q);
144 |     ddouble a = mul_pwr2(addww(spw, dpw), 0.5);
145 |     *smin = absw(divww(ha, a));
146 |     *smax = absw(mulww(fa, a));
147 | 
148 |     if (compute_uv) {
149 |         ddouble tmp = addww(divww(q, addww(spw, s)),
150 |                             divww(q, addww(dpw, d)));
151 |         tmp = mulww(tmp, adddw(1.0, a));
152 |         ddouble tt = hypotwd(tmp, 2.0);
153 |         *cv = divdw(2.0, tt);
154 |         *sv = divww(tmp, tt);
155 |         *cu = divww(addww(*cv, mulww(*sv, q)), a);
156 |         *su = divww(mulww(divww(h, f), *sv), a);
157 |     }
158 | }
159 | 
160 | void svd_2x2(ddouble a11, ddouble a12, ddouble a21, ddouble a22, ddouble *smin,
161 |              ddouble *smax, ddouble *cv, ddouble *sv, ddouble *cu, ddouble *su)
162 | {
163 |     bool compute_uv = cv != NULL;
164 |     if(iszerow(a21))
165 |         return svd_tri2x2(a11, a12, a22, smin, smax, cv, sv, cu, su);
166 | 
167 |     /* First, we use a givens rotation  Rx
168 |      *   [  cx   sx ] [ a11  a12 ] = [ rx  a12' ]
169 |      *   [ -sx   cx ] [ a21  a22 ]   [ 0   a22' ]
170 |      */
171 |     ddouble cx, sx, rx;
172 |     givensw(a11, a21, &cx, &sx, &rx);
173 |     a11 = rx;
174 |     a21 = Q_ZERO;
175 |     lmul_givensq(&a12, &a22, cx, sx, a12, a22);
176 | 
177 |     /* Next, use the triangular routine
178 |      *    [ f  g ]  =  [  cu  -su ] [ smax     0 ] [  cv   sv ]
179 |      *    [ 0  h ]     [  su   cu ] [    0  smin ] [ -sv   cv ]
180 |      */
181 |     svd_tri2x2(a11, a12, a22, smin, smax, cv, sv, cu, su);
182 | 
183 |     /* Finally, update the LHS (U) transform as follows:
184 |      *   [  cx  -sx ] [  cu  -su ] = [  cu'  -su' ]
185 |      *   [  sx   cx ] [  su   cu ]   [  su'   cu' ]
186 |      */
187 |     if (compute_uv)
188 |         lmul_givensq(cu, su, cx, negw(sx), *cu, *su);
189 | }
190 | 
191 | ddouble jacobi_sweep(ddouble *u, long sui, long suj, ddouble *vt, long svi,
192 |                      long svj, long ii, long jj)
193 | {
194 |     ddouble _cu, _su, cv, sv, _smin, _smax;
195 |     ddouble offd = Q_ZERO;
196 | 
197 |     if (ii < jj)
198 |         return nanw();
199 | 
200 |     // Note that the inner loop only runs over the square portion!
201 |     for (long i = 0; i < jj - 1; ++i) {
202 |         for (long j = i + 1; j < jj; ++j) {
203 |             // Construct the matrix to be diagonalized
204 |             ddouble Hii = Q_ZERO, Hij = Q_ZERO, Hjj = Q_ZERO;
205 |             for (long k = 0; k != ii; ++k) {
206 |                 ddouble u_ki = u[k * sui + i * suj];
207 |                 ddouble u_kj = u[k * sui + j * suj];
208 |                 Hii = addww(Hii, mulww(u_ki, u_ki));
209 |                 Hij = addww(Hij, mulww(u_ki, u_kj));
210 |                 Hjj = addww(Hjj, mulww(u_kj, u_kj));
211 |             }
212 |             offd = addww(offd, sqrw(Hij));
213 | 
214 |             // diagonalize
215 |             svd_2x2(Hii, Hij, Hij, Hjj, &_smin, &_smax, &cv, &sv, &_cu, &_su);
216 | 
217 |             // apply rotation to VT
218 |             for (long k = 0; k < jj; ++k) {
219 |                 ddouble *vt_ik = &vt[i * svi + k * svj];
220 |                 ddouble *vt_jk = &vt[j * svi + k * svj];
221 |                 lmul_givensq(vt_ik, vt_jk, cv, sv, *vt_ik, *vt_jk);
222 |             }
223 | 
224 |             // apply transposed rotation to U
225 |             for (long k = 0; k < ii; ++k) {
226 |                 ddouble *u_ki = &u[k * sui + i * suj];
227 |                 ddouble *u_kj = &u[k * sui + j * suj];
228 |                 lmul_givensq(u_ki, u_kj, cv, sv, *u_ki, *u_kj);
229 |             }
230 |         }
231 |     }
232 |     offd = sqrtw(offd);
233 |     return offd;
234 | }
235 | 
236 | static ddouble gk_shift(ddouble d1, ddouble e1, ddouble d2)
237 | {
238 |     /* Get singular values of 2x2 triangular matrix formed from the lower
239 |      * right corner in the array:
240 |      *
241 |      *      [ d[ii-2]  e[ii-2] ]
242 |      *      [ 0        d[ii-1] ]
243 |      */
244 |     ddouble smin, smax;
245 |     svd_tri2x2(d1, e1, d2, &smin, &smax, NULL, NULL, NULL, NULL);
246 | 
247 |     ddouble smin_dist = absw(subww(smin, d2));
248 |     ddouble smax_dist = absw(subww(smax, d2));
249 |     return lessww(smin_dist, smax_dist) ? smin : smax;
250 | }
251 | 
252 | void golub_kahan_chaseq(ddouble *d, long sd, ddouble *e, long se, long ii,
253 |                         ddouble *rot)
254 | {
255 |     if (ii < 2)
256 |         return;
257 | 
258 |     ddouble shift = gk_shift(d[(ii-2)*sd], e[(ii-2)*se], d[(ii-1)*sd]);
259 |     ddouble g = e[0];
260 |     ddouble f = addww(copysigndw(1.0, d[0]), divww(shift, d[0]));
261 |     f = mulww(f, subww(absw(d[0]), shift));
262 | 
263 |     for (long i = 0; i < (ii - 1); ++i) {
264 |         ddouble r, cosr, sinr;
265 |         givensw(f, g, &cosr, &sinr, &r);
266 |         if (i != 0)
267 |             e[(i-1)*se] = r;
268 | 
269 |         lmul_givensq(&f, &e[i*se], cosr, sinr, d[i*sd], e[i*se]);
270 |         lmul_givensq(&g, &d[(i+1)*sd], cosr, sinr, Q_ZERO, d[(i+1)*sd]);
271 |         *(rot++) = cosr;
272 |         *(rot++) = sinr;
273 | 
274 |         ddouble cosl, sinl;
275 |         givensw(f, g, &cosl, &sinl, &r);
276 |         d[i*sd] = r;
277 |         lmul_givensq(&f, &d[(i+1)*sd], cosl, sinl, e[i*se], d[(i+1)*sd]);
278 |         if (i < ii - 2) {
279 |             lmul_givensq(&g, &e[(i+1)*se], cosl, sinl, Q_ZERO, e[(i+1)*se]);
280 |         }
281 |         *(rot++) = cosl;
282 |         *(rot++) = sinl;
283 |     }
284 |     e[(ii-2)*se] = f;
285 | }
286 | 


--------------------------------------------------------------------------------
/pysrc/xprec/linalg.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2021 Markus Wallerberger and others
  2 | # SPDX-License-Identifier: MIT
  3 | #
  4 | # Some of the code in this module is adapted from the LAPACK reference
  5 | # implementation.
  6 | import numpy as np
  7 | from warnings import warn
  8 | 
  9 | from . import ddouble
 10 | from . import _dd_linalg
 11 | 
 12 | norm = _dd_linalg.norm
 13 | givens = _dd_linalg.givens
 14 | householder = _dd_linalg.householder
 15 | rank1update = _dd_linalg.rank1update
 16 | 
 17 | 
 18 | def qr(A, reflectors=False):
 19 |     """QR decomposition without pivoting.
 20 | 
 21 |     Decomposes a `(m, n)` matrix `A` into the product:
 22 | 
 23 |         A == Q @ R
 24 | 
 25 |     where `Q` is an `(m, m)` orthogonal matrix and `R` is a `(m, n)` upper
 26 |     triangular matrix.  No pivoting is used.
 27 |     """
 28 |     R = np.array(A)
 29 |     m, n = R.shape
 30 |     k = min(m, n)
 31 | 
 32 |     Q = np.zeros((k, m), A.dtype)
 33 |     for i in range(k):
 34 |         householder_update(R[i:,i:], Q[i:,i:])
 35 |     if not reflectors:
 36 |         I = np.eye(m, dtype=A.dtype)
 37 |         Q = householder_apply(Q, I)
 38 |     return Q, R
 39 | 
 40 | 
 41 | def rrqr(A, tol=5e-32, reflectors=False):
 42 |     """Truncated rank-revealing QR decomposition with full column pivoting.
 43 | 
 44 |     Decomposes a `(m, n)` matrix `A` into the product:
 45 | 
 46 |         A[:,piv] == Q @ R
 47 | 
 48 |     where `Q` is an `(m, k)` isometric matrix, `R` is a `(k, n)` upper
 49 |     triangular matrix, `piv` is a permutation vector, and `k` is chosen such
 50 |     that the relative tolerance `tol` is met in the equality above.
 51 |     """
 52 |     R = np.array(A)
 53 |     m, n = R.shape
 54 |     k = min(m, n)
 55 | 
 56 |     Q = np.zeros((m, k), A.dtype)
 57 |     jpvt = np.arange(n)
 58 |     norms = norm(A.T)
 59 |     xnorms = norms.copy()
 60 |     TOL3Z = np.finfo(float).eps
 61 |     for i in range(k):
 62 |         pvt = i + np.argmax(norms[i:])
 63 |         if i != pvt:
 64 |             R[:,[i, pvt]] = R[:,[pvt, i]]
 65 |             jpvt[[i, pvt]] = jpvt[[pvt, i]]
 66 |             norms[pvt] = norms[i]
 67 |             xnorms[pvt] = xnorms[i]
 68 | 
 69 |         householder_update(R[i:,i:], Q[i:,i:])
 70 | 
 71 |         js = (i + 1) + norms[i + 1:].nonzero()[0]
 72 |         temp = np.abs(R[i,js]) / norms[js]
 73 |         temp = np.fmax(0.0, (1 + temp)*(1 - temp))
 74 |         temp2 = temp * np.square(norms[js] / xnorms[js])
 75 | 
 76 |         wheresmall = temp2 < TOL3Z
 77 |         jsmall = js[wheresmall]
 78 |         upd_norms = norm(R[i+1:,jsmall].T)
 79 |         norms[jsmall] = upd_norms
 80 |         xnorms[jsmall] = upd_norms
 81 |         jbig = js[~wheresmall]
 82 |         norms[jbig] *= np.sqrt(temp[~wheresmall])
 83 | 
 84 |         if tol is not None:
 85 |             acc = np.abs(R[i,i] / R[0,0])
 86 |             if acc < tol:
 87 |                 k = i + 1
 88 |                 Q = Q[:,:k]
 89 |                 R = R[:k,:]
 90 |                 break
 91 | 
 92 |     if not reflectors:
 93 |         I = np.eye(m, k, dtype=A.dtype)
 94 |         Q = householder_apply(Q, I)
 95 |     return Q, R, jpvt
 96 | 
 97 | 
 98 | def svd(A, full_matrices=True):
 99 |     """Truncated singular value decomposition.
100 | 
101 |     Decomposes a `(m, n)` matrix `A` into the product:
102 | 
103 |         A == U @ (s[:,None] * VT)
104 | 
105 |     where `U` is a `(m, k)` matrix with orthogonal columns, `VT` is a `(k, n)`
106 |     matrix with orthogonal rows and `s` are the singular values, a set of `k`
107 |     nonnegative numbers in non-ascending order and `k = min(m, n)`.
108 |     """
109 |     A = np.asarray(A)
110 |     m, n = A.shape
111 |     if m < n:
112 |         U, s, VT = svd(A.T, full_matrices)
113 |         return VT.T, s, U.T
114 | 
115 |     Q, B, RT = bidiag(A)
116 |     for _ in range(20 * n):
117 |         if svd_bidiag_step(Q, B, RT):
118 |             break
119 |     else:
120 |         warn("Did not converge")
121 | 
122 |     U, s, VH = svd_normalize(Q, B.diagonal(), RT)
123 |     if not full_matrices:
124 |         U = U[:,:n]
125 |     return U, s, VH
126 | 
127 | 
128 | def svd_trunc(A, tol=5e-32, method='jacobi', max_iter=20):
129 |     """Truncated singular value decomposition.
130 | 
131 |     Decomposes a `(m, n)` matrix `A` into the product:
132 | 
133 |         A == U @ (s[:,None] * VT)
134 | 
135 |     where `U` is a `(m, k)` matrix with orthogonal columns, `VT` is a `(k, n)`
136 |     matrix with orthogonal rows and `s` are the singular values, a set of `k`
137 |     nonnegative numbers in non-ascending order.  The SVD is truncated in the
138 |     sense that singular values below `tol` are discarded.
139 |     """
140 |     # RRQR is an excellent preconditioner for Jacobi.  One should then perform
141 |     # Jacobi on RT
142 |     Q, R, p = rrqr(A, tol)
143 |     if method == 'jacobi':
144 |         U, s, VT = svd_jacobi(R.T, tol, max_iter)
145 |     elif method == 'golub-kahan':
146 |         U, s, VT = svd(R.T, full_matrices=False)
147 |     else:
148 |         raise ValueError("invalid method")
149 | 
150 |     # Reconstruct A from QRs
151 |     U_A = Q @ VT.T
152 |     VT_B = U.T[:, p.argsort()]
153 |     return U_A, s, VT_B
154 | 
155 | 
156 | def bidiag(A, reflectors=False, force_structure=False):
157 |     """Biadiagonalizes an arbitray rectangular matrix.
158 | 
159 |     Decomposes a `(m, n)` matrix `A` into the product:
160 | 
161 |         A == Q @ B @ RT
162 | 
163 |     where `Q` is a `(m, m)` orthogonal matrix, `RT` is a `(n, n)` orthogonal
164 |     matrix, and `B` is a bidiagonal matrix, where the upper diagonal is
165 |     nonzero for `m >= n` and the lower diagonal is nonzero for `m < n`.
166 |     """
167 |     A = np.asarray(A)
168 |     m, n = A.shape
169 |     if m < n:
170 |         Q, B, RT = bidiag(A.T, reflectors)
171 |         return RT.T, B.T, Q.T
172 | 
173 |     rq = n - (m == n)
174 |     B = A.copy()
175 |     Q = np.zeros_like(B)
176 |     R = np.zeros_like(B[:n,:n])
177 | 
178 |     for j in range(n-2):
179 |         householder_update(B[j:,j:], Q[j:,j:])
180 |         householder_update(B[j:,j+1:].T, R[j+1:,j+1:])
181 |     for j in range(n-2, rq):
182 |         householder_update(B[j:,j:], Q[j:,j:])
183 | 
184 |     if force_structure:
185 |         d = B.diagonal().copy()
186 |         e = B.diagonal(1).copy()
187 |         B[...] = 0
188 |         i = np.arange(n)
189 |         B[i, i] = d
190 |         B[i[:-1], i[:-1]+1] = e
191 |     if not reflectors:
192 |         Q = householder_apply(Q, np.eye(m, dtype=B.dtype))
193 |         R = householder_apply(R, np.eye(n, dtype=B.dtype))
194 |     return Q, B, R.T
195 | 
196 | 
197 | def svd_jacobi(A, tol=5e-32, max_iter=20):
198 |     """Singular value decomposition using Jacobi rotations."""
199 |     U = A.copy()
200 |     m, n = U.shape
201 |     if m < n:
202 |         raise RuntimeError("expecting tall matrix")
203 | 
204 |     VT = np.eye(n, dtype=U.dtype)
205 |     offd = np.empty((), ddouble)
206 | 
207 |     limit = tol * np.linalg.norm(U[:n,:n], 'fro')
208 |     for _ in range(max_iter):
209 |         _dd_linalg.jacobi_sweep(U, VT, out=(U, VT, offd))
210 |         if offd <= limit:
211 |             break
212 |     else:
213 |         warn("Did not converge")
214 | 
215 |     s = norm(U.T)
216 |     U = U / s
217 |     return U, s, VT
218 | 
219 | 
220 | def householder_update(A, Q):
221 |     """Reflects the zeroth column onto a multiple of the unit vector"""
222 |     beta, v = householder(A[:,0])
223 |     w = -beta * (A.T @ v)
224 |     rank1update(A, v, w, out=A)
225 |     Q[0,0] = beta
226 |     Q[1:,0] = v[1:]
227 | 
228 | 
229 | def householder_apply(H, Q):
230 |     """Applies a set of reflectors to a matrix"""
231 |     H = np.asarray(H)
232 |     Q = Q.copy()
233 |     m, r = H.shape
234 |     if Q.shape[0] != m:
235 |         raise ValueError("invalid shape")
236 |     if Q.shape[1] < r:
237 |         raise ValueError("invalid shape")
238 |     for j in range(r-1, -1, -1):
239 |         beta = H[j,j]
240 |         if np.equal(beta, 0):
241 |             continue
242 |         v = np.empty_like(H[j:,0])
243 |         v[0] = 1
244 |         v[1:] = H[j+1:,j]
245 |         Qpart = Q[j:,j:]
246 |         w = -beta * (Qpart.T @ v)
247 |         rank1update(Qpart, v, w, out=Qpart)
248 |     return Q
249 | 
250 | 
251 | def svd_normalize(U, d, VH):
252 |     """Given a SVD-like decomposition, normalize"""
253 |     # Invert
254 |     n = d.size
255 |     VH[np.signbit(d)] = -VH[np.signbit(d)]
256 |     d = np.abs(d)
257 | 
258 |     # Sort
259 |     order = np.argsort(d)[::-1]
260 |     d = d[order]
261 |     VH = VH[order]
262 |     U = U.copy()
263 |     U[:,:n] = U[:,order]
264 |     return U, d, VH
265 | 
266 | 
267 | def svd_bidiag_step(Q, B, RT):
268 |     """Single SVD step for a bidiagonal matrix"""
269 |     d = B.diagonal().copy()
270 |     e = np.hstack([B.diagonal(1), 0.0])
271 | 
272 |     p, q = bidiag_partition(d, e)
273 |     if q <= 1:
274 |         return True
275 | 
276 |     d_part = d[p:q]
277 |     e_part = e[p:q]
278 |     rot = np.empty((d_part.size, 4), d.dtype)
279 |     _dd_linalg.golub_kahan_chase(d_part, e_part, out=(d_part, e_part, rot))
280 | 
281 |     i = np.arange(p, q)
282 |     B[i, i] = d_part
283 |     B[i[:-1], i[:-1]+1] = e_part[:-1]
284 | 
285 |     rot_Q = rot[:, 2:]
286 |     rot_R = rot[:, :2]
287 |     QT_part = Q[:, p:q].T
288 |     RT_part = RT[p:q, :]
289 |     _dd_linalg.givens_seq(rot_Q, QT_part, out=QT_part)
290 |     _dd_linalg.givens_seq(rot_R, RT_part, out=RT_part)
291 |     return False
292 | 
293 | 
294 | def bidiag_partition(d, e, eps=5e-32):
295 |     """Partition bidiagonal matrix into blocks for implicit QR.
296 | 
297 |     Return `p,q` which partions a bidiagonal `B` matrix into three blocks:
298 | 
299 |       - B[0:p, 0:p], an arbitrary bidiaonal matrix
300 |       - B[p:q, p:q], a matrix with all off-diagonal elements nonzero
301 |       - B[q:,  q:],  a diagonal matrix
302 |     """
303 |     abs_e = np.abs(e)
304 |     abs_d = np.abs(d)
305 |     e_zero = abs_e <= eps * (abs_d + abs_e)
306 |     e[e_zero] = 0
307 | 
308 |     q = _find_last(~e_zero) + 1
309 |     if q <= 0:
310 |         return 0, 0
311 |     p = _find_last(e_zero[:q]) + 1
312 |     return p, q + 1
313 | 
314 | 
315 | def _find_last(a, axis=-1):
316 |     a = a.astype(bool)
317 |     maxloc = a.shape[axis] - 1 - a[::-1].argmax(axis)
318 |     return np.where(a[maxloc], maxloc, -1)
319 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # Setup script - embracing the setuptools madness.
  2 | #
  3 | # Copyright (C) 2021 Markus Wallerberger and others
  4 | # SPDX-License-Identifier: MIT
  5 | import io
  6 | import os.path
  7 | import os
  8 | import platform
  9 | import re
 10 | 
 11 | from setuptools import setup, find_packages
 12 | from setuptools.extension import Extension
 13 | from setuptools.command.build_ext import build_ext as BuildExt
 14 | 
 15 | 
 16 | def readfile(*parts):
 17 |     """Return contents of file with path relative to script directory"""
 18 |     herepath = os.path.abspath(os.path.dirname(__file__))
 19 |     fullpath = os.path.join(herepath, *parts)
 20 |     with io.open(fullpath, 'r') as f:
 21 |         return f.read()
 22 | 
 23 | 
 24 | def extract_version(*parts):
 25 |     """Extract value of __version__ variable by parsing python script"""
 26 |     initfile = readfile(*parts)
 27 |     version_re = re.compile(r"(?m)^__version__\s*=\s*['\"]([^'\"]*)['\"]")
 28 |     match = version_re.search(initfile)
 29 |     return match.group(1)
 30 | 
 31 | 
 32 | def rebase_links(text, base_url):
 33 |     """Rebase links to doc/ directory to ensure they work online."""
 34 |     doclink_re = re.compile(
 35 |                         r"(?m)^\s*\[\s*([^\]\n\r]+)\s*\]:\s*(doc/[./\w]+)\s*$")
 36 |     result, nsub = doclink_re.subn(r"[\1]: %s/\2" % base_url, text)
 37 |     return result
 38 | 
 39 | 
 40 | def append_if_absent(list, arg):
 41 |     """Append argument to list if absent"""
 42 |     if arg not in list:
 43 |         list.append(arg)
 44 | 
 45 | 
 46 | def get_flags_dict(exec):
 47 |     # First, let us clean up the mess of compiler options a little bit:  Move
 48 |     # flags out into a dictionary, thereby removing the myriad of duplicates
 49 |     cc_so, *cflags_so_list = exec
 50 |     cflags_curr = None
 51 |     cflags_so = {}
 52 |     for arg in cflags_so_list:
 53 |         if arg.startswith("-"):
 54 |             if cflags_curr is not None:
 55 |                 cflags_so[cflags_curr] = None
 56 |                 cflags_curr = None
 57 |             arg = arg.split("=", 1)
 58 |             if len(arg) == 1:
 59 |                 cflags_curr, = arg
 60 |             else:
 61 |                 k, v = arg
 62 |                 cflags_so[k] = v
 63 |         else:
 64 |             if cflags_curr is None:
 65 |                 raise ValueError("expected flag" + str(exec))
 66 |             cflags_so[cflags_curr] = arg
 67 |             cflags_curr = None
 68 |     if cflags_curr is not None:
 69 |         cflags_so[cflags_curr] = None
 70 | 
 71 |     return cc_so, cflags_so
 72 | 
 73 | 
 74 | def make_exec_string(cc_so, cflags_so):
 75 |     # Now update the flags
 76 |     cflags_so = [k + ("=" + v if v is not None else "")
 77 |                  for (k,v) in cflags_so.items()]
 78 |     return [cc_so] + cflags_so
 79 | 
 80 | 
 81 | class OptionsMixin:
 82 |     _convert_to_bool = {"true": True, "false": False}
 83 |     user_options = [
 84 |         ("with-openmp=", None, "use openmp to build (default: false)"),
 85 |         ("opt-arch=", None, "optimized for architecture"),
 86 |         ("numpy-include-dir=", None, "numpy include directory"),
 87 |         ]
 88 | 
 89 |     def initialize_options(self):
 90 |         super().initialize_options()
 91 |         self.with_openmp = None
 92 |         self.numpy_include_dir = None
 93 |         self.opt_arch = None
 94 | 
 95 |     def finalize_options(self):
 96 |         if self.with_openmp is not None:
 97 |             self.with_openmp = self._convert_to_bool[self.with_openmp.lower()]
 98 |         if self.opt_arch is not None:
 99 |             self.opt_arch = self._convert_to_bool[self.opt_arch.lower()]
100 |         if self.numpy_include_dir is not None:
101 |             if not os.path.isdir(self.numpy_include_dir):
102 |                 raise ValueError("include directory must exist")
103 |         super().finalize_options()
104 | 
105 | 
106 | class BuildExtWithNumpy(OptionsMixin, BuildExt):
107 |     """Wrapper class for building numpy extensions"""
108 |     user_options = BuildExt.user_options + OptionsMixin.user_options
109 | 
110 |     def build_extensions(self):
111 |         """Modify paths according to options"""
112 |         # This must be deferred to build time, because that is when
113 |         # self.compiler starts being a compiler instance (before, it is
114 |         # a flag)  *slow-clap*
115 |         # compiler type is either 'unix', 'msvc' or 'mingw'
116 |         compiler_type = self.compiler.compiler_type
117 | 
118 |         compiler_binary = getattr(self.compiler, 'compiler', [''])[0]
119 |         compiler_binary = os.path.basename(compiler_binary)
120 |         compiler_make = ''
121 |         if 'gcc' in compiler_binary or 'g++' in compiler_binary:
122 |             compiler_make = 'gcc'
123 |         elif 'clang' in compiler_binary:
124 |             compiler_make = 'clang'
125 |         elif 'icc' in compiler_binary:
126 |             compiler_make = 'icc'
127 |         elif compiler_type == 'msvc':
128 |             # See msvccompiler.py:206 - a comment worth reading in its
129 |             # entirety.  distutils sets up an abstraction which it immediately
130 |             # break with its own derived classes.  *slow-clap*
131 |             compiler_make = 'msvc'
132 | 
133 |         if compiler_type != 'msvc':
134 |             new_flags = {"-Wextra": None, "-std": "c11"}
135 |             # By default, we do not optimize for the architecture by default,
136 |             # because this is harmful when building a binary package.
137 |             if self.opt_arch:
138 |                 new_flags["-mtune"] = new_flags["-march"] = "native"
139 | 
140 |             cc_so, flags_dict = get_flags_dict(self.compiler.compiler_so)
141 | 
142 |             # Replace arch with march
143 |             if "-arch" in flags_dict:
144 |                 flags_dict["-march"] = flags_dict.pop("-arch")
145 | 
146 |             # Remove any existing -mtune, -march, -arch flags if not self.opt_arch
147 |             if not self.opt_arch:
148 |                 for key in ["-mtune", "-march", "-arch"]:
149 |                     if key in flags_dict:
150 |                         del flags_dict[key]
151 | 
152 |             flags_dict.update(new_flags)
153 |             self.compiler.compiler_so = make_exec_string(cc_so, flags_dict)
154 | 
155 |         # clang on 14.4.1 fails to include C header files...
156 |         if platform.system() == 'Darwin':
157 |             sdk_path = (
158 |                 "/Applications/Xcode.app/Contents/Developer/Platforms/"
159 |                 "MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include"
160 |             )
161 |             current_cpath = os.environ.get('CPATH', '')
162 |             os.environ['CPATH'] = f"{sdk_path}:{current_cpath}"
163 | 
164 |         # This has to be set to false because MacOS does not ship openmp
165 |         # by default.
166 |         if self.with_openmp is None:
167 |             self.with_openmp = platform.system() == 'Linux'
168 | 
169 |         # Numpy headers: numpy must be imported here rather than
170 |         # globally, because otherwise it may not be available at the time
171 |         # when the setup script is run.  *slow-cl ... ah, f*ck it.
172 |         if self.numpy_include_dir is None:
173 |             import numpy
174 |             self.numpy_include_dir = numpy.get_include()
175 | 
176 |         for ext in self.extensions:
177 |             append_if_absent(ext.include_dirs, self.numpy_include_dir)
178 |             if self.with_openmp:
179 |                 append_if_absent(ext.extra_compile_args, '-fopenmp')
180 |                 append_if_absent(ext.extra_link_args, '-fopenmp')
181 |                 if compiler_make == 'clang':
182 |                     append_if_absent(ext.extra_link_args, '-lomp')
183 | 
184 |         super().build_extensions()
185 | 
186 |     def get_source_files(self):
187 |         """Return list of files to include in source dist"""
188 |         # Specifying include_dirs= argument in Extension adds headers from that
189 |         # directory to the sdist ... on some machines.  On others, not.  Note
190 |         # that overriding sdist will not save you, since this is not called
191 |         # from sdist.add_defaults(), as you might expect. (With setuptools, it
192 |         # is never what you expect.)  Instead, sdist requires egg_info, which
193 |         # hooks into a hidden manifest_maker class derived from sdist, where
194 |         # add_defaults() called, the list passed back to sdist, sidestepping
195 |         # the method in the orginal class.  Kudos.
196 |         #
197 |         # Really, if you have monkeys type out 1000 pages on typewriters, use
198 |         # the result as toilet paper for a month, unfold it, scan it at 20 dpi,
199 |         # and run it through text recognition software, it would still yield
200 |         # better code than setuptools.
201 |         source_files = super().get_source_files()
202 |         header_regex = re.compile(r"\.(?:h|hh|hpp|hxx|H|HH|HPP|HXX)$")
203 | 
204 |         include_dirs = set()
205 |         for ext in self.extensions:
206 |             include_dirs.update(ext.include_dirs)
207 |         for dir in include_dirs:
208 |             for entry in os.scandir(dir):
209 |                 if not entry.is_file():
210 |                     continue
211 |                 if not header_regex.search(entry.name):
212 |                     continue
213 |                 source_files.append(entry.path)
214 | 
215 |         return source_files
216 | 
217 | 
218 | VERSION = extract_version('pysrc', 'xprec', '__init__.py')
219 | REPO_URL = "https://github.com/tuwien-cms/xprec"
220 | DOCTREE_URL = "%s/tree/v%s" % (REPO_URL, VERSION)
221 | LONG_DESCRIPTION = rebase_links(readfile('README.md'), DOCTREE_URL)
222 | 
223 | setup(
224 |     name='xprec',
225 |     version=VERSION,
226 | 
227 |     description='xprec precision numpy extension',
228 |     long_description=LONG_DESCRIPTION,
229 |     long_description_content_type='text/markdown',
230 |     keywords=' '.join([
231 |         'double-double'
232 |         ]),
233 |     classifiers=[
234 |         'Development Status :: 5 - Production/Stable',
235 |         'Intended Audience :: Science/Research',
236 |         'Intended Audience :: Developers',
237 |         'Programming Language :: Python :: 3',
238 |         'License :: OSI Approved :: MIT License',
239 |         ],
240 | 
241 |     url=REPO_URL,
242 |     author=', '.join([
243 |         'Markus Wallerberger'
244 |         ]),
245 |     author_email='markus.wallerberger@tuwien.ac.at',
246 | 
247 |     python_requires='>=3',
248 |     install_requires=[
249 |         # we need matmul to be an ufunc -> 1.16
250 |         'numpy>=1.16',
251 |         ],
252 |     extras_require={
253 |         'test': ['pytest', 'mpmath'],
254 |         },
255 | 
256 |     ext_modules=[
257 |         Extension("xprec._dd_ufunc",
258 |                   ["csrc/_dd_ufunc.c", "csrc/dd_arith.c"],
259 |                   include_dirs=["csrc"]),
260 |         Extension("xprec._dd_linalg",
261 |                   ["csrc/_dd_linalg.c", "csrc/dd_arith.c", "csrc/dd_linalg.c"],
262 |                   include_dirs=["csrc"]),
263 |         ],
264 |     setup_requires=[
265 |         'numpy>=1.16'
266 |         ],
267 |     cmdclass={
268 |         'build_ext': BuildExtWithNumpy
269 |         },
270 | 
271 |     package_dir={'': 'pysrc'},
272 |     packages=find_packages(where='pysrc'),
273 |     )
274 | 


--------------------------------------------------------------------------------
/csrc/dd_arith.h:
--------------------------------------------------------------------------------
  1 | /* Double-double arithmetic library
  2 |  *
  3 |  * Part of the functions are modified from the QD library for U.C. Berkeley
  4 |  * and licensed under a modified BSD license (see QD-LICENSE.txt)
  5 |  *
  6 |  * Some of the algorithms were updated according to the findings in
  7 |  * M. Joldes, et al., ACM Trans. Math. Softw. 44, 1-27 (2018)
  8 |  * (Algorithm numbers in the code)
  9 |  *
 10 |  * Copyright (C) 2012 Yozo Hida, Xiaoye S. Li, David H. Bailey
 11 |  * Copyright (C) 2021 Markus Wallerberger and others
 12 |  * SPDX-License-Identifier: MIT and Modified-BSD
 13 |  */
 14 | #pragma once
 15 | #include <math.h>
 16 | #include <stdbool.h>
 17 | #include <stdlib.h>
 18 | 
 19 | /**
 20 |  * Type for double-double calculations
 21 |  */
 22 | typedef struct {
 23 |     double hi;
 24 |     double lo;
 25 | } ddouble;
 26 | 
 27 | static inline ddouble two_sum_quick(double a, double b)
 28 | {
 29 |     double s = a + b;
 30 |     double lo = b - (s - a);
 31 |     return (ddouble){.hi = s, .lo = lo};
 32 | }
 33 | 
 34 | static inline ddouble two_sum(double a, double b)
 35 | {
 36 |     double s = a + b;
 37 |     double v = s - a;
 38 |     double lo = (a - (s - v)) + (b - v);
 39 |     return (ddouble){.hi = s, .lo = lo};
 40 | }
 41 | 
 42 | static inline ddouble two_diff(double a, double b)
 43 | {
 44 |     double s = a - b;
 45 |     double v = s - a;
 46 |     double lo = (a - (s - v)) - (b + v);
 47 |     return (ddouble){.hi = s, .lo = lo};
 48 | }
 49 | 
 50 | static inline ddouble two_prod(double a, double b)
 51 | {
 52 |     double s = a * b;
 53 |     double lo = fma(a, b, -s);
 54 |     return (ddouble){.hi = s, .lo = lo};
 55 | }
 56 | 
 57 | /* -------------------- Combining quad/double ------------------------ */
 58 | 
 59 | static inline ddouble addwd(ddouble x, double y)
 60 | {
 61 |     ddouble s = two_sum(x.hi, y);
 62 |     double v = x.lo + s.lo;
 63 |     return two_sum_quick(s.hi, v);
 64 | }
 65 | 
 66 | static inline ddouble subwd(ddouble x, double y)
 67 | {
 68 |     ddouble s = two_diff(x.hi, y);
 69 |     double v = x.lo + s.lo;
 70 |     return two_sum_quick(s.hi, v);
 71 | }
 72 | 
 73 | static inline ddouble mulwd(ddouble x, double y)
 74 | {
 75 |     ddouble c = two_prod(x.hi, y);
 76 |     double v = fma(x.lo, y, c.lo);
 77 |     return two_sum_quick(c.hi, v);
 78 | }
 79 | 
 80 | static inline ddouble divwd(ddouble x, double y)
 81 | {
 82 |     /* Alg 14 */
 83 |     double t_hi = x.hi / y;
 84 |     ddouble pi = two_prod(t_hi, y);
 85 |     double d_hi = x.hi - pi.hi;
 86 |     double d_lo = x.lo - pi.lo;
 87 |     double t_lo = (d_hi + d_lo) / y;
 88 |     return two_sum_quick(t_hi, t_lo);
 89 | }
 90 | 
 91 | /* -------------------- Combining double/quad ------------------------- */
 92 | 
 93 | static inline ddouble negw(ddouble);
 94 | static inline ddouble reciprocalw(ddouble);
 95 | 
 96 | static inline ddouble adddw(double x, ddouble y)
 97 | {
 98 |     return addwd(y, x);
 99 | }
100 | 
101 | static inline ddouble subdw(double x, ddouble y)
102 | {
103 |     /* TODO: Probably not ideal */
104 |     return addwd(negw(y), x);
105 | }
106 | 
107 | static inline ddouble muldw(double x, ddouble y)
108 | {
109 |     return mulwd(y, x);
110 | }
111 | 
112 | static inline ddouble divdw(double x, ddouble y)
113 | {
114 |     /* TODO: Probably not ideal */
115 |     return mulwd(reciprocalw(y), x);
116 | }
117 | 
118 | static inline ddouble mul_pwr2(ddouble a, double b) {
119 |     return (ddouble){a.hi * b, a.lo * b};
120 | }
121 | 
122 | /* -------------------- Combining quad/quad ------------------------- */
123 | 
124 | static inline ddouble addww(ddouble x, ddouble y)
125 | {
126 |     ddouble s = two_sum(x.hi, y.hi);
127 |     ddouble t = two_sum(x.lo, y.lo);
128 |     ddouble v = two_sum_quick(s.hi, s.lo + t.hi);
129 |     ddouble z = two_sum_quick(v.hi, t.lo + v.lo);
130 |     return z;
131 | }
132 | 
133 | static inline ddouble subww(ddouble x, ddouble y)
134 | {
135 |     ddouble s = two_diff(x.hi, y.hi);
136 |     ddouble t = two_diff(x.lo, y.lo);
137 |     ddouble v = two_sum_quick(s.hi, s.lo + t.hi);
138 |     ddouble z = two_sum_quick(v.hi, t.lo + v.lo);
139 |     return z;
140 | }
141 | 
142 | static inline ddouble mulww(ddouble a, ddouble b)
143 | {
144 |     /* Alg 11 */
145 |     ddouble c = two_prod(a.hi, b.hi);
146 |     double t = a.hi * b.lo;
147 |     t = fma(a.lo, b.hi, t);
148 |     return two_sum_quick(c.hi, c.lo + t);
149 | }
150 | 
151 | static inline ddouble divww(ddouble x, ddouble y)
152 | {
153 |     /* Alg 17 */
154 |     double t_hi = x.hi / y.hi;
155 |     ddouble r = mulwd(y, t_hi);
156 |     double pi_hi = x.hi - r.hi;
157 |     double d = pi_hi + (x.lo - r.lo);
158 |     double t_lo = d / y.hi;
159 |     return two_sum_quick(t_hi, t_lo);
160 | }
161 | 
162 | /* -------------------- Unary functions ------------------------- */
163 | 
164 | static inline ddouble negw(ddouble a)
165 | {
166 |     return (ddouble){-a.hi, -a.lo};
167 | }
168 | 
169 | static inline ddouble posw(ddouble a)
170 | {
171 |     return (ddouble){-a.hi, -a.lo};
172 | }
173 | 
174 | static inline ddouble absw(ddouble a)
175 | {
176 |     return signbit(a.hi) ? negw(a) : a;
177 | }
178 | 
179 | static inline ddouble reciprocalw(ddouble y)
180 | {
181 |     /* Alg 17 with x = 1 */
182 |     double t_hi = 1.0 / y.hi;
183 |     ddouble r = mulwd(y, t_hi);
184 |     double pi_hi = 1.0 - r.hi;
185 |     double d = pi_hi - r.lo;
186 |     double t_lo = d / y.hi;
187 |     return two_sum_quick(t_hi, t_lo);
188 | }
189 | 
190 | static inline ddouble sqrw(ddouble a)
191 | {
192 |     /* Alg 11 */
193 |     ddouble c = two_prod(a.hi, a.hi);
194 |     double t = 2 * a.hi * a.lo;
195 |     return two_sum_quick(c.hi, c.lo + t);
196 | }
197 | 
198 | static inline ddouble roundw(ddouble a)
199 | {
200 |     double hi = round(a.hi);
201 |     double lo;
202 | 
203 |     if (hi == a.hi) {
204 |         /* High word is an integer already.  Round the low word.*/
205 |         lo = round(a.lo);
206 | 
207 |         /* Renormalize. This is needed if x[0] = some integer, x[1] = 1/2.*/
208 |         return two_sum_quick(hi, lo);
209 |     } else {
210 |         /* High word is not an integer. */
211 |         lo = 0.0;
212 |         if (fabs(hi - a.hi) == 0.5 && a.lo < 0.0) {
213 |             /* There is a tie in the high word, consult the low word
214 |              * to break the tie.
215 |              * NOTE: This does not cause INEXACT.
216 |              */
217 |             hi -= 1.0;
218 |         }
219 |         return (ddouble){hi, lo};
220 |     }
221 | }
222 | 
223 | static inline ddouble floorw(ddouble a)
224 | {
225 |     double hi = floor(a.hi);
226 |     double lo = 0.0;
227 | 
228 |     if (hi == a.hi) {
229 |         /* High word is integer already.  Round the low word. */
230 |         lo = floor(a.lo);
231 |         return two_sum_quick(hi, lo);
232 |     }
233 |     return (ddouble){hi, lo};
234 | }
235 | 
236 | static inline ddouble ceilw(ddouble a)
237 | {
238 |     double hi = ceil(a.hi);
239 |     double lo = 0.0;
240 | 
241 |     if (hi == a.hi) {
242 |         /* High word is integer already.  Round the low word. */
243 |         lo = ceil(a.lo);
244 |         return two_sum_quick(hi, lo);
245 |     }
246 |     return (ddouble){hi, lo};
247 | }
248 | 
249 | static inline bool signbitw(ddouble x)
250 | {
251 |     return signbit(x.hi);
252 | }
253 | 
254 | static inline ddouble copysignww(ddouble x, ddouble y)
255 | {
256 |     /* The sign is determined by the hi part, however, the sign of hi and lo
257 |      * need not be the same, so we cannot merely broadcast copysign to both
258 |      * parts.
259 |      */
260 |     return signbitw(x) != signbitw(y) ? negw(x) : x;
261 | }
262 | 
263 | static inline ddouble copysignwd(ddouble x, double y)
264 | {
265 |     return signbitw(x) != signbit(y) ? negw(x) : x;
266 | }
267 | 
268 | static inline ddouble copysigndw(double x, ddouble y)
269 | {
270 |     /* It is less surprising to return a ddouble here */
271 |     double res = copysign(x, y.hi);
272 |     return (ddouble) {res, 0.0};
273 | }
274 | 
275 | static inline bool iszerow(ddouble x);
276 | 
277 | static inline ddouble signw(ddouble x)
278 | {
279 |     /* The numpy sign function does not respect signed zeros.  We do. */
280 |     if (iszerow(x))
281 |         return x;
282 |     return copysigndw(1.0, x);
283 | }
284 | 
285 | /******************************** Constants *********************************/
286 | 
287 | static inline ddouble nanw()
288 | {
289 |     double nan = strtod("NaN", NULL);
290 |     return (ddouble){nan, nan};
291 | }
292 | 
293 | static inline ddouble infw()
294 | {
295 |     double inf = strtod("Inf", NULL);
296 |     return (ddouble){inf, inf};
297 | }
298 | 
299 | static const ddouble Q_ZERO = {0.0, 0.0};
300 | static const ddouble Q_ONE = {1.0, 0.0};
301 | static const ddouble Q_2PI = {6.283185307179586232e+00, 2.449293598294706414e-16};
302 | static const ddouble Q_PI = {3.141592653589793116e+00, 1.224646799147353207e-16};
303 | static const ddouble Q_PI_2 = {1.570796326794896558e+00, 6.123233995736766036e-17};
304 | static const ddouble Q_PI_4 = {7.853981633974482790e-01, 3.061616997868383018e-17};
305 | static const ddouble Q_3PI_4 = {2.356194490192344837e+00, 9.1848509936051484375e-17};
306 | static const ddouble Q_PI_16 = {1.963495408493620697e-01, 7.654042494670957545e-18};
307 | static const ddouble Q_E = {2.718281828459045091e+00, 1.445646891729250158e-16};
308 | static const ddouble Q_LOG2 = {6.931471805599452862e-01, 2.319046813846299558e-17};
309 | static const ddouble Q_LOG10 = {2.302585092994045901e+00, -2.170756223382249351e-16};
310 | 
311 | static const ddouble Q_EPS = {4.93038065763132e-32, 0.0};
312 | static const ddouble Q_MIN = {2.0041683600089728e-292, 0.0};
313 | static const ddouble Q_MAX = {1.79769313486231570815e+308, 0.0};
314 | static const ddouble Q_TINY = {2.2250738585072014e-308, 0.0};
315 | 
316 | 
317 | static inline bool isfinitew(ddouble x)
318 | {
319 |     return isfinite(x.hi);
320 | }
321 | 
322 | static inline bool isinfw(ddouble x)
323 | {
324 |     return isinf(x.hi);
325 | }
326 | 
327 | static inline bool isnanw(ddouble x)
328 | {
329 |     return isnan(x.hi);
330 | }
331 | 
332 | /*********************** Comparisons q/q ***************************/
333 | 
334 | static inline bool equalww(ddouble a, ddouble b)
335 | {
336 |     return a.hi == b.hi && a.lo == b.lo;
337 | }
338 | 
339 | static inline bool notequalww(ddouble a, ddouble b)
340 | {
341 |     return a.hi != b.hi || a.lo != b.lo;
342 | }
343 | 
344 | static inline bool greaterww(ddouble a, ddouble b)
345 | {
346 |     return a.hi > b.hi || (a.hi == b.hi && a.lo > b.lo);
347 | }
348 | 
349 | static inline bool lessww(ddouble a, ddouble b)
350 | {
351 |     return a.hi < b.hi || (a.hi == b.hi && a.lo < b.lo);
352 | }
353 | 
354 | static inline bool greaterequalww(ddouble a, ddouble b)
355 | {
356 |     return a.hi > b.hi || (a.hi == b.hi && a.lo >= b.lo);
357 | }
358 | 
359 | static inline bool lessequalww(ddouble a, ddouble b)
360 | {
361 |     return a.hi < b.hi || (a.hi == b.hi && a.lo <= b.lo);
362 | }
363 | 
364 | /*********************** Comparisons q/d ***************************/
365 | 
366 | static inline bool equalwd(ddouble a, double b)
367 | {
368 |     return equalww(a, (ddouble){b, 0});
369 | }
370 | 
371 | static inline bool notequalwd(ddouble a, double b)
372 | {
373 |     return notequalww(a, (ddouble){b, 0});
374 | }
375 | 
376 | static inline bool greaterwd(ddouble a, double b)
377 | {
378 |     return greaterww(a, (ddouble){b, 0});
379 | }
380 | 
381 | static inline bool lesswd(ddouble a, double b)
382 | {
383 |     return lessww(a, (ddouble){b, 0});
384 | }
385 | 
386 | static inline bool greaterequalwd(ddouble a, double b)
387 | {
388 |     return greaterequalww(a, (ddouble){b, 0});
389 | }
390 | 
391 | static inline bool lessequalwd(ddouble a, double b)
392 | {
393 |     return lessequalww(a, (ddouble){b, 0});
394 | }
395 | 
396 | /*********************** Comparisons d/q ***************************/
397 | 
398 | static inline bool equaldw(double a, ddouble b)
399 | {
400 |     return equalww((ddouble){a, 0}, b);
401 | }
402 | 
403 | static inline bool notequaldw(double a, ddouble b)
404 | {
405 |     return notequalww((ddouble){a, 0}, b);
406 | }
407 | 
408 | static inline bool greaterdw(double a, ddouble b)
409 | {
410 |     return greaterww((ddouble){a, 0}, b);
411 | }
412 | 
413 | static inline bool lessdw(double a, ddouble b)
414 | {
415 |     return lessww((ddouble){a, 0}, b);
416 | }
417 | 
418 | static inline bool greaterequaldw(double a, ddouble b)
419 | {
420 |     return greaterequalww((ddouble){a, 0}, b);
421 | }
422 | 
423 | static inline bool lessequaldw(double a, ddouble b)
424 | {
425 |     return lessequalww((ddouble){a, 0}, b);
426 | }
427 | 
428 | /************************ Minimum/maximum ************************/
429 | 
430 | static inline ddouble fminww(ddouble a, ddouble b)
431 | {
432 |     return lessww(a, b) ? a : b;
433 | }
434 | 
435 | static inline ddouble fmaxww(ddouble a, ddouble b)
436 | {
437 |     return greaterww(a, b) ? a : b;
438 | }
439 | 
440 | static inline ddouble fminwd(ddouble a, double b)
441 | {
442 |     return lesswd(a, b) ? a : (ddouble) {b, 0};
443 | }
444 | 
445 | static inline ddouble fmaxwd(ddouble a, double b)
446 | {
447 |     return greaterwd(a, b) ? a : (ddouble) {b, 0};
448 | }
449 | 
450 | static inline ddouble fmindw(double a, ddouble b)
451 | {
452 |     return lessdw(a, b) ? (ddouble) {a, 0} : b;
453 | }
454 | 
455 | static inline ddouble fmaxdw(double a, ddouble b)
456 | {
457 |     return greaterdw(a, b) ? (ddouble) {a, 0} : b;
458 | }
459 | 
460 | /************************** Unary tests **************************/
461 | 
462 | static inline bool iszerow(ddouble x)
463 | {
464 |     return x.hi == 0.0;
465 | }
466 | 
467 | static inline bool isonew(ddouble x)
468 | {
469 |     return x.hi == 1.0 && x.lo == 0.0;
470 | }
471 | 
472 | static inline bool ispositivew(ddouble x)
473 | {
474 |     return x.hi > 0.0;
475 | }
476 | 
477 | static inline bool isnegativew(ddouble x)
478 | {
479 |     return x.hi < 0.0;
480 | }
481 | 
482 | /************************** Advanced math functions ********************/
483 | 
484 | ddouble sqrtw(ddouble a);
485 | 
486 | static inline ddouble ldexpw(ddouble a, int exp)
487 | {
488 |     return (ddouble) {ldexp(a.hi, exp), ldexp(a.lo, exp)};
489 | }
490 | 
491 | /************************* Binary functions ************************/
492 | 
493 | ddouble _hypotqq_ordered(ddouble x, ddouble y);
494 | 
495 | static inline ddouble hypotww(ddouble x, ddouble y)
496 | {
497 |     x = absw(x);
498 |     y = absw(y);
499 |     if (x.hi < y.hi)
500 |         return _hypotqq_ordered(y, x);
501 |     else
502 |         return _hypotqq_ordered(x, y);
503 | }
504 | 
505 | static inline ddouble hypotdw(double x, ddouble y)
506 | {
507 |     return hypotww((ddouble){x, 0}, y);
508 | }
509 | 
510 | static inline ddouble hypotwd(ddouble x, double y)
511 | {
512 |     return hypotww(x, (ddouble){y, 0});
513 | }
514 | 
515 | /* Computes the nearest integer to d. */
516 | static inline ddouble nintw(ddouble d) {
517 |     if (equalww(d, floorw(d))) {
518 |         return d;
519 |     }
520 |     return floorw(addww(d, (ddouble){0.5, 0}));
521 | }
522 | 
523 | ddouble expw(ddouble a);
524 | ddouble expm1w(ddouble a);
525 | ddouble ldexpwi(ddouble a, int m);
526 | ddouble logw(ddouble a);
527 | ddouble sinw(ddouble a);
528 | ddouble cosw(ddouble a);
529 | ddouble tanw(ddouble a);
530 | ddouble sinhw(ddouble a);
531 | ddouble coshw(ddouble a);
532 | ddouble tanhw(ddouble a);
533 | ddouble atanw(ddouble a);
534 | ddouble acosw(ddouble a);
535 | ddouble asinw(ddouble a);
536 | ddouble atanhw(ddouble a);
537 | ddouble acoshw(ddouble a);
538 | ddouble asinhw(ddouble a);
539 | ddouble atan2wd(ddouble a, double b);
540 | ddouble atan2dw(double a, ddouble b);
541 | ddouble atan2ww(ddouble a, ddouble b);
542 | ddouble powww(ddouble a, ddouble b);
543 | ddouble powwd(ddouble a, double b);
544 | ddouble powdw(double a, ddouble b);
545 | ddouble modfww(ddouble a, ddouble *b);
546 | 


--------------------------------------------------------------------------------
/csrc/_dd_linalg.c:
--------------------------------------------------------------------------------
  1 | /* Python extension module for linear algebra functions.
  2 |  *
  3 |  * Copyright (C) 2021 Markus Wallerberger and others
  4 |  * SPDX-License-Identifier: MIT
  5 |  */
  6 | #include "Python.h"
  7 | #include "math.h"
  8 | #include "stdio.h"
  9 | 
 10 | #include "dd_arith.h"
 11 | #include "dd_linalg.h"
 12 | 
 13 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 14 | #include "numpy/ndarraytypes.h"
 15 | #include "numpy/ufuncobject.h"
 16 | #include "numpy/npy_3kcompat.h"
 17 | 
 18 | /**
 19 |  * Allows parameter to be marked unused
 20 |  */
 21 | #define MARK_UNUSED(x)  do { (void)(x); } while(false)
 22 | 
 23 | /************************ Linear algebra ***************************/
 24 | 
 25 | static void u_matmulw(char **args, const npy_intp *dims, const npy_intp* steps,
 26 |                       void *data)
 27 | {
 28 |     // signature (n;i,j),(n;j,k)->(n;i,k)
 29 |     const npy_intp nn = dims[0], ii = dims[1], jj = dims[2], kk = dims[3];
 30 |     const npy_intp _san = steps[0], _sbn = steps[1], _scn = steps[2],
 31 |                    _sai = steps[3], _saj = steps[4], _sbj = steps[5],
 32 |                    _sbk = steps[6], _sci = steps[7], _sck = steps[8];
 33 |     char *_a = args[0], *_b = args[1], *_c = args[2];
 34 | 
 35 |     const npy_intp sai = _sai / sizeof(ddouble), saj = _saj / sizeof(ddouble),
 36 |                    sbj = _sbj / sizeof(ddouble), sbk = _sbk / sizeof(ddouble),
 37 |                    sci = _sci / sizeof(ddouble), sck = _sck / sizeof(ddouble);
 38 | 
 39 |     for (npy_intp n = 0; n != nn; ++n, _a += _san, _b += _sbn, _c += _scn) {
 40 |         const ddouble *a = (const ddouble *)_a, *b = (const ddouble *)_b;
 41 |         ddouble *c = (ddouble *)_c;
 42 | 
 43 |         #pragma omp parallel for collapse(2)
 44 |         for (npy_intp i = 0; i < ii; ++i) {
 45 |             for (npy_intp k = 0; k < kk; ++k) {
 46 |                 ddouble val = Q_ZERO, tmp;
 47 |                 for (npy_intp j = 0; j < jj; ++j) {
 48 |                     tmp = mulww(a[i * sai + j * saj], b[j * sbj + k * sbk]);
 49 |                     val = addww(val, tmp);
 50 |                 }
 51 |                 c[i * sci + k * sck] = val;
 52 |             }
 53 |         }
 54 |     }
 55 |     MARK_UNUSED(data);
 56 | }
 57 | 
 58 | /****************************** Helper functions *************************/
 59 | 
 60 | static void ensure_inplace_2(
 61 |         char *in, char *out, npy_intp n1, npy_intp si1, npy_intp so1,
 62 |         npy_intp n2, npy_intp si2, npy_intp so2)
 63 | {
 64 |     if (in == out)
 65 |         return;
 66 | 
 67 |     char *in1 = in, *out1 = out;
 68 |     for (npy_intp i1 = 0; i1 != n1; ++i1, in1 += si1, out1 += so1) {
 69 |         char *in2 = in1, *out2 = out1;
 70 |         for (npy_intp i2 = 0; i2 != n2; ++i2, in2 += si2, out2 += so2) {
 71 |             char *inx = in2, *outx = out2;
 72 |             *(ddouble *)outx = *(ddouble *)inx;
 73 |         }
 74 |     }
 75 | }
 76 | 
 77 | static void ensure_inplace_3(
 78 |         char *in, char *out, npy_intp n1, npy_intp si1, npy_intp so1,
 79 |         npy_intp n2, npy_intp si2, npy_intp so2, npy_intp n3, npy_intp si3,
 80 |         npy_intp so3)
 81 | {
 82 |     if (in == out)
 83 |         return;
 84 | 
 85 |     char *in1 = in, *out1 = out;
 86 |     for (npy_intp i1 = 0; i1 != n1; ++i1, in1 += si1, out1 += so1) {
 87 |         char *in2 = in1, *out2 = out1;
 88 |         for (npy_intp i2 = 0; i2 != n2; ++i2, in2 += si2, out2 += so2) {
 89 |             char *in3 = in2, *out3 = out2;
 90 |             for (npy_intp i3 = 0; i3 != n3; ++i3, in3 += si3, out3 += so3) {
 91 |                 char *inx = in3, *outx = out3;
 92 |                 *(ddouble *)outx = *(ddouble *)inx;
 93 |             }
 94 |         }
 95 |     }
 96 | }
 97 | 
 98 | /*************************** More complicated ***********************/
 99 | 
100 | static void u_normw(
101 |     char **args, const npy_intp *dims, const npy_intp* steps, void *data)
102 | {
103 |    // signature (n;i)->(n;)
104 |     const npy_intp nn = dims[0], ii = dims[1];
105 |     const npy_intp san = steps[0], sbn = steps[1], _sai = steps[2];
106 |     char *_a = args[0], *_b = args[1];
107 | 
108 |     for (npy_intp n = 0; n != nn; ++n, _a += san, _b += sbn) {
109 |         *(ddouble *)_b = normw((const ddouble *)_a, ii, _sai / sizeof(ddouble));
110 |     }
111 |     MARK_UNUSED(data);
112 | }
113 | 
114 | static void u_householderw(
115 |     char **args, const npy_intp *dims, const npy_intp* steps, void *data)
116 | {
117 |     // signature (n;i)->(n;),(n;i)
118 |     const npy_intp nn = dims[0], ii = dims[1];
119 |     const npy_intp _san = steps[0], _sbn = steps[1], _scn = steps[2],
120 |                    _sai = steps[3], _sci = steps[4];
121 |     char *_a = args[0], *_b = args[1], *_c = args[2];
122 | 
123 |     for (npy_intp n = 0; n != nn; ++n, _a += _san, _b += _sbn, _c += _scn) {
124 |         *(ddouble *)_b = householderw(
125 |                 (const ddouble *)_a, (ddouble *)_c, ii,
126 |                 _sai / sizeof(ddouble), _sci / sizeof(ddouble));
127 |     }
128 |     MARK_UNUSED(data);
129 | }
130 | 
131 | static void u_rank1updateq(
132 |     char **args, const npy_intp *dims, const npy_intp* steps, void *data)
133 | {
134 |     // signature (n;i,j),(n;i),(n;j)->(n;i,j)
135 |     const npy_intp nn = dims[0], ii = dims[1], jj = dims[2];
136 |     const npy_intp _san = steps[0], _sbn = steps[1], _scn = steps[2],
137 |                    _sdn = steps[3], _sai = steps[4], _saj = steps[5],
138 |                    _sbi = steps[6], _scj = steps[7], _sdi = steps[8],
139 |                    _sdj = steps[9];
140 |     char *_a = args[0], *_b = args[1], *_c = args[2], *_d = args[3];
141 | 
142 |     ensure_inplace_3(_a, _d, nn, _san, _sdn, ii, _sai, _sdi, jj, _saj, _sdj);
143 |     for (npy_intp n = 0; n != nn; ++n, _a += _san, _b += _sbn, _c += _scn) {
144 |         rank1updateq(
145 |             (ddouble *)_d, _sai / sizeof(ddouble), _saj / sizeof(ddouble),
146 |             (const ddouble *)_b, _sbi / sizeof(ddouble),
147 |             (const ddouble *)_c, _scj / sizeof(ddouble), ii, jj);
148 |     }
149 |     MARK_UNUSED(data);
150 | }
151 | 
152 | static void u_jacobisweepw(
153 |     char **args, const npy_intp *dims, const npy_intp* steps, void *data)
154 | {
155 |     // signature (n;i,j),(n;i=j,j)->(n;i,j),(n;i=j,j);(n,)
156 |     const npy_intp nn = dims[0], ii = dims[1], jj = dims[2];
157 |     const npy_intp _san = steps[0], _sbn = steps[1],  _scn = steps[2],
158 |                    _sdn = steps[3], _sen = steps[4],  _sai = steps[5],
159 |                    _saj = steps[6], _sbi = steps[7],  _sbj = steps[8],
160 |                    _sci = steps[9], _scj = steps[10], _sdi = steps[11],
161 |                    _sdj = steps[12];
162 |     char *_a = args[0], *_b = args[1], *_c = args[2], *_d = args[3],
163 |          *_e = args[4];
164 | 
165 |     ensure_inplace_3(_a, _c, nn, _san, _scn, ii, _sai, _sci, jj, _saj, _scj);
166 |     ensure_inplace_3(_b, _d, nn, _sbn, _sdn, jj, _sbi, _sdi, jj, _sbj, _sdj);
167 |     for (npy_intp n = 0; n != nn; ++n, _c += _scn, _d += _sdn, _e += _sen) {
168 |         ddouble *c = (ddouble *)_c, *d = (ddouble *)_d, *e = (ddouble *)_e;
169 |         const npy_intp
170 |                 sci = _sci / sizeof(ddouble), scj = _scj / sizeof(ddouble),
171 |                 sdi = _sdi / sizeof(ddouble), sdj = _sdj / sizeof(ddouble);
172 | 
173 |         *e = jacobi_sweep(c, sci, scj, d, sdi, sdj, ii, jj);
174 |     }
175 |     MARK_UNUSED(data);
176 | }
177 | 
178 | static void u_givensw(
179 |     char **args, const npy_intp *dims, const npy_intp* steps, void *data)
180 | {
181 |     // signature (n;2)->(n;2),(n;2,2)
182 |     const npy_intp nn = dims[0];
183 |     const npy_intp san = steps[0], sbn = steps[1], scn = steps[2],
184 |                    sai = steps[3], sbi = steps[4], sci = steps[5],
185 |                    scj = steps[6];
186 |     char *_a = args[0], *_b = args[1], *_c = args[2];
187 | 
188 |     for (npy_intp n = 0; n != nn; ++n, _a += san, _b += sbn, _c += scn) {
189 |         ddouble f = *(ddouble *) _a;
190 |         ddouble g = *(ddouble *) (_a + sai);
191 | 
192 |         ddouble c, s, r;
193 |         givensw(f, g, &c, &s, &r);
194 | 
195 |         *(ddouble *)_b = r;
196 |         *(ddouble *)(_b + sbi) = Q_ZERO;
197 |         *(ddouble *)_c = c;
198 |         *(ddouble *)(_c + scj) = s;
199 |         *(ddouble *)(_c + sci) = negw(s);
200 |         *(ddouble *)(_c + sci + scj) = c;
201 |     }
202 |     MARK_UNUSED(data);
203 | }
204 | 
205 | static void u_givens_seqq(
206 |     char **args, const npy_intp *dims, const npy_intp* steps, void *data)
207 | {
208 |     // signature (n;i,2),(n;i,j)->(n;i,j)
209 |     const npy_intp nn = dims[0], ii = dims[1], jj = dims[3];
210 |     const npy_intp _san = steps[0], _sbn = steps[1], _scn = steps[2],
211 |                    _sai = steps[3], _saq = steps[4], _sbi = steps[5],
212 |                    _sbj = steps[6], _sci = steps[7], _scj = steps[8];
213 |     char *_a = args[0], *_b = args[1], *_c = args[2];
214 | 
215 |     ensure_inplace_3(_b, _c, nn, _sbn, _scn, ii, _sbi, _sci, jj, _sbj, _scj);
216 |     for (npy_intp n = 0; n != nn; ++n, _a += _san, _c += _scn) {
217 |         /* The rotation are interdependent, so we splice the array in
218 |          * the other direction.
219 |          */
220 |         #pragma omp parallel for
221 |         for (npy_intp j = 0; j < jj; ++j) {
222 |             for (npy_intp i = 0; i < ii - 1; ++i) {
223 |                 ddouble *c_x = (ddouble *)(_c + i *_sci + j * _scj);
224 |                 ddouble *c_y = (ddouble *)(_c + (i + 1) *_sci + j * _scj);
225 |                 ddouble g_cos = *(ddouble *)(_a + i * _sai);
226 |                 ddouble g_sin = *(ddouble *)(_a + i * _sai + _saq);
227 |                 lmul_givensq(c_x, c_y, g_cos, g_sin, *c_x, *c_y);
228 |             }
229 |         }
230 |     }
231 |     MARK_UNUSED(data);
232 | }
233 | 
234 | static void u_golub_kahan_chaseq(
235 |     char **args, const npy_intp *dims, const npy_intp* steps, void *data)
236 | {
237 |     // signature (n;i),(n;i)->(n;i),(n;i),(n;i,4)
238 |     const npy_intp nn = dims[0], ii = dims[1];
239 |     const npy_intp _san = steps[0], _sbn = steps[1],  _scn = steps[2],
240 |                    _sdn = steps[3], _sen = steps[4],  _sai = steps[5],
241 |                    _sbi = steps[6], _sci = steps[7],  _sdi = steps[8],
242 |                    _sei = steps[9], _se4 = steps[10];
243 |     char *_a = args[0], *_b = args[1], *_c = args[2], *_d = args[3],
244 |          *_e = args[4];
245 | 
246 |     ensure_inplace_2(_a, _c, nn, _san, _scn, ii, _sai, _sci);
247 |     ensure_inplace_2(_b, _d, nn, _sbn, _sdn, ii, _sbi, _sdi);
248 |     if (_se4 != sizeof(ddouble) || _sei != 4 * sizeof(ddouble)) {
249 |         fprintf(stderr, "rot is not contiguous, but needs to be");
250 |         return;
251 |     }
252 | 
253 |     for (npy_intp n = 0; n != nn; ++n, _c += _scn, _d += _sdn, _e += _sen) {
254 |         golub_kahan_chaseq((ddouble *)_c, _sci / sizeof(ddouble),
255 |                            (ddouble *)_d, _sdi / sizeof(ddouble),
256 |                            ii, (ddouble *)_e);
257 |     }
258 |     MARK_UNUSED(data);
259 | }
260 | 
261 | static void u_svd_2x2(
262 |     char **args, const npy_intp *dims, const npy_intp* steps, void *data)
263 | {
264 |     // signature (n;2,2)->(n;2,2),(n;2),(n;2,2)
265 |     const npy_intp nn = dims[0];
266 |     const npy_intp san = steps[0], sbn = steps[1], scn = steps[2],
267 |                    sdn = steps[3], sai = steps[4], saj = steps[5],
268 |                    sbi = steps[6], sbj = steps[7], sci = steps[8],
269 |                    sdi = steps[9], sdj = steps[10];
270 |     char *_a = args[0], *_b = args[1], *_c = args[2], *_d = args[3];
271 | 
272 |     for (npy_intp n = 0; n != nn;
273 |                 ++n, _a += san, _b += sbn, _c += scn, _d += sdn) {
274 |         ddouble a11 = *(ddouble *) _a;
275 |         ddouble a12 = *(ddouble *) (_a + saj);
276 |         ddouble a21 = *(ddouble *) (_a + sai);
277 |         ddouble a22 = *(ddouble *) (_a + sai + saj);
278 | 
279 |         ddouble smin, smax, cu, su, cv, sv;
280 |         svd_2x2(a11, a12, a21, a22, &smin, &smax, &cv, &sv, &cu, &su);
281 | 
282 |         *(ddouble *)_b = cu;
283 |         *(ddouble *)(_b + sbj) = negw(su);
284 |         *(ddouble *)(_b + sbi) = su;
285 |         *(ddouble *)(_b + sbi + sbj) = cu;
286 | 
287 |         *(ddouble *)_c = smax;
288 |         *(ddouble *)(_c + sci) = smin;
289 | 
290 |         *(ddouble *)_d = cv;
291 |         *(ddouble *)(_d + sdj) = sv;
292 |         *(ddouble *)(_d + sdi) = negw(sv);
293 |         *(ddouble *)(_d + sdi + sdj) = cv;
294 |     }
295 |     MARK_UNUSED(data);
296 | }
297 | 
298 | static void u_svvals_2x2(
299 |     char **args, const npy_intp *dims, const npy_intp* steps, void *data)
300 | {
301 |     // signature (n;2,2)->(n;2)
302 |     const npy_intp nn = dims[0];
303 |     const npy_intp san = steps[0], sbn = steps[1], sai = steps[2],
304 |                    saj = steps[3], sbi = steps[4];
305 |     char *_a = args[0], *_b = args[1];
306 | 
307 |     for (npy_intp n = 0; n != nn; ++n, _a += san, _b += sbn) {
308 |         ddouble a11 = *(ddouble *) _a;
309 |         ddouble a12 = *(ddouble *) (_a + saj);
310 |         ddouble a21 = *(ddouble *) (_a + sai);
311 |         ddouble a22 = *(ddouble *) (_a + sai + saj);
312 | 
313 |         ddouble smin, smax;
314 |         svd_2x2(a11, a12, a21, a22, &smin, &smax, NULL, NULL, NULL, NULL);
315 | 
316 |         *(ddouble *)_b = smax;
317 |         *(ddouble *)(_b + sbi) = smin;
318 |     }
319 |     MARK_UNUSED(data);
320 | }
321 | 
322 | /* ----------------------- Python stuff -------------------------- */
323 | 
324 | static PyObject *module;
325 | static PyObject *numpy_module = NULL;
326 | static int type_num;
327 | 
328 | static PyObject *make_module()
329 | {
330 |     static PyMethodDef no_methods[] = {
331 |         {NULL, NULL, 0, NULL}    // No methods defined
332 |     };
333 |     static struct PyModuleDef module_def = {
334 |         PyModuleDef_HEAD_INIT,
335 |         "_dd_linalg",
336 |         NULL,
337 |         -1,
338 |         no_methods,
339 |         NULL,
340 |         NULL,
341 |         NULL,
342 |         NULL
343 |     };
344 |     module = PyModule_Create(&module_def);
345 |     return module;
346 | }
347 | 
348 | static int import_ddouble_dtype()
349 | {
350 |     PyObject *dd_module = PyImport_ImportModule("xprec._dd_ufunc");
351 |     if (dd_module == NULL)
352 |         return -1;
353 | 
354 |     PyArray_Descr *dtype =
355 |         (PyArray_Descr *)PyObject_GetAttrString(dd_module, "dtype");
356 |     if (dtype == NULL)
357 |         return -1;
358 | 
359 |     /* Let's pray at least this stays public */
360 |     type_num = dtype->type_num;
361 |     return 0;
362 | }
363 | 
364 | static int gufunc(
365 |         PyUFuncGenericFunction uloop, int nin, int nout,
366 |         const char *signature, const char *name, const char *docstring,
367 |         bool in_numpy)
368 | {
369 |     PyUFuncObject *ufunc = NULL;
370 |     int *arg_types = NULL, retcode = 0;
371 | 
372 |     if (in_numpy) {
373 |         ufunc = (PyUFuncObject *)PyObject_GetAttrString(numpy_module, name);
374 |     } else {
375 |         ufunc = (PyUFuncObject *)PyUFunc_FromFuncAndDataAndSignature(
376 |                         NULL, NULL, NULL, 0, nin, nout, PyUFunc_None, name,
377 |                         docstring, 0, signature);
378 |     }
379 |     if (ufunc == NULL) goto error;
380 | 
381 |     int *dtypes = PyMem_New(int, nin + nout);
382 |     if (dtypes == NULL) goto error;
383 | 
384 |     for (int i = 0; i != nin + nout; ++i)
385 |         dtypes[i] = type_num;
386 | 
387 |     retcode = PyUFunc_RegisterLoopForType(ufunc, type_num,
388 |                                           uloop, arg_types, NULL);
389 |     if (retcode < 0) goto error;
390 | 
391 |     return PyModule_AddObject(module, name, (PyObject *)ufunc);
392 | 
393 | error:
394 |     if (!in_numpy)
395 |         Py_XDECREF(ufunc);
396 |     PyMem_Free(arg_types);
397 |     return -1;
398 | }
399 | 
400 | PyMODINIT_FUNC PyInit__dd_linalg(void)
401 | {
402 |     if (!make_module())
403 |         return NULL;
404 | 
405 |     /* Initialize numpy things */
406 |     import_array();
407 |     import_umath();
408 | 
409 |     numpy_module = PyImport_ImportModule("numpy");
410 |     if (numpy_module == NULL)
411 |        return NULL;
412 | 
413 |     if (import_ddouble_dtype() < 0)
414 |         return NULL;
415 | 
416 |     gufunc(u_normw, 1, 1, "(i)->()",
417 |            "norm", "Vector 2-norm", false);
418 |     gufunc(u_matmulw, 2, 1, "(i?,j),(j,k?)->(i?,k?)",
419 |            "matmul", "Matrix multiplication", true);
420 |     gufunc(u_givensw, 1, 2, "(2)->(2),(2,2)",
421 |            "givens", "Generate Givens rotation", false);
422 |     gufunc(u_givens_seqq, 2, 1, "(i,2),(i,j?)->(i,j?)",
423 |            "givens_seq", "apply sequence of givens rotation to matrix", false);
424 |     gufunc(u_householderw, 1, 2, "(i)->(),(i)",
425 |            "householder", "Generate Householder reflectors", false);
426 |     gufunc(u_rank1updateq, 3, 1, "(i,j),(i),(j)->(i,j)",
427 |            "rank1update", "Perform rank-1 update of matrix", false);
428 |     gufunc(u_svd_2x2, 1, 3, "(2,2)->(2,2),(2),(2,2)",
429 |            "svd2x2", "SVD of upper triangular 2x2 problem", false);
430 |     gufunc(u_svvals_2x2, 1, 1, "(2,2)->(2)",
431 |            "svvals2x2", "singular values of upper triangular 2x2 problem", false);
432 |     gufunc(u_jacobisweepw, 2, 3, "(i,j),(j,j)->(i,j),(j,j),()",
433 |            "jacobi_sweep", "Perform sweep of one-sided Jacobi rotations", false);
434 |     gufunc(u_golub_kahan_chaseq, 2, 3, "(i),(i)->(i),(i),(i,4)",
435 |            "golub_kahan_chase", "bidiagonal chase procedure", false);
436 | 
437 |     /* Make dtype */
438 |     PyArray_Descr *dtype = PyArray_DescrFromType(NPY_CDOUBLE);
439 |     PyModule_AddObject(module, "dtype", (PyObject *)dtype);
440 | 
441 |     /* Module is ready */
442 |     return module;
443 | }
444 | 


--------------------------------------------------------------------------------
/csrc/dd_arith.c:
--------------------------------------------------------------------------------
  1 | /* Double-double arithmetic library
  2 |  *
  3 |  * Part of the functions are copied from the QD library for U.C. Berkeley
  4 |  * and licensed modified BSD (see QD-LICENSE.txt)
  5 |  *
  6 |  * Copyright (C) 2012 Yozo Hida, Xiaoye S. Li, David H. Bailey
  7 |  * Copyright (C) 2021 Markus Wallerberger and others
  8 |  * SPDX-License-Identifier: MIT and Modified-BSD
  9 |  */
 10 | #include "./dd_arith.h"
 11 | #include <math.h>
 12 | 
 13 | // 2**500 and 2**(-500);
 14 | static const double LARGE = 3.273390607896142e+150;
 15 | static const double INV_LARGE = 3.054936363499605e-151;
 16 | 
 17 | static ddouble hypotqq_compute(ddouble x, ddouble y)
 18 | {
 19 |     return sqrtw(addww(sqrw(x), sqrw(y)));
 20 | }
 21 | 
 22 | ddouble _hypotqq_ordered(ddouble x, ddouble y)
 23 | {
 24 |     // assume that x >= y >= 0
 25 |     // special cases
 26 |     if (iszerow(y))
 27 |         return x;
 28 | 
 29 |     // if very large or very small, renormalize
 30 |     if (x.hi > LARGE) {
 31 |         x = mul_pwr2(x, INV_LARGE);
 32 |         y = mul_pwr2(y, INV_LARGE);
 33 |         return mul_pwr2(hypotqq_compute(x, y), LARGE);
 34 |     }
 35 |     if (x.hi < INV_LARGE) {
 36 |         x = mul_pwr2(x, LARGE);
 37 |         y = mul_pwr2(y, LARGE);
 38 |         return mul_pwr2(hypotqq_compute(x, y), INV_LARGE);
 39 |     }
 40 | 
 41 |     // normal case
 42 |     return hypotqq_compute(x, y);
 43 | }
 44 | 
 45 | ddouble sqrtw(ddouble a)
 46 | {
 47 |     /* Given approximation x to 1/sqrt(a), perform a single Newton step:
 48 |      *
 49 |      *    sqrt(a) = a*x + [a - (a*x)^2] * x / 2   (approx)
 50 |      *
 51 |      * The approximation is accurate to twice the accuracy of x.
 52 |      * Also, the multiplication (a*x) and [-]*x can be done with
 53 |      * only half the precision.
 54 |      * From: Karp, High Precision Division and Square Root, 1993
 55 |      */
 56 |     if (a.hi <= 0)
 57 |         return (ddouble){sqrt(a.hi), 0};
 58 | 
 59 |     double x = 1.0 / sqrt(a.hi);
 60 |     double ax = a.hi * x;
 61 |     ddouble ax_sqr = sqrw((ddouble){ax, 0});
 62 |     double diff = subww(a, ax_sqr).hi * x * 0.5;
 63 |     return two_sum(ax, diff);
 64 | }
 65 | 
 66 | /* Inverse Factorials from 1/0!, 1/1!, 1/2!, asf. */
 67 | static int _n_inv_fact = 18;
 68 | static const ddouble _inv_fact[] = {
 69 |     {1.00000000000000000e+00, 0.00000000000000000e+00},
 70 |     {1.00000000000000000e+00, 0.00000000000000000e+00},
 71 |     {5.00000000000000000e-01, 0.00000000000000000e+00},
 72 |     {1.66666666666666657e-01, 9.25185853854297066e-18},
 73 |     {4.16666666666666644e-02, 2.31296463463574266e-18},
 74 |     {8.33333333333333322e-03, 1.15648231731787138e-19},
 75 |     {1.38888888888888894e-03, -5.30054395437357706e-20},
 76 |     {1.98412698412698413e-04, 1.72095582934207053e-22},
 77 |     {2.48015873015873016e-05, 2.15119478667758816e-23},
 78 |     {2.75573192239858925e-06, -1.85839327404647208e-22},
 79 |     {2.75573192239858883e-07, 2.37677146222502973e-23},
 80 |     {2.50521083854417202e-08, -1.44881407093591197e-24},
 81 |     {2.08767569878681002e-09, -1.20734505911325997e-25},
 82 |     {1.60590438368216133e-10, 1.25852945887520981e-26},
 83 |     {1.14707455977297245e-11, 2.06555127528307454e-28},
 84 |     {7.64716373181981641e-13, 7.03872877733453001e-30},
 85 |     {4.77947733238738525e-14, 4.39920548583408126e-31},
 86 |     {2.81145725434552060e-15, 1.65088427308614326e-31}
 87 |     };
 88 | 
 89 | /**
 90 |  * For the exponential of `a`, return compute tuple `x, m` such that:
 91 |  *
 92 |  *      exp(a) = ldexp(1 + x, m),
 93 |  *
 94 |  * where `m` is chosen such that `abs(x) < 1`.  The value `x` is returned,
 95 |  * whereas the value `m` is given as an out parameter.
 96 |  */
 97 | static ddouble _exp_reduced(ddouble a, int *m)
 98 | {
 99 |     // Strategy:  We first reduce the size of x by noting that
100 |     //
101 |     //     exp(k * r + m * log(2)) = 2^m * exp(r)^k
102 |     //
103 |     // where m and k are integers.  By choosing m appropriately
104 |     // we can make |k * r| <= log(2) / 2 = 0.347.
105 |     const double k = 512.0;
106 |     const double inv_k = 1.0 / k;
107 |     double mm = floor(a.hi / Q_LOG2.hi + 0.5);
108 |     ddouble r = mul_pwr2(subww(a, mulwd(Q_LOG2, mm)), inv_k);
109 |     *m = (int)mm;
110 | 
111 |     // Now, evaluate exp(r) using the Taylor series, since reducing
112 |     // the argument substantially speeds up the convergence.  We omit order 0
113 |     // and start at order 1:
114 |     ddouble rpower = r;
115 |     ddouble term = r;
116 |     ddouble sum = term;
117 | 
118 |     // Order 2
119 |     rpower = sqrw(r);
120 |     term = mul_pwr2(rpower, 0.5);
121 |     sum = addww(sum, term);
122 | 
123 |     // Order 3 and up
124 |     for (int i = 3; i < 9; i++) {
125 |         rpower = mulww(rpower, r);
126 |         term = mulww(rpower, _inv_fact[i]);
127 |         sum = addww(sum, term);
128 |         if (fabs(term.hi) <= inv_k * Q_EPS.hi)
129 |             break;
130 |     }
131 | 
132 |     // We now have that approximately exp(r) == 1 + sum.  Raise that to
133 |     // the m'th (512) power by squaring the binomial nine times
134 |     sum = addww(mul_pwr2(sum, 2.0), sqrw(sum));
135 |     sum = addww(mul_pwr2(sum, 2.0), sqrw(sum));
136 |     sum = addww(mul_pwr2(sum, 2.0), sqrw(sum));
137 |     sum = addww(mul_pwr2(sum, 2.0), sqrw(sum));
138 |     sum = addww(mul_pwr2(sum, 2.0), sqrw(sum));
139 |     sum = addww(mul_pwr2(sum, 2.0), sqrw(sum));
140 |     sum = addww(mul_pwr2(sum, 2.0), sqrw(sum));
141 |     sum = addww(mul_pwr2(sum, 2.0), sqrw(sum));
142 |     sum = addww(mul_pwr2(sum, 2.0), sqrw(sum));
143 |     return sum;
144 | }
145 | 
146 | ddouble expw(ddouble a)
147 | {
148 |     if (a.hi <= -709.0)
149 |         return Q_ZERO;
150 |     if (a.hi >= 709.0)
151 |         return infw();
152 |     if (iszerow(a))
153 |         return Q_ONE;
154 |     if (isonew(a))
155 |         return Q_E;
156 | 
157 |     int m;
158 |     ddouble sum = _exp_reduced(a, &m);
159 | 
160 |     /** Add back the one and multiply by 2 to the m */
161 |     sum = addwd(sum, 1.0);
162 |     return ldexpw(sum, (int)m);
163 | }
164 | 
165 | ddouble expm1w(ddouble a)
166 | {
167 |     if (a.hi <= -709.0)
168 |         return (ddouble){-1.0, 0.0};
169 |     if (a.hi >= 709.0)
170 |         return infw();
171 |     if (iszerow(a))
172 |         return Q_ZERO;
173 | 
174 |     int m;
175 |     ddouble sum = _exp_reduced(a, &m);
176 | 
177 |     /* Truncation case: simply return sum */
178 |     if (m == 0)
179 |         return sum;
180 | 
181 |     /* Non-truncation case: compute full exp, then remove the one */
182 |     sum = addwd(sum, 1.0);
183 |     sum = ldexpw(sum, (int)m);
184 |     return subwd(sum, 1.0);
185 | }
186 | 
187 | ddouble ldexpwi(ddouble a, int exp)
188 | {
189 |     return ldexpw(a, exp);
190 | }
191 | 
192 | ddouble logw(ddouble a)
193 | {
194 |     /* Strategy.  The Taylor series for log converges much more
195 |      * slowly than that of exp, due to the lack of the factorial
196 |      * term in the denominator.  Hence this routine instead tries
197 |      * to determine the root of the function
198 |      *
199 |      *     f(x) = exp(x) - a
200 |      *
201 |      * using Newton iteration.  The iteration is given by
202 |      *
203 |      *     x' = x - f(x)/f'(x)
204 |      *        = x - (1 - a * exp(-x))
205 |      *        = x + a * exp(-x) - 1.
206 |      *
207 |      * Only one iteration is needed, since Newton's iteration
208 |      * approximately doubles the number of digits per iteration.
209 |      */
210 |     if (isonew(a))
211 |         return Q_ZERO;
212 |     if (iszerow(a))
213 |         return negw(infw());
214 |     if (!ispositivew(a))
215 |         return nanw();
216 | 
217 |     ddouble x = {log(a.hi), 0.0}; /* Initial approximation */
218 |     x = subwd(addww(x, mulww(a, expw(negw(x)))), 1.0);
219 |     return x;
220 | }
221 | 
222 | /* Table of sin(k * pi/16) and cos(k * pi/16). */
223 | static const ddouble _sin_table[] = {
224 |     {1.950903220161282758e-01, -7.991079068461731263e-18},
225 |     {3.826834323650897818e-01, -1.005077269646158761e-17},
226 |     {5.555702330196021776e-01, 4.709410940561676821e-17},
227 |     {7.071067811865475727e-01, -4.833646656726456726e-17}
228 |     };
229 | 
230 | static const ddouble _cos_table[] = {
231 |     {9.807852804032304306e-01, 1.854693999782500573e-17},
232 |     {9.238795325112867385e-01, 1.764504708433667706e-17},
233 |     {8.314696123025452357e-01, 1.407385698472802389e-18},
234 |     {7.071067811865475727e-01, -4.833646656726456726e-17}
235 |     };
236 | 
237 | static ddouble sin_taylor(ddouble a)
238 | {
239 |     // Use the Taylor series a - a^3/3! + a^5/5! + ...
240 |     const double thresh = 0.5 * fabs(a.hi) * Q_EPS.hi;
241 |     const ddouble minus_asquared = negw(sqrw(a));
242 | 
243 |     // First order:
244 |     ddouble apow = a;
245 |     ddouble term = a;
246 |     ddouble sum = a;
247 | 
248 |     // Subsequent orders:
249 |     for (int i = 3; i < _n_inv_fact; i += 2) {
250 |         apow = mulww(apow, minus_asquared);
251 |         term = mulww(apow, _inv_fact[i]);
252 |         sum = addww(sum, term);
253 |         if (fabs(term.hi) <= thresh)
254 |             break;
255 |     }
256 |     return sum;
257 | }
258 | 
259 | static ddouble cos_taylor(ddouble a)
260 | {
261 |     // Use Taylor series 1 - x^2/2! + x^4/4! + ...
262 |     const double thresh = 0.5 * Q_EPS.hi;
263 |     const ddouble minus_asquared = negw(sqrw(a));
264 | 
265 |     // Zeroth and second order:
266 |     ddouble apow = minus_asquared;
267 |     ddouble term = mul_pwr2(apow, 0.5);
268 |     ddouble sum = adddw(1.0, term);
269 | 
270 |     // From fourth order:
271 |     for (int i = 4; i < _n_inv_fact; i += 2) {
272 |         apow = mulww(apow, minus_asquared);
273 |         term = mulww(apow, _inv_fact[i]);
274 |         sum = addww(sum, term);
275 |         if (fabs(term.hi) <= thresh)
276 |             break;
277 |     }
278 |     return sum;
279 | }
280 | 
281 | static void sincos_taylor(ddouble a, ddouble *sin_a, ddouble *cos_a)
282 | {
283 |     if (iszerow(a)) {
284 |         *sin_a = Q_ZERO;
285 |         *cos_a = Q_ONE;
286 |     } else {
287 |         *sin_a = sin_taylor(a);
288 |         *cos_a = sqrtw(subdw(1.0, sqrw(*sin_a)));
289 |     }
290 | }
291 | 
292 | /**
293 |  * To compute 2pi-periodic function, we reduce the argument `a` by
294 |  * choosing integers z, -2 <= j <= 2 and -4 <= k <= 4 such that:
295 |  *
296 |  *   a == z * (2*pi) + j * (pi/2) + k * (pi/16) + t,
297 |  *
298 |  * where `abs(t) <= pi/32`.
299 |  */
300 | static ddouble mod_pi16(ddouble a, int *j, int *k)
301 | {
302 |     static const ddouble pi_16 =
303 |         {1.963495408493620697e-01, 7.654042494670957545e-18};
304 | 
305 |     // approximately reduce modulo 2*pi
306 |     ddouble z = roundw(divww(a, Q_2PI));
307 |     ddouble r = subww(a, mulww(Q_2PI, z));
308 | 
309 |     // approximately reduce modulo pi/2
310 |     double q = floor(r.hi / Q_PI_2.hi + 0.5);
311 |     ddouble t = subww(r, mulwd(Q_PI_2, q));
312 |     *j = (int)q;
313 | 
314 |     // approximately reduce modulo pi/16.
315 |     q = floor(t.hi / pi_16.hi + 0.5);
316 |     t = subww(t, mulwd(pi_16, q));
317 |     *k = (int)q;
318 |     return t;
319 | }
320 | 
321 | ddouble sinw(ddouble a)
322 | {
323 |     /* Strategy.  To compute sin(x), we choose integers a, b so that
324 |      *
325 |      *   x = s + a * (pi/2) + b * (pi/16)
326 |      *
327 |      * and |s| <= pi/32.  Using the fact that
328 |      *
329 |      *   sin(pi/16) = 0.5 * sqrt(2 - sqrt(2 + sqrt(2)))
330 |      *
331 |      * we can compute sin(x) from sin(s), cos(s).  This greatly
332 |      * increases the convergence of the sine Taylor series.
333 |      */
334 |     if (iszerow(a))
335 |         return Q_ZERO;
336 | 
337 |     int j, k;
338 |     ddouble t = mod_pi16(a, &j, &k);
339 |     int abs_k = abs(k);
340 | 
341 |     if (j < -2 || j > 2)
342 |         return nanw();
343 | 
344 |     if (abs_k > 4)
345 |         return nanw();
346 | 
347 |     if (k == 0) {
348 |         switch (j)
349 |         {
350 |         case 0:
351 |             return sin_taylor(t);
352 |         case 1:
353 |             return cos_taylor(t);
354 |         case -1:
355 |             return negw(cos_taylor(t));
356 |         default:
357 |             return negw(sin_taylor(t));
358 |         }
359 |     }
360 | 
361 |     ddouble u = _cos_table[abs_k - 1];
362 |     ddouble v = _sin_table[abs_k - 1];
363 |     ddouble sin_x, cos_x, r;
364 |     sincos_taylor(t, &sin_x, &cos_x);
365 |     if (j == 0) {
366 |         if (k > 0)
367 |             r = addww(mulww(u, sin_x), mulww(v, cos_x));
368 |         else
369 |             r = subww(mulww(u, sin_x), mulww(v, cos_x));
370 |     } else if (j == 1) {
371 |         if (k > 0)
372 |             r = subww(mulww(u, cos_x), mulww(v, sin_x));
373 |         else
374 |             r = addww(mulww(u, cos_x), mulww(v, sin_x));
375 |     } else if (j == -1) {
376 |         if (k > 0)
377 |             r = subww(mulww(v, sin_x), mulww(u, cos_x));
378 |         else if (k < 0)   /* NOTE! */
379 |             r = subww(mulww(negw(u), cos_x), mulww(v, sin_x));
380 |     } else {
381 |         if (k > 0)
382 |             r = subww(mulww(negw(u), sin_x), mulww(v, cos_x));
383 |         else
384 |             r = subww(mulww(v, cos_x), mulww(u, sin_x));
385 |     }
386 |     return r;
387 | }
388 | 
389 | ddouble cosw(ddouble a)
390 | {
391 |     if (iszerow(a))
392 |         return Q_ONE;
393 | 
394 |     int j, k;
395 |     ddouble t = mod_pi16(a, &j, &k);
396 |     int abs_k = abs(k);
397 | 
398 |     if (j < -2 || j > 2)
399 |         return nanw();
400 | 
401 |     if (abs_k > 4)
402 |         return nanw();
403 | 
404 |     if (k == 0) {
405 |         switch (j) {
406 |         case 0:
407 |             return cos_taylor(t);
408 |         case 1:
409 |             return negw(sin_taylor(t));
410 |         case -1:
411 |             return sin_taylor(t);
412 |         default:
413 |             return negw(cos_taylor(t));
414 |         }
415 |     }
416 | 
417 |     ddouble sin_x, cos_x, r;
418 |     sincos_taylor(t, &sin_x, &cos_x);
419 |     ddouble u = _cos_table[abs_k - 1];
420 |     ddouble v = _sin_table[abs_k - 1];
421 | 
422 |     if (j == 0) {
423 |         if (k > 0)
424 |             r = subww(mulww(u, cos_x), mulww(v, sin_x));
425 |         else
426 |             r = addww(mulww(u, cos_x), mulww(v, sin_x));
427 |     } else if (j == 1) {
428 |         if (k > 0)
429 |             r = subww(mulww(negw(u), sin_x), mulww(v, cos_x));
430 |         else
431 |             r = subww(mulww(v, cos_x), mulww(u, sin_x));
432 |     } else if (j == -1) {
433 |         if (k > 0)
434 |             r = addww(mulww(u, sin_x), mulww(v, cos_x));
435 |         else
436 |             r = subww(mulww(u, sin_x), mulww(v, cos_x));
437 |     } else {
438 |         if (k > 0)
439 |             r = subww(mulww(v, sin_x), mulww(u, cos_x));
440 |         else
441 |             r = subww(mulww(negw(u), cos_x), mulww(v, sin_x));
442 |     }
443 |     return r;
444 | }
445 | 
446 | ddouble sinhw(ddouble a)
447 | {
448 |     if (iszerow(a))
449 |         return Q_ZERO;
450 | 
451 |     if (absw(a).hi > 0.05) {
452 |         ddouble ea = expw(a);
453 |         if (isinfw(ea))
454 |             return ea;
455 |         if (iszerow(ea))
456 |             return negw(infw());
457 |         return mul_pwr2(subww(ea, reciprocalw(ea)), 0.5);
458 |     }
459 | 
460 |     // When a is small, using the above formula gives a lot of cancellation.
461 |     // Use Taylor series: x + x^3/3! + x^5/5! + ...
462 |     const ddouble asquared = sqrw(a);
463 |     const double thresh = fabs(a.hi) * Q_EPS.hi;
464 | 
465 |     // First order:
466 |     ddouble apower = a;
467 |     ddouble sum = a;
468 |     ddouble term = a;
469 | 
470 |     // From third order:
471 |     for (int i = 3; i < _n_inv_fact; i += 2) {
472 |         apower = mulww(apower, asquared);
473 |         term = mulww(apower, _inv_fact[i]);
474 |         sum = addww(sum, term);
475 |         if (fabs(term.hi) <= thresh)
476 |             break;
477 |     }
478 |     return sum;
479 | }
480 | 
481 | ddouble coshw(ddouble a)
482 | {
483 |     if (iszerow(a))
484 |         return Q_ONE;
485 | 
486 |     ddouble ea = expw(a);
487 |     if (isinfw(ea) || iszerow(ea))
488 |         return infw();
489 |     return mul_pwr2(addww(ea, reciprocalw(ea)), 0.5);
490 | }
491 | 
492 | ddouble tanhw(ddouble a)
493 | {
494 |     if (iszerow(a))
495 |         return Q_ZERO;
496 | 
497 |     if (fabs(a.hi) > 0.05) {
498 |         ddouble ea = expw(a);
499 |         ddouble inv_ea = reciprocalw(ea);
500 |         return divww(subww(ea, inv_ea), addww(ea, inv_ea));
501 |     }
502 | 
503 |     ddouble s, c;
504 |     s = sinhw(a);
505 |     c = sqrtw(adddw(1.0, sqrw(s)));
506 |     return divww(s, c);
507 | }
508 | 
509 | ddouble tanw(ddouble a)
510 | {
511 |     if (iszerow(a))
512 |         return Q_ZERO;
513 | 
514 |     ddouble s, c;
515 |     s = sinw(a);
516 |     c = cosw(a);
517 |     return divww(s, c);
518 | }
519 | 
520 | void sincosw(const ddouble a, ddouble *sin_a, ddouble *cos_a)
521 | {
522 |     if (iszerow(a)) {
523 |         *sin_a = Q_ZERO;
524 |         *cos_a = Q_ONE;
525 |         return;
526 |     }
527 | 
528 |     int j, k;
529 |     ddouble t = mod_pi16(a, &j, &k);
530 |     int abs_j = abs(j), abs_k = abs(k);
531 | 
532 |     if (abs_j > 2 || abs_k > 4) {
533 |         *cos_a = *sin_a = nanw();
534 |         return;
535 |     }
536 | 
537 |     ddouble sin_t, cos_t;
538 |     ddouble s, c;
539 | 
540 |     sincos_taylor(t, &sin_t, &cos_t);
541 | 
542 |     if (abs_k == 0) {
543 |         s = sin_t;
544 |         c = cos_t;
545 |     } else {
546 |         ddouble u = _cos_table[abs_k - 1];
547 |         ddouble v = _sin_table[abs_k - 1];
548 | 
549 |         if (k > 0) {
550 |             s = addww(mulww(u, sin_t), mulww(v, cos_t));
551 |             c = subww(mulww(u, cos_t), mulww(v, sin_t));
552 |         } else {
553 |             s = subww(mulww(u, sin_t), mulww(v, cos_t));
554 |             c = addww(mulww(u, cos_t), mulww(v, sin_t));
555 |         }
556 |     }
557 |     if (abs_j == 0) {
558 |         *sin_a = s;
559 |         *cos_a = c;
560 |     } else if (j == 1) {
561 |         *sin_a = c;
562 |         *cos_a = negw(s);
563 |     } else if (j == -1) {
564 |         *sin_a = negw(c);
565 |         *cos_a = s;
566 |     } else {
567 |         *sin_a = negw(s);
568 |         *cos_a = negw(c);
569 |     }
570 | 
571 | }
572 | 
573 | ddouble atan2ww(ddouble y, ddouble x)
574 | {
575 |     /* Strategy: Instead of using Taylor series to compute
576 |      * arctan, we instead use Newton's iteration to solve
577 |      * the equation
578 |      *
579 |      *     sin(z) = y/r    or    cos(z) = x/r
580 |      *
581 |      * where r = sqrt(x^2 + y^2).
582 |      * The iteration is given by
583 |      *
584 |      *     z' = z + (y - sin(z)) / cos(z)          (for equation 1)
585 |      *     z' = z - (x - cos(z)) / sin(z)          (for equation 2)
586 |      *
587 |      * Here, x and y are normalized so that x^2 + y^2 = 1.
588 |      * If |x| > |y|, then first iteration is used since the
589 |      * denominator is larger.  Otherwise, the second is used.
590 |      */
591 |     if (iszerow(x) && iszerow(y))
592 |         return Q_ZERO;
593 |     if (iszerow(x))
594 |         return (ispositivew(y)) ? Q_PI_2 : negw(Q_PI_2);
595 |     if (iszerow(y))
596 |         return (ispositivew(x)) ? Q_ZERO : Q_PI;
597 |     if (equalww(x, y))
598 |         return (ispositivew(y)) ? Q_PI_4: negw(Q_3PI_4);
599 |     if (equalww(x, negw(y)))
600 |         return (ispositivew(y)) ? Q_3PI_4 : negw(Q_PI_4);
601 | 
602 |     ddouble r = hypotww(x, y);
603 |     x = divww(x, r);
604 |     y = divww(y, r);
605 | 
606 |     /* Compute double precision approximation to atan. */
607 |     ddouble z = (ddouble){atan2(y.hi, x.hi), 0.};
608 |     ddouble sin_z, cos_z;
609 | 
610 |     sincosw(z, &sin_z, &cos_z);
611 |     if (fabs(x.hi) > fabs(y.hi)) {
612 |         /* Use Newton iteration 1.  z' = z + (y - sin(z)) / cos(z)  */
613 |         z = addww(z, divww(subww(y, sin_z), cos_z));
614 |     } else {
615 |         /* Use Newton iteration 2.  z' = z - (x - cos(z)) / sin(z)  */
616 |         z = subww(z, divww(subww(x, cos_z), sin_z));
617 |     }
618 |     return z;
619 | }
620 | 
621 | ddouble atan2dw(const double a, const ddouble b)
622 | {
623 |     return atan2ww((ddouble){a, 0.}, b);
624 | }
625 | 
626 | ddouble atan2wd(const ddouble a, const double b)
627 | {
628 |     return atan2ww(a, (ddouble){b, 0.});
629 | }
630 | 
631 | ddouble atanw(const ddouble a)
632 | {
633 |     return atan2ww(a, Q_ONE);
634 | }
635 | 
636 | ddouble acosw(const ddouble a)
637 | {
638 |     ddouble abs_a = absw(a);
639 |     if (greaterww(abs_a, Q_ONE))
640 |         return nanw();
641 |     if (isonew(abs_a))
642 |         return (ispositivew(a)) ? Q_ZERO : Q_PI;
643 | 
644 |     return atan2ww(sqrtw(subdw(1.0, sqrw(a))), a);
645 | }
646 | 
647 | ddouble asinw(const ddouble a)
648 | {
649 |     ddouble abs_a = absw(a);
650 |     if (greaterwd(abs_a, 1.0))
651 |         return nanw();
652 |     if (isonew(abs_a))
653 |         return (ispositivew(a)) ? Q_PI_2 : negw(Q_PI_2);
654 | 
655 |     return atan2ww(a, sqrtw(subdw(1.0, sqrw(a))));
656 | }
657 | 
658 | ddouble asinhw(const ddouble a)
659 | {
660 |     return logw(addww(a,sqrtw(addwd(sqrw(a),1.0))));
661 | }
662 | 
663 | ddouble acoshw(const ddouble a)
664 | {
665 |     if (lesswd(a, 1.0))
666 |         return nanw();
667 | 
668 |     return logw(addww(a, sqrtw(subwd(sqrw(a), 1.0))));
669 | }
670 | 
671 | ddouble atanhw(const ddouble a)
672 | {
673 |     if (equalwd(a, -1.0))
674 |         return negw(infw());
675 |     if (isonew(a))
676 |         return infw();
677 |     if (greaterwd(absw(a), 1.0))
678 |         return nanw();
679 | 
680 |     return mul_pwr2(logw(divww(adddw(1.0, a) , subdw(1.0, a))), 0.5);
681 | }
682 | 
683 | ddouble powww(const ddouble a, const ddouble b)
684 | {
685 |     if (iszerow(a) && iszerow(b))
686 |         return Q_ONE;
687 |     if (iszerow(a) && !iszerow(b))
688 |         return Q_ZERO;
689 | 
690 |     return expw(mulww(b, logw(a)));
691 | }
692 | 
693 | ddouble powwd(const ddouble a, const double b)
694 | {
695 |     if (iszerow(a) && b == 0)
696 |         return Q_ONE;
697 |     if (iszerow(a) && b != 0)
698 |         return Q_ZERO;
699 | 
700 |     return expw(muldw(b, logw(a)));
701 | }
702 | 
703 | ddouble powdw(const double a, const ddouble b)
704 | {
705 |     if (a == 0 && iszerow(b))
706 |         return Q_ONE;
707 |     if (a == 0 && !iszerow(b))
708 |         return Q_ZERO;
709 | 
710 |     return expw(mulwd(b, log(a)));
711 | }
712 | 
713 | ddouble modfww(const ddouble a, ddouble *b)
714 | {
715 |     if (isnegativew(a)) {
716 |         *b = ceilw(a);
717 |     } else {
718 |         *b = floorw(a);
719 |     }
720 |     return subww(a, *b);
721 | }
722 | 


--------------------------------------------------------------------------------
/csrc/_dd_ufunc.c:
--------------------------------------------------------------------------------
   1 | /* Python extension module for the ddouble data type.
   2 |  *
   3 |  * Code is adapted from tensorflow's bfloat16 extension type, found here:
   4 |  * `tensorflow/python/lib/core/bfloat16.cc` and licensed Apache 2.0.
   5 |  *
   6 |  * Copyright (C) 2021 Markus Wallerberger and others
   7 |  * SPDX-License-Identifier: MIT
   8 |  */
   9 | #include <Python.h>
  10 | #include <structmember.h>
  11 | 
  12 | #include <math.h>
  13 | #include <stdio.h>
  14 | #include <stdalign.h>
  15 | 
  16 | #include "dd_arith.h"
  17 | 
  18 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
  19 | #include "numpy/ndarraytypes.h"
  20 | #include "numpy/ufuncobject.h"
  21 | #include "numpy/npy_3kcompat.h"
  22 | 
  23 | #if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE)
  24 | static inline void _Py_SET_TYPE(PyObject *ob, PyTypeObject *type)
  25 | { ob->ob_type = type; }
  26 | #define Py_SET_TYPE(ob, type) _Py_SET_TYPE((PyObject*)(ob), type)
  27 | #endif
  28 | 
  29 | /**
  30 |  * Allows parameter to be marked unused
  31 |  */
  32 | #define MARK_UNUSED(x)  do { (void)(x); } while(false)
  33 | 
  34 | #ifdef _MSC_VER
  35 | #define alignof __alignof
  36 | #endif
  37 | 
  38 | /* ------------------------ DDouble object ----------------------- */
  39 | 
  40 | static PyObject *module = NULL;
  41 | static PyObject *numpy_module = NULL;
  42 | static int type_num = -1;  //FIXME
  43 | 
  44 | static PyTypeObject *pyddouble_type = NULL;
  45 | static PyObject *pyddouble_finfo = NULL;
  46 | 
  47 | typedef struct {
  48 |     PyObject_HEAD
  49 |     ddouble x;
  50 | } PyDDouble;
  51 | 
  52 | static bool PyDDouble_Check(PyObject* object)
  53 | {
  54 |     return PyObject_IsInstance(object, (PyObject *)pyddouble_type);
  55 | }
  56 | 
  57 | static PyObject *PyDDouble_Wrap(ddouble x)
  58 | {
  59 |     PyDDouble *obj = (PyDDouble *) pyddouble_type->tp_alloc(pyddouble_type, 0);
  60 |     if (obj != NULL)
  61 |         obj->x = x;
  62 |     return (PyObject *)obj;
  63 | }
  64 | 
  65 | static ddouble PyDDouble_Unwrap(PyObject *arg)
  66 | {
  67 |     return ((PyDDouble *)arg)->x;
  68 | }
  69 | 
  70 | static bool PyDDouble_Cast(PyObject *arg, ddouble *out)
  71 | {
  72 |     if (PyDDouble_Check(arg)) {
  73 |         *out = PyDDouble_Unwrap(arg);
  74 |     } else if (PyFloat_Check(arg)) {
  75 |         double val = PyFloat_AsDouble(arg);
  76 |         *out = (ddouble) {val, 0.0};
  77 |     } else if (PyLong_Check(arg)) {
  78 |         long val = PyLong_AsLong(arg);
  79 |         *out = (ddouble) {val, 0.0};
  80 |     } else if (PyArray_IsScalar(arg, Float)) {
  81 |         float val;
  82 |         PyArray_ScalarAsCtype(arg, &val);
  83 |         *out = (ddouble) {val, 0.0};
  84 |     } else if (PyArray_IsScalar(arg, Double)) {
  85 |         double val;
  86 |         PyArray_ScalarAsCtype(arg, &val);
  87 |         *out = (ddouble) {val, 0.0};
  88 |     } else if (PyArray_IsZeroDim(arg)) {
  89 |         PyArrayObject* arr = (PyArrayObject *)arg;
  90 |         if (PyArray_TYPE(arr) == type_num) {
  91 |             *out = *(ddouble *)PyArray_DATA(arr);
  92 |         } else {
  93 |             arr = (PyArrayObject *)PyArray_Cast(arr, type_num);
  94 |             if (!PyErr_Occurred())
  95 |                 *out = *(ddouble *)PyArray_DATA(arr);
  96 |             else
  97 |                 *out = nanw();
  98 |             Py_XDECREF(arr);
  99 |         }
 100 |     } else {
 101 |         *out = nanw();
 102 |         PyErr_Format(PyExc_TypeError,
 103 |             "Cannot cast instance of %s to ddouble scalar",
 104 |             arg->ob_type->tp_name);
 105 |     }
 106 |     return !PyErr_Occurred();
 107 | }
 108 | 
 109 | static PyObject* PyDDouble_New(PyTypeObject *type, PyObject *args, PyObject *kwds)
 110 | {
 111 |     PyObject *arg = NULL;
 112 |     if (PyArg_ParseTuple(args, "O", &arg) < 0)
 113 |         return NULL;
 114 | 
 115 |     ddouble val;
 116 |     if (PyDDouble_Check(arg)) {
 117 |         Py_INCREF(arg);
 118 |         return arg;
 119 |     } else  if (PyDDouble_Cast(arg, &val)) {
 120 |         return PyDDouble_Wrap(val);
 121 |     } else {
 122 |         PyErr_Format(PyExc_TypeError, "expected ddouble, got %s",
 123 |                      arg->ob_type->tp_name);
 124 |         return NULL;
 125 |     }
 126 |     MARK_UNUSED(type);
 127 |     MARK_UNUSED(kwds);
 128 | }
 129 | 
 130 | static PyObject* PyDDouble_Float(PyObject* self)
 131 | {
 132 |     ddouble x = PyDDouble_Unwrap(self);
 133 |     return PyFloat_FromDouble(x.hi);
 134 | }
 135 | 
 136 | static PyObject* PyDDouble_Int(PyObject* self)
 137 | {
 138 |     ddouble x = PyDDouble_Unwrap(self);
 139 |     return PyFloat_FromDouble((long) x.hi);
 140 | }
 141 | 
 142 | #define PYWRAP_UNARY(name, inner)                                       \
 143 |     static PyObject* name(PyObject* _x)                                 \
 144 |     {                                                                   \
 145 |         ddouble r, x;                                                   \
 146 |         x = PyDDouble_Unwrap(_x);                                       \
 147 |         r = inner(x);                                                   \
 148 |         return PyDDouble_Wrap(r);                                       \
 149 |     }
 150 | 
 151 | #define PYWRAP_BINARY(name, inner, tp_inner_op)                         \
 152 |     static PyObject* name(PyObject* _x, PyObject* _y)                   \
 153 |     {                                                                   \
 154 |         ddouble r, x, y;                                                \
 155 |         if (PyArray_Check(_y))                                          \
 156 |             return PyArray_Type.tp_as_number->tp_inner_op(_x, _y);      \
 157 |         if (PyDDouble_Cast(_x, &x) && PyDDouble_Cast(_y, &y)) {         \
 158 |             r = inner(x, y);                                            \
 159 |             return PyDDouble_Wrap(r);                                   \
 160 |         }                                                               \
 161 |         return NULL;                                                    \
 162 |     }
 163 | 
 164 | #define PYWRAP_INPLACE(name, inner)                                     \
 165 |     static PyObject* name(PyObject* _self, PyObject* _y)                \
 166 |     {                                                                   \
 167 |         PyDDouble *self = (PyDDouble *)_self;                           \
 168 |         ddouble y;                                                      \
 169 |         if (PyDDouble_Cast(_y, &y)) {                                   \
 170 |             self->x = inner(self->x, y);                                \
 171 |             Py_XINCREF(_self);                                          \
 172 |             return _self;                                               \
 173 |         } else {                                                        \
 174 |             return NULL;                                                \
 175 |         }                                                               \
 176 |     }
 177 | 
 178 | PYWRAP_UNARY(PyDDouble_Positive, posw)
 179 | PYWRAP_UNARY(PyDDouble_Negative, negw)
 180 | PYWRAP_UNARY(PyDDouble_Absolute, absw)
 181 | 
 182 | PYWRAP_BINARY(PyDDouble_Add, addww, nb_add)
 183 | PYWRAP_BINARY(PyDDouble_Subtract, subww, nb_subtract)
 184 | PYWRAP_BINARY(PyDDouble_Multiply, mulww, nb_multiply)
 185 | PYWRAP_BINARY(PyDDouble_Divide, divww, nb_true_divide)
 186 | 
 187 | PYWRAP_INPLACE(PyDDouble_InPlaceAdd, addww)
 188 | PYWRAP_INPLACE(PyDDouble_InPlaceSubtract, subww)
 189 | PYWRAP_INPLACE(PyDDouble_InPlaceMultiply, mulww)
 190 | PYWRAP_INPLACE(PyDDouble_InPlaceDivide, divww)
 191 | 
 192 | static int PyDDouble_Nonzero(PyObject* _x)
 193 | {
 194 |     ddouble x = PyDDouble_Unwrap(_x);
 195 |     return !(x.hi == 0);
 196 | }
 197 | 
 198 | static PyObject* PyDDouble_RichCompare(PyObject* _x, PyObject* _y, int op)
 199 | {
 200 |     ddouble x, y;
 201 |     if (!PyDDouble_Cast(_x, &x) || !PyDDouble_Cast(_y, &y))
 202 |         return PyGenericArrType_Type.tp_richcompare(_x, _y, op);
 203 | 
 204 |     bool result;
 205 |     switch (op) {
 206 |     case Py_LT:
 207 |         result = lessww(x, y);
 208 |         break;
 209 |     case Py_LE:
 210 |         result = lessequalww(x, y);
 211 |         break;
 212 |     case Py_EQ:
 213 |         result = equalww(x, y);
 214 |         break;
 215 |     case Py_NE:
 216 |         result = notequalww(x, y);
 217 |         break;
 218 |     case Py_GT:
 219 |         result = greaterww(x, y);
 220 |         break;
 221 |     case Py_GE:
 222 |         result = greaterequalww(x, y);
 223 |         break;
 224 |     default:
 225 |         PyErr_SetString(PyExc_RuntimeError, "Invalid op type");
 226 |         return NULL;
 227 |     }
 228 |     return PyBool_FromLong(result);
 229 | }
 230 | 
 231 | static Py_hash_t PyDDouble_Hash(PyObject *_x)
 232 | {
 233 |     ddouble x = PyDDouble_Unwrap(_x);
 234 | 
 235 |     int exp;
 236 |     double mantissa;
 237 |     mantissa = frexp(x.hi, &exp);
 238 |     return (Py_hash_t)(LONG_MAX * mantissa) + exp;
 239 | }
 240 | 
 241 | static PyObject *PyDDouble_Str(PyObject *self)
 242 | {
 243 |     char out[200];
 244 |     ddouble x = PyDDouble_Unwrap(self);
 245 |     snprintf(out, 200, "%.16g", x.hi);
 246 |     return PyUnicode_FromString(out);
 247 | }
 248 | 
 249 | static PyObject *PyDDouble_Repr(PyObject *self)
 250 | {
 251 |     char out[200];
 252 |     ddouble x = PyDDouble_Unwrap(self);
 253 |     snprintf(out, 200, "ddouble(%.16g+%.16g)", x.hi, x.lo);
 254 |     return PyUnicode_FromString(out);
 255 | }
 256 | 
 257 | static PyObject *PyDDoubleGetFinfo(PyObject *self, PyObject *_dummy)
 258 | {
 259 |     Py_INCREF(pyddouble_finfo);
 260 |     return pyddouble_finfo;
 261 |     MARK_UNUSED(self);
 262 |     MARK_UNUSED(_dummy);
 263 | }
 264 | 
 265 | static int make_ddouble_type()
 266 | {
 267 |     static PyNumberMethods ddouble_as_number = {
 268 |         .nb_add = PyDDouble_Add,
 269 |         .nb_subtract = PyDDouble_Subtract,
 270 |         .nb_multiply = PyDDouble_Multiply,
 271 |         .nb_true_divide = PyDDouble_Divide,
 272 |         .nb_inplace_add = PyDDouble_InPlaceAdd,
 273 |         .nb_inplace_subtract = PyDDouble_InPlaceSubtract,
 274 |         .nb_inplace_multiply = PyDDouble_InPlaceMultiply,
 275 |         .nb_inplace_true_divide = PyDDouble_InPlaceDivide,
 276 |         .nb_negative = PyDDouble_Negative,
 277 |         .nb_positive = PyDDouble_Positive,
 278 |         .nb_absolute = PyDDouble_Absolute,
 279 |         .nb_bool = PyDDouble_Nonzero,
 280 |         .nb_int = PyDDouble_Int,
 281 |         .nb_float = PyDDouble_Float,
 282 |         };
 283 |     static PyMethodDef ddouble_methods[] = {
 284 |         {"__finfo__", PyDDoubleGetFinfo, METH_NOARGS | METH_CLASS,
 285 |          "floating point information for type"},
 286 |         {NULL, NULL, 0, NULL}
 287 |         };
 288 |     static PyTypeObject ddouble_type = {
 289 |         PyVarObject_HEAD_INIT(NULL, 0)
 290 |         .tp_name = "ddouble",
 291 |         .tp_basicsize = sizeof(PyDDouble),
 292 |         .tp_repr = PyDDouble_Repr,
 293 |         .tp_as_number = &ddouble_as_number,
 294 |         .tp_hash = PyDDouble_Hash,
 295 |         .tp_str = PyDDouble_Str,
 296 |         .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
 297 |         .tp_doc = "double-double floating point type",
 298 |         .tp_richcompare = PyDDouble_RichCompare,
 299 |         .tp_new = PyDDouble_New,
 300 |         .tp_methods = ddouble_methods
 301 |         };
 302 | 
 303 |     ddouble_type.tp_base = &PyFloatingArrType_Type;
 304 |     if (PyType_Ready(&ddouble_type) < 0)
 305 |         return -1;
 306 | 
 307 |     pyddouble_type = &ddouble_type;
 308 |     return PyModule_AddObject(module, "ddouble", (PyObject *)pyddouble_type);
 309 | }
 310 | 
 311 | /* --------------------- Ddouble Finfo object -------------------- */
 312 | 
 313 | typedef struct {
 314 |     PyObject_HEAD
 315 |     PyObject *dtype;    // which dtype
 316 |     int bits;           // number of bits
 317 |     PyObject *max;      // largest positive number
 318 |     PyObject *min;      // largest negative number
 319 |     PyObject *eps;      // machine epsilon (spacing)
 320 |     int nexp;           // number of exponent bits
 321 |     int nmant;          // number of mantissa bits
 322 |     PyObject *machar;   // machar object (unused)
 323 | } PyDDoubleFInfo;
 324 | 
 325 | static PyTypeObject *PyDDoubleFinfoType;
 326 | 
 327 | static PyObject *PPyDDoubleFInfo_Make()
 328 | {
 329 |     PyDDoubleFInfo *self =
 330 |         (PyDDoubleFInfo *) PyDDoubleFinfoType->tp_alloc(PyDDoubleFinfoType, 0);
 331 |     if (self == NULL)
 332 |         return NULL;
 333 | 
 334 |     Py_INCREF(Py_None);
 335 |     self->dtype = (PyObject *)PyArray_DescrFromType(type_num);
 336 |     self->bits = CHAR_BIT * sizeof(ddouble);
 337 |     self->max = PyDDouble_Wrap(Q_MAX);
 338 |     self->min = PyDDouble_Wrap(Q_MIN);
 339 |     self->eps = PyDDouble_Wrap(Q_EPS);
 340 |     self->nexp = 11;
 341 |     self->nmant = 104;
 342 |     self->machar = Py_None;
 343 |     return (PyObject *)self;
 344 | }
 345 | 
 346 | static int make_finfo()
 347 | {
 348 |     static PyMemberDef finfo_members[] = {
 349 |         {"dtype",  T_OBJECT_EX, offsetof(PyDDoubleFInfo, dtype),  READONLY,
 350 |                    "underlying dtype object"},
 351 |         {"bits",   T_INT,       offsetof(PyDDoubleFInfo, bits),   READONLY,
 352 |                    "storage size of object in bits"},
 353 |         {"max",    T_OBJECT_EX, offsetof(PyDDoubleFInfo, max),    READONLY,
 354 |                    "largest positive number"},
 355 |         {"min",    T_OBJECT_EX, offsetof(PyDDoubleFInfo, min),    READONLY,
 356 |                    "largest negative number"},
 357 |         {"eps",    T_OBJECT_EX, offsetof(PyDDoubleFInfo, eps),    READONLY,
 358 |                    "machine epsilon"},
 359 |         {"nexp",   T_INT,       offsetof(PyDDoubleFInfo, nexp),   READONLY,
 360 |                    "number of bits in exponent"},
 361 |         {"nmant",  T_INT,       offsetof(PyDDoubleFInfo, nmant),  READONLY,
 362 |                    "number of bits in mantissa"},
 363 |         {"machar", T_OBJECT_EX, offsetof(PyDDoubleFInfo, machar), READONLY,
 364 |                    "machar object (unused)"},
 365 |         {NULL, 0, 0, 0, NULL}
 366 |         };
 367 |     static PyTypeObject finfo_type = {
 368 |         PyVarObject_HEAD_INIT(NULL, 0)
 369 |         .tp_name = "ddouble_finfo",
 370 |         .tp_basicsize = sizeof(PyDDoubleFInfo),
 371 |         .tp_members = finfo_members,
 372 |         .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
 373 |         .tp_doc = "finfo type"
 374 |         };
 375 | 
 376 |     if (PyType_Ready(&finfo_type) < 0)
 377 |         return -1;
 378 | 
 379 |     PyDDoubleFinfoType = &finfo_type;
 380 |     pyddouble_finfo = PPyDDoubleFInfo_Make();
 381 |     if (pyddouble_finfo == NULL)
 382 |         return -1;
 383 | 
 384 |     return 0;
 385 | }
 386 | 
 387 | /* ------------------------------ Descriptor ----------------------------- */
 388 | 
 389 | static PyObject *NPyDDouble_GetItem(void *data, void *arr)
 390 | {
 391 |     ddouble x = *(ddouble *)data;
 392 |     return PyDDouble_Wrap(x);
 393 |     MARK_UNUSED(arr);
 394 | }
 395 | 
 396 | static int NPyDDouble_SetItem(PyObject *item, void *data, void *arr)
 397 | {
 398 |     ddouble x;
 399 |     if (!PyDDouble_Cast(item, &x))
 400 |         return -1;
 401 |     *(ddouble *)data = x;
 402 |     return 0;
 403 |     MARK_UNUSED(arr);
 404 | }
 405 | 
 406 | static int NPyDDouble_Compare(const void *_a, const void *_b, void *arr)
 407 | {
 408 |     ddouble a = *(const ddouble *)_a;
 409 |     ddouble b = *(const ddouble *)_b;
 410 | 
 411 |     if (lessww(a, b))
 412 |         return -1;
 413 |     if (greaterww(a, b))
 414 |         return 1;
 415 |     if (isnanw(b))
 416 |         return 1;
 417 |     return 0;
 418 |     MARK_UNUSED(arr);
 419 | }
 420 | 
 421 | static void NPyDDouble_CopySwapN(void *_d, npy_intp sd, void *_s, npy_intp ss,
 422 |                                  npy_intp ii, int swap, void* arr)
 423 | {
 424 |     if (_s == NULL)
 425 |         return;
 426 |     char *_cd = (char *)_d, *_cs = (char *)_s;
 427 |     if (swap) {
 428 |         for (npy_intp i = 0; i != ii; ++i, _cd += sd, _cs += ss) {
 429 |             ddouble *s = (ddouble *)_cs, *d = (ddouble *)_cd, tmp;
 430 |             tmp = *d;
 431 |             *d = *s;
 432 |             *s = tmp;
 433 |         }
 434 |     } else {
 435 |         for (npy_intp i = 0; i != ii; ++i, _cd += sd, _cs += ss) {
 436 |             ddouble *s = (ddouble *)_cs, *d = (ddouble *)_cd;
 437 |             *d = *s;
 438 |         }
 439 |     }
 440 |     MARK_UNUSED(arr);
 441 | }
 442 | 
 443 | static void NPyDDouble_CopySwap(void *_d, void *_s, int swap, void* arr)
 444 | {
 445 |     ddouble *s = _s, *d = _d, tmp;
 446 |     if (_s == NULL)
 447 |         return;
 448 |     if (swap) {
 449 |         tmp = *d;
 450 |         *d = *s;
 451 |         *s = tmp;
 452 |     } else {
 453 |         *d = *s;
 454 |     }
 455 |     MARK_UNUSED(arr);
 456 | }
 457 | 
 458 | static npy_bool NPyDDouble_NonZero(void *data, void *arr)
 459 | {
 460 |     ddouble x = *(ddouble *)data;
 461 |     return !iszerow(x);
 462 |     MARK_UNUSED(arr);
 463 | }
 464 | 
 465 | static int NPyDDouble_Fill(void *_buffer, npy_intp ii, void *arr)
 466 | {
 467 |     // Fill with linear array
 468 |     ddouble *buffer = (ddouble *)_buffer;
 469 |     if (ii < 2)
 470 |         return -1;
 471 | 
 472 |     ddouble curr = buffer[1];
 473 |     ddouble step = subww(curr, buffer[0]);
 474 |     for (npy_intp i = 2; i != ii; ++i) {
 475 |         curr = addww(curr, step);
 476 |         buffer[i] = curr;
 477 |     }
 478 |     return 0;
 479 |     MARK_UNUSED(arr);
 480 | }
 481 | 
 482 | static int NPyDDouble_FillWithScalar(void *_buffer, npy_intp ii, void *_value,
 483 |                                       void *arr)
 484 | {
 485 |     ddouble *buffer = (ddouble *)_buffer;
 486 |     ddouble value = *(ddouble *)_value;
 487 |     for (npy_intp i = 0; i < ii; ++i)
 488 |         buffer[i] = value;
 489 |     return 0;
 490 |     MARK_UNUSED(arr);
 491 | }
 492 | 
 493 | static void NPyDDouble_DotFunc(void *_in1, npy_intp is1, void *_in2,
 494 |                                npy_intp is2, void *_out, npy_intp ii, void *arr)
 495 | {
 496 |     ddouble out = Q_ZERO;
 497 |     char *_cin1 = (char *)_in1, *_cin2 = (char *)_in2;
 498 |     for (npy_intp i = 0; i < ii; ++i, _cin1 += is1, _cin2 += is2) {
 499 |         ddouble in1 = *(ddouble *)_cin1, in2 = *(ddouble *)_cin2;
 500 |         out = addww(out, mulww(in1, in2));
 501 |     }
 502 |     *(ddouble *)_out = out;
 503 |     MARK_UNUSED(arr);
 504 | }
 505 | 
 506 | static int NPyDDouble_ArgMax(void *_data, npy_intp n, npy_intp *max_ind,
 507 |                              void *arr)
 508 | {
 509 |     ddouble *data = (ddouble *)_data;
 510 |     ddouble max_val = negw(infw());
 511 |     for (npy_intp i = 0; i < n; ++i) {
 512 |         if (greaterww(data[i], max_val)) {
 513 |             max_val = data[i];
 514 |             *max_ind = i;
 515 |         }
 516 |     }
 517 |     return 0;
 518 |     MARK_UNUSED(arr);
 519 | }
 520 | 
 521 | static int NPyDDouble_ArgMin(void *_data, npy_intp n, npy_intp *min_ind,
 522 |                              void *arr)
 523 | {
 524 |     ddouble *data = (ddouble *)_data;
 525 |     ddouble min_val = infw();
 526 |     for (npy_intp i = 0; i < n; ++i) {
 527 |         if (lessww(data[i], min_val)) {
 528 |             min_val = data[i];
 529 |             *min_ind = i;
 530 |         }
 531 |     }
 532 |     return 0;
 533 |     MARK_UNUSED(arr);
 534 | }
 535 | 
 536 | /* This is necessary in order to ensure both 1.0 and 2.0 compatibility.
 537 |  * https://numpy.org/doc/stable/reference/c-api/array.html#c.PyArray_RegisterDataType
 538 |  */
 539 | #if NPY_ABI_VERSION < 0x02000000
 540 | #define PyArray_DescrProto PyArray_Descr
 541 | #endif
 542 | 
 543 | static int make_dtype()
 544 | {
 545 |     /* Check if another module has registered a ddouble type.
 546 |      *
 547 |      * FIXME: this check is removed, let's see if it is missed ...
 548 |      */
 549 |     //type_num = PyArray_TypeNumFromName("ddouble");
 550 |     //if (type_num != NPY_NOTYPE) {
 551 |     //   return type_num;
 552 |     //}
 553 | 
 554 |     static PyArray_ArrFuncs ddouble_arrfuncs;
 555 | 
 556 |     static PyArray_DescrProto ddouble_dtype = {
 557 |         PyObject_HEAD_INIT(NULL)
 558 | 
 559 |         /* We must register ddouble with a kind other than "f", because numpy
 560 |          * considers two types with the same kind and size to be equal, but
 561 |          * float128 != ddouble.  The downside of this is that NumPy scalar
 562 |          * promotion does not work with ddoubles.
 563 |          */
 564 |         .kind = 'V',
 565 |         .type = 'E',
 566 |         .byteorder = '=',
 567 | 
 568 |         /* NPY_USE_GETITEM is not needed, since we inherit from numpy scalar,
 569 |          * which according to the docs means that "standard conversion" is
 570 |          * used.  However, we still need to define and register getitem()
 571 |          * below, otherwise PyArray_RegisterDataType complains.
 572 |          */
 573 |         .flags = 0,
 574 |         .elsize = sizeof(ddouble),
 575 |         .alignment = alignof(ddouble),
 576 |         .hash = -1
 577 |         };
 578 | 
 579 |     ddouble_dtype.typeobj = pyddouble_type;
 580 |     ddouble_dtype.f = &ddouble_arrfuncs;
 581 |     Py_SET_TYPE(&ddouble_dtype, &PyArrayDescr_Type);
 582 | 
 583 |     PyArray_InitArrFuncs(&ddouble_arrfuncs);
 584 |     ddouble_arrfuncs.getitem = NPyDDouble_GetItem;
 585 |     ddouble_arrfuncs.setitem = NPyDDouble_SetItem;
 586 |     ddouble_arrfuncs.compare = NPyDDouble_Compare;
 587 |     ddouble_arrfuncs.copyswapn = NPyDDouble_CopySwapN;
 588 |     ddouble_arrfuncs.copyswap = NPyDDouble_CopySwap;
 589 |     ddouble_arrfuncs.nonzero = NPyDDouble_NonZero;
 590 |     ddouble_arrfuncs.fill = NPyDDouble_Fill;
 591 |     ddouble_arrfuncs.fillwithscalar = NPyDDouble_FillWithScalar;
 592 |     ddouble_arrfuncs.dotfunc = NPyDDouble_DotFunc;
 593 |     ddouble_arrfuncs.argmax = NPyDDouble_ArgMax;
 594 |     ddouble_arrfuncs.argmin = NPyDDouble_ArgMin;
 595 | 
 596 |     type_num = PyArray_RegisterDataType(&ddouble_dtype);
 597 |     return type_num;
 598 | }
 599 | 
 600 | /* ------------------------------- Casts ------------------------------ */
 601 | 
 602 | #define NPY_CAST_FROM(func, from_type)                               \
 603 |     static void func(void *_from, void *_to, npy_intp n,             \
 604 |                      void *_arr_from, void *_arr_to)                 \
 605 |     {                                                                \
 606 |         ddouble *to = (ddouble *)_to;                                \
 607 |         const from_type *from = (const from_type *)_from;            \
 608 |         for (npy_intp i = 0; i < n; ++i)                             \
 609 |             to[i] = (ddouble) { from[i], 0.0 };                      \
 610 |         MARK_UNUSED(_arr_from);                                      \
 611 |         MARK_UNUSED(_arr_to);                                        \
 612 |     }
 613 | 
 614 | #define NPY_CAST_FROM_I64(func, from_type)                           \
 615 |     static void func(void *_from, void *_to, npy_intp n,             \
 616 |                      void *_arr_from, void *_arr_to)                 \
 617 |     {                                                                \
 618 |         ddouble *to = (ddouble *)_to;                                \
 619 |         const from_type *from = (const from_type *)_from;            \
 620 |         for (npy_intp i = 0; i < n; ++i) {                           \
 621 |             double hi = from[i];                                     \
 622 |             double lo = from[i] - (from_type) hi;                    \
 623 |             to[i] = (ddouble){hi, lo};                               \
 624 |         }                                                            \
 625 |         MARK_UNUSED(_arr_from);                                      \
 626 |         MARK_UNUSED(_arr_to);                                        \
 627 |     }
 628 | 
 629 | #define NPY_CAST_TO(func, to_type)                                   \
 630 |     static void func(void *_from, void *_to, npy_intp n,             \
 631 |                      void *_arr_from, void *_arr_to)                 \
 632 |     {                                                                \
 633 |         to_type *to = (to_type *)_to;                                \
 634 |         const ddouble *from = (const ddouble *)_from;                \
 635 |         for (npy_intp i = 0; i < n; ++i)                             \
 636 |             to[i] = (to_type) from[i].hi;                            \
 637 |         MARK_UNUSED(_arr_from);                                      \
 638 |         MARK_UNUSED(_arr_to);                                        \
 639 |     }
 640 | 
 641 | #define NPY_CAST_TO_I64(func, to_type)                               \
 642 |     static void func(void *_from, void *_to, npy_intp n,             \
 643 |                      void *_arr_from, void *_arr_to)                 \
 644 |     {                                                                \
 645 |         to_type *to = (to_type *)_to;                                \
 646 |         const ddouble *from = (const ddouble *)_from;                \
 647 |         for (npy_intp i = 0; i < n; ++i)                             \
 648 |             to[i] = (to_type) from[i].hi + (to_type) from[i].lo;     \
 649 |         MARK_UNUSED(_arr_from);                                      \
 650 |         MARK_UNUSED(_arr_to);                                        \
 651 |     }
 652 | 
 653 | // These casts are all loss-less
 654 | NPY_CAST_FROM(from_double, double)
 655 | NPY_CAST_FROM(from_float, float)
 656 | NPY_CAST_FROM(from_bool, bool)
 657 | NPY_CAST_FROM(from_int8, int8_t)
 658 | NPY_CAST_FROM(from_int16, int16_t)
 659 | NPY_CAST_FROM(from_int32, int32_t)
 660 | NPY_CAST_FROM(from_uint8, uint8_t)
 661 | NPY_CAST_FROM(from_uint16, uint16_t)
 662 | NPY_CAST_FROM(from_uint32, uint32_t)
 663 | 
 664 | // These casts are also lossless, because we have now 2*54 bits of mantissa
 665 | NPY_CAST_FROM_I64(from_int64, int64_t)
 666 | NPY_CAST_FROM_I64(from_uint64, uint64_t)
 667 | 
 668 | // These casts are all lossy
 669 | NPY_CAST_TO(to_double, double)
 670 | NPY_CAST_TO(to_float, float)
 671 | NPY_CAST_TO(to_bool, bool)
 672 | NPY_CAST_TO(to_int8, int8_t)
 673 | NPY_CAST_TO(to_int16, int16_t)
 674 | NPY_CAST_TO(to_int32, int32_t)
 675 | NPY_CAST_TO(to_uint8, uint8_t)
 676 | NPY_CAST_TO(to_uint16, uint16_t)
 677 | NPY_CAST_TO(to_uint32, uint32_t)
 678 | 
 679 | // These casts can be made more accurate
 680 | NPY_CAST_TO_I64(to_int64, int64_t)
 681 | NPY_CAST_TO_I64(to_uint64, uint64_t)
 682 | 
 683 | 
 684 | static bool register_cast(int other_type, PyArray_VectorUnaryFunc from_other,
 685 |                          PyArray_VectorUnaryFunc to_other)
 686 | {
 687 |     PyArray_Descr *other_descr = NULL, *ddouble_descr = NULL;
 688 |     int ret;
 689 | 
 690 |     other_descr = PyArray_DescrFromType(other_type);
 691 |     if (other_descr == NULL) goto error;
 692 | 
 693 |     ddouble_descr = PyArray_DescrFromType(type_num);
 694 |     if (ddouble_descr == NULL) goto error;
 695 | 
 696 |     ret = PyArray_RegisterCastFunc(other_descr, type_num, from_other);
 697 |     if (ret < 0) goto error;
 698 | 
 699 |     // NPY_NOSCALAR apparently implies that casting is safe?
 700 |     ret = PyArray_RegisterCanCast(other_descr, type_num, NPY_NOSCALAR);
 701 |     if (ret < 0) goto error;
 702 | 
 703 |     ret = PyArray_RegisterCastFunc(ddouble_descr, other_type, to_other);
 704 |     if (ret < 0) goto error;
 705 |     return true;
 706 | 
 707 | error:
 708 |     return false;
 709 | }
 710 | 
 711 | static int register_casts()
 712 | {
 713 |     bool ok = register_cast(NPY_DOUBLE, from_double, to_double)
 714 |         && register_cast(NPY_FLOAT,  from_float,  to_float)
 715 |         && register_cast(NPY_BOOL,   from_bool,   to_bool)
 716 |         && register_cast(NPY_INT8,   from_int8,   to_int8)
 717 |         && register_cast(NPY_INT16,  from_int16,  to_int16)
 718 |         && register_cast(NPY_INT32,  from_int32,  to_int32)
 719 |         && register_cast(NPY_INT64,  from_int64,  to_int64)
 720 |         && register_cast(NPY_UINT8,  from_uint8,  to_uint8)
 721 |         && register_cast(NPY_UINT16, from_uint16, to_uint16)
 722 |         && register_cast(NPY_UINT32, from_uint32, to_uint32)
 723 |         && register_cast(NPY_UINT64, from_uint64, to_uint64);
 724 |     return ok ? 0 : -1;
 725 | }
 726 | 
 727 | /* ------------------------------- Ufuncs ----------------------------- */
 728 | 
 729 | #define ULOOP_UNARY(func_name, inner_func, type_out, type_in)           \
 730 |     static void func_name(char **args, const npy_intp *dimensions,      \
 731 |                           const npy_intp *steps, void *data)            \
 732 |     {                                                                   \
 733 |         const npy_intp n = dimensions[0];                               \
 734 |         const npy_intp is = steps[0] / sizeof(type_in),                 \
 735 |                        os = steps[1] / sizeof(type_out);                \
 736 |         const type_in *in = (const type_in *)args[0];                   \
 737 |         type_out *out = (type_out *)args[1];                            \
 738 |                                                                         \
 739 |         for (npy_intp i = 0; i < n; ++i)                                \
 740 |             out[i * os] = inner_func(in[i * is]);                       \
 741 |         MARK_UNUSED(data);                                              \
 742 |     }
 743 | 
 744 | #define ULOOP_BINARY(func_name, inner_func, type_out, type_a, type_b)   \
 745 |     static void func_name(char **args, const npy_intp *dimensions,      \
 746 |                           const npy_intp* steps, void *data)            \
 747 |     {                                                                   \
 748 |         const npy_intp n = dimensions[0];                               \
 749 |         const npy_intp as = steps[0] / sizeof(type_a),                  \
 750 |                        bs = steps[1] / sizeof(type_b),                  \
 751 |                        os = steps[2] / sizeof(type_out);                \
 752 |         const type_a *a = (const type_a *)args[0];                      \
 753 |         const type_b *b = (const type_b *)args[1];                      \
 754 |         type_out *out = (type_out *)args[2];                            \
 755 |                                                                         \
 756 |         for (npy_intp i = 0; i < n; ++i) {                              \
 757 |             out[i * os] = inner_func(a[i * as], b[i * bs]);             \
 758 |         }                                                               \
 759 |         MARK_UNUSED(data);                                              \
 760 |     }
 761 | 
 762 | #define ULOOP_MODF(func_name, inner_func, type_out, type_a, type_b)     \
 763 |     static void func_name(char **args, const npy_intp *dimensions,      \
 764 |                           const npy_intp* steps, void *data)            \
 765 |     {                                                                   \
 766 |         const npy_intp n = dimensions[0];                               \
 767 |         const npy_intp as = steps[0] / sizeof(type_a),                  \
 768 |                        bs = steps[1] / sizeof(type_b),                  \
 769 |                        os = steps[2] / sizeof(type_out);                \
 770 |         const type_a *a = (const type_a *)args[0];                      \
 771 |         type_b *b = (type_b *)args[2];                                  \
 772 |         type_out *out = (type_out *)args[1];                            \
 773 |                                                                         \
 774 |         for (npy_intp i = 0; i < n; ++i) {                              \
 775 |             out[i * os] = inner_func(a[i * as], &b[i * bs]);            \
 776 |         }                                                               \
 777 |         MARK_UNUSED(data);                                              \
 778 |     }
 779 | 
 780 | ULOOP_BINARY(u_addwd, addwd, ddouble, ddouble, double)
 781 | ULOOP_BINARY(u_subwd, subwd, ddouble, ddouble, double)
 782 | ULOOP_BINARY(u_mulwd, mulwd, ddouble, ddouble, double)
 783 | ULOOP_BINARY(u_divwd, divwd, ddouble, ddouble, double)
 784 | ULOOP_BINARY(u_adddw, adddw, ddouble, double, ddouble)
 785 | ULOOP_BINARY(u_subdw, subdw, ddouble, double, ddouble)
 786 | ULOOP_BINARY(u_muldw, muldw, ddouble, double, ddouble)
 787 | ULOOP_BINARY(u_divdw, divdw, ddouble, double, ddouble)
 788 | ULOOP_BINARY(u_addww, addww, ddouble, ddouble, ddouble)
 789 | ULOOP_BINARY(u_subww, subww, ddouble, ddouble, ddouble)
 790 | ULOOP_BINARY(u_mulww, mulww, ddouble, ddouble, ddouble)
 791 | ULOOP_BINARY(u_divww, divww, ddouble, ddouble, ddouble)
 792 | ULOOP_BINARY(u_copysignww, copysignww, ddouble, ddouble, ddouble)
 793 | ULOOP_BINARY(u_copysignwd, copysignwd, ddouble, ddouble, double)
 794 | ULOOP_BINARY(u_copysigndw, copysigndw, ddouble, double, ddouble)
 795 | ULOOP_BINARY(u_equalww, equalww, bool, ddouble, ddouble)
 796 | ULOOP_BINARY(u_notequalww, notequalww, bool, ddouble, ddouble)
 797 | ULOOP_BINARY(u_greaterww, greaterww, bool, ddouble, ddouble)
 798 | ULOOP_BINARY(u_lessww, lessww, bool, ddouble, ddouble)
 799 | ULOOP_BINARY(u_greaterequalww, greaterww, bool, ddouble, ddouble)
 800 | ULOOP_BINARY(u_lessequalww, lessww, bool, ddouble, ddouble)
 801 | ULOOP_BINARY(u_equalwd, equalwd, bool, ddouble, double)
 802 | ULOOP_BINARY(u_notequalwd, notequalwd, bool, ddouble, double)
 803 | ULOOP_BINARY(u_greaterwd, greaterwd, bool, ddouble, double)
 804 | ULOOP_BINARY(u_lesswd, lesswd, bool, ddouble, double)
 805 | ULOOP_BINARY(u_greaterequalwd, greaterequalwd, bool, ddouble, double)
 806 | ULOOP_BINARY(u_lessequalwd, lessequalwd, bool, ddouble, double)
 807 | ULOOP_BINARY(u_equaldw, equaldw, bool, double, ddouble)
 808 | ULOOP_BINARY(u_notequaldw, notequaldw, bool, double, ddouble)
 809 | ULOOP_BINARY(u_greaterdw, greaterdw, bool, double, ddouble)
 810 | ULOOP_BINARY(u_lessdw, lessdw, bool, double, ddouble)
 811 | ULOOP_BINARY(u_greaterequaldw, greaterequaldw, bool, double, ddouble)
 812 | ULOOP_BINARY(u_lessequaldw, lessequaldw, bool, double, ddouble)
 813 | ULOOP_BINARY(u_fminww, fminww, ddouble, ddouble, ddouble)
 814 | ULOOP_BINARY(u_fmaxww, fmaxww, ddouble, ddouble, ddouble)
 815 | ULOOP_BINARY(u_fminwd, fminwd, ddouble, ddouble, double)
 816 | ULOOP_BINARY(u_fmaxwd, fmaxwd, ddouble, ddouble, double)
 817 | ULOOP_BINARY(u_fmindw, fmindw, ddouble, double, ddouble)
 818 | ULOOP_BINARY(u_fmaxdw, fmaxdw, ddouble, double, ddouble)
 819 | ULOOP_BINARY(u_atan2wd, atan2wd, ddouble, ddouble, double)
 820 | ULOOP_BINARY(u_atan2dw, atan2dw, ddouble, double, ddouble)
 821 | ULOOP_BINARY(u_atan2ww, atan2ww, ddouble, ddouble, ddouble)
 822 | ULOOP_BINARY(u_powwd, powwd, ddouble, ddouble, double)
 823 | ULOOP_BINARY(u_powdw, powdw, ddouble, double, ddouble)
 824 | ULOOP_BINARY(u_powww, powww, ddouble, ddouble, ddouble)
 825 | ULOOP_BINARY(u_hypotww, hypotww, ddouble, ddouble, ddouble)
 826 | ULOOP_BINARY(u_hypotdw, hypotdw, ddouble, double, ddouble)
 827 | ULOOP_BINARY(u_hypotwd, hypotwd, ddouble, ddouble, double)
 828 | ULOOP_BINARY(u_ldexpwi, ldexpwi, ddouble, ddouble, int)
 829 | ULOOP_MODF(u_modfww, modfww, ddouble, ddouble, ddouble)
 830 | ULOOP_UNARY(u_signbitw, signbitw, bool, ddouble)
 831 | ULOOP_UNARY(u_signw, signw, ddouble, ddouble)
 832 | ULOOP_UNARY(u_isfinitew, isfinitew, bool, ddouble)
 833 | ULOOP_UNARY(u_isinfw, isinfw, bool, ddouble)
 834 | ULOOP_UNARY(u_isnanw, isnanw, bool, ddouble)
 835 | ULOOP_UNARY(u_negw, negw, ddouble, ddouble)
 836 | ULOOP_UNARY(u_posw, posw, ddouble, ddouble)
 837 | ULOOP_UNARY(u_absw, absw, ddouble, ddouble)
 838 | ULOOP_UNARY(u_reciprocalw, reciprocalw, ddouble, ddouble)
 839 | ULOOP_UNARY(u_sqrw, sqrw, ddouble, ddouble)
 840 | ULOOP_UNARY(u_roundw, roundw, ddouble, ddouble)
 841 | ULOOP_UNARY(u_floorw, floorw, ddouble, ddouble)
 842 | ULOOP_UNARY(u_ceilw, ceilw, ddouble, ddouble)
 843 | ULOOP_UNARY(u_sqrtw, sqrtw, ddouble, ddouble)
 844 | ULOOP_UNARY(u_expw, expw, ddouble, ddouble)
 845 | ULOOP_UNARY(u_expm1w, expm1w, ddouble, ddouble)
 846 | ULOOP_UNARY(u_logw, logw, ddouble, ddouble)
 847 | ULOOP_UNARY(u_sinw, sinw, ddouble, ddouble)
 848 | ULOOP_UNARY(u_cosw, cosw, ddouble, ddouble)
 849 | ULOOP_UNARY(u_tanw, tanw, ddouble, ddouble)
 850 | ULOOP_UNARY(u_atanw, atanw, ddouble, ddouble)
 851 | ULOOP_UNARY(u_acosw, acosw, ddouble, ddouble)
 852 | ULOOP_UNARY(u_asinw, asinw, ddouble, ddouble)
 853 | ULOOP_UNARY(u_atanhw, atanhw, ddouble, ddouble)
 854 | ULOOP_UNARY(u_acoshw, acoshw, ddouble, ddouble)
 855 | ULOOP_UNARY(u_asinhw, asinhw, ddouble, ddouble)
 856 | ULOOP_UNARY(u_sinhw, sinhw, ddouble, ddouble)
 857 | ULOOP_UNARY(u_coshw, coshw, ddouble, ddouble)
 858 | ULOOP_UNARY(u_tanhw, tanhw, ddouble, ddouble)
 859 | 
 860 | static bool register_binary(PyUFuncGenericFunction dq_func,
 861 |         PyUFuncGenericFunction qd_func, PyUFuncGenericFunction qq_func,
 862 |         int ret_dtype, const char *name)
 863 | {
 864 |     PyUFuncObject *ufunc;
 865 |     int *arg_types = NULL, retcode = 0;
 866 | 
 867 |     ufunc = (PyUFuncObject *)PyObject_GetAttrString(numpy_module, name);
 868 |     if (ufunc == NULL) goto error;
 869 | 
 870 |     arg_types = PyMem_New(int, 3 * 3);
 871 |     if (arg_types == NULL) goto error;
 872 | 
 873 |     arg_types[0] = NPY_DOUBLE;
 874 |     arg_types[1] = type_num;
 875 |     arg_types[2] = ret_dtype;
 876 |     retcode = PyUFunc_RegisterLoopForType(ufunc, type_num,
 877 |                                           dq_func, arg_types, NULL);
 878 |     if (retcode < 0) goto error;
 879 | 
 880 |     arg_types[3] = type_num;
 881 |     arg_types[4] = NPY_DOUBLE;
 882 |     arg_types[5] = ret_dtype;
 883 |     retcode = PyUFunc_RegisterLoopForType(ufunc, type_num,
 884 |                                           qd_func, arg_types + 3, NULL);
 885 |     if (retcode < 0) goto error;
 886 | 
 887 |     arg_types[6] = type_num;
 888 |     arg_types[7] = type_num;
 889 |     arg_types[8] = ret_dtype;
 890 |     retcode = PyUFunc_RegisterLoopForType(ufunc, type_num,
 891 |                                           qq_func, arg_types + 6, NULL);
 892 |     if (retcode < 0) goto error;
 893 |     return true;
 894 | 
 895 | error:
 896 |     return false;
 897 | }
 898 | 
 899 | static int register_unary(PyUFuncGenericFunction func, int ret_dtype,
 900 |                           const char *name)
 901 | {
 902 |     PyUFuncObject *ufunc;
 903 |     int *arg_types = NULL, retcode = 0;
 904 | 
 905 |     ufunc = (PyUFuncObject *)PyObject_GetAttrString(numpy_module, name);
 906 |     if (ufunc == NULL) goto error;
 907 | 
 908 |     arg_types = PyMem_New(int, 2);
 909 |     if (arg_types == NULL) goto error;
 910 | 
 911 |     arg_types[0] = type_num;
 912 |     arg_types[1] = ret_dtype;
 913 |     retcode = PyUFunc_RegisterLoopForType(ufunc, type_num,
 914 |                                           func, arg_types, NULL);
 915 |     if (retcode < 0) goto error;
 916 |     return true;
 917 | 
 918 | error:
 919 |     return false;
 920 | }
 921 | 
 922 | static int register_ldexp(PyUFuncGenericFunction func, int ret_dtype,
 923 |                           const char *name)
 924 | {
 925 |     PyUFuncObject *ufunc;
 926 |     int *arg_types = NULL, retcode = 0;
 927 | 
 928 |     ufunc = (PyUFuncObject *)PyObject_GetAttrString(numpy_module, name);
 929 |     if (ufunc == NULL) goto error;
 930 | 
 931 |     arg_types = PyMem_New(int, 3);
 932 |     if (arg_types == NULL) goto error;
 933 | 
 934 |     arg_types[0] = type_num;
 935 |     arg_types[1] = NPY_INTP;
 936 |     arg_types[2] = ret_dtype;
 937 |     retcode = PyUFunc_RegisterLoopForType(ufunc, type_num,
 938 |                                           func, arg_types, NULL);
 939 |     if (retcode < 0) goto error;
 940 |     return true;
 941 | 
 942 | error:
 943 |     return false;
 944 | }
 945 | 
 946 | static int register_modf(PyUFuncGenericFunction func, int ret_dtype,
 947 |                           const char *name)
 948 | {
 949 |     PyUFuncObject *ufunc;
 950 |     int *arg_types = NULL, retcode = 0;
 951 | 
 952 |     ufunc = (PyUFuncObject *)PyObject_GetAttrString(numpy_module, name);
 953 |     if (ufunc == NULL) goto error;
 954 | 
 955 |     arg_types = PyMem_New(int, 4);
 956 |     if (arg_types == NULL) goto error;
 957 | 
 958 |     arg_types[0] = type_num;
 959 |     arg_types[1] = type_num;
 960 |     arg_types[2] = ret_dtype;
 961 |     arg_types[3] = ret_dtype;
 962 |     retcode = PyUFunc_RegisterLoopForType(ufunc, type_num,
 963 |                                           func, arg_types, NULL);
 964 |     if (retcode < 0) goto error;
 965 |     return true;
 966 | 
 967 | error:
 968 |     return false;
 969 | }
 970 | 
 971 | static int register_ufuncs()
 972 | {
 973 |     bool ok = register_unary(u_negw, type_num, "negative")
 974 |         && register_unary(u_posw, type_num, "positive")
 975 |         && register_unary(u_absw, type_num, "absolute")
 976 |         && register_unary(u_reciprocalw, type_num, "reciprocal")
 977 |         && register_unary(u_sqrw, type_num, "square")
 978 |         && register_unary(u_sqrtw, type_num, "sqrt")
 979 |         && register_unary(u_signbitw, NPY_BOOL, "signbit")
 980 |         && register_unary(u_isfinitew, NPY_BOOL, "isfinite")
 981 |         && register_unary(u_isinfw, NPY_BOOL, "isinf")
 982 |         && register_unary(u_isnanw, NPY_BOOL, "isnan")
 983 |         && register_unary(u_roundw, type_num, "rint")
 984 |         && register_unary(u_floorw, type_num, "floor")
 985 |         && register_unary(u_ceilw, type_num, "ceil")
 986 |         && register_unary(u_expw, type_num, "exp")
 987 |         && register_unary(u_expm1w, type_num, "expm1")
 988 |         && register_unary(u_logw, type_num, "log")
 989 |         && register_unary(u_sinw, type_num, "sin")
 990 |         && register_unary(u_cosw, type_num, "cos")
 991 |         && register_unary(u_tanw, type_num, "tan")
 992 |         && register_unary(u_atanw, type_num, "arctan")
 993 |         && register_unary(u_acosw, type_num, "arccos")
 994 |         && register_unary(u_asinw, type_num, "arcsin")
 995 |         && register_unary(u_atanhw, type_num, "arctanh")
 996 |         && register_unary(u_acoshw, type_num, "arccosh")
 997 |         && register_unary(u_asinhw, type_num, "arcsinh")
 998 |         && register_unary(u_sinhw, type_num, "sinh")
 999 |         && register_unary(u_coshw, type_num, "cosh")
1000 |         && register_unary(u_tanhw, type_num, "tanh")
1001 |         && register_unary(u_signw, type_num, "sign")
1002 |         && register_ldexp(u_ldexpwi, type_num, "ldexp")
1003 |         && register_modf(u_modfww, type_num, "modf")
1004 |         && register_binary(u_adddw, u_addwd, u_addww, type_num, "add")
1005 |         && register_binary(u_subdw, u_subwd, u_subww, type_num, "subtract")
1006 |         && register_binary(u_muldw, u_mulwd, u_mulww, type_num, "multiply")
1007 |         && register_binary(u_divdw, u_divwd, u_divww, type_num, "true_divide")
1008 |         && register_binary(u_powdw, u_powwd, u_powww, type_num, "power")
1009 |         && register_binary(u_equaldw, u_equalwd, u_equalww, NPY_BOOL, "equal")
1010 |         && register_binary(u_notequaldw, u_notequalwd, u_notequalww, NPY_BOOL,
1011 |                            "not_equal")
1012 |         && register_binary(u_greaterdw, u_greaterwd, u_greaterww, NPY_BOOL, "greater")
1013 |         && register_binary(u_lessdw, u_lesswd, u_lessww, NPY_BOOL, "less")
1014 |         && register_binary(u_greaterequaldw, u_greaterequalwd, u_greaterequalww,
1015 |                            NPY_BOOL, "greater_equal")
1016 |         && register_binary(u_lessequaldw, u_lessequalwd, u_lessequalww, NPY_BOOL,
1017 |                            "less_equal")
1018 |         && register_binary(u_fmindw, u_fminwd, u_fminww, type_num, "fmin")
1019 |         && register_binary(u_fmaxdw, u_fmaxwd, u_fmaxww, type_num, "fmax")
1020 |         && register_binary(u_fmindw, u_fminwd, u_fminww, type_num, "minimum")
1021 |         && register_binary(u_fmaxdw, u_fmaxwd, u_fmaxww, type_num, "maximum")
1022 |         && register_binary(u_atan2dw, u_atan2wd, u_atan2ww, type_num, "arctan2")
1023 |         && register_binary(u_copysigndw, u_copysignwd, u_copysignww, type_num,
1024 |                            "copysign")
1025 |         && register_binary(u_hypotdw, u_hypotwd, u_hypotww, type_num, "hypot");
1026 |     return ok ? 0 : -1;
1027 | }
1028 | 
1029 | static int register_dtype_in_dicts()
1030 | {
1031 |     PyObject *type_dict = NULL;
1032 | 
1033 |     type_dict = PyObject_GetAttrString(numpy_module, "sctypeDict");
1034 |     if (type_dict == NULL) goto error;
1035 | 
1036 |     if (PyDict_SetItemString(type_dict, "ddouble",
1037 |                              (PyObject *)pyddouble_type) < 0)
1038 |         goto error;
1039 |     return 0;
1040 | 
1041 | error:
1042 |     Py_XDECREF(type_dict);
1043 |     return -1;
1044 | }
1045 | 
1046 | /* ----------------------- Python stuff -------------------------- */
1047 | 
1048 | static PyObject *make_module()
1049 | {
1050 |     // Defitions
1051 |     static PyMethodDef no_methods[] = {
1052 |         {NULL, NULL, 0, NULL}    // No methods defined
1053 |     };
1054 |     static struct PyModuleDef module_def = {
1055 |         PyModuleDef_HEAD_INIT,
1056 |         "_dd_ufunc",
1057 |         NULL,
1058 |         -1,
1059 |         no_methods,
1060 |         NULL,
1061 |         NULL,
1062 |         NULL,
1063 |         NULL
1064 |     };
1065 | 
1066 |     /* Module definition */
1067 |     module = PyModule_Create(&module_def);
1068 |     return module;
1069 | }
1070 | 
1071 | static bool constant(ddouble value, const char *name)
1072 | {
1073 |     // Note that data must be allocated using malloc, not python allocators!
1074 |     ddouble *data = malloc(sizeof value);
1075 |     *data = value;
1076 | 
1077 |     PyArrayObject *array = (PyArrayObject *)
1078 |             PyArray_SimpleNewFromData(0, NULL, type_num, data);
1079 |     if (array == NULL) return false;
1080 | 
1081 |     PyArray_ENABLEFLAGS(array, NPY_ARRAY_OWNDATA);
1082 |     PyArray_CLEARFLAGS(array, NPY_ARRAY_WRITEABLE);
1083 | 
1084 |     PyModule_AddObject(module, name, (PyObject *)array);
1085 |     return true;
1086 | }
1087 | 
1088 | static int register_constants()
1089 | {
1090 |     bool ok = constant(Q_MAX, "MAX")
1091 |         && constant(Q_MIN, "MIN")
1092 |         && constant(Q_EPS, "EPS")
1093 |         && constant(Q_2PI, "TWOPI")
1094 |         && constant(Q_PI, "PI")
1095 |         && constant(Q_PI_2, "PI_2")
1096 |         && constant(Q_PI_4, "PI_4")
1097 |         && constant(Q_E, "E")
1098 |         && constant(Q_LOG2, "LOG2")
1099 |         && constant(Q_LOG10, "LOG10")
1100 |         && constant(nanw(), "NAN")
1101 |         && constant(infw(), "INF");
1102 |     return ok ? 0 : -1;
1103 | }
1104 | 
1105 | PyMODINIT_FUNC PyInit__dd_ufunc(void)
1106 | {
1107 |     /* Initialize module */
1108 |     if (!make_module())
1109 |         return NULL;
1110 | 
1111 |     /* Initialize numpy things */
1112 |     import_array();
1113 |     import_umath();
1114 | 
1115 |     if (make_ddouble_type() < 0)
1116 |         return NULL;
1117 |     if (make_dtype() < 0)
1118 |         return NULL;
1119 |     if (make_finfo() < 0)
1120 |         return NULL;
1121 | 
1122 |     numpy_module = PyImport_ImportModule("numpy");
1123 |     if (numpy_module == NULL)
1124 |         return NULL;
1125 | 
1126 |     PyArray_Descr *dtype = PyArray_DescrFromType(type_num);
1127 |     PyModule_AddObject(module, "dtype", (PyObject *)dtype);
1128 | 
1129 |     /* Casts need to be defined before ufuncs, because numpy >= 1.21 caches
1130 |      * casts/ufuncs in a way that is non-trivial... one should consider casts
1131 |      * to be "more basic".
1132 |      * See: https://github.com/numpy/numpy/issues/20009
1133 |      */
1134 |     if (register_casts() < 0)
1135 |         return NULL;
1136 |     if (register_ufuncs() < 0)
1137 |         return NULL;
1138 |     if (register_dtype_in_dicts() < 0)
1139 |         return NULL;
1140 |     if (register_constants() < 0)
1141 |         return NULL;
1142 | 
1143 |     /* Module is ready */
1144 |     return module;
1145 | }
1146 | 


--------------------------------------------------------------------------------