├── benchmarks
    ├── __init__.py
    └── README.rst
├── src
    ├── pnumpy
    │   ├── tests
    │   │   └── __init__.py
    │   ├── recycler.cpp
    │   ├── recarray.py
    │   ├── __init__.py
    │   ├── arange.cpp
    │   ├── conversions.cpp
    │   ├── cpu.py
    │   ├── common.h
    │   ├── benchmark.py
    │   ├── module_init.cpp
    │   ├── ledger.cpp
    │   └── sort.py
    ├── nte
    │   └── x64
    │   │   └── Release
    │   │       └── nte.vcxproj.FileListAbsolute.txt
    └── atop
    │   ├── readme.txt
    │   ├── invalids.h
    │   ├── atop.h
    │   ├── halffloat.h
    │   ├── atop.cpp
    │   ├── fill.cpp
    │   ├── recarray.cpp
    │   ├── common_inc.h
    │   ├── threads.cpp
    │   └── ops_log.cpp
├── doc_src
    ├── doc_requirements.txt
    ├── images
    │   ├── bench4graph.PNG
    │   ├── bench4graph2.PNG
    │   ├── bench4graph3.PNG
    │   └── threading_npadd.PNG
    ├── source
    │   ├── installation.rst
    │   ├── benchmarking_asv.rst
    │   ├── use.rst
    │   ├── roadmap.rst
    │   ├── conf.py
    │   └── index.rst
    ├── Makefile
    └── make.bat
├── ci
    └── requirements.txt
├── AUTHORS.rst
├── pyproject.toml
├── CHANGELOG.rst
├── .coveragerc
├── .editorconfig
├── MANIFEST.in
├── .bumpversion.cfg
├── test_requirements.txt
├── tests
    ├── conftest.py
    ├── test_pnumpy.py
    └── test_ufuncs.py
├── LICENSE
├── .gitignore
├── .github
    └── workflows
    │   ├── push_docs.yml
    │   ├── build.yml
    │   ├── pypi.yml
    │   └── build_uploadpypi.yml
├── setup.cfg
├── CONTRIBUTING.md
├── CODE_OF_CONDUCT.md
├── asv.conf.json
├── README.md
├── _add_newdocs.py
└── setup.py


/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/pnumpy/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/doc_src/doc_requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==3.2.0
2 | 


--------------------------------------------------------------------------------
/src/nte/x64/Release/nte.vcxproj.FileListAbsolute.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ci/requirements.txt:
--------------------------------------------------------------------------------
1 | virtualenv>=16.6.0
2 | six>=1.14.0
3 | twine
4 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | 
2 | Authors
3 | =======
4 | 
5 | * Matti Picus - https://labs.quansight.org/
6 | 


--------------------------------------------------------------------------------
/doc_src/images/bench4graph.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quansight/pnumpy/HEAD/doc_src/images/bench4graph.PNG


--------------------------------------------------------------------------------
/doc_src/images/bench4graph2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quansight/pnumpy/HEAD/doc_src/images/bench4graph2.PNG


--------------------------------------------------------------------------------
/doc_src/images/bench4graph3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quansight/pnumpy/HEAD/doc_src/images/bench4graph3.PNG


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "wheel",
4 |     "setuptools",
5 |     "numpy<1.20",
6 | ]
7 | 


--------------------------------------------------------------------------------
/doc_src/images/threading_npadd.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quansight/pnumpy/HEAD/doc_src/images/threading_npadd.PNG


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
1 | 
2 | Changelog
3 | =========
4 | 
5 | 0.0.0 (2020-09-09)
6 | ------------------
7 | 
8 | * First release on PyPI.
9 | 


--------------------------------------------------------------------------------
/src/atop/readme.txt:
--------------------------------------------------------------------------------
1 | atop : array threaded operation
2 | A library containing vector intrinsic loops and threading to speed up calculations
3 | 


--------------------------------------------------------------------------------
/src/pnumpy/recycler.cpp:
--------------------------------------------------------------------------------
1 | #include "Python.h"
2 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
3 | #include <stdint.h>
4 | #include <stdio.h>
5 | #include "../atop/atop.h"
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [paths]
 2 | source = src
 3 | 
 4 | [run]
 5 | branch = true
 6 | source =
 7 |     src
 8 |     tests
 9 | parallel = true
10 | 
11 | [report]
12 | show_missing = true
13 | precision = 2
14 | omit = *migrations*
15 | 


--------------------------------------------------------------------------------
/doc_src/source/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ------------
 3 | 
 4 | This is a binary package and requires compilation. We recommend using pip or
 5 | conda to obtain a pre-built version::
 6 | 
 7 |     $ pip install pnumpy
 8 | 
 9 | To use the package one it is installed::
10 | 
11 |     >>> import pnumpy as pn
12 | 
13 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # see https://editorconfig.org/
 2 | root = true
 3 | 
 4 | [*]
 5 | end_of_line = lf
 6 | trim_trailing_whitespace = true
 7 | insert_final_newline = true
 8 | indent_style = space
 9 | indent_size = 4
10 | charset = utf-8
11 | 
12 | [*.{bat,cmd,ps1}]
13 | end_of_line = crlf
14 | 
15 | [*.{yml,yaml}]
16 | indent_size = 2
17 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | graft docs
 2 | graft src
 3 | graft ci
 4 | graft tests
 5 | 
 6 | include .bumpversion.cfg
 7 | include .coveragerc
 8 | include .editorconfig
 9 | 
10 | include AUTHORS.rst
11 | include CHANGELOG.rst
12 | include CONTRIBUTING.md
13 | include LICENSE
14 | include README.md
15 | 
16 | global-exclude *.py[cod] __pycache__/* *.so *.dylib
17 | 


--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.0.0
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:README.rst]
11 | search = v{current_version}.
12 | replace = v{new_version}.
13 | 
14 | [bumpversion:file:src/pnumpy/__init__.py]
15 | search = __version__ = '{current_version}'
16 | replace = __version__ = '{new_version}'
17 | 


--------------------------------------------------------------------------------
/doc_src/source/benchmarking_asv.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | 
 4 | .. _benchmarking_asv:
 5 | 
 6 | Benchmarking via ASV
 7 | --------------------
 8 | 
 9 | ASV_ is a tool that discovers and runs benchmarks. It saves the state of the
10 | run and the results, and is useful for comparing performance across versions.
11 | To run the benchmarks, do:
12 | 
13 | .. code-block:: shell
14 | 
15 |     $ cd benchmarks
16 |     $ asv run
17 | 
18 | .. _ASV: https://asv.readthedocs.io/en/stable/using.html
19 | 


--------------------------------------------------------------------------------
/doc_src/source/use.rst:
--------------------------------------------------------------------------------
 1 | How to use pnumpy
 2 | =================
 3 | ..
 4 |     The rest of this is taken from the __init__.py and the function's docstrings
 5 | 
 6 | pnumpy functions
 7 | ----------------
 8 | 
 9 | .. automodule:: pnumpy
10 |     :members: enable, disable, atop_disable, atop_enable, atop_info, atop_isenabled, atop_setworkers, getitem, recarray_to_colmajor, sort, lexsort
11 | 
12 | .. _benchmarking:
13 | 
14 | Benchmarking
15 | ------------
16 | 
17 | .. automodule:: pnumpy.benchmark
18 |     :members: benchmark
19 | 
20 | 


--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
 1 | cython==0.29.21
 2 | wheel<=0.35.1
 3 | setuptools<49.2.0
 4 | hypothesis==5.41.5
 5 | pytest==6.2.0
 6 | pytz==2020.4
 7 | pytest-cov==2.10.1
 8 | pickle5; python_version == '3.7' and platform_python_implementation != 'PyPy'
 9 | # for numpy.random.test.test_extending
10 | cffi
11 | # For testing types. Notes on the restrictions:
12 | # - Mypy relies on C API features not present in PyPy
13 | # - Mypy doesn't currently work on Python 3.9
14 | mypy==0.790; platform_python_implementation != "PyPy"
15 | typing_extensions
16 | 


--------------------------------------------------------------------------------
/doc_src/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS  ?= -WT --keep-going
 7 | SPHINXBUILD ?= sphinx-build
 8 | SOURCEDIR    = source
 9 | BUILDDIR     = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from packaging.utils import Version
 4 | HAVE_PNUMPY = True
 5 | try:
 6 |     import pnumpy
 7 | except Exception:
 8 |     HAVE_PNUMPY = False
 9 | 
10 | old_numpy = Version(np.__version__) < Version('1.18')
11 | 
12 | @pytest.fixture(scope='session')
13 | def initialize_pnumpy():
14 |     if HAVE_PNUMPY:
15 |         from numpy.core._multiarray_umath import __cpu_features__ as cpu
16 |         if not cpu['AVX2']:
17 |             pytest.skip('pnumpy.initialize requires AVX2')
18 |         pnumpy.initialize()
19 | 
20 | @pytest.fixture(scope='function')
21 | def rng():
22 |     if old_numpy:
23 |         class OldRNG(np.random.RandomState):
24 |             pass
25 |         rng = OldRNG(1234)
26 |         rng.random = rng.random_sample
27 |         rng.integers = rng.randint
28 |         return rng
29 |     else:
30 |         return np.random.default_rng(1234)
31 | 


--------------------------------------------------------------------------------
/doc_src/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 | 
3 | Copyright (c) 2020, Matti Picus
4 | 
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | 
7 | The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # generated files
 2 | src/pnumpy/PNUMPY.h
 3 | 
 4 | # benchmark environments
 5 | .asv
 6 | 
 7 | *.py[cod]
 8 | __pycache__
 9 | 
10 | # msvc
11 | *.sln
12 | *.vcxproj
13 | *.tlog
14 | *.log
15 | *.db
16 | *.opendb
17 | *.idb
18 | *.pdb
19 | *.ipch
20 | *.sqlite
21 | *.suo
22 | *.obj
23 | *.filters
24 | *.user
25 | *.cod
26 | *.lastbuildstate
27 | 
28 | # C extensions
29 | *.so
30 | 
31 | # version
32 | src/pnumpy/_version.py
33 | 
34 | # Packages
35 | *.egg
36 | *.egg-info
37 | dist
38 | build
39 | eggs
40 | .eggs
41 | parts
42 | bin
43 | var
44 | sdist
45 | wheelhouse
46 | develop-eggs
47 | .installed.cfg
48 | lib
49 | lib64
50 | venv*/
51 | pyvenv*/
52 | pip-wheel-metadata/
53 | 
54 | # Installer logs
55 | pip-log.txt
56 | 
57 | # Unit test / coverage reports
58 | .coverage
59 | .coverage.*
60 | .pytest_cache/
61 | nosetests.xml
62 | coverage.xml
63 | htmlcov
64 | 
65 | # Translations
66 | *.mo
67 | 
68 | # Complexity
69 | output/*.html
70 | output/*/index.html
71 | 
72 | # Sphinx
73 | docs/_build
74 | 
75 | # Mypy Cache
76 | .mypy_cache/
77 | 
78 | # Editor tmp files
79 | .*.swp
80 | /.vs/VSWorkspaceState.json
81 | /.vs/ProjectSettings.json
82 | 


--------------------------------------------------------------------------------
/src/atop/invalids.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "common_inc.h"
 3 | 
 4 | // For when integers have an invalid predefined
 5 | static const int8_t  GetInvalid(int8_t x) { return (int8_t)(0x80); };
 6 | static const int16_t GetInvalid(int16_t x) { return (int16_t)(0x8000); };
 7 | static const int32_t GetInvalid(int32_t x) { return (int32_t)(0x800000000000); };
 8 | static const int64_t GetInvalid(int64_t x) { return (int64_t)(0x8000000000000000); };
 9 | 
10 | static const uint8_t  GetInvalid(uint8_t x) { return (uint8_t)(0xFF); };
11 | static const uint16_t GetInvalid(uint16_t x) { return (uint16_t)(0xFFFF); };
12 | static const uint32_t GetInvalid(uint32_t x) { return (uint32_t)(0xFFFFFFFF); };
13 | static const uint64_t GetInvalid(uint64_t x) { return (uint64_t)(0xFFFFFFFFFFFFFFFF); };
14 | 
15 | static const float GetInvalid(float x) { return std::numeric_limits<float>::quiet_NaN(); };
16 | static const double GetInvalid(double x) { return std::numeric_limits<double>::quiet_NaN(); };
17 | static const long double GetInvalid(long double x) { return std::numeric_limits<long double>::quiet_NaN(); };
18 | 
19 | 
20 | //-------------------------------------------------------------------------
21 | 


--------------------------------------------------------------------------------
/doc_src/source/roadmap.rst:
--------------------------------------------------------------------------------
 1 | Roadmap
 2 | =======
 3 | 
 4 | Version 2.0 of the package uses multithreaded ufunc loops and parallel sorts.
 5 | 
 6 | Future versions of the package will extend these capabilites to cover more of 
 7 | the NumPy functionality. Some of these proposed enhancements will require new 
 8 | APIs from NumPy.
 9 | 
10 | Conversions
11 | -----------
12 | 
13 | Currently NumPy does not expose a hook for dtype conversions.  When available,
14 | PNumPy will parallelize those conversions.
15 | 
16 | Vectorized loops
17 | ----------------
18 | 
19 | NumPy is only now beginning to use `SIMD <https://en.wikipedia.org/wiki/SIMD>`_  
20 | instructions to speed up loops. We have a few further enhancements to the 
21 | current NumPy implementations. Check out the code in the `atop` directory.
22 | 
23 | Using a better memory allocator
24 | -------------------------------
25 | 
26 | NumPy uses a small cache for data memory but does not have one for larger
27 | arrays.  When the new API is available, we will provide a better cache.
28 | 
29 | Ledger
30 | ------
31 | What PNumPy hooks can be recorded and timed.  This built in profiler will help
32 | you to tweak and speed up your code.
33 | 
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/src/atop/atop.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "common_inc.h"
 3 | 
 4 | // Export DLL section
 5 | #if defined(_WIN32) && !defined(__GNUC__)
 6 | 
 7 | #define DllExport __declspec(dllexport)
 8 | 
 9 | #else 
10 | 
11 | #define DllExport
12 | 
13 | #endif
14 | 
15 | 
16 | extern "C" {
17 | 
18 |     // defined in atop.cpp
19 |     DllExport BOOL atop_init();
20 | 
21 |     // defined in ops_binary.cpp
22 |     DllExport ANY_TWO_FUNC GetSimpleMathOpFast(int func, int atopInType1, int atopInType2, int* wantedOutType);
23 |     DllExport REDUCE_FUNC GetReduceMathOpFast(int func, int atopInType1);
24 |     DllExport ANY_TWO_FUNC GetComparisonOpFast(int func, int atopInType1, int atopInType2, int* wantedOutType);
25 |     DllExport UNARY_FUNC GetUnaryOpFast(int func, int atopInType1, int* wantedOutType);
26 |     DllExport UNARY_FUNC GetUnaryOpSlow(int func, int atopInType1, int* wantedOutType);
27 |     DllExport UNARY_FUNC GetTrigOpFast(int func, int atopInType1, int* wantedOutType);
28 |     DllExport UNARY_FUNC GetTrigOpSlow(int func, int atopInType1, int* wantedOutType);
29 |     DllExport UNARY_FUNC GetLogOpFast(int func, int atopInType1, int* wantedOutType);
30 | 
31 |     // CPUID capabilities
32 |     extern DllExport int g_bmi2;
33 |     extern DllExport int g_avx2;
34 |     extern DllExport ATOP_cpuid_t   g_cpuid;
35 | 
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/.github/workflows/push_docs.yml:
--------------------------------------------------------------------------------
 1 | name: Push_docs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 | 
 7 | jobs:
 8 |   update_docs:
 9 |     runs-on: ubuntu-latest
10 |     defaults:
11 |       run:
12 |         shell: bash
13 | 
14 |     steps:
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: 3.8
19 | 
20 |     - uses: actions/checkout@v2
21 |       with:
22 |         submodules: recursive
23 |         fetch-depth: 0
24 | 
25 |     - name: build docs
26 |       run: |
27 |         set -ex
28 |         python -m pip install .
29 |         pushd doc_src
30 |         python -m pip install --progress-bar=off -r doc_requirements.txt
31 |         make html
32 |         popd
33 |         git fetch origin site
34 |         git checkout site
35 |         target=main  # fixme: use $GIT_BRANCH or so
36 |         rm -rf docs/$target/*
37 |         cp -r doc_src/build/html/* docs/$target
38 |         if [ $target == "main" ]; then
39 |             rm -rf docs/_static/*
40 |             cp -r doc_src/build/html/_static/* docs/_static
41 |         fi
42 |         git add docs || true
43 |         git config user.email "mattigit@picus.org.il"
44 |         git config user.name "mattibot"
45 |         # If there aren't changes, doesn't make a commit; push will be a no-op
46 |         git commit -m "auto-generating sphinx docs" || true
47 |     
48 |     - name: Push
49 |       uses: ad-m/github-push-action@master
50 |       with:
51 |         github_token: ${{ github.token }}
52 |         branch: site
53 | 


--------------------------------------------------------------------------------
/tests/test_pnumpy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | HAVE_PNUMPY = True
 4 | try:
 5 |     import pnumpy as pn
 6 | except Exception:
 7 |     HAVE_PNUMPY = False
 8 | 
 9 | 
10 | def test_enable(initialize_pnumpy):
11 |     # enable/disable return the previous value
12 |     if HAVE_PNUMPY:
13 |         old = pn.atop_isenabled()
14 |         pn.atop_enable()
15 |         assert pn.atop_isenabled() == True
16 |         pn.atop_disable()
17 |         assert pn.atop_isenabled() == False
18 | 
19 |         # restore prior state
20 |         if old:
21 |             pn.atop_enable()
22 |         else:
23 |             pn.atop_disable()
24 |         assert pn.atop_isenabled() == old
25 | 
26 | 
27 | def test_result(rng):
28 |     """ test that the basic idea of rng and ufunc result testing works.
29 |     """
30 | 
31 |     # this is currently the only test that does not require initialize_pnumpy
32 |     # which is useful for CI runs without AVX2. Otherwise all the tests will be
33 |     # skipped, and pytest will notice that all the tests are skipped and will
34 |     # complain.
35 | 
36 |     if HAVE_PNUMPY:
37 |         print('numpy version', np.__version__)
38 |         print(pn.cpustring())
39 |   
40 |         m = rng.integers(100, size=(10, 10), dtype=np.int32)
41 |         o = np.empty_like(m)
42 |         for i in range(m.shape[0]):
43 |             for j in range(m.shape[1]):
44 |                 o[i, j] = m[i, j] + m[i, j]
45 |         assert np.all(np.add(m, m) == o)
46 | 
47 | 
48 | def test_numpy_off(initialize_pnumpy):
49 |     if HAVE_PNUMPY:
50 |         np.test()
51 | 
52 | 
53 | def test_numpy_on(initialize_pnumpy):
54 |     if HAVE_PNUMPY:
55 |         pn.atop_enable()
56 |         np.test()
57 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 140
 3 | exclude = .eggs,build,dist
 4 | 
 5 | [tool:pytest]
 6 | # If a pytest section is found in one of the possible config files
 7 | # (pytest.ini, setup.cfg), then pytest will not look for any others,
 8 | # so if you add a pytest config section elsewhere,
 9 | # you will need to delete this section from setup.cfg.
10 | norecursedirs =
11 |     .git
12 |     .env
13 |     dist
14 |     build
15 |     migrations
16 | 
17 | python_files =
18 |     test_*.py
19 |     *_test.py
20 |     tests.py
21 | addopts =
22 |     -ra
23 |     --strict-markers
24 |     --ignore=docs/conf.py
25 |     --ignore=setup.py
26 |     --ignore=ci
27 |     --ignore=.eggs
28 |     --doctest-modules
29 |     --doctest-glob=\*.rst
30 |     --tb=short
31 |     --pyargs
32 | # The order of these options matters. testpaths comes after addopts so that
33 | # pnumpy in testpaths is interpreted as
34 | # --pyargs pnumpy.
35 | # Any tests in the src/ directory (that is, tests installed with the package)
36 | # can be run by any user with pytest --pyargs pnumpy.
37 | # Packages that are sensitive to the host machine, most famously NumPy,
38 | # include tests with the installed package so that any user can check
39 | # at any time that everything is working properly.
40 | # If you do choose to make installable tests, this will run the installed
41 | # tests as they are actually installed (same principle as when we ensure that
42 | # we always test the installed version of the package).
43 | # If you have no need for this (and your src/ directory is very large),
44 | # you can save a few milliseconds on testing by telling pytest not to search
45 | # the src/ directory by removing
46 | # --pyargs and pnumpy from the options here.
47 | testpaths =
48 |     pnumpy
49 |     tests/
50 | filterwarnings = 
51 |     ignore:.*AVX2
52 | 
53 | [tool:isort]
54 | force_single_line = True
55 | line_length = 120
56 | known_first_party = pnumpy
57 | default_section = THIRDPARTY
58 | forced_separate = test_pnumpy
59 | skip = .eggs,build,dist
60 | 
61 | [bdist_wheel]
62 | py-limited-api = cp36
63 | 


--------------------------------------------------------------------------------
/doc_src/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'PNumPy'
21 | copyright = '2020-2021, tdimitri, mattip'
22 | author = 'Quansight'
23 | 
24 | # The full version, including alpha/beta/rc tags
25 | release = 'v2.0.20'
26 | 
27 | 
28 | # -- General configuration ---------------------------------------------------
29 | 
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 |     'sphinx.ext.autodoc',
35 |     'sphinx.ext.autosummary',
36 |     'sphinx.ext.napoleon',
37 | ]
38 | 
39 | # Add any paths that contain templates here, relative to this directory.
40 | templates_path = ['_templates']
41 | 
42 | # List of patterns, relative to source directory, that match files and
43 | # directories to ignore when looking for source files.
44 | # This pattern also affects html_static_path and html_extra_path.
45 | exclude_patterns = []
46 | 
47 | 
48 | # -- Options for HTML output -------------------------------------------------
49 | 
50 | # The theme to use for HTML and HTML Help pages.  See the documentation for
51 | # a list of builtin themes.
52 | #
53 | html_theme = 'alabaster'
54 | 
55 | # Add any paths that contain custom static files (such as style sheets) here,
56 | # relative to this directory. They are copied after the builtin static files,
57 | # so a file named "default.css" will overwrite the builtin "default.css".
58 | #html_static_path = ['_static']
59 | 


--------------------------------------------------------------------------------
/src/atop/halffloat.h:
--------------------------------------------------------------------------------
 1 | #ifdef __cplusplus
 2 | extern "C" {
 3 | #endif
 4 | 
 5 | /*
 6 |  * Half-precision routines
 7 |  */
 8 |  /* half/float16 isn't a floating-point type in C */
 9 | 
10 | typedef uint16_t atop_half;
11 | 
12 | ///* Conversions */
13 | //float atop_half_to_float(atop_half h);
14 | //double atop_half_to_double(atop_half h);
15 | //atop_half atop_float_to_half(float f);
16 | //atop_half atop_double_to_half(double d);
17 | ///* Comparisons */
18 | //int atop_half_eq(atop_half h1, atop_half h2);
19 | //int atop_half_ne(atop_half h1, atop_half h2);
20 | //int atop_half_le(atop_half h1, atop_half h2);
21 | //int atop_half_lt(atop_half h1, atop_half h2);
22 | //int atop_half_ge(atop_half h1, atop_half h2);
23 | //int atop_half_gt(atop_half h1, atop_half h2);
24 | ///* faster *_nonan variants for when you know h1 and h2 are not NaN */
25 | //int atop_half_eq_nonan(atop_half h1, atop_half h2);
26 | //int atop_half_lt_nonan(atop_half h1, atop_half h2);
27 | //int atop_half_le_nonan(atop_half h1, atop_half h2);
28 | ///* Miscellaneous functions */
29 | //int atop_half_iszero(atop_half h);
30 | //int atop_half_isnan(atop_half h);
31 | //int atop_half_isinf(atop_half h);
32 | //int atop_half_isfinite(atop_half h);
33 | //int atop_half_signbit(atop_half h);
34 | //atop_half atop_half_copysign(atop_half x, atop_half y);
35 | //atop_half atop_half_spacing(atop_half h);
36 | //atop_half atop_half_nextafter(atop_half x, atop_half y);
37 | //atop_half atop_half_divmod(atop_half x, atop_half y, atop_half *modulus);
38 | //
39 | ///*
40 | // * Half-precision constants
41 | // */
42 | //
43 | //#define ATOP_HALF_ZERO   (0x0000u)
44 | //#define ATOP_HALF_PZERO  (0x0000u)
45 | //#define ATOP_HALF_NZERO  (0x8000u)
46 | //#define ATOP_HALF_ONE    (0x3c00u)
47 | //#define ATOP_HALF_NEGONE (0xbc00u)
48 | //#define ATOP_HALF_PINF   (0x7c00u)
49 | //#define ATOP_HALF_NINF   (0xfc00u)
50 | //#define ATOP_HALF_NAN    (0x7e00u)
51 | //
52 | //#define ATOP_MAX_HALF    (0x7bffu)
53 | //
54 | ///*
55 | // * Bit-level conversions
56 | // */
57 | 
58 | //uint16_t atop_floatbits_to_halfbits(uint32_t f);
59 | //uint16_t atop_doublebits_to_halfbits(uint64_t d);
60 | //uint32_t atop_halfbits_to_floatbits(uint16_t h);
61 | //uint64_t atop_halfbits_to_doublebits(uint16_t h);
62 | 
63 | #ifdef __cplusplus
64 | }
65 | #endif
66 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build,test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |   #schedule:
 9 |   #  - cron: '0 0 * * 0' # weekly
10 | 
11 | jobs:
12 |   build:
13 | 
14 |     runs-on: ${{ matrix.os }}
15 |     defaults:
16 |       run:
17 |         shell: bash
18 |     strategy:
19 |       fail-fast: false
20 |       matrix:
21 |         python-version: [3.6, 3.7, 3.8, 3.9]
22 |         os: [ubuntu-latest, macos-latest, windows-latest]
23 |         platform: [x64]
24 |         include:
25 |           - python-version: 3.8
26 |             os: windows-latest
27 |             platform: x86
28 |     steps:
29 |     - uses: actions/checkout@v2
30 |       with:
31 |         submodules: recursive
32 |     - name: Set up Python ${{ matrix.python-version }}
33 |       uses: actions/setup-python@v2
34 |       with:
35 |         python-version: ${{ matrix.python-version }}
36 |         architecture: ${{ matrix.platform }}
37 |     - name: Install prerequisits
38 |       run: |
39 |         set -ex
40 |         python -m pip install --upgrade pip
41 |         python -m pip install --progress-bar=off  -r ci/requirements.txt
42 |         echo $(python -c"import sys; print(sys.version)")
43 |     - name: Build and install
44 |       run: |
45 |         set -ex
46 |         python -m pip install .
47 |     - name: Test
48 |       run: |
49 |         python -m pip install -r test_requirements.txt
50 |         python -m pytest tests -vv --durations 10 
51 | 
52 |   build_docs:
53 |     runs-on: ubuntu-latest
54 |     defaults:
55 |       run:
56 |         shell: bash
57 | 
58 |     steps:
59 |     - name: Set up Python ${{ matrix.python-version }}
60 |       uses: actions/setup-python@v2
61 |       with:
62 |         python-version: 3.8
63 | 
64 |     - uses: actions/checkout@v2
65 |       with:
66 |         submodules: recursive
67 |         fetch-depth: 0
68 | 
69 |     - name: build docs
70 |       run: |
71 |         set -ex
72 |         python -m pip install .
73 |         pushd doc_src
74 |         python -m pip install --progress-bar=off -r doc_requirements.txt
75 |         make html
76 |         popd
77 |     - name: store docs
78 |       uses: actions/upload-artifact@v2
79 |       with:
80 |         name: docs
81 |         path: doc_src/build/html
82 | 


--------------------------------------------------------------------------------
/src/pnumpy/recarray.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import numpy as np
 3 | 
 4 | __all__ = [
 5 |     'recarray_to_colmajor']
 6 | 
 7 | from pnumpy._pnumpy import recarray_to_colmajor as _recarray_to_colmajor
 8 | 
 9 | 
10 | #-----------------------------------------------------------------------------------------
11 | def recarray_to_colmajor(item, parallel=True):
12 |     """
13 |     Converts a numpy record array (void type) to a dictionary of numpy arrays, col major
14 | 
15 |     Returns
16 |     -------
17 |     A dictionary of numpy arrays corresponding to the original numpy record array.
18 | 
19 |     Examples
20 |     --------
21 |     >>> x=np.array([(1.0, 2, 3, 4, 5, 'this is a long test'), (3.0, 4, 5, 6, 7, 'short'), (30.0, 40, 50, 60, 70, '')],
22 |                 dtype=[('x', '<f4'), ('y', '<i2'), ('z', 'i8'),('zz','i8'),('yy','i4'),('str','<S20')])
23 |     >>> item=np.tile(x,100_000)
24 |     >>> mydict = recarray_to_colmajor(item)
25 |     """
26 |     if item.dtype.char == 'V':
27 |         # warnings.warn(f"Converting numpy record array. Performance may suffer.")
28 |         # flip row-major to column-major
29 |         list_types = [*item.dtype.fields.values()]
30 |         success = True
31 |         for t in list_types:
32 |             val = t[0].char
33 |             # if the record type has an object or another record type, we cannot handle
34 |             if val == 'O' or val =='V':
35 |                 success = False
36 |                 break;
37 | 
38 |         d={}
39 |         if successs and parallel:
40 |             offsets=[]
41 |             arrays=np.empty(len(item.dtype.fields), dtype='O')
42 |             arrlen = len(item)
43 |             count =0
44 |             for name, v in item.dtype.fields.items():
45 |                 offsets.append(v[1])
46 |                 arr= np.empty(arrlen, dtype=v[0])
47 |                 arrays[count] = arr
48 |                 count += 1
49 |                 # build dict of names and new arrays
50 |                 d[name] = arr
51 | 
52 |             # Call parallel routine to convert
53 |             _recarray_to_colmajor(item, np.asarray(offsets, dtype=np.int64), arrays);
54 | 
55 |         else:
56 |             # single thread way
57 |             for name in item.dtype.names:
58 |                 d[name] = item[:][name].copy()
59 |         return d
60 | 
61 |     warnings.warn(f"The array passed was not a numpy record array.")
62 |     return item
63 | 
64 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Contributions are welcome, and they are greatly appreciated! Every
 4 | little bit helps, and we will try to give credit appropriately.
 5 | 
 6 | # Bug reports
 7 | 
 8 | Use the [issue tracker](https://github.com/Quansight/pnumpy/issues).
 9 | Please include:
10 | 
11 | * Your operating system name and version.
12 | * Any details about your local setup that might be helpful in troubleshooting.
13 | * Detailed steps to reproduce the bug.
14 | 
15 | # Documentation improvements
16 | 
17 | FastNumPy could always use more documentation, whether as part of the
18 | official FastNumPy docs, in docstrings, or even on the web in blog posts,
19 | articles, and such.
20 | 
21 | # Feature requests and feedback
22 | 
23 | The best way to send feedback is to file an issue on the issue tracker.
24 | 
25 | If you are proposing a feature:
26 | 
27 | * Explain in detail how it would work.
28 | * Keep the scope as narrow as possible, to make it easier to implement.
29 | * Remember that this is a volunteer-driven project, and that code contributions are welcome :)
30 | 
31 | # Development
32 | 
33 | To set up `pnumpy` for local development:
34 | 
35 | 1. Fork [pnumpy](https://github.com/Quansight/pnumpy)
36 |    (look for the "Fork" button).
37 | 2. Clone your fork locally
38 |    ```
39 |    git clone git@github.com:YOURGITHUBNAME/pnumpy.git
40 |    ```
41 | 
42 | 3. Create a branch for local development::
43 |    ```
44 |    git checkout -b name-of-your-bugfix-or-feature
45 |    ```
46 | 
47 |    Now you can make your changes locally.
48 | 
49 | 4. When you're done making changes run all the tests with 
50 |    ```
51 |    python setup.py build_ext --inplace
52 |    python -m pip install pytest
53 |    python -m pytest tests
54 |    ```
55 | 
56 | 5. Commit your changes and push your branch to GitHub::
57 |    ```
58 |    git add .
59 |    git commit -m "Your detailed description of your changes."
60 |    git push origin name-of-your-bugfix-or-feature
61 |    ```
62 | 
63 | 6. Submit a pull request through the GitHub website.
64 | 
65 | ### Pull Request Guidelines
66 | 
67 | If you need some code review or feedback while you're developing the code just make the pull request.
68 | 
69 | For merging, you should:
70 | 
71 | 1. Update documentation when there's new API, functionality etc.
72 | 2. Add a note to `CHANGELOG.rst` about the changes.
73 | 3. Add yourself to `AUTHORS.rst`.
74 | 
75 | <sup>1</sup>If you don't have all the necessary python versions available
76 | locally you can rely on CI - it will [run the
77 | tests](https://travis-ci.org/Quansight/pnumpy/pull_requests)
78 | for each change you add in the pull request.
79 | 
80 | It will be slower though ...
81 | 
82 | ### Tips
83 | 
84 | To run a subset of tests::
85 | ```
86 | python -m pytest -k test_myfeature
87 | ```
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/src/pnumpy/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pnumpy now calls ``init`` at startup to setup the package. This imports NumPy,
 3 | replacing all the inner loops of UFuncs with wrapped versions. Then you can
 4 | enable/disable any of the subsystems:
 5 | 
 6 |   - threading
 7 | 
 8 |     Threading will kick in when the number of elements to be processed is more
 9 |     than 50,000. It will break the operation into chunks. Each chunk will be
10 |     executed in its own thread.
11 | 
12 |   - ledger
13 | 
14 |     The ledger records data on each loop execution to
15 |     enable more accurate heuristics on memory allocation, threading behavior
16 |     and reporting for logging and benchmarking.
17 | 
18 |   - recycler
19 | 
20 |     Once we can change the NumPy memory allocation strategy, we can use the
21 |     data from the ledger to create more performant memory caches.
22 | 
23 |   - atop
24 | 
25 |     Provide faster implementations of NumPy inner loops.
26 | """
27 | from ._version import __version__
28 | __all__ = [
29 |     'initialize', 'atop_enable', 'atop_disable', 'atop_isenabled', 'atop_info', 'atop_setworkers','cpustring',
30 |     'thread_enable', 'thread_disable', 'thread_isenabled', 'thread_getworkers', 'thread_setworkers', 'thread_zigzag',
31 |     'ledger_enable', 'ledger_disable', 'ledger_isenabled', 'ledger_info',
32 |     'recycler_enable', 'recycler_disable', 'recycler_isenabled', 'recycler_info',
33 |     'timer_gettsc','timer_getutc', 'benchmark', 'recarray_to_colmajor', 'init', 'enable', 'disable', 'cpu_count_linux',
34 |     'sort', 'lexsort'
35 |     ]
36 | 
37 | import numpy as np
38 | import numpy.core._multiarray_umath as umath
39 | 
40 | # TODO check for Apple M1 chip (where AVX2 makes no sense)
41 | # TODO check for numpy version
42 | 
43 | try:
44 |     # Numpy 1.18 does not have __cpu_features
45 |     # If we cannot find it, we load anyway because 95% have AVX2
46 |     # and we can hook numpy 1.18 ufuncs
47 |     # TODO: check for Apple M1 chip
48 |     __hasavx2 = umath.__cpu_features__['AVX2']
49 | except Exception:
50 |     __hasavx2 = True
51 | 
52 | if not __hasavx2:
53 |     raise ValueError(f"PNumPy requires a CPU with AVX2 capability to work")
54 | 
55 | del __hasavx2
56 | 
57 | import pnumpy._pnumpy as _pnumpy
58 | from pnumpy._pnumpy import atop_enable, atop_disable, atop_isenabled, atop_info, atop_setworkers, cpustring 
59 | from pnumpy._pnumpy import thread_enable, thread_disable, thread_isenabled, thread_getworkers, thread_setworkers, thread_zigzag
60 | from pnumpy._pnumpy import timer_gettsc, timer_getutc
61 | from pnumpy._pnumpy import ledger_enable, ledger_disable, ledger_isenabled, ledger_info
62 | from pnumpy._pnumpy import recycler_enable, recycler_disable, recycler_isenabled, recycler_info
63 | from pnumpy._pnumpy import getitem, lexsort32, lexsort64
64 | 
65 | from .cpu import cpu_count_linux, init, enable, disable
66 | from .sort import sort, lexsort, argsort, argmin, argmax, searchsorted
67 | from .benchmark import benchmark, benchmark_func
68 | from .recarray import recarray_to_colmajor
69 | 
70 | # to be removed
71 | def initialize():
72 |     init()
73 | 
74 | # start the engine by default
75 | # TODO: check environment variable
76 | init()
77 | 


--------------------------------------------------------------------------------
/benchmarks/README.rst:
--------------------------------------------------------------------------------
 1 | ..  -*- rst -*-
 2 | 
 3 | ==========
 4 | Benchmarks
 5 | ==========
 6 | 
 7 | This package uses `Airspeed Velocity`_ for benchmarks. The benchmarks are adapted
 8 | from the ones in the `NumPy github repo`_
 9 | 
10 | 
11 | Usage
12 | -----
13 | 
14 | Airspeed Velocity manages building and Python virtualenvs by itself.
15 | 
16 | Before beginning, ensure that *airspeed velocity* is installed.
17 | By default, `asv` ships with support for anaconda and virtualenv::
18 | 
19 |     pip install asv
20 |     pip install virtualenv
21 | 
22 | After contributing new benchmarks, you should test them locally
23 | before submitting a pull request.
24 | 
25 | To run all benchmarks, navigate to the top-level repo directory via the command
26 | line and execute::
27 | 
28 |     asv run
29 | 
30 | The first time this is run, it will build a profile of the machine. The
31 | information is stored in a top-level `.asv` directory that will be ignored by
32 | git.  (Note: running benchmarks could take a while. Each benchmark is run
33 | multiple times to measure the distribution in execution times.)
34 | 
35 | To run benchmarks across a series of git commits, `asv` supports git-like
36 | syntax. For example to run benchmarks on all commits on a branch off `main`,
37 | do::
38 |     asv run main..mybranch
39 | 
40 | To view benchmarks once run, use ``asv show <commit>``::
41 | 
42 |     asv show main
43 | 
44 | This will display the results in plain text in the console. For a graphical
45 | view, you can create html via ``asv publish`` and then view the result with
46 | ``asv preview``.
47 | 
48 | More on how to use ``asv`` can be found in `ASV documentation`_
49 | Command-line help is available as usual via ``asv --help`` and
50 | ``asv run --help``.
51 | 
52 | .. _ASV documentation: https://asv.readthedocs.io/
53 | 
54 | 
55 | Writing benchmarks
56 | ------------------
57 | 
58 | See `Airspeed Velocity`_ documentation for basics on how to write benchmarks.
59 | 
60 | Some things to consider:
61 | 
62 | - The benchmark suite should be importable with any version of the project.
63 | 
64 | - The benchmark parameters etc. should not depend on which version is
65 |   installed.
66 | 
67 | - Try to keep the runtime of the benchmark reasonable.
68 | 
69 | - Prefer ASV's ``time_`` methods for benchmarking times rather than cooking up
70 |   time measurements via ``time.clock``, even if it requires some juggling when
71 |   writing the benchmark.
72 | 
73 | - Preparing arrays etc. should generally be put in the ``setup`` method rather
74 |   than the ``time_`` methods, to avoid counting preparation time together with
75 |   the time of the benchmarked operation.
76 | 
77 | - Be mindful that large arrays created with ``np.empty`` or ``np.zeros`` might
78 |   not be allocated in physical memory until the memory is accessed. If this is
79 |   desired behaviour, make sure to comment it in your setup function. If
80 |   you are benchmarking an algorithm, it is unlikely that a user will be
81 |   executing said algorithm on a newly created empty/zero array. One can force
82 |   pagefaults to occur in the setup phase either by calling ``np.ones`` or
83 |   ``arr.fill(value)`` after creating the array,
84 | 
85 | .. _`Airspeed Velocity`: https://asv.readthedocs.io/
86 | .. _`NumPy github repo`: https://github.com/numpy/numpy
87 | 


--------------------------------------------------------------------------------
/tests/test_ufuncs.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | HAVE_PNUMPY = True
 3 | try:
 4 |     import pnumpy
 5 | except Exception:
 6 |     HAVE_PNUMPY = False
 7 | 
 8 | import pytest
 9 | 
10 | # pnumpy.initialize() is called from conftest.py
11 | 
12 | 
13 | def type2dtype(types):
14 |     """
15 |     Maps the ufunc.type to the input, output dtypes.
16 |     type2dtype('ii->?') -> (np.int32, np.int32), (np.bool_,)
17 |     """
18 |     inp, out = types.split('->')
19 |     return tuple(np.dtype(c) for c in inp), tuple(np.dtype(c) for c in out)
20 | 
21 | 
22 | def get_ufuncs_and_types():
23 |     """Create a dictionary with keys of ufunc names and values of all the
24 |     supported type signatures
25 |     """
26 |     ufuncs = [x for x in dir(np) if isinstance(getattr(np, x), np.ufunc)]
27 |     if 'matmul' in ufuncs:
28 |         ufuncs.remove('matmul')
29 |     # Maybe use a collections.defaultdict instead?
30 |     ret = dict([[x, []] for x in ufuncs])
31 |     for s in ret:
32 |         ret[s] = [type2dtype(t) for t in getattr(np, s).types]
33 |     return ret
34 | 
35 | 
36 | def fill_random(a, rng):
37 |     """Fill an ndarray with random values. This will uniformly cover the
38 |     bit-valued number space, which in the case of floats is differnt from
39 |     rng.uniform()
40 |     """
41 |     if a.dtype == 'object':
42 |         v = a.reshape(-1)
43 |         # Slow !!!
44 |         for i in range(v.size):
45 |             v[i] = float(rng._bit_generator.random_raw(1))
46 |     else:
47 |         v = a.view(np.uint64)
48 |         v[:] = rng._bit_generator.random_raw(v.size).reshape(v.shape)
49 |     return a
50 | 
51 | typemap = get_ufuncs_and_types()
52 | 
53 | def data(in_dtypes, out_dtypes, shape, rng):
54 |     """ Return two tuples: input and output, with random data dtypes and
55 |     shape
56 |     """
57 |     ret_in = [fill_random(np.empty(shape, dtype=d), rng) for d in in_dtypes]
58 |     ret_out = tuple([fill_random(np.empty(shape, dtype=d), rng) for d in out_dtypes])
59 |     return ret_in, ret_out
60 | 
61 | @pytest.mark.filterwarnings("ignore::RuntimeWarning")
62 | @pytest.mark.parametrize(['name', 'types'], ([k, v] for k,v in typemap.items()))
63 | def test_threads(name, types, initialize_pnumpy, rng):
64 |     """ Test that enabling the threading does not change the results
65 |     """
66 |     if HAVE_PNUMPY:
67 |         ufunc = getattr(np, name)
68 |         for in_dtypes, out_dtypes in types:
69 |             # Skip object dtypes
70 |             if any([o == 'object' for o in out_dtypes]):
71 |                 continue
72 |             if any([o == 'object' for o in in_dtypes]):
73 |                 continue
74 |             in_data, out_data = data(in_dtypes, out_dtypes, [1024, 1024], rng)
75 |             if (name in ('power',) and 
76 |                     issubclass(in_data[1].dtype.type, np.integer)):
77 |                 in_data[1] = np.abs(in_data[1])
78 |                 in_data[1][in_data[1] < 0] = 0
79 |             if len(out_data) == 1:
80 |                 out_data = out_data[0]
81 |             out1 = ufunc(*in_data, out=out_data)
82 |             pnumpy.thread_enable()
83 |             assert pnumpy.thread_isenabled()
84 |             out2 = ufunc(*in_data, out=out_data)
85 |             pnumpy.thread_disable()
86 |             # may not work on datetime
87 |             if not any([o == 'datetime64' for o in out_dtypes]) and not any([o == 'timedelta64' for o in out_dtypes]):
88 |                 np.testing.assert_allclose(out1, out2, equal_nan=True)
89 | 


--------------------------------------------------------------------------------
/src/atop/atop.cpp:
--------------------------------------------------------------------------------
  1 | #include "atop.h"
  2 | #include "threads.h"
  3 | 
  4 | #define LOGGING(...)
  5 | #define LOGERROR printf
  6 | 
  7 | //----------------------------------------------------------------------------------
  8 | // Lookup to go from 1 byte to 8 byte boolean values
  9 | int64_t gBooleanLUT64[256];
 10 | int32_t gBooleanLUT32[16];
 11 | 
 12 | int64_t gBooleanLUT64Inverse[256];
 13 | int32_t gBooleanLUT32Inverse[16];
 14 | 
 15 | void* g_cMathWorker = NULL;
 16 | 
 17 | // Keep track of stats
 18 | static int64_t g_TotalAllocs = 0;
 19 | static int64_t g_TotalFree = 0;
 20 | static int64_t g_TotalMemoryAllocated = 0;
 21 | static int64_t g_TotalMemoryFreed = 0;
 22 | 
 23 | #define MAGIC_PAGE_GUARD  0xDEADBEEFDEADBEEF
 24 | //-----------------------------------------------
 25 | void* FmAlloc(size_t _Size) {
 26 |     // make thread safe
 27 |     uint64_t* pageGuard = (uint64_t*)malloc(_Size + 16);
 28 |     if (pageGuard) {
 29 |         InterlockedIncrement64(&g_TotalAllocs);
 30 |         InterlockedAdd64(&g_TotalMemoryAllocated, _Size);
 31 |         pageGuard[0] = _Size;
 32 |         pageGuard[1] = MAGIC_PAGE_GUARD;
 33 | 
 34 |         // Skip past guard
 35 |         return &pageGuard[2];
 36 |     }
 37 |     return NULL;
 38 | }
 39 | 
 40 | void FmFree(void* _Block) {
 41 |     // The C standard requires that free() be a no-op when called with nullptr.
 42 |     // FmAlloc can return a nullptr, and since we want this function to behave
 43 |     // like free() we also need to handle the nullptr case here.
 44 |     if (!_Block) { return; }
 45 | 
 46 |     //LOGRECYCLE("Freeing %p\n", _Block);
 47 |     InterlockedIncrement64(&g_TotalFree);
 48 |     uint64_t* pageGuard = (uint64_t*)_Block;
 49 |     pageGuard--;
 50 |     pageGuard--;
 51 |     if (pageGuard[1] != MAGIC_PAGE_GUARD) {
 52 |         LOGERROR("!! User freed bad memory, no page guard %p\n", pageGuard);
 53 |     }
 54 |     else {
 55 |         InterlockedAdd64(&g_TotalMemoryFreed, pageGuard[0]);
 56 |         // mark so cannot free again
 57 |         pageGuard[1] = 0;
 58 |     }
 59 | 
 60 |     free(pageGuard);
 61 | }
 62 | 
 63 | //====================================================
 64 | // Must be called to initialize atop
 65 | // Will start threads and detect the CPU
 66 | // Will build runtime lookup tables
 67 | // NOTE: return FALSE if ALREADY initialized
 68 | BOOL atop_init() {
 69 | 
 70 |     // Check if init already called
 71 |     if (g_cMathWorker) return FALSE;
 72 | 
 73 |     // Build LUTs used in comarisons after mask generated
 74 |     for (int i = 0; i < 256; i++) {
 75 |         BYTE* pDest = (BYTE*)&gBooleanLUT64[i];
 76 |         for (int j = 0; j < 8; j++) {
 77 |             *pDest++ = ((i >> j) & 1);
 78 |         }
 79 |     }
 80 |     // Build LUTs
 81 |     for (int i = 0; i < 16; i++) {
 82 |         BYTE* pDest = (BYTE*)&gBooleanLUT32[i];
 83 |         for (int j = 0; j < 4; j++) {
 84 |             *pDest++ = ((i >> j) & 1);
 85 |         }
 86 |     }
 87 | 
 88 |     // Build LUTs
 89 |     for (int i = 0; i < 256; i++) {
 90 |         gBooleanLUT64Inverse[i] = gBooleanLUT64[i] ^ 0x0101010101010101LL;
 91 |     }
 92 |     // Build LUTs
 93 |     for (int i = 0; i < 16; i++) {
 94 |         gBooleanLUT32Inverse[i] = gBooleanLUT32[i] ^ 0x01010101;
 95 |     }
 96 | 
 97 |     g_cMathWorker = new CMathWorker();
 98 | 
 99 |     // start up the worker threads now in case we use them
100 |     THREADER->StartWorkerThreads(0);
101 | 
102 |     LOGGING("ATOP loaded\n");
103 |     return TRUE;
104 | }
105 | 
106 | 


--------------------------------------------------------------------------------
/src/pnumpy/arange.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include <cmath>
  3 | #include "../atop/threads.h"
  4 | 
  5 | //------------------------------------------------------
  6 | // The arange routine is largely copied from numpy
  7 | //
  8 | #define error_converting(x)  (((x) == -1) && PyErr_Occurred())
  9 | 
 10 | /*
 11 |  * Like ceil(value), but check for overflow.
 12 |  *
 13 |  * Return 0 on success, -1 on failure. In case of failure, set a PyExc_Overflow
 14 |  * exception
 15 |  */
 16 | static npy_intp
 17 | _arange_safe_ceil_to_intp(double value)
 18 | {
 19 |     double ivalue;
 20 | 
 21 |     ivalue = ceil(value);
 22 |     /* condition inverted to handle NaN */
 23 |     if (isnan(ivalue)) {
 24 |         PyErr_SetString(PyExc_ValueError,
 25 |             "arange: cannot compute length");
 26 |         return -1;
 27 |     }
 28 |     if (!(NPY_MIN_INTP <= ivalue && ivalue <= NPY_MAX_INTP)) {
 29 |         PyErr_SetString(PyExc_OverflowError,
 30 |             "arange: overflow while computing length");
 31 |         return -1;
 32 |     }
 33 | 
 34 |     return (npy_intp)ivalue;
 35 | }
 36 | 
 37 | /*NUMPY_API
 38 |   Arange,
 39 | */
 40 | PyObject*
 41 | PArange(double start, double stop, double step, int type_num)
 42 | {
 43 |     npy_intp length;
 44 |     PyArrayObject* range;
 45 |     PyArray_ArrFuncs* funcs;
 46 |     PyObject* obj;
 47 |     int ret;
 48 |     double delta, tmp_len;
 49 |     NPY_BEGIN_THREADS_DEF;
 50 | 
 51 |     delta = stop - start;
 52 |     tmp_len = delta / step;
 53 | 
 54 |     /* Underflow and divide-by-inf check */
 55 |     if (tmp_len == 0.0 && delta != 0.0) {
 56 |         if (signbit(tmp_len)) {
 57 |             length = 0;
 58 |         }
 59 |         else {
 60 |             length = 1;
 61 |         }
 62 |     }
 63 |     else {
 64 |         length = _arange_safe_ceil_to_intp(tmp_len);
 65 |         if (error_converting(length)) {
 66 |             return NULL;
 67 |         }
 68 |     }
 69 | 
 70 |     if (length <= 0) {
 71 |         length = 0;
 72 |         return PyArray_New(&PyArray_Type, 1, &length, type_num,
 73 |             NULL, NULL, 0, 0, NULL);
 74 |     }
 75 |     range = (PyArrayObject*)PyArray_New(&PyArray_Type, 1, &length, type_num,
 76 |         NULL, NULL, 0, 0, NULL);
 77 |     if (range == NULL) {
 78 |         return NULL;
 79 |     }
 80 |     funcs = PyArray_DESCR(range)->f;
 81 | 
 82 |     /*
 83 |      * place start in the buffer and the next value in the second position
 84 |      * if length > 2, then call the inner loop, otherwise stop
 85 |      */
 86 |     obj = PyFloat_FromDouble(start);
 87 |     ret = funcs->setitem(obj, PyArray_DATA(range), range);
 88 |     Py_DECREF(obj);
 89 |     if (ret < 0) {
 90 |         goto fail;
 91 |     }
 92 |     if (length == 1) {
 93 |         return (PyObject*)range;
 94 |     }
 95 |     obj = PyFloat_FromDouble(start + step);
 96 |     ret = funcs->setitem(obj, PyArray_BYTES(range) + PyArray_ITEMSIZE(range),
 97 |         range);
 98 |     Py_DECREF(obj);
 99 |     if (ret < 0) {
100 |         goto fail;
101 |     }
102 |     if (length == 2) {
103 |         return (PyObject*)range;
104 |     }
105 |     if (!funcs->fill) {
106 |         PyErr_SetString(PyExc_ValueError,
107 |             "no fill-function for data-type.");
108 |         Py_DECREF(range);
109 |         return NULL;
110 |     }
111 |     NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(range));
112 |     funcs->fill(PyArray_DATA(range), length, range);
113 |     NPY_END_THREADS;
114 |     if (PyErr_Occurred()) {
115 |         goto fail;
116 |     }
117 |     return (PyObject*)range;
118 | 
119 | fail:
120 |     Py_DECREF(range);
121 |     return NULL;
122 | }
123 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at info@quansight.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/src/pnumpy/conversions.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | 
  3 | #define LOGGING(...)
  4 | 
  5 | // TODO: look at casting in ndarraytypes.h, convert_datatype.c
  6 | //NPY_NO_EXPORT PyArray_VectorUnaryFunc*
  7 | //PyArray_GetCastFunc(PyArray_Descr* descr, int type_num)
  8 | //{
  9 | //    PyArray_VectorUnaryFunc* castfunc = NULL;
 10 | //
 11 | //    if (type_num < NPY_NTYPES_ABI_COMPATIBLE) {
 12 | //        castfunc = descr->f->cast[type_num];
 13 | 
 14 | //-----------------------------------
 15 | // Converts (in parallel) a numpy recarray (void type)
 16 | // Caller must have PREALLOCATE the colmajor arrays to copy data into
 17 | // Caller must also pass the struct offsets (within the recarray)
 18 | //
 19 | // Input1: the recordarray to convert
 20 | // Input2: int64 array of offsets
 21 | // Input3: object array of numpy arrays pre allocated that match in order the recarray
 22 | extern "C"
 23 | PyObject*
 24 | recarray_to_colmajor(PyObject* self, PyObject* args) {
 25 | 
 26 |     PyArrayObject* inArr = NULL;
 27 |     PyArrayObject* offsetArr = NULL;
 28 |     PyArrayObject* arrArr = NULL;
 29 | 
 30 |     //if (!PyArg_ParseTuple(args, "O!O!O!:recarray_to_colmajor",
 31 |     //    &PyArray_Type, &inArr,
 32 |     //    &PyArray_Type, &offsetArr,
 33 |     //    &PyArray_Type, &arrArr)) {
 34 |     //    return NULL;
 35 |     //}
 36 | 
 37 |     if (PyTuple_Size(args) == 3) {
 38 |         inArr = (PyArrayObject*)PyTuple_GetItem(args, 0);
 39 |         offsetArr = (PyArrayObject*)PyTuple_GetItem(args, 1);
 40 |         arrArr = (PyArrayObject*)PyTuple_GetItem(args, 2);
 41 |     }
 42 |     else {
 43 |         PyErr_Format(PyExc_ValueError, "recarray_to_colmajor must input 3 numpy arrays");
 44 |         return NULL;
 45 |     }
 46 | 
 47 |     int64_t itemSize = PyArray_ITEMSIZE(inArr);
 48 | 
 49 |     if (itemSize != PyArray_STRIDE(inArr, 0)) {
 50 |         PyErr_Format(PyExc_ValueError, "recarray_to_colmajor cannot yet handle strides");
 51 |         return NULL;
 52 |     }
 53 | 
 54 |     if (NPY_VOID != PyArray_TYPE(inArr)) {
 55 |         PyErr_Format(PyExc_ValueError, "recarray_to_colmajor must be void type");
 56 |         return NULL;
 57 |     }
 58 | 
 59 |     if (NPY_OBJECT != PyArray_TYPE(arrArr)) {
 60 |         PyErr_Format(PyExc_ValueError, "recarray_to_colmajor third param must be object array");
 61 |         return NULL;
 62 |     }
 63 | 
 64 |     int64_t length = ArrayLength(inArr);
 65 |     int64_t numArrays = ArrayLength(arrArr);
 66 | 
 67 |     if (numArrays != ArrayLength(offsetArr)) {
 68 |         PyErr_Format(PyExc_ValueError, "recarray_to_colmajor inputs do not match");
 69 |         return NULL;
 70 |     }
 71 | 
 72 |     int64_t totalRows = length;
 73 |     int64_t* pOffsets = (int64_t*)PyArray_BYTES(offsetArr);
 74 |     PyArrayObject** ppArrays = (PyArrayObject**)PyArray_BYTES(arrArr);
 75 | 
 76 |     stRecarrayOffsets* pstOffset;
 77 | 
 78 |     // TODO allocate this on the stack
 79 |     pstOffset = (stRecarrayOffsets*)WORKSPACE_ALLOC(sizeof(stRecarrayOffsets) * numArrays);
 80 | 
 81 |     for (int64_t i = 0; i < numArrays; i++) {
 82 |         // Consider adding pOffsets here
 83 |         pstOffset[i].pData = PyArray_BYTES(ppArrays[i]);
 84 |         pstOffset[i].readoffset = pOffsets[i];
 85 |         pstOffset[i].itemsize = PyArray_ITEMSIZE(ppArrays[i]);
 86 |     }
 87 | 
 88 |     char* pStartOffset = PyArray_BYTES(inArr);
 89 | 
 90 |     // Call atop to finish the work
 91 |     RecArrayToColMajor(
 92 |         pstOffset,
 93 |         pStartOffset,
 94 |         totalRows,
 95 |         numArrays,
 96 |         itemSize);
 97 | 
 98 |     WORKSPACE_FREE(pstOffset);
 99 | 
100 |     RETURN_NONE;
101 | }
102 | 
103 | 


--------------------------------------------------------------------------------
/asv.conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // The version of the config file format.  Do not change, unless
 3 |     // you know what you are doing.
 4 |     "version": 1,
 5 | 
 6 |     // The name of the project being benchmarked
 7 |     "project": "pnumpy",
 8 | 
 9 |     // The project's homepage
10 |     "project_url": "https://quansight.github.io/pnumpy/stable/index.html",
11 | 
12 |     // The URL or local path of the source code repository for the
13 |     // project being benchmarked
14 |     "repo": ".",
15 | 
16 |     // List of branches to benchmark. If not provided, defaults to "master"
17 |     // (for git) or "tip" (for mercurial).
18 |     "branches": ["main"],
19 | 
20 |     // The DVCS being used.  If not set, it will be automatically
21 |     // determined from "repo" by looking at the protocol in the URL
22 |     // (if remote), or by looking for special directories, such as
23 |     // ".git" (if local).
24 |     "dvcs": "git",
25 | 
26 |     // The tool to use to create environments.  May be "conda",
27 |     // "virtualenv" or other value depending on the plugins in use.
28 |     // If missing or the empty string, the tool will be automatically
29 |     // determined by looking for tools on the PATH environment
30 |     // variable.
31 |     "environment_type": "virtualenv",
32 | 
33 |     // the base URL to show a commit for the project.
34 |     "show_commit_url": "https://github.com/Qaunsight/pnumpy.git",
35 | 
36 |     // The Pythons you'd like to test against.  If not provided, defaults
37 |     // to the current version of Python used to run `asv`.
38 |     //"pythons": ["3.7"],
39 | 
40 |     // The matrix of dependencies to test.  Each key is the name of a
41 |     // package (in PyPI) and the values are version numbers.  An empty
42 |     // list indicates to just test against the default (latest)
43 |     // version.
44 |     "matrix": {
45 |         "numpy": [],
46 |     },
47 | 
48 |     // The directory (relative to the current directory) that benchmarks are
49 |     // stored in.  If not provided, defaults to "benchmarks"
50 |     "benchmark_dir": "benchmarks",
51 | 
52 |     // The directory (relative to the current directory) to cache the Python
53 |     // environments in.  If not provided, defaults to "env"
54 |     "env_dir": ".asv/env",
55 | 
56 | 
57 |     // The directory (relative to the current directory) that raw benchmark
58 |     // results are stored in.  If not provided, defaults to "results".
59 |     "results_dir": ".asv/results",
60 | 
61 |     // The directory (relative to the current directory) that the html tree
62 |     // should be written to.  If not provided, defaults to "html".
63 |     "html_dir": "html",
64 | 
65 |     // The number of characters to retain in the commit hashes.
66 |     // "hash_length": 8,
67 | 
68 |     // `asv` will cache wheels of the recent builds in each
69 |     // environment, making them faster to install next time.  This is
70 |     // number of builds to keep, per environment.
71 |     "build_cache_size": 8,
72 | 
73 |     // The commits after which the regression search in `asv publish`
74 |     // should start looking for regressions. Dictionary whose keys are
75 |     // regexps matching to benchmark names, and values corresponding to
76 |     // the commit (exclusive) after which to start looking for
77 |     // regressions.  The default is to start from the first commit
78 |     // with results. If the commit is `null`, regression detection is
79 |     // skipped for the matching benchmark.
80 |     //
81 |     // "regressions_first_commits": {
82 |     //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
83 |     //    "another_benchmark": null,   // Skip regression detection altogether
84 |     // }
85 | }
86 | 


--------------------------------------------------------------------------------
/doc_src/source/index.rst:
--------------------------------------------------------------------------------
  1 | .. pnumpy documentation master file, created by
  2 |    sphinx-quickstart on Thu Oct 22 12:01:26 2020.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | Welcome to PNumPy's documentation!
  7 | ==================================
  8 | 
  9 | .. toctree::
 10 |     :maxdepth: 2
 11 |     :caption: Contents:
 12 | 
 13 |     roadmap
 14 |     installation
 15 |     use
 16 | 
 17 | PNumPy seamlessly speeds up NumPy for large arrays (64K+ elements) with *no
 18 | change required to your existing NumPy code*.
 19 | 
 20 | This first release speeds up NumPy binary and unary ufuncs such as ``add``,
 21 | ``multiply``, ``isnan``, ``abs``, ``sin``, ``log``, ``sum``, ``min`` and many more.
 22 | Sped up functions also include: ``sort``, ``argsort``, ``lexsort``, boolean indexing, and
 23 | fancy indexing.  In the near future we will speed up: ``astype``, ``where``, ``putmask``,
 24 | ``arange``, ``searchsorted``.
 25 | 
 26 | Installation
 27 | ------------
 28 | 
 29 | .. code-block:: python
 30 | 
 31 |    pip install pnumpy
 32 | 
 33 | To use the project:
 34 | 
 35 | .. code-block:: python
 36 | 
 37 |    import pnumpy as pn
 38 | 
 39 | 
 40 | PNumPy speeds up NumPy silently under the hood.  To see some benchmarks
 41 | yourself :ref:`run ASV <benchmarking_asv>` or use the built-in :ref:`benchmark
 42 | <benchmarking>` function:
 43 |  
 44 | .. code-block:: python
 45 | 
 46 |    pn.benchmark()
 47 | 
 48 | .. image:: ../images/bench4graph2.PNG
 49 | .. image:: ../images/bench4graph3.PNG
 50 | 
 51 | To get a partial list of functions sped up run
 52 | 
 53 | .. code-block:: python
 54 | 
 55 |    pn.atop_info()
 56 | 
 57 | To disable or enable pnumpy run
 58 | 
 59 | .. code-block:: python
 60 | 
 61 |    pn.disable()
 62 |    pn.enable()
 63 | 
 64 | To cap the number of additional worker threads to 3 run
 65 | 
 66 | .. code-block:: python
 67 | 
 68 |    pn.thread_setworkers(3)
 69 | 
 70 | .. _ASV: https://asv.readthedocs.io/en/stable/using.html
 71 | 
 72 | 
 73 | Additional Functionality
 74 | ------------------------
 75 | 
 76 | PNumPy provides additional routines such as converting a NumPy record array to a column major array in parallel (**pn.recarray_to_colmajor**) which is useful for DataFrames.  Other routines include **pn.lexsort32**, which performs an indirect sort using **np.int32** instead of **np.int64** consuming half the memory and running faster.
 77 | 
 78 | Threading
 79 | ---------
 80 | 
 81 | PNumPy uses a combination of threads and 256 bit vector intrinsics to speed up calculations.  By default most operations will only use 3 additional worker threads in combination with the main python thread for a total 4.  Large arrays are divided up into 16K chunks and threads are assigned to maintain cache coherency.  More threads are dynamically deployed for more intensive CPU problems like **np.sin**.  Users can customize threading.  The example below shows how 4 threads can work together to quadruple the effective L2 cache size.
 82 | 
 83 | .. image:: ../images/threading_npadd.PNG
 84 | 
 85 | 
 86 | FAQ
 87 | ---
 88 | 
 89 | **Q: If I type np.sort(a) where a is an array, will it be sped up?**
 90 | 
 91 | *A: If len(a) > 65536 and pnumpy has been imported, it will automatically be sped up*
 92 | 
 93 | **Q: How is sort sped up?**
 94 | 
 95 | *A: PNumPy uses additional threads to divide up the sorting job.  For example it might perform an 8 way quicksort followed by a 4 way mergesort*
 96 | 
 97 | Development
 98 | -----------
 99 | 
100 | To run all the tests run:
101 | 
102 | .. code-block:: python
103 | 
104 |    python -m pip install pytest
105 |    python -m pytest tests
106 | 
107 | 
108 | Indices and tables
109 | ==================
110 | 
111 | * :ref:`genindex`
112 | * :ref:`modindex`
113 | * :ref:`search`
114 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
  1 | name: Build and upload to PyPI
  2 | # publish when a (published) GitHub Release is created
  3 | on:
  4 |   release:
  5 |     types:
  6 |       - published
  7 | 
  8 | jobs:
  9 |   build_wheels:
 10 |     name: Build wheels ${{ matrix.os }}, ${{ matrix.platform }}
 11 |     runs-on: ${{ matrix.os }}
 12 |     defaults:
 13 |       run:
 14 |         shell: bash
 15 |     strategy:
 16 |       matrix:
 17 |         os: [ubuntu-latest, macos-latest, windows-latest]
 18 |         platform: [x64]
 19 |         python-version: [3.7]   # changed from [3.6, 3.7, 3.8]
 20 |         exclude:
 21 |           - os: macos-latest
 22 |             platform: x32
 23 |     steps:
 24 |       - uses: actions/checkout@v2
 25 |         with:
 26 |           submodules: recursive
 27 |       - name: Set up python ${{ matrix.python-version }}
 28 |         uses: actions/setup-python@v2
 29 |         with:
 30 |           python-version: ${{ matrix.python-version }}
 31 | 
 32 |       - name: Install cibuildwheel
 33 |         run: |
 34 |           python -m pip install cibuildwheel==1.6.3
 35 |       - name: Build wheels on ${{ matrix.os}}
 36 |         run: |
 37 |           RUNNER_OS="${{ runner.os }}"
 38 |           PLATFORM="${{ matrix.platform }}"
 39 |           echo $RUNNER_OS
 40 |           echo $PLATFORM
 41 |           if [ "$PLATFORM" == "x64" ]; then
 42 |             export CIBW_SKIP="cp27-* cp35-* *-win32 *-manylinux_i686 *manylinux_aarch64 *manylinux_ppc64le *manylinux_s390x"
 43 |           elif [ "$PLATFORM" == "x32" ]; then
 44 |             export CIBW_SKIP="cp27-* cp35-*"
 45 |           fi
 46 |           # to exclude  manylinux_aarch64 manylinux_ppc64le manylinux_s390x
 47 |           if [ "$RUNNER_OS" == "Windows" ]; then
 48 |             if [ "$PLATFORM" == "x64" ]; then
 49 |               export CIBW_BUILD="cp37-win_amd64"
 50 |             elif [ "$PLATFORM" == "x32" ]; then
 51 |               export CIBW_BUILD="cp37-win32"
 52 |             fi
 53 |           elif [ "$RUNNER_OS" == "Linux" ]; then
 54 |             if [ "$PLATFORM" == "x64" ]; then
 55 |               export CIBW_BUILD="cp36-manylinux_x86_64 cp37-manylinux_x86_64 cp38-manylinux_x86_64"
 56 |             elif [ "$PLATFORM" == "x32" ]; then
 57 |               export CIBW_BUILD="cp36-manylinux_i686 cp37-manylinux_i686 cp38-manylinux_i686"
 58 |             fi
 59 |           elif [ "$RUNNER_OS" == "macOS" ]; then
 60 |             export CIBW_BUILD="cp37-macosx_x86_64"
 61 |           fi
 62 |           python -m cibuildwheel --output-dir wheelhouse
 63 |       - uses: actions/upload-artifact@v2
 64 |         with:
 65 |           path: ./wheelhouse/*.whl
 66 | 
 67 |   build_sdist:
 68 |     name: Build source distribution
 69 |     runs-on: ubuntu-latest
 70 |     steps:
 71 |       - uses: actions/checkout@v2
 72 | 
 73 |       - uses: actions/setup-python@v2
 74 |         name: Install Python
 75 |         with:
 76 |           python-version: '3.7'
 77 | 
 78 |       - name: Build sdist
 79 |         run: |
 80 |           python -m pip install numpy>=1.19.0
 81 |           python setup.py sdist
 82 |       - uses: actions/upload-artifact@v2
 83 |         with:
 84 |           path: dist/*.tar.gz
 85 | 
 86 |   upload_pypi:
 87 |     needs: [build_wheels, build_sdist]
 88 |     runs-on: ubuntu-latest
 89 |     # publish when a GitHub Release is created
 90 |     if: github.event_name == 'release' && github.event.action == 'published'
 91 |     steps:
 92 |       - uses: actions/download-artifact@v2
 93 |         with:
 94 |           name: artifact
 95 |           path: dist
 96 | 
 97 |       - uses: pypa/gh-action-pypi-publish@master
 98 |         with:
 99 |           user: __token__
100 |           password: ${{ secrets.test_pypi }}  # switch to non-test pwd after testing
101 |           repository_url: https://test.pypi.org/legacy/  # remove line after testing
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PNumPy
  2 | Parallel NumPy seamlessly speeds up NumPy for large arrays (64K+ elements) with *no change required to your existing NumPy code*.
  3 | 
  4 | PNumPy supports Linux, Windows, and MacOS for NumPy >= 1.18 for python 3.6, 3.7, 3.8, and 3.9.
  5 | 
  6 | This first release speeds up NumPy binary and unary ufuncs such as **add, multiply, isnan, abs, sin, log, sum, min and many more**.
  7 | Sped up functions also include: **sort, argsort, lexsort, arange, boolean indexing, and fancy indexing**.
  8 | In the near future we will speed up: **astype, where, putmask, and searchsorted**.
  9 | 
 10 | Other packages that use numpy, such as [scikit-learn](https://scikit-learn.org/stable/) or [pandas](https://github.com/pandas-dev/pandas), will also be sped up for large arrays.
 11 | 
 12 | [![CI Status](https://github.com/Quansight/pnumpy/workflows/tox/badge.svg)](https://github.com/Quansight/pnumpy/actions)
 13 | 
 14 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 15 | 
 16 | ## Installation
 17 | ```
 18 | pip install pnumpy
 19 | ```
 20 | 
 21 | You can also install the latest development versions with
 22 | ```
 23 | pip install https://github.com/Quansight/pnumpy/archive/main.zip
 24 | ```
 25 | 
 26 | ## Documentation
 27 | 
 28 | See the [full documentation](https://quansight.github.io/pnumpy/stable/index.html)
 29 | 
 30 | To use the project:
 31 | 
 32 | ```python
 33 | import pnumpy as pn
 34 | ```
 35 | 
 36 | Parallel NumPy speeds up NumPy silently under the hood.  To see some benchmarks yourself run
 37 | ```
 38 | pn.benchmark()
 39 | ```
 40 | ![plot](./doc_src/images/bench4graph2.PNG)
 41 | ![plot](./doc_src/images/bench4graph3.PNG)
 42 | 
 43 | To get a partial list of functions sped up run
 44 | ```
 45 | pn.atop_info()
 46 | ```
 47 | 
 48 | To disable or enable pnumpy run
 49 | ```
 50 | pn.disable()
 51 | pn.enable()
 52 | ```
 53 | 
 54 | ## Additional Functionality
 55 | PNumPy provides additional routines such as converting a NumPy record array to a column major array in parallel (**pn.recarray_to_colmajor**) which is useful for DataFrames.  Other routines include **pn.lexsort32**, which performs an indirect sort using **np.int32** instead of **np.int64** consuming half the memory and running faster.
 56 | 
 57 | ## Threading
 58 | PNumPy uses a combination of threads and 256 bit vector intrinsics to speed up calculations.  By default most operations will only use 3 additional worker threads in combination with the main python thread for a total 4.  Large arrays are divided up into 16K chunks and threads are assigned to maintain cache coherency.  More threads are dynamically deployed for more intensive CPU problems like **np.sin**.  Users can customize threading.  The example below shows how 4 threads can work together to quadruple the effective L2 cache size.
 59 | 
 60 | ![plot](./doc_src/images/threading_npadd.PNG)
 61 | 
 62 | To cap the number of additional worker threads to 3 run
 63 | ```
 64 | pn.thread_setworkers(3)
 65 | ```
 66 | 
 67 | To disable or re-enable threading run
 68 | ```
 69 | pn.thread_disable()
 70 | pn.thread_enable()
 71 | ```
 72 | 
 73 | To disable or re-enable just the atop engine run
 74 | ```
 75 | pn.atop_disable()
 76 | pn.atop_enable()
 77 | ```
 78 | 
 79 | ## FAQ
 80 | **Q: If I type np.sort(a) where a is an array, will it be sped up?**
 81 | 
 82 | *A: If len(a) > 65536 and pnumpy has been imported, it will automatically be sped up*
 83 | 
 84 | **Q: How is sort sped up?**
 85 | 
 86 | *A: PNumPy uses additional threads to divide up the sorting job.  For example it might perform an 8 way quicksort followed by a 4 way mergesort*
 87 | 
 88 | **Q: How is scikit or pandas sped up?**
 89 | 
 90 | *A: PNumPy's vector loops and threads will speed up any package that uses large NumPy arrays*
 91 | 
 92 | ## Development
 93 | 
 94 | To run all the tests run:
 95 | 
 96 | ```
 97 | python -m pip install pytest
 98 | python -m pytest tests
 99 | ```
100 | 


--------------------------------------------------------------------------------
/src/pnumpy/cpu.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import re
  4 | import glob
  5 | 
  6 | import pnumpy._pnumpy as _pnumpy
  7 | from pnumpy._pnumpy import atop_enable, atop_disable, atop_isenabled, atop_info, atop_setworkers, cpustring 
  8 | from pnumpy._pnumpy import thread_enable, thread_disable, thread_isenabled, thread_getworkers, thread_setworkers, thread_zigzag
  9 | 
 10 | __all__ = [
 11 |     'cpu_count_linux', 'init', 'enable', 'disable']
 12 | 
 13 | # NOTE: code adapted from psinfo
 14 | 
 15 | def open_binary(fname, **kwargs):
 16 |     return open(fname, "rb", **kwargs)
 17 | 
 18 | def cpu_physical_linux():
 19 |     """Return the number of physical cores in the system.
 20 |     None may be returned on failure.
 21 |     """
 22 |     # Method #1
 23 |     ls = set()
 24 |     # These 2 files are the same but */core_cpus_list is newer while
 25 |     # */thread_siblings_list is deprecated and may disappear in the future.
 26 |     # https://www.kernel.org/doc/Documentation/admin-guide/cputopology.rst
 27 |     # https://github.com/giampaolo/psutil/pull/1727#issuecomment-707624964
 28 |     # https://lkml.org/lkml/2019/2/26/41
 29 |     p1 = "/sys/devices/system/cpu/cpu[0-9]*/topology/core_cpus_list"
 30 |     p2 = "/sys/devices/system/cpu/cpu[0-9]*/topology/thread_siblings_list"
 31 |     for path in glob.glob(p1) or glob.glob(p2):
 32 |         with open_binary(path) as f:
 33 |             ls.add(f.read().strip())
 34 |     result = len(ls)
 35 |     if result != 0:
 36 |         return result
 37 | 
 38 |     # Method #2
 39 |     mapping = {}
 40 |     current_info = {}
 41 |     with open_binary('/proc/cpuinfo') as f:
 42 |         for line in f:
 43 |             line = line.strip().lower()
 44 |             if not line:
 45 |                 # new section
 46 |                 try:
 47 |                     mapping[current_info[b'physical id']] = \
 48 |                         current_info[b'cpu cores']
 49 |                 except KeyError:
 50 |                     pass
 51 |                 current_info = {}
 52 |             else:
 53 |                 # ongoing section
 54 |                 if line.startswith((b'physical id', b'cpu cores')):
 55 |                     key, value = line.split(b'\t:', 1)
 56 |                     current_info[key] = int(value)
 57 | 
 58 |     result = sum(mapping.values())
 59 |     return result or None  # mimic os.cpu_count()
 60 | 
 61 | def cpu_count_linux():
 62 |     """
 63 |     Return the number of logical CPUs and physical cores.
 64 |     None may be returned on failure.
 65 |     """
 66 |     try:
 67 |         num= os.sysconf("SC_NPROCESSORS_ONLN")
 68 |     except ValueError:
 69 |         # as a second fallback we try to parse /proc/cpuinfo
 70 |         num = 0
 71 |         with open_binary('/proc/cpuinfo') as f:
 72 |             for line in f:
 73 |                 if line.lower().startswith(b'processor'):
 74 |                     num += 1
 75 | 
 76 |         # try to parse /proc/stat as a last resort
 77 |         if num == 0:
 78 |             search = re.compile(r'cpu\d')
 79 |             with open_text('/proc/stat') as f:
 80 |                 for line in f:
 81 |                     line = line.split(' ')[0]
 82 |                     if search.match(line):
 83 |                         num += 1
 84 | 
 85 |         if num == 0:
 86 |             # mimic os.cpu_count()
 87 |             num=None
 88 |     return num, cpu_physical_linux()
 89 | 
 90 | def init():
 91 |     """
 92 |     Called at load time to start the atop and threading engines.
 93 |     
 94 |     Parameters
 95 |     ----------
 96 |     None
 97 | 
 98 |     See Also
 99 |     --------
100 |     pn.enable
101 |     pn.disable
102 |     """
103 |     
104 |     import platform
105 |     if platform.system() == 'Linux':
106 |         logical,physical = cpu_count_linux()
107 |         _pnumpy.initialize()
108 |     else:
109 |         _pnumpy.initialize()
110 | 
111 | def enable():
112 |     """
113 |     Call to enable the atop engine, use threads, and hook numpy functions.
114 | 
115 |     Parameters
116 |     ----------
117 |     None
118 | 
119 |     Returns
120 |     -------
121 |     None
122 | 
123 |     See Also
124 |     --------
125 |     pn.disable
126 |     pn.atop_info
127 |     """
128 |     atop_enable()
129 |     thread_enable()
130 | 
131 | def disable():
132 |     """
133 |     Call to disable the atop engine, stop any threads, and unhook numpy functions.
134 | 
135 |     Parameters
136 |     ----------
137 |     None
138 | 
139 |     Returns
140 |     -------
141 |     None
142 | 
143 |     See Also
144 |     --------
145 |     pn.enable
146 |     pn.atop_info
147 |     """
148 |     atop_disable()
149 |     thread_disable()
150 | 
151 | 


--------------------------------------------------------------------------------
/_add_newdocs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Add docstrings to c-extension modules:
  3 | Will create a header file for each module in the current directory, and
  4 | fill it with the docstrings.
  5 | """
  6 | from collections import defaultdict
  7 | import io
  8 | import os
  9 | srcdir = os.path.join(os.path.dirname(__file__), 'src', 'pnumpy')
 10 | 
 11 | def append_header(fid, function, docstring):
 12 | 	key = function.upper() + "_DOC"
 13 | 	docstring = docstring.replace('"', '\\"')
 14 | 	docstring = docstring.replace('\n', '"\n"')
 15 | 	fid.write("\n")
 16 | 	fid.write(f'static char {key}[] = "{docstring}";')
 17 | 
 18 | headers = defaultdict(io.StringIO)
 19 | 
 20 | def add_newdoc(module, function, docstring):
 21 | 	fid = headers[module.upper() + '.h']
 22 | 	append_header(fid, function, docstring)	
 23 | 
 24 | 
 25 | add_newdoc('pnumpy', 'initialize',
 26 | """
 27 | Initialize the module. Replaces all the ufunc inner loops with a new version
 28 | using ``PyUFunc_ReplaceLoopBySignature``. If none of the other options are
 29 | enabled, the original inner loop function will be called. Will also call
 30 | ``numpy.setbufsize(8192 * 1024)`` to work around numpy issue 17649.
 31 | """)
 32 | 
 33 | 
 34 | add_newdoc('pnumpy', 'atop_enable',
 35 | """
 36 | enable the atop inner loop implementations.
 37 | """)
 38 | 
 39 | 
 40 | add_newdoc('pnumpy', 'atop_disable',
 41 | """
 42 | disable the atop inner loop implementations.
 43 | """)
 44 | 
 45 | 
 46 | add_newdoc('pnumpy', "atop_isenabled",
 47 | "returns True if atop enabled, else False")
 48 | 
 49 | 
 50 | add_newdoc('pnumpy', "thread_enable",
 51 | """
 52 | Enable worker threads for inner loops when they are large enough to justify
 53 | the extra overhead.
 54 | """)
 55 | 
 56 | 
 57 | add_newdoc('pnumpy', "thread_disable",
 58 | "Disable worker threads")
 59 | 
 60 | 
 61 | add_newdoc('pnumpy', "thread_isenabled",
 62 | "Returns True if worker threads enabled else False")
 63 | 
 64 | 
 65 | add_newdoc('pnumpy', "thread_getworkers",
 66 | "Get the number of worker threads")
 67 | 
 68 | 
 69 | add_newdoc('pnumpy', "thread_setworkers",
 70 | "Set the number of worker threads, return previous value. Must be at least 1.")
 71 | 
 72 | 
 73 | add_newdoc('pnumpy', "timer_gettsc",
 74 | "Get the time stamp counter")
 75 | 
 76 | 
 77 | add_newdoc('pnumpy', "timer_getutc",
 78 | "Get the time in utc nanos since unix epoch")
 79 | 
 80 | 
 81 | add_newdoc('pnumpy', "cpustring",
 82 | "Cpu brand string plus features")
 83 | 
 84 | 
 85 | add_newdoc('pnumpy', "oldinit",
 86 | "old, deprecated")        
 87 | 
 88 | 
 89 | add_newdoc('pnumpy', "ledger_enable",
 90 | """
 91 | Enable ledger debuggging. This collects statistics on each run of a loop:
 92 | input signature and dimensions, time to execute the loop and more
 93 | """)
 94 | 
 95 | 
 96 | add_newdoc('pnumpy', "ledger_disable",
 97 | "Disable ledger")
 98 | 
 99 | 
100 | add_newdoc('pnumpy', "ledger_isenabled",
101 | "Returns True if ledger enabled else False")
102 | 
103 | 
104 | add_newdoc('pnumpy', "ledger_info",
105 | "Return ledger information")
106 | 
107 | add_newdoc('pnumpy', 'recarray_to_colmajor', 
108 |    ("Converts a numpy record array (void type) to a dictionary of numpy arrays, col major\n"
109 |     "Inputs\n"
110 |     "------\n"
111 |     "item: A numpy recorarray to return as column major\n"
112 |     "parallel: Default to True\n"
113 |     "\n"
114 |     "Returns\n"
115 |     "-------\n"
116 |     "A dictionary of numpy arrays corresponding to the original numpy record array.\n"
117 |     "\n"
118 |     "Examples\n"
119 |     "--------\n"
120 |     ">>> x=np.array([(1.0, 2, 3, 4, 5, 'this is a long test'), (3.0, 4, 5, 6, 7, 'short'), (30.0, 40, 50, 60, 70, '')],\n"
121 |     "            dtype=[('x', '<f4'), ('y', '<i2'), ('z', 'i8'),('zz','i8'),('yy','i4'),('str','<S20')])\n"
122 |     ">>> item=np.tile(x,100_000)\n"
123 |     ">>> mydict = recarray_to_colmajor(item)"
124 |    ))
125 |            
126 | add_newdoc('pnumpy', "recycler_enable",
127 | "Enable recycler to compact memory usage")
128 | 
129 | 
130 | add_newdoc('pnumpy', "recycler_disable",
131 | "Disable recycler")
132 | 
133 | 
134 | add_newdoc('pnumpy', "recycler_isenabled",
135 | "Returns True if recycler enabled else False")
136 | 
137 | 
138 | add_newdoc('pnumpy', "recycler_info",
139 | "Return recycler information")
140 | 
141 | # Rewrite any of the headers that changed
142 | 
143 | def main():
144 | 	for k, v in headers.items():
145 | 		txt2 = ''
146 | 		target = os.path.join(srcdir, k)
147 | 		txt1 = v.getvalue()
148 | 		if os.path.exists(target):
149 | 			with open(target) as fid:
150 | 				txt2 = fid.read()
151 | 		if txt1 != txt2:
152 | 			print('writing', target)
153 | 			with open(target, 'w') as fid:
154 | 				fid.write(txt1)
155 | 
156 | if __name__ == "__main__":
157 | 	main()
158 | 


--------------------------------------------------------------------------------
/src/atop/fill.cpp:
--------------------------------------------------------------------------------
  1 | #include "common_inc.h"
  2 | #include "threads.h"
  3 | #include "halffloat.h"
  4 | #include <cmath>
  5 | #include <algorithm>
  6 | 
  7 | //#define LOGGING printf
  8 | #define LOGGING(...)
  9 | 
 10 | // NOTES
 11 | // FillZeros calls
 12 | // PyArray_AssignRawScalar
 13 | // raw_array_assign_scalar
 14 | // which then calls PyArray_GetDTypeTransferFunction
 15 | //        /* Process the innermost dimension */
 16 | //stransfer(dst_data, dst_strides_it[0], src_data, 0,
 17 | //    shape_it[0], src_itemsize, transferdata);
 18 | 
 19 | ////================================================================================
 20 | typedef int(*ARANGE_FILL)(char* pBufferV, void* pFirstV, void* pNextValueV, int64_t start, int64_t length);
 21 | 
 22 | // vector code disabled for now (does not seem much faster)
 23 | //template<typename TYPE>
 24 | //static int
 25 | //ArangeFillTypeInt32(char* pBufferV, void* pFirstV, void* pNextValueV, int64_t start, int64_t length)
 26 | //{
 27 | //    //printf("int32 fill\n");
 28 | //    TYPE* pBuffer = (TYPE*)pBufferV;
 29 | //
 30 | //    // The start/next are stored by numpy in first two values of array
 31 | //    TYPE first = *(TYPE*)pFirstV;
 32 | //    TYPE delta = *(TYPE*)pNextValueV;
 33 | //
 34 | //    delta -= first;
 35 | //
 36 | //    __m256i mstart = _mm256_set_epi32(7,6,5,4,3,2,1,0);
 37 | //    __m256i madd= _mm256_set1_epi32(sizeof(__m256i)/sizeof(TYPE)); // 8
 38 | //    __m256i mdelta = _mm256_set1_epi32((int32_t)delta);
 39 | //    madd = _mm256_mullo_epi32(madd, mdelta);
 40 | //    mstart = _mm256_add_epi32(mstart, _mm256_set1_epi32((int32_t)start));
 41 | //    mstart = _mm256_mullo_epi32(mstart, mdelta);
 42 | //
 43 | //    __m256i* pDest = (__m256i*)(pBuffer + start);
 44 | //    __m256i* pDestEnd = pDest + (length - start) / 8;
 45 | //
 46 | //    while (pDest != pDestEnd) {
 47 | //        _mm256_storeu_si256(pDest, mstart);
 48 | //        mstart = _mm256_add_epi32(mstart, madd);
 49 | //        pDest++;
 50 | //    }
 51 | //
 52 | //    start = start + length - (length & 7);
 53 | //    for (int64_t i = start; i < length; i++) {
 54 | //        pBuffer[i] = (TYPE)(first + i * delta);
 55 | //    }
 56 | //
 57 | //    return 0;
 58 | //}
 59 | 
 60 | 
 61 | template<typename TYPE>
 62 | static int
 63 | ArangeFillType(char *pBufferV, void* pFirstV, void* pNextValueV, int64_t start, int64_t length)
 64 | {
 65 |     TYPE* pBuffer = (TYPE*)pBufferV;
 66 | 
 67 |     // The start/next are stored by numpy in first two values of array
 68 |     TYPE first = *(TYPE*)pFirstV;
 69 |     TYPE delta = *(TYPE*)pNextValueV;
 70 | 
 71 |     delta -= first;
 72 | 
 73 |     // TOOD: vectorize this code
 74 |     for (int64_t i = start; i < length; i++) {
 75 |         pBuffer[i] = (TYPE)(first + i * delta);
 76 |     }
 77 |     // Path below is slower
 78 |     //TYPE* pBufferEnd = pBuffer + length;
 79 |     //while (pBuffer < pBufferEnd) {
 80 |     //    *pBuffer++ = start;
 81 |     //    start += delta;
 82 |     //}
 83 |     return 0;
 84 | }
 85 | 
 86 | ARANGE_FILL g_ArangeFill[ATOP_LAST] = {
 87 |     NULL, //ArangeFillType<bool>,
 88 |     ArangeFillType<int8_t>,  ArangeFillType<uint8_t>,
 89 |     ArangeFillType<int16_t>, ArangeFillType<uint16_t>,
 90 |     ArangeFillType<int32_t>, ArangeFillType<uint32_t>,
 91 |     ArangeFillType<int64_t>, ArangeFillType<uint64_t>,
 92 |     NULL, NULL, //int128
 93 |     NULL, ArangeFillType<float>, ArangeFillType<double>, ArangeFillType<long double>,
 94 |     NULL, NULL, NULL, NULL, // Complex
 95 |     NULL, NULL, NULL       // String, unicode, void
 96 | };
 97 | 
 98 | extern "C" int ArangeFill(
 99 |     int   atype,
100 |     char* pBuffer,
101 |     void* pFirstValue,
102 |     void* pSecondValue,
103 |     int64_t length,
104 |     int32_t threadwakeup) {
105 | 
106 |     ARANGE_FILL pArangeFill = g_ArangeFill[atype];
107 | 
108 |     // check if we have the routine
109 |     if (pArangeFill) {
110 | 
111 |         // Multithreaded callback
112 |         struct ArangeCallbackStruct {
113 |             ARANGE_FILL pArangeFill;
114 |             char* pBuffer;
115 |             void* pFirstValue;
116 |             void* pSecondValue;
117 |             int64_t length;
118 |         } stArangeCallback{ pArangeFill, pBuffer, pFirstValue, pSecondValue, length };
119 | 
120 |         // This is the routine that will be called back from multiple threads
121 |         auto lambdaArangeCallback = [](void* callbackArgT, int core, int64_t start, int64_t length) -> int64_t {
122 |             LOGGING("[%d] Arange  %lld %lld\n", core, start, length);
123 |             ArangeCallbackStruct* cb=(ArangeCallbackStruct * )callbackArgT;
124 |             cb->pArangeFill(cb->pBuffer, cb->pFirstValue, cb->pSecondValue, start, start + length);
125 |             return 1;
126 |         };
127 | 
128 |         THREADER->DoMultiThreadedChunkWork(length, lambdaArangeCallback, &stArangeCallback, threadwakeup);
129 |         return 0;
130 |     }
131 |     // fail
132 |     return -1;
133 | }
134 | 
135 | 


--------------------------------------------------------------------------------
/src/pnumpy/common.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "Python.h"
  3 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
  4 | // NOTE: See PY_ARRAY_UNIQUE_SYMBOL
  5 | // If this is not included, calling PY_ARRAY functions will have a null value
  6 | #define PY_ARRAY_UNIQUE_SYMBOL sharedata_ARRAY_API
  7 | 
  8 | #ifndef SHAREDATA_MAIN_C_FILE
  9 | #define NO_IMPORT_ARRAY
 10 | #endif
 11 | 
 12 | #include "numpy/ndarrayobject.h"
 13 | #include "numpy/ufuncobject.h"
 14 | #include <stdint.h>
 15 | #include <stdio.h>
 16 | #include "../atop/atop.h"
 17 | 
 18 | 
 19 | int dtype_to_atop(int dtype);
 20 | 
 21 | // Global user settings controlled by python functions
 22 | // set to 0 to disable
 23 | struct stSettings {
 24 |     int32_t  AtopEnabled;
 25 |     int32_t  LedgerEnabled;
 26 |     int32_t  RecyclerEnabled;
 27 |     int32_t  ZigZag;  // set to 0 to disable
 28 |     int32_t  Initialized;
 29 |     int32_t  Reserved;
 30 |     binaryfunc NumpyGetItem;  // optional hook
 31 | };
 32 | 
 33 | extern stSettings g_Settings;
 34 | 
 35 | struct stUFuncToAtop {
 36 |     const char* str_ufunc_name;
 37 |     const int       atop_op;
 38 | };
 39 | 
 40 | enum OP_CATEGORY:int32_t {
 41 |     OPCAT_BINARY = 0,
 42 |     OPCAT_UNARY = 1,
 43 |     OPCAT_COMPARE = 2,
 44 |     OPCAT_TRIG = 3,
 45 |     OPCAT_CONVERT = 4,
 46 |     OPCAT_SORT = 5,
 47 |     OPCAT_ARGSORT = 6,
 48 |     OPCAT_ARANGE = 7,
 49 |     OPCAT_ARGMINMAX = 8,
 50 |     OPCAT_LAST = 9,
 51 | };
 52 | 
 53 | struct stOpCategory {
 54 |     const char*     StrName;
 55 |     int32_t         NumOps;
 56 |     OP_CATEGORY     CatEnum;    // 
 57 |     stUFuncToAtop*  pUFuncToAtop;
 58 | };
 59 | 
 60 | 
 61 | //---------------------------------------------------------------------
 62 | // NOTE: See SDSArrayInfo and keep same
 63 | struct ArrayInfo {
 64 | 
 65 |     // Numpy object
 66 |     PyArrayObject* pObject;
 67 | 
 68 |     // First bytes
 69 |     char* pData;
 70 | 
 71 |     // Width in bytes of one row
 72 |     int64_t      ItemSize;
 73 | 
 74 |     // total number of items
 75 |     int64_t       ArrayLength;
 76 | 
 77 |     int64_t       NumBytes;
 78 | 
 79 |     int           NumpyDType;
 80 |     int           NDim;
 81 | 
 82 |     // When calling ensure contiguous, we might make a copy
 83 |     // if so, pObject is the copy and must be deleted.  pOriginal was passed in
 84 |     PyArrayObject* pOriginalObject;
 85 | 
 86 | };
 87 | 
 88 | extern void* GetDefaultForType(int numpyInType);
 89 | extern int64_t CalcArrayLength(int ndim, npy_intp* dims);
 90 | extern int64_t ArrayLength(PyArrayObject* inArr);
 91 | extern PyArrayObject* AllocateNumpyArray(int ndim, npy_intp* dims, int32_t numpyType, int64_t itemsize = 0, int fortran_array = 0, npy_intp* strides = nullptr);
 92 | extern PyArrayObject* AllocateLikeResize(PyArrayObject* inArr, npy_intp rowSize);
 93 | extern PyArrayObject* AllocateLikeNumpyArray(PyArrayObject* inArr, int numpyType);
 94 | extern BOOL ConvertScalarObject(PyObject* inObject1, void* pDest, int16_t numpyOutType, void** ppDataIn, int64_t* pItemSize);
 95 | extern int GetStridesAndContig(PyArrayObject* inArray, int& ndim, int64_t& stride);
 96 | 
 97 | // defined in pnumpy
 98 | extern stOpCategory gOpCategory[OPCAT_LAST];
 99 | 
100 | extern void LedgerRecord(int32_t op_category, int64_t start_time, int64_t end_time, char** args, const npy_intp* dimensions, const npy_intp* steps, void* innerloop, int funcop, int atype);
101 | extern void LedgerRecord2(int32_t op_category, int64_t start_time, int64_t end_time, int atype, int64_t length);
102 | extern void LedgerInit();
103 | extern int64_t CalcArrayLength(int ndim, npy_intp* dims);
104 | extern int64_t ArrayLength(PyArrayObject* inArr);
105 | extern PyArrayObject* AllocateNumpyArray(int ndim, npy_intp* dims, int32_t numpyType, int64_t itemsize, int fortran_array, npy_intp* strides);
106 | extern PyArrayObject* AllocateLikeResize(PyArrayObject* inArr, npy_intp rowSize);
107 | extern PyArrayObject* AllocateLikeNumpyArray(PyArrayObject* inArr, int numpyType);
108 | extern ArrayInfo* BuildArrayInfo(
109 |     PyObject* listObject,
110 |     int64_t* pTupleSize,
111 |     int64_t* pTotalItemSize,
112 |     BOOL checkrows = TRUE,
113 |     BOOL convert = TRUE);
114 | 
115 | extern void FreeArrayInfo(ArrayInfo* pAlloc);
116 | 
117 | extern PyObject* BooleanIndexInternal(PyArrayObject* aValues, PyArrayObject* aIndex);
118 | extern "C" PyObject *getitem(PyObject * self, PyObject * args);
119 | 
120 | #define RETURN_NONE Py_INCREF(Py_None); return Py_None;
121 | #define RETURN_FALSE Py_XINCREF(Py_False); return Py_False;
122 | #define RETURN_TRUE Py_XINCREF(Py_True); return Py_True;
123 | 
124 | #define IS_BINARY_REDUCE ((args[0] == args[2])\
125 |         && (steps[0] == steps[2])\
126 |         && (steps[0] == 0))
127 | 
128 | extern PyTypeObject* pPyArray_Type;
129 | 
130 | #if defined(_WIN32) && !defined(__GNUC__)
131 | 
132 | #define CASE_NPY_INT32      case NPY_INT32:       case NPY_INT
133 | #define CASE_NPY_UINT32     case NPY_UINT32:      case NPY_UINT
134 | #define CASE_NPY_INT64      case NPY_INT64
135 | #define CASE_NPY_UINT64     case NPY_UINT64
136 | #define CASE_NPY_FLOAT64    case NPY_DOUBLE:     case NPY_LONGDOUBLE
137 | 
138 | #else
139 | 
140 | #define CASE_NPY_INT32      case NPY_INT32
141 | #define CASE_NPY_UINT32     case NPY_UINT32
142 | #define CASE_NPY_INT64      case NPY_INT64:    case NPY_LONGLONG
143 | #define CASE_NPY_UINT64     case NPY_UINT64:   case NPY_ULONGLONG
144 | #define CASE_NPY_FLOAT64    case NPY_DOUBLE
145 | #endif
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/src/pnumpy/benchmark.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | __all__ = [
  5 |     'benchmark','benchmark_func']
  6 | 
  7 | from pnumpy._pnumpy import atop_enable, atop_disable, atop_isenabled 
  8 | from pnumpy._pnumpy import thread_enable, thread_disable, thread_isenabled
  9 | from pnumpy._pnumpy import timer_gettsc, timer_getutc
 10 | 
 11 | import numpy as np
 12 | 
 13 | # TODO: move this to new location
 14 | def benchmark_timeit(
 15 |     func=np.equal,
 16 |     ctypes=[np.bool_, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64],
 17 |     scalar=False,
 18 |     unary = False,
 19 |     reduct = False,
 20 |     outdtype=None,
 21 |     recycle=True,
 22 |     sizes=[1_000_000]):
 23 |     '''
 24 |     Internal routine to benchmark a function.
 25 | 
 26 |     '''
 27 | 
 28 |     def time_func(recycle, c):
 29 |         if unary is False:
 30 |             starttime = timer_gettsc()
 31 |             if recycle:
 32 |                 result=func(a,b,out=c)
 33 |             else:
 34 |                 result=func(a,b)
 35 |             delta= timer_gettsc() - starttime
 36 | 
 37 |         else:
 38 |             starttime = timer_gettsc()
 39 |             if reduct is True:
 40 |                 result=func(a)
 41 |             else:
 42 |                 if recycle:
 43 |                     result=func(a,out=c)
 44 |                 else:
 45 |                     result=func(a)
 46 | 
 47 |             delta= timer_gettsc() - starttime
 48 |         return delta, result
 49 | 
 50 |     timedelta = np.zeros(len(ctypes), np.int64)
 51 |     
 52 |     for s in sizes:
 53 |         slot = 0
 54 |         loop_size = 100
 55 |         mtimedelta = np.zeros(loop_size, np.int64)
 56 |         for ctype in ctypes:
 57 |             if ctype is np.bool_:
 58 |                a=np.arange(s, dtype=np.int8).astype(ctype)+1
 59 |             else:
 60 |                a=np.arange(s, dtype=ctype)
 61 |                a=a % 253
 62 |                a+=1
 63 | 
 64 |             if scalar is True:
 65 |                 b=a[5]
 66 |             else:
 67 |                 b=a.copy()
 68 | 
 69 |             # dry run
 70 |             delta, c=time_func(False, None)
 71 | 
 72 |             # main timing loop
 73 |             for loop in range(loop_size):
 74 |                 delta, result = time_func(recycle, c)
 75 |                 del result
 76 | 
 77 |                 mtimedelta[loop] = delta
 78 |             
 79 |             timedelta[slot] = np.median(mtimedelta)
 80 |             # print("median is ", timedelta[slot], slot)
 81 |             slot = slot + 1
 82 |     return timedelta
 83 | 
 84 | 
 85 | def benchmark_func(
 86 |     func=np.equal,
 87 |     ctypes=[np.bool_, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64],
 88 |     scalar=False,
 89 |     unary = False,
 90 |     reduct = False,
 91 |     outdtype=None,
 92 |     recycle=True,
 93 |     atop=True,
 94 |     thread=True,
 95 |     sizes=[1_000_000]):
 96 |     '''
 97 |     Benchmark one function.
 98 | 
 99 |     Examples
100 |     --------
101 | 
102 |     benchmark_func(np.add)
103 |     benchmark_func(np.add, sizes=[2**16])
104 |     benchmark_func(np.sqrt, unary=True)
105 |     '''
106 |     # disable atop and threading
107 |     atop_disable()
108 |     thread_disable()
109 |     # get original time
110 |     t0=benchmark_timeit(func=func, ctypes=ctypes, scalar=scalar, unary=unary, reduct=reduct,  outdtype=outdtype, recycle=recycle, sizes=sizes)
111 | 
112 |     # now possibly enable atop and threading
113 |     if atop:
114 |         atop_enable()
115 |     if thread:
116 |         thread_enable()
117 |     t1=benchmark_timeit(func=func, ctypes=ctypes, scalar=scalar, unary=unary, reduct=reduct, outdtype=outdtype, recycle=recycle, sizes=sizes)
118 |     return t0/t1
119 | 
120 | def benchmark(
121 |     ctypes=[np.bool_, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64],
122 |     recycle=True,
123 |     atop=True,
124 |     thread=True,
125 |     sizes=[1_000_000]):
126 |     '''
127 |     Performs a simple benchmark of the ratio of normal numpy (no threading) vs parallel numpy (threaded).
128 |     The output is formatted to be copied and pasted in a csv file.
129 |     A result above 1.0 indicates an improvement, below 1.0 indicates worse peformance.
130 | 
131 |     Parameters
132 |     ----------
133 |     ctypes :list of numpy dtypes to test, for example [np.int32, npfloat32]
134 |     recycle: True or False
135 |     atop: True or False.  Whether or not the atop engine is used in benchmarking.
136 |     thread: True or False.
137 |     sizes: list of array sizes to test, for example [100_000, 1_000_000]
138 | 
139 |     Returns
140 |     -------
141 |     output text formatted for a .csv file
142 | 
143 |     Examples
144 |     --------
145 |     pn.benchmark()
146 |     pn.benchmark(thread=False)
147 |     pn.benchmark(sizes=[2**16])
148 |     pn.benchmark(ctypes=[np.float32, np.float64])
149 |     '''
150 | 
151 |     def ctype_string(ct):
152 |         s=f'{sizes[0]} rows,'
153 |         for i in ct:
154 |             s=s+f'{i.__name__},'
155 |         return s
156 | 
157 |     def output_data(rowname, data):
158 |         s=f'{rowname},'
159 |         for i in data:
160 |             s=s+f'{i:5.2f},'
161 |         print(s)
162 | 
163 |     print(ctype_string(ctypes))
164 |     output_data("a==b", benchmark_func(np.equal, ctypes=ctypes, scalar=False, unary=False, recycle=recycle, atop=atop, thread=thread,  outdtype='?', sizes=sizes))
165 |     output_data("a==5", benchmark_func(np.equal, ctypes=ctypes, scalar=True, unary=False, recycle=recycle, atop=atop, thread=thread, outdtype='?', sizes=sizes))
166 |     output_data("a+b", benchmark_func(np.add, ctypes=ctypes, scalar=False, unary=False,recycle=recycle, atop=atop, thread=thread, sizes=sizes))
167 |     output_data("a+5", benchmark_func(np.add, ctypes=ctypes, scalar=True, unary=False,recycle=recycle, atop=atop, thread=thread, sizes=sizes))
168 |     output_data("a/5", benchmark_func(np.true_divide, ctypes=ctypes, scalar=True, unary=False,recycle=recycle, atop=atop, thread=thread, sizes=sizes))
169 |     output_data("abs", benchmark_func(np.abs, ctypes=ctypes, scalar=False, unary=True,recycle=recycle, atop=atop, thread=thread, sizes=sizes))
170 |     output_data("isnan", benchmark_func(np.isnan, ctypes=ctypes, scalar=False, unary=True,recycle=recycle, atop=atop, thread=thread, sizes=sizes))
171 |     output_data("sin",   benchmark_func(np.sin, ctypes=ctypes, scalar=False, unary=True,recycle=recycle, atop=atop, thread=thread, sizes=sizes))
172 |     output_data("log",   benchmark_func(np.log, ctypes=ctypes, scalar=False, unary=True,recycle=recycle, atop=atop, thread=thread, sizes=sizes))
173 |     output_data("sum",   benchmark_func(np.sum, ctypes=ctypes, scalar=False, reduct=True, unary=True,recycle=recycle, atop=atop, thread=thread, sizes=sizes))
174 |     output_data("min",   benchmark_func(np.min, ctypes=ctypes, scalar=False, reduct=True, unary=True,recycle=recycle, atop=atop, thread=thread, sizes=sizes))
175 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | from __future__ import absolute_import
  4 | from __future__ import print_function
  5 | 
  6 | import io
  7 | import os
  8 | import platform
  9 | from glob import glob
 10 | from os.path import basename
 11 | from os.path import dirname
 12 | from os.path import join
 13 | from os.path import relpath
 14 | from os.path import splitext
 15 | 
 16 | from setuptools import Extension
 17 | from setuptools import find_packages
 18 | from setuptools import setup
 19 | import numpy as np
 20 | 
 21 | try:
 22 |     from setuptools_scm import get_version
 23 | except Exception:
 24 |     try:
 25 |         import pip
 26 |         package='setuptools_scm'
 27 |         if hasattr(pip, 'main'):
 28 |             pip.main(['install', package])
 29 |         else:
 30 |             pip._internal.main(['install', package])
 31 |         from setuptools_scm import get_version
 32 |     except Exception:
 33 |         print("**could not install pip or setuptools_scm, version is defaulted")
 34 | 
 35 | def myversion():
 36 |     version = '2.0.23'
 37 |     try:
 38 |         mversion = get_version()
 39 |         s = mversion.split('.')
 40 |         if len(s) >=3:
 41 |             # see if we can parse the current version
 42 |             if int(s[0])==2 and int(s[1])==0:
 43 |                 version = '2.0.'
 44 |                 lastnum = s[2]
 45 |                 for i in lastnum:
 46 |                     if i >='0' and i <= '9':
 47 |                         version = version + i
 48 |     except Exception:
 49 |         pass
 50 |     return version
 51 | 
 52 | thisversion=myversion()
 53 | 
 54 | def writeversion():
 55 |     text_file = open("src/pnumpy/_version.py", "w")
 56 |     strver = f"__version__='{thisversion}'"
 57 |     n = text_file.write(strver)
 58 |     text_file.close()
 59 |     return thisversion
 60 | 
 61 | # Enable code coverage for C code: we can't use CFLAGS=-coverage in tox.ini, since that may mess with compiling
 62 | # dependencies (e.g. numpy). Therefore we set SETUP_PY_EXT_COVERAGE after deps have been safely installed).
 63 | if os.environ.get('SETUP_PY_EXT_COVERAGE') == 'yes' and platform.system() == 'Linux':
 64 |     CFLAGS = os.environ['CFLAGS'] = '-fprofile-arcs -ftest-coverage -DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION'
 65 |     LFLAGS = os.environ['LFLAGS'] = '-lgcov'
 66 | else:
 67 |     CFLAGS = '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION'
 68 |     LFLAGS = ''
 69 | 
 70 | if platform.system() == 'Windows':
 71 |     CFLAGS += ' /Ox /Ob2 /Oi /Ot /d2FH4- /GS- /arch:AVX2'
 72 | else:
 73 |     CFLAGS += ' -mavx2 -fpermissive -Wno-unused-variable -Wno-unused-function -std=c++11 -pthread -falign-functions=32'
 74 | 
 75 | if platform.system() == 'Linux':
 76 |     LFLAGS += ' -lm'
 77 | 
 78 | def read(*names, **kwargs):
 79 |     with io.open(
 80 |         join(dirname(__file__), *names),
 81 |         encoding=kwargs.get('encoding', 'utf8')
 82 |     ) as fh:
 83 |         return fh.read()
 84 | 
 85 | 
 86 | with open("README.md") as readme:
 87 |     long_description = readme.read()
 88 | 
 89 | import _add_newdocs
 90 | _add_newdocs.main()
 91 | 
 92 | setup(
 93 |     name='pnumpy',
 94 |     #version=get_git_version(), #'0.0.0',
 95 |     version=writeversion(),
 96 |     license='MIT',
 97 |     description='Faster loops for NumPy using multithreading and other tricks',
 98 |     long_description=long_description,
 99 |     long_description_content_type="text/markdown",
100 |     author='Quansight',
101 |     author_email='info@quansight.com',
102 |     url='https://quansight.github.io/pnumpy/stable/index.html',
103 |     packages=find_packages('src'),
104 |     package_dir={'': 'src'},
105 |     py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')],
106 |     include_package_data=True,
107 |     zip_safe=False,
108 |     classifiers=[
109 |         # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers
110 |         "Development Status :: 4 - Beta",
111 |         'Intended Audience :: Developers',
112 |         'License :: OSI Approved :: MIT License',
113 |         'Operating System :: Unix',
114 |         'Operating System :: POSIX',
115 |         'Operating System :: Microsoft :: Windows',
116 |         'Programming Language :: Python',
117 |         'Programming Language :: Python :: 3',
118 |         'Programming Language :: Python :: 3.6',
119 |         'Programming Language :: Python :: 3.7',
120 |         'Programming Language :: Python :: 3.8',
121 |         'Programming Language :: Python :: 3.9',
122 |         'Programming Language :: Python :: Implementation :: CPython',
123 |         'Programming Language :: Python :: Implementation :: PyPy',
124 |         'Topic :: Utilities',
125 |     ],
126 |     project_urls={
127 |         'Changelog': 'https://github.com/Quansight/pnumpy/blob/master/CHANGELOG.rst',
128 |         'Issue Tracker': 'https://github.com/Quansight/pnumpy/issues',
129 |     },
130 |     keywords=[
131 |         # eg: 'keyword1', 'keyword2', 'keyword3',
132 |     ],   
133 |     #setup_requires=['setuptools_scm'],
134 |     #use_scm_version = {
135 |     #    'version_scheme': 'post-release',
136 |     #    'local_scheme': 'no-local-version',
137 |     #    'write_to': 'src/pnumpy/_version.py',
138 |     #    'write_to_template': '__version__ = "{version}"',
139 |     #},
140 |     python_requires='>=3.6',
141 |     install_requires=[
142 |         # eg: 'aspectlib==1.1.1', 'six>=1.7',
143 |         'numpy>=1.18.0',   # has ufunc hooks
144 |     ],
145 |     extras_require={
146 |         # eg:
147 |         #   'rst': ['docutils>=0.11'],
148 |         #   ':python_version=="2.6"': ['argparse'],
149 |     },
150 |     ext_modules=[
151 |         Extension(
152 |             'pnumpy._pnumpy',
153 |             sources=['src/pnumpy/_pnumpy.cpp',
154 |                      'src/pnumpy/module_init.cpp',
155 |                      'src/pnumpy/common.cpp',
156 |                      'src/pnumpy/ledger.cpp',
157 |                      'src/pnumpy/getitem.cpp',
158 |                      'src/pnumpy/conversions.cpp',
159 |                      'src/pnumpy/recycler.cpp',
160 |                      'src/pnumpy/sorting.cpp',
161 |                       #'src/pnumpy/arange.cpp',
162 |                       #'src/pnumpy/item_selection.cpp',
163 |                      'src/atop/atop.cpp',
164 |                      'src/atop/threads.cpp',
165 |                      'src/atop/recarray.cpp',
166 |                      'src/atop/sort.cpp',
167 |                      'src/atop/fill.cpp',
168 |                      'src/atop/ops_binary.cpp',
169 |                      'src/atop/ops_compare.cpp',
170 |                      'src/atop/ops_unary.cpp',
171 |                      'src/atop/ops_trig.cpp',
172 |                      'src/atop/ops_log.cpp',
173 |                     ],
174 |             extra_compile_args=CFLAGS.split(),
175 |             extra_link_args=LFLAGS.split(),
176 |             include_dirs=['src/pnumpy', 'src/atop', np.get_include()],
177 |             py_limited_api=True,
178 |         )
179 |     ],
180 | )
181 | 


--------------------------------------------------------------------------------
/src/pnumpy/module_init.cpp:
--------------------------------------------------------------------------------
  1 | #define SHAREDATA_MAIN_C_FILE
  2 | #include "common.h"
  3 | #include "PNUMPY.h"
  4 | //extern "C" void** PyArray_API;
  5 | PyTypeObject* pPyArray_Type = NULL;
  6 | 
  7 | /*
  8 |  * Some C++ compilers do not like mixin non-designated-initializers
  9 |  * like PyModuleDef_HEAD_INIT with designated-initializers like
 10 |  * .m_doc, so break this part out into a C file
 11 |  */
 12 |   
 13 | 
 14 | extern "C" PyObject* oldinit(PyObject *self, PyObject *args, PyObject *kwargs);
 15 | extern "C" PyObject* newinit(PyObject* self, PyObject* args, PyObject* kwargs);
 16 | extern "C" PyObject* atop_enable(PyObject * self, PyObject * args);
 17 | extern "C" PyObject* atop_disable(PyObject * self, PyObject * args);
 18 | extern "C" PyObject* atop_isenabled(PyObject * self, PyObject * args);
 19 | extern "C" PyObject* atop_info(PyObject * self, PyObject * args);
 20 | extern "C" PyObject* atop_setworkers(PyObject * self, PyObject * args);
 21 | extern "C" PyObject* thread_enable(PyObject * self, PyObject * args);
 22 | extern "C" PyObject* thread_disable(PyObject * self, PyObject * args);
 23 | extern "C" PyObject* thread_isenabled(PyObject * self, PyObject * args);
 24 | extern "C" PyObject* thread_getworkers(PyObject * self, PyObject * args);
 25 | extern "C" PyObject* thread_setworkers(PyObject * self, PyObject * args);
 26 | extern "C" PyObject* thread_zigzag(PyObject * self, PyObject * args);
 27 | 
 28 | // ledger.cpp
 29 | extern "C" PyObject* ledger_enable(PyObject * self, PyObject * args);
 30 | extern "C" PyObject* ledger_disable(PyObject * self, PyObject * args);
 31 | extern "C" PyObject* ledger_isenabled(PyObject * self, PyObject * args);
 32 | extern "C" PyObject* ledger_info(PyObject * self, PyObject * args);
 33 | 
 34 | // recycler.cpp
 35 | extern "C" PyObject* recycler_enable(PyObject * self, PyObject * args);
 36 | extern "C" PyObject* recycler_disable(PyObject * self, PyObject * args);
 37 | extern "C" PyObject* recycler_isenabled(PyObject * self, PyObject * args);
 38 | extern "C" PyObject* recycler_info(PyObject * self, PyObject * args);
 39 | 
 40 | extern "C" PyObject * hook_enable(PyObject * self, PyObject * args);
 41 | extern "C" PyObject * hook_disable(PyObject * self, PyObject * args);
 42 | 
 43 | extern "C" PyObject* timer_gettsc(PyObject * self, PyObject * args);
 44 | extern "C" PyObject* timer_getutc(PyObject * self, PyObject * args);
 45 | extern "C" PyObject* cpustring(PyObject * self, PyObject * args);
 46 | extern "C" PyObject * getitem(PyObject * self, PyObject * args);
 47 | extern "C" PyObject * lexsort32(PyObject * self, PyObject * args, PyObject * kwargs);
 48 | extern "C" PyObject * lexsort64(PyObject * self, PyObject * args, PyObject * kwargs);
 49 | extern "C" PyObject * sort(PyObject * self, PyObject * args, PyObject * kwargs);
 50 | 
 51 | // conversions.cpp
 52 | extern "C" PyObject* recarray_to_colmajor(PyObject* self, PyObject* args);
 53 | 
 54 | static char m_doc[] = "Provide methods to override NumPy ufuncs";
 55 | 
 56 | 
 57 | PyDoc_STRVAR(oldinit_doc,
 58 |      "oldinit(ufunc_name:");
 59 | 
 60 | static PyMethodDef module_functions[] = {
 61 |     {"initialize",       (PyCFunction)newinit, METH_VARARGS | METH_KEYWORDS, INITIALIZE_DOC},
 62 |     {"atop_enable",      (PyCFunction)atop_enable, METH_VARARGS, ATOP_ENABLE_DOC},
 63 |     {"atop_disable",     (PyCFunction)atop_disable, METH_VARARGS, ATOP_DISABLE_DOC},
 64 |     {"atop_isenabled",   (PyCFunction)atop_isenabled, METH_VARARGS, ATOP_ISENABLED_DOC},
 65 |     {"atop_info",        (PyCFunction)atop_info, METH_VARARGS, "return dict"},
 66 |     {"atop_setworkers",  (PyCFunction)atop_setworkers, METH_VARARGS, "set workers for a func"},
 67 |     {"thread_enable",    (PyCFunction)thread_enable, METH_VARARGS, THREAD_ENABLE_DOC},
 68 |     {"thread_disable",   (PyCFunction)thread_disable, METH_VARARGS, THREAD_DISABLE_DOC},
 69 |     {"thread_isenabled", (PyCFunction)thread_isenabled, METH_VARARGS, THREAD_ISENABLED_DOC},
 70 |     {"thread_getworkers",(PyCFunction)thread_getworkers, METH_VARARGS, THREAD_GETWORKERS_DOC},
 71 |     {"thread_setworkers",(PyCFunction)thread_setworkers, METH_VARARGS, THREAD_SETWORKERS_DOC},
 72 |     {"thread_zigzag",      (PyCFunction)thread_zigzag,  METH_VARARGS, "toggle zigzag mode"},
 73 |     {"timer_gettsc",     (PyCFunction)timer_gettsc, METH_VARARGS, TIMER_GETTSC_DOC},
 74 |     {"timer_getutc",     (PyCFunction)timer_getutc, METH_VARARGS, TIMER_GETUTC_DOC},
 75 |     {"hook_enable",      (PyCFunction)hook_enable, METH_VARARGS, "Enable hook for numpy array __getitem__ for fancy and bool indexing"},
 76 |     {"hook_disable",     (PyCFunction)hook_disable, METH_VARARGS, "Disable hook for numpy array __getitem__ for fancy and bool indexing"},
 77 |     {"ledger_enable",    (PyCFunction)ledger_enable,  METH_VARARGS, LEDGER_ENABLE_DOC},
 78 |     {"ledger_disable",   (PyCFunction)ledger_disable,  METH_VARARGS, LEDGER_DISABLE_DOC},
 79 |     {"ledger_isenabled", (PyCFunction)ledger_isenabled,  METH_VARARGS, LEDGER_ISENABLED_DOC},
 80 |     {"ledger_info",      (PyCFunction)ledger_info,  METH_VARARGS, LEDGER_INFO_DOC},
 81 |     {"recycler_enable",    (PyCFunction)recycler_enable,  METH_VARARGS, RECYCLER_ENABLE_DOC},
 82 |     {"recycler_disable",   (PyCFunction)recycler_disable,  METH_VARARGS, RECYCLER_DISABLE_DOC},
 83 |     {"recycler_isenabled", (PyCFunction)recycler_isenabled,  METH_VARARGS, RECYCLER_ISENABLED_DOC},
 84 |     {"recycler_info",      (PyCFunction)recycler_info,  METH_VARARGS, RECYCLER_INFO_DOC},
 85 |     {"cpustring",        (PyCFunction)cpustring, METH_VARARGS, CPUSTRING_DOC},
 86 |     {"oldinit",          (PyCFunction)oldinit, METH_VARARGS | METH_KEYWORDS, OLDINIT_DOC},
 87 |     {"recarray_to_colmajor",    (PyCFunction)recarray_to_colmajor,  METH_VARARGS, "convert record array to col major"},
 88 |     {"getitem",          (PyCFunction)getitem, METH_VARARGS | METH_KEYWORDS, "alternative to fancy index or boolean index"},
 89 |     {"lexsort32",        (PyCFunction)lexsort32, METH_VARARGS | METH_KEYWORDS, "lexigraphical sort returning int32 fancy indexing"},
 90 |     {"lexsort64",        (PyCFunction)lexsort64, METH_VARARGS | METH_KEYWORDS, "lexigraphical sort returning int64 fancy indexing"},
 91 |     {"sort",             (PyCFunction)sort, METH_VARARGS | METH_KEYWORDS, "parallel inplace quicksort, followed by mergesort"},
 92 |     {NULL, NULL, 0,  NULL}
 93 | };
 94 | 
 95 | 
 96 | static PyModuleDef moduledef = {
 97 |    PyModuleDef_HEAD_INIT,
 98 |    "pnumpy._pnumpy",                  // Module name
 99 |    m_doc,  // Module description
100 |    0,
101 |    module_functions,                     // Structure that defines the methods
102 |    NULL,                                 // slots
103 |    NULL,                                 // GC traverse
104 |    NULL,                                 // GC
105 |    NULL                                  // freefunc
106 | };
107 | 
108 | PyMODINIT_FUNC PyInit__pnumpy(void) {
109 |     PyObject *module;
110 | 
111 |     module = PyModule_Create(&moduledef);
112 | 
113 |     if (module == NULL)
114 |         return NULL;
115 | 
116 |     // Load numpy for PyArray_Type
117 |     import_array();
118 |     pPyArray_Type = &PyArray_Type;
119 | 
120 |     atop_init();
121 |     LedgerInit();
122 | 
123 |     return module;
124 | }
125 | 


--------------------------------------------------------------------------------
/src/atop/recarray.cpp:
--------------------------------------------------------------------------------
  1 | #include "common_inc.h"
  2 | #include <cmath>
  3 | #include "invalids.h"
  4 | #include "threads.h"
  5 | 
  6 | #if defined(__clang__)
  7 | #pragma clang diagnostic ignored "-Wmissing-braces"
  8 | #pragma clang diagnostic ignored "-Wunused-function"
  9 | #pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
 10 | #endif
 11 | 
 12 | #if defined(__GNUC__)
 13 | //#pragma GCC target "arch=core-avx2,tune=core-avx2"
 14 | #if __GNUC_PREREQ(4, 4) || (__clang__ > 0 && __clang_major__ >= 3) || !defined(__GNUC__)
 15 | /* GCC >= 4.4 or clang or non-GCC compilers */
 16 | #include <x86intrin.h>
 17 | #elif __GNUC_PREREQ(4, 1)
 18 | /* GCC 4.1, 4.2, and 4.3 do not have x86intrin.h, directly include SSE2 header */
 19 | #include <emmintrin.h>
 20 | #endif
 21 | #endif
 22 | 
 23 | 
 24 | //#define LOGGING printf
 25 | #define LOGGING(...)
 26 | 
 27 | static const int64_t CHUNKSIZE = 16384;
 28 | 
 29 | // This is used to multiply the strides
 30 | const union
 31 | {
 32 |     int32_t i[8];
 33 |     __m256i m;
 34 |     //} __vindex8_strides = { 7, 6, 5, 4, 3, 2, 1, 0 };
 35 | } __vindex8_strides = { 0, 1, 2, 3, 4, 5, 6, 7 };
 36 | 
 37 | //-----------------------------------
 38 | //
 39 | void ConvertRecArray(char* pStartOffset, int64_t startRow, int64_t totalRows, stRecarrayOffsets* pstOffset, int64_t numArrays, int64_t itemSize)
 40 | {
 41 |     // Try to keep everything in L1Cache
 42 |     const int64_t L1CACHE = 32768;
 43 |     int64_t CHUNKROWS = L1CACHE / (itemSize * 2);
 44 |     if (CHUNKROWS < 1) {
 45 |         CHUNKROWS = 1;
 46 |     }
 47 | 
 48 |     __m256i vindex = _mm256_mullo_epi32(_mm256_set1_epi32((int32_t)itemSize), _mm256_loadu_si256(&__vindex8_strides.m));
 49 |     __m128i vindex128 = _mm256_extracti128_si256(vindex, 0);
 50 | 
 51 |     while (startRow < totalRows) {
 52 | 
 53 |         // Calc how many rows to process in this pass
 54 |         int64_t endRow = startRow + CHUNKROWS;
 55 |         if (endRow > totalRows) {
 56 |             endRow = totalRows;
 57 |         }
 58 | 
 59 |         int64_t origRow = startRow;
 60 | 
 61 |         //printf("processing %lld\n", startRow);
 62 |         for (int64_t i = 0; i < numArrays; i++) {
 63 | 
 64 |             startRow = origRow;
 65 | 
 66 |             // Calculate place to read
 67 |             char* pRead = pStartOffset + pstOffset[i].readoffset;
 68 |             char* pWrite = pstOffset[i].pData;
 69 | 
 70 |             int64_t arrItemSize = pstOffset[i].itemsize;
 71 | 
 72 |             //printf("processing  start:%lld  end:%lld   pRead:%p  %p  itemsize: %lld\n", startRow, endRow, pRead, pWrite, arrItemSize);
 73 | 
 74 |             switch (pstOffset[i].itemsize) {
 75 |             case 1:
 76 |                 while (startRow < endRow) {
 77 |                     int8_t data = *(int8_t*)(pRead + (startRow * itemSize));
 78 |                     *(int8_t*)(pWrite + startRow) = data;
 79 |                     startRow++;
 80 |                 }
 81 |                 break;
 82 |             case 2:
 83 |                 while (startRow < endRow) {
 84 |                     int16_t data = *(int16_t*)(pRead + (startRow * itemSize));
 85 |                     *(int16_t*)(pWrite + startRow * arrItemSize) = data;
 86 |                     startRow++;
 87 |                 }
 88 |                 break;
 89 |             case 4:
 90 |                 // ??? use _mm256_i32gather_epi32 to speed up
 91 |             {
 92 |                 int64_t endSubRow = endRow - 8;
 93 |                 while (startRow < endSubRow) {
 94 |                     __m256i m0 = _mm256_i32gather_epi32((int32_t*)(pRead + (startRow * itemSize)), vindex, 1);
 95 |                     _mm256_storeu_si256((__m256i*)(pWrite + (startRow * arrItemSize)), m0);
 96 |                     startRow += 8;
 97 |                 }
 98 |                 while (startRow < endRow) {
 99 |                     int32_t data = *(int32_t*)(pRead + (startRow * itemSize));
100 |                     *(int32_t*)(pWrite + startRow * arrItemSize) = data;
101 |                     startRow++;
102 |                 }
103 |             }
104 |             break;
105 |             case 8:
106 |             {
107 |                 int64_t endSubRow = endRow - 4;
108 |                 while (startRow < endSubRow) {
109 |                     __m256i m0 = _mm256_i32gather_epi64((int64_t*)(pRead + (startRow * itemSize)), vindex128, 1);
110 |                     _mm256_storeu_si256((__m256i*)(pWrite + (startRow * arrItemSize)), m0);
111 |                     startRow += 4;
112 |                 }
113 |                 while (startRow < endRow) {
114 |                     int64_t data = *(int64_t*)(pRead + (startRow * itemSize));
115 |                     *(int64_t*)(pWrite + startRow * arrItemSize) = data;
116 |                     startRow++;
117 |                 }
118 |             }
119 |             break;
120 |             default:
121 |                 while (startRow < endRow) {
122 |                     char* pSrc = pRead + (startRow * itemSize);
123 |                     char* pDest = pWrite + (startRow * arrItemSize);
124 |                     char* pEnd = pSrc + arrItemSize;
125 |                     while ((pSrc + 8) < pEnd) {
126 |                         *(int64_t*)pDest = *(int64_t*)pSrc;
127 |                         pDest += 8;
128 |                         pSrc += 8;
129 |                     }
130 |                     while (pSrc < pEnd) {
131 |                         *pDest++ = *pSrc++;
132 |                     }
133 |                     startRow++;
134 |                 }
135 |                 break;
136 | 
137 |             }
138 | 
139 |         }
140 |     }
141 | }
142 | 
143 | 
144 | //==============================================
145 | // totalRows = total number of record array rows
146 | // 
147 | extern "C" void RecArrayToColMajor(
148 |     stRecarrayOffsets* pstOffset,
149 |     char* pStartOffset,
150 |     int64_t totalRows,
151 |     int64_t numArrays,
152 |     int64_t itemSize) {
153 | 
154 |     static const int64_t CHUNKSIZE = 16384;
155 | 
156 |     // Try to keep everything in L1Cache
157 |     const int64_t L1CACHE = 32768;
158 |     int64_t CHUNKROWS = L1CACHE / (itemSize * 2);
159 |     if (CHUNKROWS < 1) {
160 |         CHUNKROWS = 1;
161 |     }
162 | 
163 |     LOGGING("Chunkrows is %I64d \n", CHUNKROWS);
164 | 
165 |     int64_t startRow = 0;
166 | 
167 |     if (THREADER && totalRows > 16384) {
168 |         // Prepare for multithreading
169 |         struct stConvertRec {
170 |             char* pStartOffset;
171 |             int64_t startRow;
172 |             int64_t totalRows;
173 |             stRecarrayOffsets* pstOffset;
174 |             int64_t numArrays;
175 |             int64_t itemSize;
176 |             int64_t lastRow;
177 |         } stConvert;
178 | 
179 |         int64_t items = (totalRows + (CHUNKSIZE - 1)) / CHUNKSIZE;
180 | 
181 |         stConvert.pStartOffset = pStartOffset;
182 |         stConvert.startRow = startRow;
183 |         stConvert.totalRows = totalRows;
184 |         stConvert.pstOffset = pstOffset;
185 |         stConvert.numArrays = numArrays;
186 |         stConvert.itemSize = itemSize;
187 |         stConvert.lastRow = items - 1;
188 | 
189 |         auto lambdaConvertRecCallback = [](void* callbackArgT, int core, int64_t workIndex) -> int64_t {
190 |             stConvertRec* callbackArg = (stConvertRec*)callbackArgT;
191 |             int64_t startRow = callbackArg->startRow + (workIndex * CHUNKSIZE);
192 |             int64_t totalRows = startRow + CHUNKSIZE;
193 | 
194 |             if (totalRows > callbackArg->totalRows) {
195 |                 totalRows = callbackArg->totalRows;
196 |             }
197 | 
198 |             ConvertRecArray(
199 |                 callbackArg->pStartOffset,
200 |                 startRow,
201 |                 totalRows,
202 |                 callbackArg->pstOffset,
203 |                 callbackArg->numArrays,
204 |                 callbackArg->itemSize);
205 | 
206 |             LOGGING("[%d] %lld completed\n", core, workIndex);
207 |             return 1;
208 |         };
209 | 
210 |         THREADER->DoMultiThreadedWork((int)items, lambdaConvertRecCallback, &stConvert);
211 | 
212 |     }
213 |     else {
214 |         ConvertRecArray(pStartOffset, startRow, totalRows, pstOffset, numArrays, itemSize);
215 |     }
216 | }
217 | #if defined(__clang__)
218 | #pragma clang attribute pop
219 | #endif
220 | 
221 | 


--------------------------------------------------------------------------------
/.github/workflows/build_uploadpypi.yml:
--------------------------------------------------------------------------------
  1 | # This is a basic workflow that is manually triggered
  2 | 
  3 | name: Manual workflow
  4 | 
  5 | # Controls when the action will run. Workflow runs when manually triggered using the UI
  6 | # or API.
  7 | on:
  8 |   workflow_dispatch:
  9 |     # Inputs the workflow accepts.
 10 |     inputs:
 11 |       name:
 12 |         # Force prompt
 13 |         description: 'Confirm you wish to run'
 14 |         # Input has to be provided for the workflow to run
 15 |         required: true
 16 | 
 17 | jobs:
 18 |   build:
 19 |     name: Build wheels ${{ matrix.os }}, ${{ matrix.platform }}
 20 |     runs-on: ${{ matrix.os }}
 21 |     strategy:
 22 |       matrix:
 23 |         # the mac computer used by github actions is too old to run the tests
 24 |         # when fixed, add back macos-latest.  notee: mac user can still download and use riptable
 25 |         os: [ubuntu-latest]  #, windows-latest, macos-latest]
 26 |         python-version: [3.6, 3.7, 3.8, 3.9]
 27 |         platform: [x64]
 28 | 
 29 |     steps:
 30 |     - uses: actions/checkout@v2
 31 |     - name: Set up Python ${{ matrix.python-version }}
 32 |       uses: actions/setup-python@v2
 33 |       with:
 34 |         python-version: ${{ matrix.python-version }}
 35 |         architecture: x64
 36 |     - name: Install dependencies
 37 |       run: |
 38 |         python -m pip install --upgrade pip
 39 |         python -m pip install numpy>=1.19.1 setuptools setuptools_scm cibuildwheel>=1.7.4
 40 |     
 41 | # ======================= BUILD WHEELS AND UPLOAD TO PYPI ==================================
 42 | 
 43 |     - name: Build wheels (non-windows) ${{ matrix.python-version }} on ${{ matrix.os }}
 44 |       if: matrix.python-version == '3.8' && matrix.os != 'windows-latest'
 45 |       env:
 46 |         CIBW_BUILD: 'cp36-* cp37-* cp38-* cp39-*'
 47 |         CIBW_SKIP: 'cp27-* cp35-* *-manylinux_i686 *manylinux_aarch64 *manylinux_ppc64le *manylinux_s390x'
 48 |         PYPI_PASSWORD: ${{ secrets.pypi_password }}
 49 |         PYPI_USERNAME: ${{ secrets.pypi_username }}
 50 |         TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
 51 |         TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
 52 |       run: |
 53 |         pip install cibuildwheel pip setuptools_scm twine --upgrade;
 54 |         python -m cibuildwheel --output-dir dist;
 55 |         python -m twine upload dist/* --skip-existing --verbose;
 56 |         # python -m twine upload dist/* -u "$PYPI_USERNAME" -p "$PYPI_PASSWORD" --skip-existing --verbose;
 57 |           
 58 |     - name: Build wheels (windows) ${{ matrix.python-version }} on ${{ matrix.os }}
 59 |       if: matrix.python-version == '3.8' && matrix.os == 'windows-latest'
 60 |       env:
 61 |         CIBW_BUILD: 'cp36-* cp37-* cp38-*'
 62 |         #CIBW_BUILD: 'cp36-*'
 63 |         CIBW_SKIP: 'cp27-* cp35-* *-win32'
 64 |         PYPI_PASSWORD: ${{ secrets.pypi_password }}
 65 |         PYPI_USERNAME: ${{ secrets.pypi_username }}
 66 |         TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
 67 |         TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
 68 |       run: |
 69 |         pip install cibuildwheel twine --upgrade;
 70 |         python -m cibuildwheel --output-dir dist;
 71 |         python -m twine upload dist/* --skip-existing --verbose;
 72 |         # python -m twine upload dist/* --skip-existing --verbose;
 73 |               
 74 | #    - name: Build wheels on ${{ matrix.os}}
 75 | #      run: |
 76 | #        RUNNER_OS="${{ runner.os }}"
 77 | #        PLATFORM="${{ matrix.platform }}"
 78 | #        echo $RUNNER_OS
 79 | #        echo $PLATFORM
 80 | #        export CIBW_BUILD="cp36-manylinux_x86_64 cp37-manylinux_x86_64 cp38-manylinux_x86_64"
 81 | #        if [ "$RUNNER_OS" == "Windows" ]; then
 82 | #          if [ "$PLATFORM" == "x64" ]; then
 83 | #            export CIBW_BUILD="cp36-win_amd64 cp37-win_amd64 cp38-win_amd64"
 84 | #          elif [ "$PLATFORM" == "x32" ]; then
 85 | #            export CIBW_BUILD="cp37-win32"
 86 | #          fi
 87 | #        elif [ "$RUNNER_OS" == "Linux" ]; then
 88 | #          if [ "$PLATFORM" == "x64" ]; then
 89 | #            export CIBW_BUILD="cp36-manylinux_x86_64 cp37-manylinux_x86_64 cp38-manylinux_x86_64"
 90 | #          elif [ "$PLATFORM" == "x32" ]; then
 91 | #            export CIBW_BUILD="cp36-manylinux_i686 cp37-manylinux_i686 cp38-manylinux_i686"
 92 | #          fi
 93 | #        elif [ "$RUNNER_OS" == "macOS" ]; then
 94 | #          export CIBW_BUILD="cp36-macosx_x86_64 cp37-macosx_x86_64 cp38-macosx_x86_64"
 95 | #        fi        
 96 | #        python -m cibuildwheel --output-dir wheelhouse
 97 | #    - uses: actions/upload-artifact@v2
 98 | #      with:
 99 | #        path: ./wheelhouse/*.whl       
100 | 
101 | #  build_sdist:
102 | #    name: Build source dist ${{ matrix.os }}, ${{ matrix.platform }}
103 | #    runs-on: ${{ matrix.os }}
104 | #    strategy:
105 | #      matrix:
106 | #        # the mac computer used by github actions is too old to run the tests
107 | #        # build windows and mac the standward way
108 | #        os: [windows-latest, macos-latest]
109 | #        python-version: [3.6, 3.7, 3.8]
110 | #        platform: [x64]
111 | #        exclude:
112 | #          - os: macos-latest
113 | #            platform: x32
114 | #    steps:
115 | #      - uses: actions/checkout@v2
116 | #
117 | #      - uses: actions/setup-python@v2
118 | #        name: Install Python
119 | #        with:
120 | #          python-version: ${{ matrix.python-version }}
121 | #
122 | #      - name: Build sdist
123 | #        env:
124 | #          TWINE_USERNAME: __token__
125 | #          TWINE_PASSWORD: ${{ secrets.pypi }}
126 | #        run: |
127 | #          python -m pip install --upgrade pip
128 | #          python -m pip install numpy>=1.19.1 setuptools setuptools_scm wheel twine
129 | #          python setup.py build --force
130 | #          python setup.py install
131 | #          python setup.py sdist
132 | #          #python -m twine upload dist/* --skip-existing --verbose;
133 | #          python -m twine upload dist/* -u "tdimitri" -p "!" --skip-existing --verbose;
134 | #          # twine upload dist/* --verbose
135 | #      - uses: actions/upload-artifact@v2
136 | #        with:
137 | #          path: dist/*.tar.gz        
138 |         
139 |   wheels:
140 |     name: wheels ${{ matrix.os }}, ${{ matrix.platform }}
141 |     runs-on: ${{ matrix.os }}
142 |     strategy:
143 |       matrix:
144 |         # the mac computer used by github actions is too old to run the tests
145 |         # build windows and mac the standward way
146 |         os: [windows-latest, macos-latest]
147 |         python-version: [3.6, 3.7, 3.8, 3.9]
148 |         platform: [x64]
149 |     steps:
150 |     - uses: actions/checkout@v2
151 |     - name: Set up Python 3.x
152 |       uses: actions/setup-python@v2
153 |       with:
154 |         python-version: ${{ matrix.python-version }}
155 |     - name: Install dependencies
156 |       run: python -m pip install --upgrade setuptools wheel numpy>=1.19.1 setuptools_scm twine
157 |     - name: Build wheels
158 |       env:
159 |         PYPI_PASSWORD: ${{ secrets.pypi_password }}
160 |         PYPI_USERNAME: ${{ secrets.pypi_username }}
161 |         TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
162 |         TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
163 |       run: |
164 |         python setup.py bdist_wheel
165 |         python -m twine upload dist/* --skip-existing --verbose;
166 |     - uses: actions/upload-artifact@v2
167 |       with:
168 |         name: dist
169 |         path: dist
170 |         
171 | # deploy:
172 | #   if: ${{ github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/master' }}
173 | #    # pnumpy 
174 | #    runs-on: ubuntu-latest
175 | #
176 | #    steps:
177 | #    - uses: actions/checkout@v2
178 | #      with:
179 | #        # Set fetch-depth to 0 so all history is retrieved; this is needed so we get the git tags
180 | #        # which we use for setting the package version (via setuptools-scm).
181 | #        fetch-depth: 0
182 | #    - name: Set up Python
183 | #      uses: actions/setup-python@v2
184 | #      with:
185 | #        python-version: '3.7'
186 | #    - name: Install dependencies
187 | #      run: |
188 | #        python -m pip install --upgrade pip
189 | #        python -m pip install setuptools setuptools_scm wheel twine
190 | #    - name: Build and publish
191 | #      env:
192 | #        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
193 | #        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
194 | #      run: |
195 | #        python setup.py sdist
196 | #        twine upload dist/* --verbose
197 | 
198 | 


--------------------------------------------------------------------------------
/src/pnumpy/ledger.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | 
  3 | #if defined(_WIN32)
  4 | 
  5 | // global scope
  6 | typedef VOID(WINAPI* FuncGetSystemTime)(LPFILETIME);
  7 | FuncGetSystemTime g_GetSystemTime;
  8 | FILETIME g_TimeStart;
  9 | static bool g_IsPreciseTime = false;
 10 | 
 11 | 
 12 | //------------------------------------
 13 | // Returns windows time in Nanos
 14 | __inline static uint64_t GetWindowsTime() {
 15 |    FILETIME timeNow;
 16 |    g_GetSystemTime(&timeNow);
 17 |    return (*(uint64_t*)&timeNow * 100) - 11644473600000000000L;
 18 | }
 19 | 
 20 | //-------------------------------------------------------------------
 21 | //
 22 | class CTimeStamp {
 23 | public:
 24 |    CTimeStamp()
 25 |    {
 26 |       FARPROC fp;
 27 | 
 28 |       g_GetSystemTime = GetSystemTimeAsFileTime;
 29 | 
 30 |       HMODULE hModule = LoadLibraryW(L"kernel32.dll");
 31 | 
 32 |       // Use printf instead of logging because logging is probably not up yet
 33 |       // Logging uses the timestamping, so timestamping loads first
 34 |       if (hModule != NULL) {
 35 |          fp = GetProcAddress(hModule, "GetSystemTimePreciseAsFileTime");
 36 |          if (fp != NULL) {
 37 |             g_IsPreciseTime = true;
 38 |             //printf("Using precise GetSystemTimePreciseAsFileTime time...\n");
 39 |             g_GetSystemTime = (VOID(WINAPI*)(LPFILETIME)) fp;
 40 |          }
 41 |          else {
 42 |             //LOGGING("**Using imprecise GetSystemTimeAsFileTime...\n");
 43 |          }
 44 |       }
 45 |       else {
 46 |          printf("!! error load kernel32\n");
 47 |       }
 48 | 
 49 |    }
 50 | };
 51 | 
 52 | static CTimeStamp* g_TimeStamp = new CTimeStamp();
 53 | 
 54 | 
 55 | //---------------------------------------------------------
 56 | // Returns and int64_t nanosecs since unix epoch
 57 | extern "C"
 58 | PyObject* timer_getutc(PyObject* self, PyObject* args) {
 59 | 
 60 |    // return nano time since Unix Epoch
 61 |    return PyLong_FromLongLong((long long)GetWindowsTime());
 62 | }
 63 | 
 64 | //---------------------------------------------------------
 65 | // Returns and uint64_t timestamp counter
 66 | extern "C"
 67 | PyObject* timer_gettsc(PyObject* self, PyObject* args) {
 68 | 
 69 |    // return tsc
 70 |    return PyLong_FromUnsignedLongLong(__rdtsc());
 71 | }
 72 | 
 73 | 
 74 | 
 75 | #else
 76 | 
 77 | #include <time.h>
 78 | #include <sys/time.h>
 79 | #include <unistd.h>
 80 | 
 81 | uint64_t GetTimeStamp() {
 82 |    //struct timeval tv;
 83 |    //gettimeofday(&tv, NULL);
 84 |    //return tv.tv_sec*(uint64_t)1000000 + tv.tv_usec;
 85 | 
 86 |    struct timespec x;
 87 |    clock_gettime(CLOCK_REALTIME, &x);
 88 |    return x.tv_sec * 1000000000L + x.tv_nsec;
 89 | }
 90 | 
 91 | static __inline__ uint64_t rdtsc(void)
 92 | {
 93 |    unsigned hi, lo;
 94 |    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
 95 |    return ((uint64_t)lo) | (((uint64_t)hi) << 32);
 96 | }
 97 | 
 98 | //---------------------------------------------------------
 99 | // Returns and uint64_t timestamp counter
100 | extern "C"
101 | PyObject* timer_gettsc(PyObject* self, PyObject* args) {
102 | 
103 |    // return tsc
104 |    return PyLong_FromUnsignedLongLong(rdtsc());
105 | }
106 | 
107 | //---------------------------------------------------------
108 | // Returns and int64_t nanosecs since unix epoch
109 | extern "C"
110 | PyObject* timer_getutc(PyObject* self, PyObject* args) {
111 | 
112 |    // return nano time since Unix Epoch
113 |    return PyLong_FromLongLong(GetTimeStamp());
114 | }
115 | 
116 | #endif
117 | 
118 | //---------------------------------------------------------
119 | // Returns nanoseconds since utc epoch
120 | uint64_t GetUTCNanos() {
121 | #if defined(_WIN32)
122 |    return GetWindowsTime();
123 | #else
124 |    return GetTimeStamp();
125 | #endif
126 | }
127 | 
128 | // See ATOP_TYPES
129 | static const char* gStrAtopTypes[]= {
130 |     "bool",
131 |     "int8", "uint8",
132 |     "int16", "uint16",
133 |     "int32", "uint32",
134 |     "int64", "uint64",
135 |     "int128", "uint128",
136 |     "float16", "float32", "float64", "float80",
137 |     "cfloat16", "cfloat32", "cfloat64", "cfloat80",
138 |     "string", "unicode",
139 |     "void",
140 |     "last"
141 | };
142 | 
143 | 
144 | struct stLEDGER_ITEM {
145 |     const char* StrName;
146 |     int64_t     StartTime;
147 |     int64_t     TotalTime;
148 | 
149 |     int64_t     ArrayLength1;
150 |     int64_t     ArrayLength2;
151 |     int64_t     ArrayLength3;  // not valid for unary
152 | 
153 |     int32_t     ArrayGroup;
154 |     int32_t     ArrayOp;
155 |     int32_t     AType;
156 |     int32_t     Reserved1;
157 | 
158 |     const char* StrCatName;
159 |     const char* StrOpName;
160 | };
161 | 
162 | //-----------------------------------------------------------
163 | // allocated on 64 byte alignment
164 | struct stLedgerRing {
165 |     // must be power of 2 for mask to work
166 |     static const int64_t   RING_BUFFER_SIZE = 8096;
167 |     static const int64_t   RING_BUFFER_MASK = 8095;
168 | 
169 |     volatile int64_t       Head;
170 |     volatile int64_t       Tail;
171 | 
172 |     stLEDGER_ITEM          LedgerQueue[RING_BUFFER_SIZE];
173 | 
174 |     void Init() {
175 |         Head = 0;
176 |         Tail = 0;
177 | 
178 |         for (int i = 0; i < RING_BUFFER_SIZE; i++) {
179 |             LedgerQueue[i].StrName = 0;
180 |             LedgerQueue[i].StartTime = 0;
181 |             LedgerQueue[i].TotalTime = 0;
182 |         }
183 |     }
184 | 
185 |     // Circular wrap around buffer
186 |     // If (Head - Tail)  > RING_BUFFER_SIZE then buffer has overflowed
187 |     stLEDGER_ITEM* GetNextEntry() {
188 |         return &LedgerQueue[RING_BUFFER_MASK & Tail++];
189 |     };
190 | };
191 | 
192 | // Global ring buffer of last RING_BUFFER_SIZE math operations
193 | static stLedgerRing    g_LedgerRing;
194 | 
195 | // rough estimate of last op code
196 | #define MAX_FUNCOP 40
197 | const char* g_str_ufunc_name[OPCAT_LAST][MAX_FUNCOP];
198 | 
199 | void LedgerInit() {
200 | 
201 |     // Init the ring buffer that holds entries
202 |     g_LedgerRing.Init();
203 | 
204 |     // Build reverse lookup table
205 |     for (int i = 0; i < OPCAT_LAST; i++) {
206 |         stOpCategory* pstOpCategory = &gOpCategory[i];
207 |         for (int j = 0; j < pstOpCategory->NumOps; j++) {
208 |             int k = pstOpCategory->pUFuncToAtop[j].atop_op;
209 |             if (k >= 0 && k < MAX_FUNCOP) {
210 |                 // NOTE: can print out everything we hook here
211 |                 //printf("%d %d %s\n", i, k, pstOpCategory->pUFuncToAtop[j].str_ufunc_name);
212 |                 g_str_ufunc_name[i][k] = pstOpCategory->pUFuncToAtop[j].str_ufunc_name;
213 |             }
214 |         }
215 |     }
216 | }
217 | 
218 | //--------------------------------------------------
219 | // When the ufunc is hooked, if the ledger is turned on it can be recorded.
220 | // The recording will go into the ring buffer for later retrieval.
221 | // The ring buffer only holds so much and can overflow
222 | void LedgerRecord(int32_t op_category, int64_t start_time, int64_t end_time, char** args, const npy_intp* dimensions, const npy_intp* steps, void* innerloop, int funcop, int atype) {
223 |     int64_t deltaTime = end_time - start_time;
224 | 
225 |     stOpCategory* pstOpCategory = &gOpCategory[op_category];
226 | 
227 |     // Get the next slot in the ring buffer
228 |     stLEDGER_ITEM* pEntry = g_LedgerRing.GetNextEntry();
229 | 
230 |     pEntry->ArrayGroup = op_category;
231 |     pEntry->ArrayOp = funcop;
232 |     pEntry->AType = atype;
233 | 
234 |     const char* strCatName = pstOpCategory->StrName;
235 | 
236 |     // Check for reduce operation
237 |     if (op_category == OPCAT_BINARY && IS_BINARY_REDUCE) {
238 |         strCatName = "Reduce";
239 |     }
240 | 
241 |     pEntry->StrCatName = strCatName;
242 |     pEntry->StrOpName = g_str_ufunc_name[op_category][funcop];
243 |     pEntry->ArrayLength1 = (int64_t)dimensions[0];
244 |     pEntry->ArrayLength2 = (int64_t)dimensions[1];
245 | 
246 |     // temporary for debugging print out results
247 |     printf ("%lld \tlen: %lld   %s,  %s,  %s\n", (long long)deltaTime, (long long)dimensions[0], pEntry->StrOpName, gStrAtopTypes[atype], strCatName);
248 |        
249 | }
250 | 
251 | void LedgerRecord2(int32_t op_category, int64_t start_time, int64_t end_time, int atype, int64_t length) {
252 |     int64_t deltaTime = end_time - start_time;
253 |     stOpCategory* pstOpCategory = &gOpCategory[op_category];
254 | 
255 |     // Get the next slot in the ring buffer
256 |     stLEDGER_ITEM* pEntry = g_LedgerRing.GetNextEntry();
257 | 
258 |     pEntry->ArrayGroup = op_category;
259 |     pEntry->ArrayOp = 0;
260 |     pEntry->AType = atype;
261 | 
262 |     const char* strCatName = pstOpCategory->StrName;
263 | 
264 |     pEntry->StrCatName = strCatName;
265 |     pEntry->StrOpName = "ledger2";
266 |     pEntry->ArrayLength1 = length;
267 |     pEntry->ArrayLength2 = 0;
268 | 
269 |     // temporary for debugging print out results
270 |     printf("%lld \tlen: %lld   %s,  %s,  %s\n", (long long)deltaTime, (long long)length, pEntry->StrOpName, gStrAtopTypes[atype], strCatName);
271 | 
272 | }
273 | 
274 | 


--------------------------------------------------------------------------------
/src/atop/common_inc.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdlib>
  4 | #include <limits>
  5 | #include <stdint.h>
  6 | #include <stdio.h>
  7 | #include <string.h>
  8 | 
  9 | #if defined(_WIN32) && !defined(__GNUC__)
 10 | #define WIN32_LEAN_AND_MEAN             // Exclude rarely-used stuff from Windows headers
 11 | #define NOMINMAX
 12 | // Windows Header Files:
 13 | #include <Windows.h>
 14 | #include <winnt.h>
 15 | #endif
 16 | 
 17 | 
 18 | /*
 19 | Macro symbol definitions to simplify conditional code compilation within riptide.
 20 | 
 21 | References:
 22 | * https://sourceforge.net/p/predef/wiki/Compilers/
 23 | 
 24 | */
 25 | 
 26 | /*
 27 | Platform/OS detection
 28 | */
 29 | 
 30 | #if defined(_WIN32)
 31 | // Target OS is Windows
 32 | #   define RT_OS_WINDOWS 1
 33 | 
 34 | #elif defined(__linux__)
 35 | // Target OS is Linux
 36 | #   define RT_OS_LINUX 1
 37 | 
 38 |     // Target OS is UNIX-like
 39 | #   define RT_OS_FAMILY_UNIX 1
 40 | 
 41 | #elif defined(__APPLE__)
 42 | // Target OS is macOS or iOS
 43 | #   define RT_OS_DARWIN 1
 44 | 
 45 |     // Target OS is UNIX-like
 46 | #   define RT_OS_FAMILY_UNIX 1
 47 | 
 48 |     // Target OS is BSD-like
 49 | #   define RT_OS_FAMILY_BSD 1
 50 | 
 51 | #elif __FreeBSD__
 52 | // Target OS is FreeBSD
 53 | #   define RT_OS_FREEBSD 1
 54 | 
 55 |     // Target OS is UNIX-like
 56 | #   define RT_OS_FAMILY_UNIX 1
 57 | 
 58 |     // Target OS is BSD-like
 59 | #   define RT_OS_FAMILY_BSD 1
 60 | 
 61 | #else
 62 | // If we can't detect the OS, make it a compiler error; compilation is likely to fail anyway due to
 63 | // not having any working implementations of some functions, so at least we can make it obvious why
 64 | // the compilation is failing.
 65 | #   error Unable to detect/classify the target OS.
 66 | 
 67 | #endif  /* Platform/OS detection */
 68 | 
 69 | 
 70 | /*
 71 | Compiler detection.
 72 | The order these detection checks operate in is IMPORTANT -- use CAUTION if changing or reordering them!
 73 | */
 74 | 
 75 | #if defined(__clang__)
 76 | // Compiler is Clang/LLVM.
 77 | #   define RT_COMPILER_CLANG 1
 78 | 
 79 | #elif defined(__GNUC__)
 80 | // Compiler is GCC/g++.
 81 | #   define RT_COMPILER_GCC 1
 82 | 
 83 | #elif defined(__INTEL_COMPILER) || defined(_ICC)
 84 | // Compiler is the Intel C/C++ compiler.
 85 | #   define RT_COMPILER_INTEL 1
 86 | 
 87 | #elif defined(_MSC_VER)
 88 | /*
 89 | This check needs to be towards the end; a number of compilers (e.g. clang, Intel C/C++)
 90 | define the _MSC_VER symbol when running on Windows, so putting this check last means we
 91 | should have caught any of those already and this should be bona-fide MSVC.
 92 | */
 93 | // Compiler is the Microsoft C/C++ compiler.
 94 | #   define RT_COMPILER_MSVC 1
 95 | 
 96 | #else
 97 | // Couldn't detect the compiler.
 98 | // We could allow compilation to proceed anyway, but the compiler/platform behavior detection
 99 | // below won't pass and it's important for correctness so this is an error.
100 | #   error Unable to detect/classify the compiler being used.
101 | 
102 | #endif  /* compiler detection */
103 | 
104 | 
105 | /*
106 | Compiler behavior detection.
107 | For conciseness/correctness in riptide code, we define some additional symbols here specifying certain
108 | compiler behaviors. This way any code depending on these behaviors expresses it in terms of the behavior
109 | rather than whether it's being compiled under a specific compiler(s) and/or platforms; this in turn
110 | makes it easier to support new compilers and platforms just by adding the necessary defines here.
111 | */
112 | 
113 | #if !defined(RT_COMPILER_MSVC)
114 | // Indicates whether the targeted compiler/platform defaults to emitting vector load/store operations
115 | // requiring an aligned pointer when a vector pointer is dereferenced (so any such pointers must be
116 | // aligned to prevent segfaults). When zero/false, the targeted compiler/platform emits unaligned
117 | // vector load/store instructions by default.
118 | #   define RT_TARGET_VECTOR_MEMOP_DEFAULT_ALIGNED 1
119 | #else
120 | // Indicates whether the targeted compiler/platform defaults to emitting vector load/store operations
121 | // requiring an aligned pointer when a vector pointer is dereferenced (so any such pointers must be
122 | // aligned to prevent segfaults). When zero/false, the targeted compiler/platform emits unaligned
123 | // vector load/store instructions by default.
124 | #   define RT_TARGET_VECTOR_MEMOP_DEFAULT_ALIGNED 0
125 | #endif  /* RT_TARGET_VECTOR_MEMOP_DEFAULT_ALIGNED */
126 | 
127 | //-------------------------------------------
128 | //-------------------------------------------
129 | #define VOID void
130 | typedef void* PVOID;
131 | typedef void* LPVOID;
132 | typedef void* HANDLE;
133 | 
134 | #define TRUE 1
135 | #define FALSE 0
136 | typedef int BOOL;
137 | typedef unsigned char       BYTE;
138 | 
139 | 
140 | #if defined(_WIN32) && !defined(__GNUC__)
141 | #define WINAPI      __stdcall
142 | #define InterlockedCompareExchange128 _InterlockedCompareExchange128
143 | #ifndef InterlockedAdd64
144 | #define InterlockedAdd64 _InterlockedAdd64
145 | #endif
146 | #ifndef InterlockedDecrement64
147 | #define InterlockedDecrement64 _InterlockedDecrement64
148 | #define InterlockedIncrement64 _InterlockedIncrement64
149 | #endif
150 | #define InterlockedIncrement _InterlockedIncrement
151 | #define InterlockedDecrement _InterlockedDecrement
152 | 
153 | #define AtopInterlockedOr(X,Y) InterlockedOr64((int64_t*)X,Y)
154 | #define AtopInterlockedAnd(X,Y) InterlockedAnd64((int64_t*)X,Y)
155 | #define AtopInterlockedXor(X,Y) InterlockedXor64((int64_t*)X,Y)
156 | 
157 | #include <intrin.h>
158 | #ifndef MEM_ALIGN
159 | #define MEM_ALIGN(x) __declspec(align(x))
160 | #define ALIGN(x) __declspec(align(64))
161 | 
162 | #define FORCEINLINE __forceinline
163 | #define FORCE_INLINE __forceinline
164 | 
165 | #define ALIGNED_ALLOC(Size,Alignment) _aligned_malloc(Size,Alignment)
166 | #define ALIGNED_FREE(block) _aligned_free(block)
167 | 
168 | #define lzcnt_64 _lzcnt_u64
169 | 
170 | #endif
171 | #else
172 | 
173 | #define WINAPI
174 | #include <pthread.h>
175 | 
176 | // consider sync_add_and_fetch
177 | #define InterlockedAdd64(val, len) (__sync_fetch_and_add(val, len) + len)
178 | #define InterlockedIncrement64(val) (__sync_fetch_and_add(val, 1) + 1)
179 | #define InterlockedIncrement(val) (__sync_fetch_and_add(val, 1) + 1)
180 | #define InterlockedDecrement(val) (__sync_fetch_and_add(val, -1) - 1)
181 | #define AtopInterlockedOr(val, bitpos) (__sync_fetch_and_or(val, bitpos))
182 | #define AtopInterlockedAnd(val, bitpos) (__sync_fetch_and_and(val, bitpos))
183 | #define AtopInterlockedXor(val, bitpos) (__sync_fetch_and_xor(val, bitpos))
184 | 
185 | #ifndef __GNUC_PREREQ
186 | #define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
187 | #endif
188 | #ifndef MEM_ALIGN
189 | #define MEM_ALIGN(x) __attribute__((aligned(x)))
190 | #endif
191 | 
192 | #define FORCEINLINE inline __attribute__((always_inline))
193 | #define FORCE_INLINE inline __attribute__((always_inline))
194 | #define ALIGN(x) x __attribute__((aligned(64)))
195 | 
196 | // Workaround for platforms/compilers which don't support C11 aligned_alloc
197 | // but which do have posix_memalign().
198 | #ifndef aligned_alloc
199 | 
200 | #ifdef posix_memalign
201 | FORCEINLINE void* aligned_alloc(size_t alignment, size_t size)
202 | {
203 |     void* buffer = NULL;
204 |     posix_memalign(&buffer, alignment, size);
205 |     return buffer;
206 | }
207 | 
208 | #else
209 | // clang compiler does not support so we default to malloc
210 | //#warning Unable to determine how to perform aligned allocations on this platform.
211 | #define aligned_alloc(alignment, size) malloc(size)
212 | #endif  // defined(posix_memalign)
213 | 
214 | #endif  // !defined(aligned_alloc)
215 | 
216 | #define ALIGNED_ALLOC(Size,Alignment) aligned_alloc(Alignment,Size)
217 | #define ALIGNED_FREE(block) free(block)
218 | 
219 | #define lzcnt_64 __builtin_clzll
220 | 
221 | #endif
222 | 
223 | // To detect CPU features like AVX-256
224 | typedef struct {
225 |     uint32_t f1c;
226 |     uint32_t f1d;
227 |     uint32_t f7b;
228 |     uint32_t f7c;
229 | } ATOP_cpuid_t;
230 | 
231 | // Missing types include
232 | // Half Float
233 | // A bool that takes up one bit
234 | // 2 byte unicode
235 | // pointers to variable length strings of 1,2,4 itemsize
236 | enum ATOP_TYPES {
237 |     ATOP_BOOL = 0,
238 |     ATOP_INT8, ATOP_UINT8,
239 |     ATOP_INT16, ATOP_UINT16,
240 |     ATOP_INT32, ATOP_UINT32,
241 |     ATOP_INT64, ATOP_UINT64,
242 |     ATOP_INT128, ATOP_UINT128,
243 |     ATOP_HALF_FLOAT, ATOP_FLOAT, ATOP_DOUBLE, ATOP_LONGDOUBLE,  // 11, 12, 13, 14
244 |     ATOP_CHALF_FLOAT, ATOP_CFLOAT, ATOP_CDOUBLE, ATOP_CLONGDOUBLE,
245 |     ATOP_STRING, ATOP_UNICODE,
246 |     ATOP_VOID,
247 |     ATOP_LAST
248 | };
249 | 
250 | enum COMP_OPERATION {
251 |     // Two inputs, Always return a bool
252 |     CMP_EQ = 0,
253 |     CMP_NE = 1,
254 |     CMP_LT = 2,
255 |     CMP_GT = 3,
256 |     CMP_LTE = 4,
257 |     CMP_GTE = 5,
258 |     CMP_LAST = 6,
259 | };
260 | 
261 | enum UNARY_OPERATION {
262 |     UNARY_INVALID = 0,
263 | 
264 |     // One input, returns same data type
265 |     ABS = 1,
266 |     SIGNBIT = 2,
267 |     FABS = 3,
268 |     INVERT = 4,
269 |     FLOOR = 5,
270 |     CEIL = 6,
271 |     TRUNC = 7,
272 |     ROUND = 8,
273 |     NEGATIVE = 9,
274 |     POSITIVE = 10,
275 |     SIGN = 11,
276 |     RINT = 12,
277 | 
278 |     // One input, always return a float one input
279 |     SQRT = 15,
280 |     SQUARE = 16,
281 |     RECIPROCAL = 17,
282 | 
283 |     // one input, output bool
284 |     LOGICAL_NOT = 18,
285 |     ISINF = 19,
286 |     ISNAN = 20,
287 |     ISFINITE = 21,
288 |     ISNORMAL = 22,
289 | 
290 |     ISNOTINF = 23,
291 |     ISNOTNAN = 24,
292 |     ISNOTFINITE = 25,
293 |     ISNOTNORMAL = 26,
294 |     ISNANORZERO = 27,
295 | 
296 |     // One input, does not allow floats
297 |     BITWISE_NOT = 28,      // same as invert?
298 | 
299 |     UNARY_LAST = 35,
300 | };
301 | 
302 | enum BINARY_OPERATION {
303 |     BINARY_INVALID = 0,
304 | 
305 |     // Two ops, returns same type
306 |     ADD = 1,
307 |     SUB = 2,
308 |     MUL = 3,
309 |     MOD = 4,   // Warning: there are two mods - C,Java mod  and Python mod
310 | 
311 |     MIN = 5,
312 |     MAX = 6,
313 |     NANMIN = 7,
314 |     NANMAX = 8,
315 |     FLOORDIV = 9,
316 |     POWER = 10,
317 |     REMAINDER = 11,
318 |     FMOD = 12,
319 | 
320 |     // Two ops, always return a double
321 |     DIV = 13,
322 |     SUBDATETIMES = 14,  // returns double
323 |     SUBDATES = 15,   // returns int
324 | 
325 |     // Two inputs, Always return a bool
326 |     LOGICAL_AND = 16,
327 |     LOGICAL_XOR = 17,
328 |     LOGICAL_OR = 18,
329 | 
330 |     // Two inputs, second input must be int based
331 |     BITWISE_LSHIFT = 19,    //left_shift
332 |     BITWISE_RSHIFT = 20,
333 |     BITWISE_AND = 21,
334 |     BITWISE_XOR = 22,
335 |     BITWISE_OR = 23,
336 |     BITWISE_ANDNOT = 24,
337 |     BITWISE_NOTAND = 25,
338 |     BITWISE_XOR_SPECIAL = 26,
339 | 
340 |     ATAN2 = 27,
341 |     HYPOT = 28,
342 | 
343 |     BINARY_LAST = 29,
344 | };
345 | 
346 | enum TRIG_OPERATION {
347 |     TRIG_INVALID = 0,
348 |     // One op, returns same type
349 |     SIN = 1,
350 |     COS = 2,
351 |     TAN = 3,
352 |     ASIN = 4,
353 |     ACOS = 5,
354 |     ATAN = 6,
355 |     SINH = 7,
356 |     COSH = 8,
357 |     TANH = 9,
358 |     ASINH = 10,
359 |     ACOSH = 11,
360 |     ATANH = 12,
361 | 
362 |     LOG = 13,
363 |     LOG2 = 14,
364 |     LOG10 = 15,
365 |     EXP = 16,
366 |     EXP2 = 17,
367 |     EXPM1 = 18,
368 |     LOG1P = 19,
369 |     CBRT = 20,
370 | 
371 |     TRIG_LAST = 21
372 | };
373 | 
374 | //----------------------------------------------------------------------------------
375 | // Lookup to go from 1 byte to 8 byte boolean values
376 | extern int64_t gBooleanLUT64[256];
377 | extern int32_t gBooleanLUT32[16];
378 | 
379 | extern int64_t gBooleanLUT64Inverse[256];
380 | extern int32_t gBooleanLUT32Inverse[16];
381 | 
382 | typedef void(*UNARY_FUNC)(void* pDataIn, void* pDataOut, int64_t len, int64_t strideIn, int64_t strideOut);
383 | // Pass in two vectors and return one vector
384 | // Used for operations like C = A + B
385 | typedef void(*ANY_TWO_FUNC)(void* pDataIn, void* pDataIn2, void* pDataOut, int64_t len, int64_t strideIn1, int64_t strideIn2, int64_t strideOut);
386 | typedef void(*GROUPBY_FUNC)(void* pstGroupBy, int64_t index);
387 | typedef void(*REDUCE_FUNC)(void* pDataIn1X, void* pDataOutX, void* pStartVal, int64_t datalen, int64_t strideIn);
388 | 
389 | 
390 | //======================================================
391 | // Unary
392 | //------------------------------------------------------
393 | // Macro stub for returning None
394 | #define STRIDE_NEXT(_TYPE_, _MEM_, _STRIDE_) (_TYPE_*)((char*)_MEM_ + _STRIDE_)
395 | 
396 | 
397 | //--------------------------------------------------------------------
398 | // multithreaded struct used for calling unary op codes
399 | struct UNARY_CALLBACK {
400 |     UNARY_FUNC pUnaryCallback;
401 | 
402 |     char* pDataIn;
403 |     char* pDataOut;
404 | 
405 |     int64_t itemSizeIn;
406 |     int64_t itemSizeOut;
407 | };
408 | 
409 | 
410 | //====================================================================
411 | void* FmAlloc(size_t _Size);
412 | void FmFree(void* _Block);
413 | 
414 | #define WORKSPACE_ALLOC FmAlloc
415 | #define WORKSPACE_FREE FmFree
416 | 
417 | // Default stack size in linux is 320KB
418 | #define MAX_STACK_ALLOC (1024)
419 | 
420 | // For small buffers that can be allocated on the stack
421 | #if defined(_WIN32) && !defined(__GNUC__)
422 | #define POSSIBLY_STACK_ALLOC(_alloc_size_) _alloc_size_ > MAX_STACK_ALLOC ? (char*)WORKSPACE_ALLOC(_alloc_size_) : (char*)_malloca(_alloc_size_);
423 | #define POSSIBLY_STACK_ALLOC_TYPE(_TYPE_, _alloc_size_) _alloc_size_ > MAX_STACK_ALLOC ? (_TYPE_)WORKSPACE_ALLOC(_alloc_size_) : (_TYPE_)_malloca(_alloc_size_);
424 | #else
425 | #define POSSIBLY_STACK_ALLOC(_alloc_size_) _alloc_size_ > MAX_STACK_ALLOC ? (char*)WORKSPACE_ALLOC(_alloc_size_) : (char*)alloca(_alloc_size_);
426 | #define POSSIBLY_STACK_ALLOC_TYPE(_TYPE_, _alloc_size_) _alloc_size_ > MAX_STACK_ALLOC ? (_TYPE_)WORKSPACE_ALLOC(_alloc_size_) : (_TYPE_)alloca(_alloc_size_);
427 | #endif
428 | #define POSSIBLY_STACK_FREE(_alloc_size_, _mem_ptr_) if (_alloc_size_ > MAX_STACK_ALLOC) WORKSPACE_FREE(_mem_ptr_);
429 | 
430 | //=======================================================================
431 | // Conversions
432 | struct stRecarrayOffsets {
433 |     char* pData;
434 |     int64_t    readoffset;
435 |     int64_t    itemsize;
436 | };
437 | 
438 | extern "C" void RecArrayToColMajor(
439 |     stRecarrayOffsets* pstOffset,
440 |     char* pStartOffset,
441 |     int64_t totalRows,
442 |     int64_t numArrays,
443 |     int64_t itemSize);
444 | 
445 | //=====================================================================
446 | // Sorting
447 | enum SORT_MODE {
448 |     SORT_MODE_QSORT = 0,
449 |     SORT_MODE_HEAP = 1,
450 |     SORT_MODE_MERGE = 2,
451 | };
452 | 
453 | extern "C" int64_t IsSorted(void* pDataIn1,int64_t arraySize1, int32_t arrayType1, int64_t itemSize);
454 | extern "C" int SortIndex32(
455 |     int64_t *   pCutOffs,
456 |     int64_t     cutOffLength,
457 |     void*       pDataIn1,
458 |     int64_t     arraySize1,
459 |     int32_t *   pDataOut1,
460 |     SORT_MODE   mode,
461 |     int         arrayType1,
462 |     int64_t     strlen);
463 | 
464 | extern "C" int SortIndex64(
465 |     int64_t * pCutOffs,
466 |     int64_t     cutOffLength,
467 |     void*       pDataIn1,
468 |     int64_t     arraySize1,
469 |     int64_t *   pDataOut1,
470 |     SORT_MODE   mode,
471 |     int         arrayType1,
472 |     int64_t     strlen);
473 | 
474 | 
475 | typedef int64_t(*GROUP_INDEX_FUNC)(
476 |     void* pDataIn1,
477 |     int64_t       arraySize1V,
478 |     void* pDataIndexInV,
479 |     void* pGroupOutV,
480 |     void* pFirstOutV,
481 |     void* pCountOutV,
482 |     bool* pFilter,       // optional
483 |     int64_t       base_index,
484 |     int64_t       strlen);
485 | 
486 | 
487 | extern "C" int64_t GroupIndex32(
488 |     void* pDataIn1,
489 |     int64_t    arraySize1V,
490 |     void* pDataIndexInV,
491 |     void* pGroupOutV,
492 |     void* pFirstOutV,
493 |     void* pCountOutV,
494 |     bool* pFilter,       // optional
495 |     int64_t       base_index,
496 |     int64_t       strlen);
497 | 
498 | extern "C" int64_t GroupIndex64(
499 |     void* pDataIn1,
500 |     int64_t    arraySize1V,
501 |     void* pDataIndexInV,
502 |     void* pGroupOutV,
503 |     void* pFirstOutV,
504 |     void* pCountOutV,
505 |     bool* pFilter,       // optional
506 |     int64_t       base_index,
507 |     int64_t       strlen);
508 | 
509 | extern "C" int Sort(
510 |     SORT_MODE sortmode,
511 |     int atype,
512 |     void* pDataIn,
513 |     int64_t arrayLength,
514 |     int64_t stridesIn,
515 |     int64_t itemSize,
516 |     void* pDataOut1,
517 |     int64_t stridesOut);
518 | 
519 | extern "C" int ArangeFill(
520 |     int   atype,
521 |     char* pBuffer,
522 |     void* pFirstValue,
523 |     void* pSecondValue,
524 |     int64_t length,
525 |     int32_t threadwakeup);
526 | 
527 | 


--------------------------------------------------------------------------------
/src/atop/threads.cpp:
--------------------------------------------------------------------------------
  1 | #include "threads.h"
  2 | 
  3 | // to debug thread wakeup allow LOGGING to printf
  4 | //#define LOGGING printf
  5 | #define LOGGING(...)
  6 | #define LOGERROR printf
  7 | 
  8 | #if defined(RT_OS_DARWIN)
  9 | /* For MacOS use a conditional wakeup */
 10 | pthread_cond_t  g_WakeupCond = PTHREAD_COND_INITIALIZER;
 11 | pthread_mutex_t g_WakeupMutex = PTHREAD_MUTEX_INITIALIZER;
 12 | #endif
 13 | 
 14 | 
 15 | #if defined(RT_OS_WINDOWS)
 16 | WakeSingleAddress g_WakeSingleAddress = InitWakeCalls();
 17 | WakeAllAddress g_WakeAllAddress;
 18 | WaitAddress g_WaitAddress;
 19 | 
 20 | //-----------------------------------------------------------------
 21 | // Not every version of Windows has this useful API so we have to check for it dynamically
 22 | WakeSingleAddress InitWakeCalls()
 23 | {
 24 |     FARPROC fp;
 25 | 
 26 |     HMODULE hModule = LoadLibraryW(L"kernelbase.dll");
 27 | 
 28 |     if (hModule != NULL) {
 29 |         fp = GetProcAddress(hModule, "WakeByAddressSingle");
 30 |         if (fp != NULL) {
 31 |             //LogInform("**System supports WakeByAddressSingle ...\n");
 32 |             g_WakeSingleAddress = (VOID(WINAPI*)(PVOID)) fp;
 33 | 
 34 |             fp = GetProcAddress(hModule, "WakeByAddressAll");
 35 |             g_WakeAllAddress = (WakeAllAddress)fp;
 36 | 
 37 |             fp = GetProcAddress(hModule, "WaitOnAddress");
 38 |             g_WaitAddress = (WaitAddress)fp;
 39 | 
 40 |         }
 41 |         else {
 42 |             LOGERROR("**System does NOT support WakeByAddressSingle ...\n");
 43 |             g_WakeSingleAddress = NULL;
 44 |             g_WakeAllAddress = NULL;
 45 |             g_WaitAddress = NULL;
 46 | 
 47 |         }
 48 |     }
 49 | 
 50 |     return g_WakeSingleAddress;
 51 | }
 52 | 
 53 | #else
 54 | WakeSingleAddress g_WakeSingleAddress = NULL;
 55 | WakeAllAddress g_WakeAllAddress = NULL;
 56 | WaitAddress g_WaitAddress = NULL;
 57 | #endif
 58 | 
 59 | 
 60 | //-----------------------------------------------------------
 61 | // Main thread loop
 62 | // Threads will wait on an address then wake up when there is work
 63 | // Linux uses a futex to control how many threads wakeup
 64 | // Windows uses a counter
 65 | // Darwin (macOS) does not support futexes or WaitOnAddress, so it will need to use one of:
 66 | //   * POSIX condition variables
 67 | //   * C++11 condition variables from <atomic>
 68 | //   * libdispatch (GCD), using dispatch_semaphore_t (via dispatch_semaphore_create()) to control concurrency; include <dispatch/semaphore.h>
 69 | //   * BSD syscalls like __psynch_cvwait (and other __psynch functions). These are not externally documented -- need to look in github.com/apple/darwin-libpthread to see how things work.
 70 | //
 71 | #if defined(RT_OS_WINDOWS)
 72 | DWORD WINAPI WorkerThreadFunction(LPVOID lpParam)
 73 | #else
 74 | void*
 75 | WorkerThreadFunction(void* lpParam)
 76 | #endif
 77 | {
 78 |     stWorkerRing* pWorkerRing = (stWorkerRing*)lpParam;
 79 | 
 80 |     DWORD core = (DWORD)(InterlockedIncrement64(&pWorkerRing->WorkThread));
 81 |     // The first 3 get assigned 1,2,3  and main is reserved for 0 and pool 0
 82 |     // The next  (3,4,5,6) are assigned 4,5,6,7 and pool 1
 83 |     DWORD pool = core  / MAX_WORKER_CHANNEL;
 84 | 
 85 |     LOGGING("Thread created with parameter: %d   %p\n", core, g_WaitAddress);
 86 | 
 87 |     // On windows we set the thread affinity mask
 88 |     if (g_WaitAddress != NULL) {
 89 |         DWORD tempcore = core - 1;
 90 |         // If hyperthreading is on skip every other core
 91 |         if (THREADER->GlobalWorkerParams.HyperThreading)
 92 |             tempcore = tempcore * 2;
 93 |         uint64_t ret = SetThreadAffinityMask(GetCurrentThread(), (uint64_t)1 << tempcore);
 94 |         //uint64_t ret = SetThreadAffinityMask(GetCurrentThread(), 0xFFFFFFFF);
 95 |     }
 96 | 
 97 |     int64_t lastWorkItemCompleted = -1;
 98 | 
 99 |     //
100 |     // Setting Cancelled will stop all worker threads
101 |     //
102 |     while (THREADER->GlobalWorkerParams.Cancelled == 0) {
103 |         int64_t workIndexCompleted;
104 |         int64_t workIndex;
105 | 
106 |         workIndex = pWorkerRing->Pool[pool].WorkIndex;
107 |         workIndexCompleted = pWorkerRing->Pool[pool].WorkIndexCompleted;
108 | 
109 |         int64_t didSomeWork = 0;
110 | 
111 |         // See if work to do
112 |         if (workIndex > workIndexCompleted) {
113 |             stMATH_WORKER_ITEM* pWorkItem = pWorkerRing->GetExistingWorkItem();
114 | 
115 |             // check if the work was for our thread
116 |             if ((int64_t)core <= pWorkItem->ThreadWakeup) {
117 |                 LOGGING("Pos Waking %d %d %lld\n", core, pool, workIndex);
118 |                 didSomeWork = pWorkItem->DoWork(core, pWorkerRing->MainWorkIndex);
119 |             }
120 |             else {
121 |                 LOGGING("Not Waking %d %d %lld\n", core, pool, workIndex);
122 | 
123 |             }
124 |         }
125 | 
126 |         // didSomeWork contains how many work items the thread completed
127 |         // TODO: Use core as an index to keep stats track of how many
128 |         // work items each thread is completing for future thread tuning
129 |         //
130 |         // NOTE: if we did some work, we loop back to top while to check for more work
131 |         // before waiting again on the worker Q
132 |         //
133 |         if (!didSomeWork) {
134 |             workIndexCompleted = workIndex;
135 | 
136 | #if defined(RT_OS_WINDOWS)
137 |             //printf("Sleeping %d", core);
138 |             if (g_WaitAddress == NULL) {
139 |                 // For Windows 7 we just sleep
140 |                 Sleep(THREADER->GlobalWorkerParams.SleepTime);
141 |             }
142 |             else {
143 |                 if (!didSomeWork) {
144 | 
145 |                     //workIndexCompleted++;
146 |                 }
147 | 
148 |                 LOGGING("[%d][%d] WaitAddress %llu  %p  %d\n", core, pool, workIndexCompleted, &(pWorkerRing->Pool[pool].WorkIndex), (int)didSomeWork);
149 | 
150 |                 // Otherwise wake up using conditional variable
151 |                 g_WaitAddress(
152 |                     &(pWorkerRing->Pool[pool].WorkIndex),
153 |                     (PVOID)&workIndexCompleted,
154 |                     8, // The size of the value being waited on (i.e. the number of bytes to read from the two pointers then compare).
155 |                     1000000L);
156 |             }
157 | #elif defined(RT_OS_LINUX)
158 | 
159 |             LOGGING("[%d] WaitAddress %llu  %llu  %d\n", core, workIndexCompleted, pWorkerRing->Pool[pool].WorkIndex, (int)didSomeWork);
160 | 
161 |             //int futex(int *uaddr, int futex_op, int val,
162 |             //   const struct timespec *timeout,   /* or: uint32_t val2 */
163 |             //   int *uaddr2, int val3);
164 |             futex((int*)&(pWorkerRing->Pool[pool].WorkIndex), FUTEX_WAIT, (int)workIndexCompleted, NULL, NULL, 0);
165 | 
166 | #elif defined(RT_OS_DARWIN)
167 |             LOGGING("[%lu] WaitAddress %llu  %llu  %d\n", core, workIndexCompleted, pWorkerRing->Pool[pool].WorkIndex, (int)didSomeWork);
168 | 
169 |             pthread_mutex_lock(&g_WakeupMutex);
170 |             pthread_cond_wait(&g_WakeupCond, &g_WakeupMutex);
171 |             pthread_mutex_unlock(&g_WakeupMutex);
172 | 
173 | #else
174 | #error riptide MathThreads support needs to be implemented for this platform.
175 | 
176 | #endif
177 | 
178 |             LOGGING("Waking %d %d\n", core, pool);
179 | 
180 |             //YieldProcessor();
181 |         }
182 |         //YieldProcessor();
183 |     }
184 | 
185 |     LOGERROR("Thread %d exiting!!!\n", (int)core);
186 | #if defined(RT_OS_WINDOWS)
187 |     return 0;
188 | #else
189 |     return NULL;
190 | #endif
191 | }
192 | 
193 | 
194 | #if defined(RT_OS_WINDOWS)
195 | 
196 | //-----------------------------------------------------------
197 | //
198 | THANDLE StartThread(stWorkerRing* pWorkerRing)
199 | {
200 |     DWORD dwThreadId;
201 |     THANDLE hThread;
202 | 
203 |     hThread = CreateThread(
204 |         NULL, // default security attributes
205 |         0, // use default stack size
206 |         WorkerThreadFunction, // thread function
207 |         pWorkerRing, // argument to thread function
208 |         0, // use default creation flags
209 |         &dwThreadId); // returns the thread identifier
210 | 
211 |                       //printf("The thread ID: %d.\n", dwThreadId);
212 | 
213 |                       // Check the return value for success. If something wrong...
214 |     if (hThread == NULL) {
215 |         LOGERROR("CreateThread() failed, error: %d.\n", GetLastError());
216 |         return NULL;
217 |     }
218 | 
219 |     return hThread;
220 | 
221 | }
222 | 
223 | #else
224 | 
225 | //-----------------------------------------------------------
226 | //
227 | THANDLE StartThread(stWorkerRing* pWorkerRing)
228 | {
229 |     int err;
230 |     THANDLE hThread;
231 | 
232 |     err = pthread_create(&hThread, NULL, &WorkerThreadFunction, pWorkerRing);
233 | 
234 |     if (err != 0) {
235 |         LOGERROR("*** Cannot create thread :[%s]\n", strerror(err));
236 |     }
237 | 
238 |     return hThread;
239 | }
240 | #endif
241 | 
242 | //============================================================================================
243 | #if defined(__GNUC__)
244 | #  define MEM_STATIC static __inline __attribute__((unused))
245 | #elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
246 | #  define MEM_STATIC static inline
247 | #elif defined(_MSC_VER)
248 | #  define MEM_STATIC static __inline
249 | #else
250 | #  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
251 | #endif
252 | 
253 | typedef unsigned int U32;
254 | 
255 | // Taken from the ZSTD project
256 | MEM_STATIC ATOP_cpuid_t ATOP_cpuid(void) {
257 |     U32 f1c = 0;
258 |     U32 f1d = 0;
259 |     U32 f7b = 0;
260 |     U32 f7c = 0;
261 | #ifdef _MSC_VER
262 |     int reg[4];
263 |     __cpuid((int*)reg, 0);
264 |     {
265 |         int const n = reg[0];
266 |         if (n >= 1) {
267 |             __cpuid((int*)reg, 1);
268 |             f1c = (U32)reg[2];
269 |             f1d = (U32)reg[3];
270 |         }
271 |         if (n >= 7) {
272 |             __cpuidex((int*)reg, 7, 0);
273 |             f7b = (U32)reg[1];
274 |             f7c = (U32)reg[2];
275 |         }
276 |     }
277 | #elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
278 |     /* The following block like the normal cpuid branch below, but gcc
279 |     * reserves ebx for use of its pic register so we must specially
280 |     * handle the save and restore to avoid clobbering the register
281 |     */
282 |     U32 n;
283 |     __asm__(
284 |         "pushl %%ebx\n\t"
285 |         "cpuid\n\t"
286 |         "popl %%ebx\n\t"
287 |         : "=a"(n)
288 |         : "a"(0)
289 |         : "ecx", "edx");
290 |     if (n >= 1) {
291 |         U32 f1a;
292 |         __asm__(
293 |             "pushl %%ebx\n\t"
294 |             "cpuid\n\t"
295 |             "popl %%ebx\n\t"
296 |             : "=a"(f1a), "=c"(f1c), "=d"(f1d)
297 |             : "a"(1));
298 |     }
299 |     if (n >= 7) {
300 |         __asm__(
301 |             "pushl %%ebx\n\t"
302 |             "cpuid\n\t"
303 |             "movl %%ebx, %%eax\n\r"
304 |             "popl %%ebx"
305 |             : "=a"(f7b), "=c"(f7c)
306 |             : "a"(7), "c"(0)
307 |             : "edx");
308 |     }
309 | #elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
310 |     U32 n;
311 |     __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx");
312 |     if (n >= 1) {
313 |         U32 f1a;
314 |         __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx");
315 |     }
316 |     if (n >= 7) {
317 |         U32 f7a;
318 |         __asm__("cpuid"
319 |             : "=a"(f7a), "=b"(f7b), "=c"(f7c)
320 |             : "a"(7), "c"(0)
321 |             : "edx");
322 |     }
323 | #endif
324 |     {
325 |         ATOP_cpuid_t cpuid;
326 |         cpuid.f1c = f1c;
327 |         cpuid.f1d = f1d;
328 |         cpuid.f7b = f7b;
329 |         cpuid.f7c = f7c;
330 |         return cpuid;
331 |     }
332 | }
333 | 
334 | #define X(name, r, bit)                                                        \
335 |   MEM_STATIC int ATOP_cpuid_##name(ATOP_cpuid_t const cpuid) {                 \
336 |     return ((cpuid.r) & (1U << bit)) != 0;                                     \
337 |   }
338 | 
339 | /* cpuid(1): Processor Info and Feature Bits. */
340 | #define C(name, bit) X(name, f1c, bit)
341 | C(sse3, 0)
342 | C(pclmuldq, 1)
343 | C(dtes64, 2)
344 | C(monitor, 3)
345 | C(dscpl, 4)
346 | C(vmx, 5)
347 | C(smx, 6)
348 | C(eist, 7)
349 | C(tm2, 8)
350 | C(ssse3, 9)
351 | C(cnxtid, 10)
352 | C(fma, 12)
353 | C(cx16, 13)
354 | C(xtpr, 14)
355 | C(pdcm, 15)
356 | C(pcid, 17)
357 | C(dca, 18)
358 | C(sse41, 19)
359 | C(sse42, 20)
360 | C(x2apic, 21)
361 | C(movbe, 22)
362 | C(popcnt, 23)
363 | C(tscdeadline, 24)
364 | C(aes, 25)
365 | C(xsave, 26)
366 | C(osxsave, 27)
367 | C(avx, 28)
368 | C(f16c, 29)
369 | C(rdrand, 30)
370 | #undef C
371 | #define D(name, bit) X(name, f1d, bit)
372 | D(fpu, 0)
373 | D(vme, 1)
374 | D(de, 2)
375 | D(pse, 3)
376 | D(tsc, 4)
377 | D(msr, 5)
378 | D(pae, 6)
379 | D(mce, 7)
380 | D(cx8, 8)
381 | D(apic, 9)
382 | D(sep, 11)
383 | D(mtrr, 12)
384 | D(pge, 13)
385 | D(mca, 14)
386 | D(cmov, 15)
387 | D(pat, 16)
388 | D(pse36, 17)
389 | D(psn, 18)
390 | D(clfsh, 19)
391 | D(ds, 21)
392 | D(acpi, 22)
393 | D(mmx, 23)
394 | D(fxsr, 24)
395 | D(sse, 25)
396 | D(sse2, 26)
397 | D(ss, 27)
398 | D(htt, 28)
399 | D(tm, 29)
400 | D(pbe, 31)
401 | #undef D
402 | 
403 | /* cpuid(7): Extended Features. */
404 | #define B(name, bit) X(name, f7b, bit)
405 | B(bmi1, 3)
406 | B(hle, 4)
407 | B(avx2, 5)
408 | B(smep, 7)
409 | B(bmi2, 8)
410 | B(erms, 9)
411 | B(invpcid, 10)
412 | B(rtm, 11)
413 | B(mpx, 14)
414 | B(avx512f, 16)
415 | B(avx512dq, 17)
416 | B(rdseed, 18)
417 | B(adx, 19)
418 | B(smap, 20)
419 | B(avx512ifma, 21)
420 | B(pcommit, 22)
421 | B(clflushopt, 23)
422 | B(clwb, 24)
423 | B(avx512pf, 26)
424 | B(avx512er, 27)
425 | B(avx512cd, 28)
426 | B(sha, 29)
427 | B(avx512bw, 30)
428 | B(avx512vl, 31)
429 | #undef B
430 | #define C(name, bit) X(name, f7c, bit)
431 | C(prefetchwt1, 0)
432 | C(avx512vbmi, 1)
433 | #undef C
434 | 
435 | #undef X
436 | 
437 | extern "C" {
438 |     int g_bmi2 = 0;
439 |     int g_avx2 = 0;
440 |     ATOP_cpuid_t   g_cpuid;
441 | };
442 | 
443 | #if defined(RT_OS_WINDOWS)
444 | 
445 | void PrintCPUInfo(char* buffer, size_t buffercount) {
446 |     int CPUInfo[4] = { -1 };
447 |     unsigned   nExIds, i = 0;
448 |     char CPUBrandString[0x40];
449 |     // Get the information associated with each extended ID.
450 |     __cpuid(CPUInfo, 0x80000000);
451 |     nExIds = CPUInfo[0];
452 | 
453 |     for (unsigned int i = 0x80000000; i <= nExIds; ++i)
454 |     {
455 |         __cpuid(CPUInfo, i);
456 | 
457 |         if (i == 0x80000002) {
458 |             for (size_t i = 0; i < sizeof(CPUInfo); i++)
459 |                 CPUBrandString[i] = ((char*)CPUInfo)[i];
460 |         }
461 |         else if (i == 0x80000003) {
462 |             for (size_t i = 0; i < sizeof(CPUInfo); i++)
463 |                 CPUBrandString[i + 16] = ((char*)CPUInfo)[i];
464 |         }
465 |         else if (i == 0x80000004) {
466 |             for (size_t i = 0; i < sizeof(CPUInfo); i++)
467 |                 CPUBrandString[i + 32] = ((char*)CPUInfo)[i];
468 |         }
469 |     }
470 | 
471 |     // NEW CODE
472 |     g_cpuid = ATOP_cpuid();
473 | 
474 |     g_bmi2 = ATOP_cpuid_bmi2(g_cpuid);
475 |     g_avx2 = ATOP_cpuid_avx2(g_cpuid);
476 | 
477 |     snprintf(buffer, buffercount, "**CPU: %s  AVX2:%d  BMI2:%d  f1c:0x%.8x  f1d:0x%.8x  f7b:0x%.8x  f7c:0x%.8x", CPUBrandString, g_avx2, g_bmi2, g_cpuid.f1c, g_cpuid.f1d, g_cpuid.f7b, g_cpuid.f7c);
478 |     if (g_avx2 == 0) {
479 |         printf("!!!NOTE: this system does not support AVX2 or BMI2 instructions, and will not work!\n");
480 |     }
481 | 
482 | }
483 | 
484 | #else
485 | extern "C" {
486 | #include <pthread.h>
487 | #include <sys/types.h>
488 | #include <sched.h>
489 | 
490 | #include <unistd.h>
491 | #include <sys/syscall.h>
492 | 
493 | #ifdef RT_OS_FREEBSD
494 | #include <sys/thr.h> // Use thr_self() syscall under FreeBSD to get thread id
495 | #endif  // RT_OS_FREEBSD
496 | 
497 |     pid_t gettid(void) {
498 | #if defined(RT_OS_LINUX)
499 |         return syscall(SYS_gettid);
500 | 
501 | #elif defined(RT_OS_DARWIN)
502 |         uint64_t thread_id;
503 |         return pthread_threadid_np(NULL, &thread_id) ? 0 : (pid_t)thread_id;
504 | 
505 | #elif defined(RT_OS_FREEBSD)
506 |         // https://www.freebsd.org/cgi/man.cgi?query=thr_self
507 |         long thread_id;
508 |         return thr_self(&thread_id) ? 0 : (pid_t)thread_id;
509 | 
510 | #else
511 | #error Cannot determine how to get the identifier for the current thread on this platform.
512 | #endif   // defined(RT_OS_LINUX)
513 |     }
514 | 
515 | 
516 |     VOID Sleep(DWORD dwMilliseconds) {
517 |         usleep(dwMilliseconds * 1000);
518 |     }
519 | 
520 |     BOOL CloseHandle(THANDLE hObject) {
521 |         return TRUE;
522 |     }
523 | 
524 |     pid_t GetCurrentThread() {
525 |         return gettid();
526 |     }
527 | 
528 |     uint64_t SetThreadAffinityMask(pid_t hThread, uint64_t dwThreadAffinityMask) {
529 | #if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
530 |         cpu_set_t cpuset;
531 | 
532 |         uint64_t bitpos = 1;
533 |         int count = 0;
534 | 
535 |         while (!(bitpos & dwThreadAffinityMask)) {
536 |             bitpos <<= 1;
537 |             count++;
538 |             if (count > 63) {
539 |                 break;
540 |             }
541 |         }
542 | 
543 |         //printf("**linux setting affinity %d\n", count);
544 | 
545 |         if (count <= 63) {
546 | 
547 |             CPU_ZERO(&cpuset);
548 |             CPU_SET(count, &cpuset);
549 |             //dwThreadAffinityMask
550 |             sched_setaffinity(GetCurrentThread(), sizeof(cpuset), &cpuset);
551 |         }
552 | 
553 | #else
554 |         #warning No thread - affinity support implemented for this OS.This does not prevent riptide from running but overall performance may be reduced.
555 | #endif   // defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
556 | 
557 |             return 0;
558 |     }
559 | 
560 |     BOOL GetProcessAffinityMask(HANDLE hProcess, uint64_t* lpProcessAffinityMask, uint64_t* lpSystemAffinityMask) {
561 | #if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
562 |         cpu_set_t cpuset;
563 |         sched_getaffinity(getpid(), sizeof(cpuset), &cpuset);
564 | 
565 |         *lpProcessAffinityMask = 0;
566 |         *lpSystemAffinityMask = 0;
567 | 
568 |         uint64_t bitpos = 1;
569 |         for (int i = 0; i < 63; i++) {
570 |             if (CPU_ISSET(i, &cpuset)) {
571 |                 *lpProcessAffinityMask |= bitpos;
572 |                 *lpSystemAffinityMask |= bitpos;
573 |             }
574 |             bitpos <<= 1;
575 |         }
576 | 
577 |         if (*lpProcessAffinityMask == 0) {
578 |             *lpSystemAffinityMask = 0xFF;
579 |             *lpSystemAffinityMask = 0xFF;
580 |         }
581 | 
582 |         //CPU_ISSET = 0xFF;
583 |         return TRUE;
584 | 
585 | #else
586 |         #warning No thread - affinity support implemented for this OS.This does not prevent riptide from running but overall performance may be reduced.
587 |             return FALSE;
588 | 
589 | #endif   // defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
590 |     }
591 | 
592 | 
593 |     HANDLE GetCurrentProcess(VOID) {
594 |         return NULL;
595 |     }
596 | 
597 |     DWORD  GetLastError(VOID) {
598 |         return 0;
599 |     }
600 | 
601 |     HANDLE CreateThread(VOID* lpThreadAttributes, SIZE_T dwStackSize, LPTHREAD_START_ROUTINE lpStartAddress, LPVOID lpParameter, DWORD dwCreationFlags, LPDWORD lpThreadId) {
602 |         return NULL;
603 |     }
604 | 
605 |     HMODULE LoadLibraryW(const WCHAR* lpLibFileName) {
606 |         return NULL;
607 |     }
608 | 
609 |     FARPROC GetProcAddress(HMODULE hModule, const char* lpProcName) {
610 |         return NULL;
611 |     }
612 | }
613 | 
614 | #include <cpuid.h>
615 | 
616 | void PrintCPUInfo(char* buffer, size_t buffercount) {
617 |     char CPUBrandString[0x40];
618 |     unsigned int CPUInfo[4] = { 0,0,0,0 };
619 | 
620 |     __cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
621 |     unsigned int nExIds = CPUInfo[0];
622 | 
623 |     for (size_t i = 0; i < sizeof(CPUBrandString); i++) {
624 |         CPUBrandString[i] = 0;
625 |     }
626 | 
627 |     for (unsigned int i = 0x80000000; i <= nExIds; ++i)
628 |     {
629 |         __cpuid(i, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
630 | 
631 |         if (i == 0x80000002) {
632 |             for (size_t i = 0; i < sizeof(CPUInfo); i++)
633 |                 CPUBrandString[i] = ((char*)CPUInfo)[i];
634 |         }
635 |         else if (i == 0x80000003) {
636 |             for (size_t i = 0; i < sizeof(CPUInfo); i++)
637 |                 CPUBrandString[i + 16] = ((char*)CPUInfo)[i];
638 |         }
639 |         else if (i == 0x80000004) {
640 |             for (size_t i = 0; i < sizeof(CPUInfo); i++)
641 |                 CPUBrandString[i + 32] = ((char*)CPUInfo)[i];
642 |         }
643 |     }
644 |     //printf("**CPU: %s\n", CPUBrandString);
645 | 
646 |     g_cpuid = ATOP_cpuid();
647 | 
648 |     g_bmi2 = ATOP_cpuid_bmi2(g_cpuid);
649 |     g_avx2 = ATOP_cpuid_avx2(g_cpuid);
650 | 
651 |     snprintf(buffer, buffercount, "**CPU: %s  AVX2:%d  BMI2:%d 0x%.8x 0x%.8x 0x%.8x 0x%.8x", CPUBrandString, g_avx2, g_bmi2, g_cpuid.f1c, g_cpuid.f1d, g_cpuid.f7b, g_cpuid.f7c);
652 |     if (g_avx2 == 0) {
653 |         printf("!!!NOTE: this system does not support AVX2 or BMI2 instructions, and will not work!\n");
654 |     }
655 | 
656 | }
657 | 
658 | #endif
659 | 
660 | 
661 | 
662 | int GetProcCount() {
663 | 
664 |     HANDLE proc = GetCurrentProcess();
665 | 
666 |     DWORD_PTR mask1;
667 |     DWORD_PTR mask2;
668 |     int count;
669 | 
670 |     count = 0;
671 |     GetProcessAffinityMask(proc, &mask1, &mask2);
672 | 
673 |     while (mask1 != 0) {
674 |         if (mask1 & 1) count++;
675 |         mask1 = mask1 >> 1;
676 |     }
677 | 
678 |     //printf("**Process count: %d   riptide_cpp build date and time: %s %s\n", count, __DATE__, __TIME__);
679 | 
680 |     if (count == 0) count = MAX_THREADS_WHEN_CANNOT_DETECT;
681 | 
682 |     if (count > MAX_THREADS_ALLOWED) count = MAX_THREADS_ALLOWED;
683 | 
684 |     return count;
685 | 
686 | }
687 | 


--------------------------------------------------------------------------------
/src/pnumpy/sort.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | __all__ = [
  5 |     'lexsort','sort', 'argsort','argmin', 'argmax', 'searchsorted']
  6 | 
  7 | from pnumpy._pnumpy import getitem, lexsort32, lexsort64
  8 | import numpy as np
  9 | 
 10 | from numpy import asarray, array, asanyarray
 11 | from numpy import concatenate
 12 | 
 13 | #array_function_dispatch = functools.partial(
 14 | #    overrides.array_function_dispatch, module='numpy')
 15 | 
 16 | # functions that are now methods
 17 | def _wrapit(obj, method, *args, **kwds):
 18 |     try:
 19 |         wrap = obj.__array_wrap__
 20 |     except AttributeError:
 21 |         wrap = None
 22 |     result = getattr(asarray(obj), method)(*args, **kwds)
 23 |     if wrap:
 24 |         if not isinstance(result, mu.ndarray):
 25 |             result = asarray(result)
 26 |         result = wrap(result)
 27 |     return result
 28 | 
 29 | 
 30 | def _wrapfunc(obj, method, *args, **kwds):
 31 |     bound = getattr(obj, method, None)
 32 |     if bound is None:
 33 |         return _wrapit(obj, method, *args, **kwds)
 34 | 
 35 |     try:
 36 |         return bound(*args, **kwds)
 37 |     except TypeError:
 38 |         # A TypeError occurs if the object does have such a method in its
 39 |         # class, but its signature is not identical to that of NumPy's. This
 40 |         # situation has occurred in the case of a downstream library like
 41 |         # 'pandas'.
 42 |         #
 43 |         # Call _wrapit from within the except clause to ensure a potential
 44 |         # exception has a traceback chain.
 45 |         return _wrapit(obj, method, *args, **kwds)
 46 | 
 47 | 
 48 | def sort(a, axis=-1, kind=None, order=None):
 49 |     """
 50 |     Return a sorted copy of an array.
 51 | 
 52 |     Parameters
 53 |     ----------
 54 |     a : array_like
 55 |         Array to be sorted.
 56 |     axis : int or None, optional
 57 |         Axis along which to sort. If None, the array is flattened before
 58 |         sorting. The default is -1, which sorts along the last axis.
 59 |     kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
 60 |         Sorting algorithm. The default is 'quicksort'. Note that both 'stable'
 61 |         and 'mergesort' use timsort or radix sort under the covers and, in general,
 62 |         the actual implementation will vary with data type. The 'mergesort' option
 63 |         is retained for backwards compatibility.
 64 | 
 65 |         .. versionchanged:: 1.15.0.
 66 |            The 'stable' option was added.
 67 | 
 68 |     order : str or list of str, optional
 69 |         When `a` is an array with fields defined, this argument specifies
 70 |         which fields to compare first, second, etc.  A single field can
 71 |         be specified as a string, and not all fields need be specified,
 72 |         but unspecified fields will still be used, in the order in which
 73 |         they come up in the dtype, to break ties.
 74 | 
 75 |     Returns
 76 |     -------
 77 |     sorted_array : ndarray
 78 |         Array of the same type and shape as `a`.
 79 | 
 80 |     Threading
 81 |     ---------
 82 |     Up to 8 threads
 83 | 
 84 |     See Also
 85 |     --------
 86 |     ndarray.sort : Method to sort an array in-place.
 87 |     argsort : Indirect sort.
 88 |     lexsort : Indirect stable sort on multiple keys.
 89 |     searchsorted : Find elements in a sorted array.
 90 |     partition : Partial sort.
 91 | 
 92 |     Notes
 93 |     -----
 94 |     The various sorting algorithms are characterized by their average speed,
 95 |     worst case performance, work space size, and whether they are stable. A
 96 |     stable sort keeps items with the same key in the same relative
 97 |     order. The four algorithms implemented in NumPy have the following
 98 |     properties:
 99 | 
100 |     =========== ======= ============= ============ ========
101 |        kind      speed   worst case    work space   stable
102 |     =========== ======= ============= ============ ========
103 |     'quicksort'    1     O(n^2)            0          no
104 |     'heapsort'     3     O(n*log(n))       0          no
105 |     'mergesort'    2     O(n*log(n))      ~n/2        yes
106 |     'timsort'      2     O(n*log(n))      ~n/2        yes
107 |     =========== ======= ============= ============ ========
108 | 
109 |     .. note:: The datatype determines which of 'mergesort' or 'timsort'
110 |        is actually used, even if 'mergesort' is specified. User selection
111 |        at a finer scale is not currently available.
112 | 
113 |     All the sort algorithms make temporary copies of the data when
114 |     sorting along any but the last axis.  Consequently, sorting along
115 |     the last axis is faster and uses less space than sorting along
116 |     any other axis.
117 | 
118 |     The sort order for complex numbers is lexicographic. If both the real
119 |     and imaginary parts are non-nan then the order is determined by the
120 |     real parts except when they are equal, in which case the order is
121 |     determined by the imaginary parts.
122 | 
123 |     Previous to numpy 1.4.0 sorting real and complex arrays containing nan
124 |     values led to undefined behaviour. In numpy versions >= 1.4.0 nan
125 |     values are sorted to the end. The extended sort order is:
126 | 
127 |       * Real: [R, nan]
128 |       * Complex: [R + Rj, R + nanj, nan + Rj, nan + nanj]
129 | 
130 |     where R is a non-nan real value. Complex values with the same nan
131 |     placements are sorted according to the non-nan part if it exists.
132 |     Non-nan values are sorted as before.
133 | 
134 |     .. versionadded:: 1.12.0
135 | 
136 |     quicksort has been changed to `introsort <https://en.wikipedia.org/wiki/Introsort>`_.
137 |     When sorting does not make enough progress it switches to
138 |     `heapsort <https://en.wikipedia.org/wiki/Heapsort>`_.
139 |     This implementation makes quicksort O(n*log(n)) in the worst case.
140 | 
141 |     'stable' automatically chooses the best stable sorting algorithm
142 |     for the data type being sorted.
143 |     It, along with 'mergesort' is currently mapped to
144 |     `timsort <https://en.wikipedia.org/wiki/Timsort>`_
145 |     or `radix sort <https://en.wikipedia.org/wiki/Radix_sort>`_
146 |     depending on the data type.
147 |     API forward compatibility currently limits the
148 |     ability to select the implementation and it is hardwired for the different
149 |     data types.
150 | 
151 |     .. versionadded:: 1.17.0
152 | 
153 |     Timsort is added for better performance on already or nearly
154 |     sorted data. On random data timsort is almost identical to
155 |     mergesort. It is now used for stable sort while quicksort is still the
156 |     default sort if none is chosen. For timsort details, refer to
157 |     `CPython listsort.txt <https://github.com/python/cpython/blob/3.7/Objects/listsort.txt>`_.
158 |     'mergesort' and 'stable' are mapped to radix sort for integer data types. Radix sort is an
159 |     O(n) sort instead of O(n log n).
160 | 
161 |     .. versionchanged:: 1.18.0
162 | 
163 |     NaT now sorts to the end of arrays for consistency with NaN.
164 | 
165 |     Examples
166 |     --------
167 |     >>> a = np.array([[1,4],[3,1]])
168 |     >>> np.sort(a)                # sort along the last axis
169 |     array([[1, 4],
170 |            [1, 3]])
171 |     >>> np.sort(a, axis=None)     # sort the flattened array
172 |     array([1, 1, 3, 4])
173 |     >>> np.sort(a, axis=0)        # sort along the first axis
174 |     array([[1, 1],
175 |            [3, 4]])
176 | 
177 |     Use the `order` keyword to specify a field to use when sorting a
178 |     structured array:
179 | 
180 |     >>> dtype = [('name', 'S10'), ('height', float), ('age', int)]
181 |     >>> values = [('Arthur', 1.8, 41), ('Lancelot', 1.9, 38),
182 |     ...           ('Galahad', 1.7, 38)]
183 |     >>> a = np.array(values, dtype=dtype)       # create a structured array
184 |     >>> np.sort(a, order='height')                        # doctest: +SKIP
185 |     array([('Galahad', 1.7, 38), ('Arthur', 1.8, 41),
186 |            ('Lancelot', 1.8999999999999999, 38)],
187 |           dtype=[('name', '|S10'), ('height', '<f8'), ('age', '<i4')])
188 | 
189 |     Sort by age, then height if ages are equal:
190 | 
191 |     >>> np.sort(a, order=['age', 'height'])               # doctest: +SKIP
192 |     array([('Galahad', 1.7, 38), ('Lancelot', 1.8999999999999999, 38),
193 |            ('Arthur', 1.8, 41)],
194 |           dtype=[('name', '|S10'), ('height', '<f8'), ('age', '<i4')])
195 | 
196 |     """
197 |     if axis is None:
198 |         # flatten returns (1, N) for np.matrix, so always use the last axis
199 |         a = asanyarray(a).flatten()
200 |         axis = -1
201 |         try:
202 |             # attempt a parallel sort
203 |             sort(a, kind=kind)
204 |             return a
205 |         except Exception:
206 |             pass
207 |     else:
208 |         a = asanyarray(a).copy(order="K")
209 | 
210 |     # normal numpy code
211 |     a.sort(axis=axis, kind=kind, order=order)
212 |     return a
213 | 
214 | def lexsort(*args, **kwargs):
215 |     """
216 |     Perform an indirect stable sort using a sequence of keys.
217 | 
218 |     Given multiple sorting keys, which can be interpreted as columns in a
219 |     spreadsheet, lexsort returns an array of integer indices that describes
220 |     the sort order by multiple columns. The last key in the sequence is used
221 |     for the primary sort order, the second-to-last key for the secondary sort
222 |     order, and so on. The keys argument must be a sequence of objects that
223 |     can be converted to arrays of the same shape. If a 2D array is provided
224 |     for the keys argument, it's rows are interpreted as the sorting keys and
225 |     sorting is according to the last row, second last row etc.
226 | 
227 |     Parameters
228 |     ----------
229 |     keys : (k, N) array or tuple containing k (N,)-shaped sequences
230 |         The `k` different "columns" to be sorted.  The last column (or row if
231 |         `keys` is a 2D array) is the primary sort key.
232 |     axis : int, optional
233 |         Axis to be indirectly sorted.  By default, sort over the last axis.
234 | 
235 |     Returns
236 |     -------
237 |     indices : (N,) ndarray of ints
238 |         Array of indices that sort the keys along the specified axis.
239 | 
240 |     Threading
241 |     ---------
242 |     Up to 8 threads
243 | 
244 |     See Also
245 |     --------
246 |     argsort : Indirect sort.
247 |     ndarray.sort : In-place sort.
248 |     sort : Return a sorted copy of an array.
249 | 
250 |     Examples
251 |     --------
252 |     Sort names: first by surname, then by name.
253 | 
254 |     >>> surnames =    ('Hertz',    'Galilei', 'Hertz')
255 |     >>> first_names = ('Heinrich', 'Galileo', 'Gustav')
256 |     >>> ind = np.lexsort((first_names, surnames))
257 |     >>> ind
258 |     array([1, 2, 0])
259 | 
260 |     >>> [surnames[i] + ", " + first_names[i] for i in ind]
261 |     ['Galilei, Galileo', 'Hertz, Gustav', 'Hertz, Heinrich']
262 | 
263 |     Sort two columns of numbers:
264 | 
265 |     >>> a = [1,5,1,4,3,4,4] # First column
266 |     >>> b = [9,4,0,4,0,2,1] # Second column
267 |     >>> ind = np.lexsort((b,a)) # Sort by a, then by b
268 |     >>> ind
269 |     array([2, 0, 4, 6, 5, 3, 1])
270 | 
271 |     >>> [(a[i],b[i]) for i in ind]
272 |     [(1, 0), (1, 9), (3, 0), (4, 1), (4, 2), (4, 4), (5, 4)]
273 | 
274 |     Note that sorting is first according to the elements of ``a``.
275 |     Secondary sorting is according to the elements of ``b``.
276 | 
277 |     A normal ``argsort`` would have yielded:
278 | 
279 |     >>> [(a[i],b[i]) for i in np.argsort(a)]
280 |     [(1, 9), (1, 0), (3, 0), (4, 4), (4, 2), (4, 1), (5, 4)]
281 | 
282 |     Structured arrays are sorted lexically by ``argsort``:
283 | 
284 |     >>> x = np.array([(1,9), (5,4), (1,0), (4,4), (3,0), (4,2), (4,1)],
285 |     ...              dtype=np.dtype([('x', int), ('y', int)]))
286 | 
287 |     >>> np.argsort(x) # or np.argsort(x, order=('x', 'y'))
288 |     array([2, 0, 4, 6, 5, 3, 1])
289 |     """
290 | 
291 |     try:
292 |         return lexsort32(*args, **kwargs)
293 |     except Exception:
294 |         return np.lexsort(*args, **kwargs)
295 | 
296 | def argsort(a, axis=-1, kind=None, order=None):
297 |     """
298 |     Returns the indices that would sort an array.
299 | 
300 |     Perform an indirect sort along the given axis using the algorithm specified
301 |     by the `kind` keyword. It returns an array of indices of the same shape as
302 |     `a` that index data along the given axis in sorted order.
303 | 
304 |     Parameters
305 |     ----------
306 |     a : array_like
307 |         Array to sort.
308 |     axis : int or None, optional
309 |         Axis along which to sort.  The default is -1 (the last axis). If None,
310 |         the flattened array is used.
311 |     kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
312 |         Sorting algorithm. The default is 'quicksort'. Note that both 'stable'
313 |         and 'mergesort' use timsort under the covers and, in general, the
314 |         actual implementation will vary with data type. The 'mergesort' option
315 |         is retained for backwards compatibility.
316 | 
317 |         .. versionchanged:: 1.15.0.
318 |            The 'stable' option was added.
319 |     order : str or list of str, optional
320 |         When `a` is an array with fields defined, this argument specifies
321 |         which fields to compare first, second, etc.  A single field can
322 |         be specified as a string, and not all fields need be specified,
323 |         but unspecified fields will still be used, in the order in which
324 |         they come up in the dtype, to break ties.
325 | 
326 |     Returns
327 |     -------
328 |     index_array : ndarray, int
329 |         Array of indices that sort `a` along the specified `axis`.
330 |         If `a` is one-dimensional, ``a[index_array]`` yields a sorted `a`.
331 |         More generally, ``np.take_along_axis(a, index_array, axis=axis)``
332 |         always yields the sorted `a`, irrespective of dimensionality.
333 | 
334 |     See Also
335 |     --------
336 |     sort : Describes sorting algorithms used.
337 |     lexsort : Indirect stable sort with multiple keys.
338 |     ndarray.sort : Inplace sort.
339 |     argpartition : Indirect partial sort.
340 |     take_along_axis : Apply ``index_array`` from argsort
341 |                       to an array as if by calling sort.
342 | 
343 |     Notes
344 |     -----
345 |     See `sort` for notes on the different sorting algorithms.
346 | 
347 |     As of NumPy 1.4.0 `argsort` works with real/complex arrays containing
348 |     nan values. The enhanced sort order is documented in `sort`.
349 | 
350 |     Examples
351 |     --------
352 |     One dimensional array:
353 | 
354 |     >>> x = np.array([3, 1, 2])
355 |     >>> np.argsort(x)
356 |     array([1, 2, 0])
357 | 
358 |     Two-dimensional array:
359 | 
360 |     >>> x = np.array([[0, 3], [2, 2]])
361 |     >>> x
362 |     array([[0, 3],
363 |            [2, 2]])
364 | 
365 |     >>> ind = np.argsort(x, axis=0)  # sorts along first axis (down)
366 |     >>> ind
367 |     array([[0, 1],
368 |            [1, 0]])
369 |     >>> np.take_along_axis(x, ind, axis=0)  # same as np.sort(x, axis=0)
370 |     array([[0, 2],
371 |            [2, 3]])
372 | 
373 |     >>> ind = np.argsort(x, axis=1)  # sorts along last axis (across)
374 |     >>> ind
375 |     array([[0, 1],
376 |            [0, 1]])
377 |     >>> np.take_along_axis(x, ind, axis=1)  # same as np.sort(x, axis=1)
378 |     array([[0, 3],
379 |            [2, 2]])
380 | 
381 |     Indices of the sorted elements of a N-dimensional array:
382 | 
383 |     >>> ind = np.unravel_index(np.argsort(x, axis=None), x.shape)
384 |     >>> ind
385 |     (array([0, 1, 1, 0]), array([0, 0, 1, 1]))
386 |     >>> x[ind]  # same as np.sort(x, axis=None)
387 |     array([0, 2, 2, 3])
388 | 
389 |     Sorting with keys:
390 | 
391 |     >>> x = np.array([(1, 0), (0, 1)], dtype=[('x', '<i4'), ('y', '<i4')])
392 |     >>> x
393 |     array([(1, 0), (0, 1)],
394 |           dtype=[('x', '<i4'), ('y', '<i4')])
395 | 
396 |     >>> np.argsort(x, order=('x','y'))
397 |     array([1, 0])
398 | 
399 |     >>> np.argsort(x, order=('y','x'))
400 |     array([0, 1])
401 | 
402 |     """
403 |     return _wrapfunc(a, 'argsort', axis=axis, kind=kind, order=order)
404 | 
405 | 
406 | def _argmax_dispatcher(a, axis=None, out=None):
407 |     return (a, out)
408 | 
409 | 
410 | def argmax(a, axis=None, out=None):
411 |     """
412 |     Returns the indices of the maximum values along an axis.
413 | 
414 |     Parameters
415 |     ----------
416 |     a : array_like
417 |         Input array.
418 |     axis : int, optional
419 |         By default, the index is into the flattened array, otherwise
420 |         along the specified axis.
421 |     out : array, optional
422 |         If provided, the result will be inserted into this array. It should
423 |         be of the appropriate shape and dtype.
424 | 
425 |     Returns
426 |     -------
427 |     index_array : ndarray of ints
428 |         Array of indices into the array. It has the same shape as `a.shape`
429 |         with the dimension along `axis` removed.
430 | 
431 |     See Also
432 |     --------
433 |     ndarray.argmax, argmin
434 |     amax : The maximum value along a given axis.
435 |     unravel_index : Convert a flat index into an index tuple.
436 |     take_along_axis : Apply ``np.expand_dims(index_array, axis)``
437 |                       from argmax to an array as if by calling max.
438 | 
439 |     Notes
440 |     -----
441 |     In case of multiple occurrences of the maximum values, the indices
442 |     corresponding to the first occurrence are returned.
443 | 
444 |     Examples
445 |     --------
446 |     >>> a = np.arange(6).reshape(2,3) + 10
447 |     >>> a
448 |     array([[10, 11, 12],
449 |            [13, 14, 15]])
450 |     >>> np.argmax(a)
451 |     5
452 |     >>> np.argmax(a, axis=0)
453 |     array([1, 1, 1])
454 |     >>> np.argmax(a, axis=1)
455 |     array([2, 2])
456 | 
457 |     Indexes of the maximal elements of a N-dimensional array:
458 | 
459 |     >>> ind = np.unravel_index(np.argmax(a, axis=None), a.shape)
460 |     >>> ind
461 |     (1, 2)
462 |     >>> a[ind]
463 |     15
464 | 
465 |     >>> b = np.arange(6)
466 |     >>> b[1] = 5
467 |     >>> b
468 |     array([0, 5, 2, 3, 4, 5])
469 |     >>> np.argmax(b)  # Only the first occurrence is returned.
470 |     1
471 | 
472 |     >>> x = np.array([[4,2,3], [1,0,3]])
473 |     >>> index_array = np.argmax(x, axis=-1)
474 |     >>> # Same as np.max(x, axis=-1, keepdims=True)
475 |     >>> np.take_along_axis(x, np.expand_dims(index_array, axis=-1), axis=-1)
476 |     array([[4],
477 |            [3]])
478 |     >>> # Same as np.max(x, axis=-1)
479 |     >>> np.take_along_axis(x, np.expand_dims(index_array, axis=-1), axis=-1).squeeze(axis=-1)
480 |     array([4, 3])
481 | 
482 |     """
483 |     return _wrapfunc(a, 'argmax', axis=axis, out=out)
484 | 
485 | 
486 | def _argmin_dispatcher(a, axis=None, out=None):
487 |     return (a, out)
488 | 
489 | 
490 | def argmin(a, axis=None, out=None):
491 |     """
492 |     Returns the indices of the minimum values along an axis.
493 | 
494 |     Parameters
495 |     ----------
496 |     a : array_like
497 |         Input array.
498 |     axis : int, optional
499 |         By default, the index is into the flattened array, otherwise
500 |         along the specified axis.
501 |     out : array, optional
502 |         If provided, the result will be inserted into this array. It should
503 |         be of the appropriate shape and dtype.
504 | 
505 |     Returns
506 |     -------
507 |     index_array : ndarray of ints
508 |         Array of indices into the array. It has the same shape as `a.shape`
509 |         with the dimension along `axis` removed.
510 | 
511 |     See Also
512 |     --------
513 |     ndarray.argmin, argmax
514 |     amin : The minimum value along a given axis.
515 |     unravel_index : Convert a flat index into an index tuple.
516 |     take_along_axis : Apply ``np.expand_dims(index_array, axis)``
517 |                       from argmin to an array as if by calling min.
518 | 
519 |     Notes
520 |     -----
521 |     In case of multiple occurrences of the minimum values, the indices
522 |     corresponding to the first occurrence are returned.
523 | 
524 |     Examples
525 |     --------
526 |     >>> a = np.arange(6).reshape(2,3) + 10
527 |     >>> a
528 |     array([[10, 11, 12],
529 |            [13, 14, 15]])
530 |     >>> np.argmin(a)
531 |     0
532 |     >>> np.argmin(a, axis=0)
533 |     array([0, 0, 0])
534 |     >>> np.argmin(a, axis=1)
535 |     array([0, 0])
536 | 
537 |     Indices of the minimum elements of a N-dimensional array:
538 | 
539 |     >>> ind = np.unravel_index(np.argmin(a, axis=None), a.shape)
540 |     >>> ind
541 |     (0, 0)
542 |     >>> a[ind]
543 |     10
544 | 
545 |     >>> b = np.arange(6) + 10
546 |     >>> b[4] = 10
547 |     >>> b
548 |     array([10, 11, 12, 13, 10, 15])
549 |     >>> np.argmin(b)  # Only the first occurrence is returned.
550 |     0
551 | 
552 |     >>> x = np.array([[4,2,3], [1,0,3]])
553 |     >>> index_array = np.argmin(x, axis=-1)
554 |     >>> # Same as np.min(x, axis=-1, keepdims=True)
555 |     >>> np.take_along_axis(x, np.expand_dims(index_array, axis=-1), axis=-1)
556 |     array([[2],
557 |            [0]])
558 |     >>> # Same as np.max(x, axis=-1)
559 |     >>> np.take_along_axis(x, np.expand_dims(index_array, axis=-1), axis=-1).squeeze(axis=-1)
560 |     array([2, 0])
561 | 
562 |     """
563 |     return _wrapfunc(a, 'argmin', axis=axis, out=out)
564 | 
565 | 
566 | def _searchsorted_dispatcher(a, v, side=None, sorter=None):
567 |     return (a, v, sorter)
568 | 
569 | 
570 | def searchsorted(a, v, side='left', sorter=None):
571 |     """
572 |     Find indices where elements should be inserted to maintain order.
573 | 
574 |     Find the indices into a sorted array `a` such that, if the
575 |     corresponding elements in `v` were inserted before the indices, the
576 |     order of `a` would be preserved.
577 | 
578 |     Assuming that `a` is sorted:
579 | 
580 |     ======  ============================
581 |     `side`  returned index `i` satisfies
582 |     ======  ============================
583 |     left    ``a[i-1] < v <= a[i]``
584 |     right   ``a[i-1] <= v < a[i]``
585 |     ======  ============================
586 | 
587 |     Parameters
588 |     ----------
589 |     a : 1-D array_like
590 |         Input array. If `sorter` is None, then it must be sorted in
591 |         ascending order, otherwise `sorter` must be an array of indices
592 |         that sort it.
593 |     v : array_like
594 |         Values to insert into `a`.
595 |     side : {'left', 'right'}, optional
596 |         If 'left', the index of the first suitable location found is given.
597 |         If 'right', return the last such index.  If there is no suitable
598 |         index, return either 0 or N (where N is the length of `a`).
599 |     sorter : 1-D array_like, optional
600 |         Optional array of integer indices that sort array a into ascending
601 |         order. They are typically the result of argsort.
602 | 
603 |         .. versionadded:: 1.7.0
604 | 
605 |     Returns
606 |     -------
607 |     indices : array of ints
608 |         Array of insertion points with the same shape as `v`.
609 | 
610 |     See Also
611 |     --------
612 |     sort : Return a sorted copy of an array.
613 |     histogram : Produce histogram from 1-D data.
614 | 
615 |     Notes
616 |     -----
617 |     Binary search is used to find the required insertion points.
618 | 
619 |     As of NumPy 1.4.0 `searchsorted` works with real/complex arrays containing
620 |     `nan` values. The enhanced sort order is documented in `sort`.
621 | 
622 |     This function uses the same algorithm as the builtin python `bisect.bisect_left`
623 |     (``side='left'``) and `bisect.bisect_right` (``side='right'``) functions,
624 |     which is also vectorized in the `v` argument.
625 | 
626 |     Examples
627 |     --------
628 |     >>> np.searchsorted([1,2,3,4,5], 3)
629 |     2
630 |     >>> np.searchsorted([1,2,3,4,5], 3, side='right')
631 |     3
632 |     >>> np.searchsorted([1,2,3,4,5], [-10, 10, 2, 3])
633 |     array([0, 5, 1, 2])
634 | 
635 |     """
636 |     return _wrapfunc(a, 'searchsorted', v, side=side, sorter=sorter)
637 | 
638 | 


--------------------------------------------------------------------------------
/src/atop/ops_log.cpp:
--------------------------------------------------------------------------------
  1 | #include "common_inc.h"
  2 | #include <cmath>
  3 | #include "invalids.h"
  4 | 
  5 | #if defined(__clang__)
  6 | #pragma clang diagnostic ignored "-Wmissing-braces"
  7 | #pragma clang diagnostic ignored "-Wunused-function"
  8 | #pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
  9 | #endif
 10 | 
 11 | #if defined(__GNUC__)
 12 | //#pragma GCC target "arch=core-avx2,tune=core-avx2"
 13 | #if __GNUC_PREREQ(4, 4) || (__clang__ > 0 && __clang_major__ >= 3) || !defined(__GNUC__)
 14 | /* GCC >= 4.4 or clang or non-GCC compilers */
 15 | #include <x86intrin.h>
 16 | #elif __GNUC_PREREQ(4, 1)
 17 | /* GCC 4.1, 4.2, and 4.3 do not have x86intrin.h, directly include SSE2 header */
 18 | #include <emmintrin.h>
 19 | #endif
 20 | #endif
 21 | 
 22 | 
 23 | //#define LOGGING printf
 24 | #define LOGGING(...)
 25 | 
 26 | #if !RT_TARGET_VECTOR_MEMOP_DEFAULT_ALIGNED
 27 | // MSVC compiler by default assumed unaligned loads
 28 | #define LOADU(X) *(X)
 29 | 
 30 | #else
 31 | static const inline __m256d LOADU(__m256d* x) { return _mm256_loadu_pd((double const*)x); };
 32 | static const inline __m256 LOADU(__m256* x) { return _mm256_loadu_ps((float const*)x); };
 33 | static const inline __m256i LOADU(__m256i* x) { return _mm256_loadu_si256((__m256i const*)x); };
 34 | #endif
 35 | 
 36 | static const inline __m256i LOADUI(__m256i* x) { return _mm256_loadu_si256((__m256i const*)x); };
 37 | 
 38 | static const inline void STOREU(__m256d* x, __m256d y) { _mm256_storeu_pd((double*)x, y); }
 39 | static const inline void STOREU(__m256* x, __m256 y) { _mm256_storeu_ps((float*)x, y); }
 40 | static const inline void STOREU(__m256i* x, __m256i y) { _mm256_storeu_si256((__m256i*)x, y); }
 41 | 
 42 | // For aligned loads which must be on 32 byte boundary
 43 | static const inline __m256d LOADA(__m256d* x) { return _mm256_load_pd((double const*)x); };
 44 | static const inline __m256 LOADA(__m256* x) { return _mm256_load_ps((float const*)x); };
 45 | static const inline __m256i LOADA(__m256i* x) { return _mm256_load_si256((__m256i const*)x); };
 46 | 
 47 | // Aligned stores
 48 | static const inline void STOREA(__m256d* x, __m256d y) { _mm256_store_pd((double*)x, y); }
 49 | static const inline void STOREA(__m256* x, __m256 y) { _mm256_store_ps((float*)x, y); }
 50 | static const inline void STOREA(__m256i* x, __m256i y) { _mm256_store_si256((__m256i*)x, y); }
 51 | 
 52 | 
 53 | template<typename T> static const inline long double LOG_OP(long double x) { return logl(x); }
 54 | template<typename T> static const inline double LOG_OP(double x) { return log(x); }
 55 | template<typename T> static const inline float LOG_OP(float x) { return logf(x); }
 56 | 
 57 | 
 58 | #if defined(RT_COMPILER_MSVC)
 59 | 
 60 | template<typename T> static const inline __m256  LOG_OP_256(__m256 x) { return _mm256_log_ps(x); }
 61 | template<typename T> static const inline __m256d LOG_OP_256(__m256d x) { return _mm256_log_pd(x); }
 62 | #include <float.h>
 63 | #endif
 64 | 
 65 | #if defined(__GNUC__)
 66 | // May require -lm for linker
 67 | 
 68 | extern "C" {
 69 |     __m256d _ZGVdN4v_log(__m256d x);
 70 |     __m256  _ZGVdN8v_logf(__m256 x);
 71 | }
 72 | 
 73 | template<typename T> static const inline __m256  LOG_OP_256(__m256 x) { return _ZGVdN8v_logf(x); }
 74 | template<typename T> static const inline __m256d LOG_OP_256(__m256d x) { return _ZGVdN4v_log(x); }
 75 | 
 76 | #endif
 77 | 
 78 | 
 79 | #  include <fenv.h>
 80 | 
 81 | void npy_set_floatstatus_divbyzero(void)
 82 | {
 83 |     feraiseexcept(FE_DIVBYZERO);
 84 | }
 85 | 
 86 | void npy_set_floatstatus_overflow(void)
 87 | {
 88 |     feraiseexcept(FE_OVERFLOW);
 89 | }
 90 | 
 91 | void npy_set_floatstatus_underflow(void)
 92 | {
 93 |     feraiseexcept(FE_UNDERFLOW);
 94 | }
 95 | 
 96 | void npy_set_floatstatus_invalid(void)
 97 | {
 98 |     feraiseexcept(FE_INVALID);
 99 | }
100 | 
101 | //-------------------------------------------------------------------
102 | //template<typename T>
103 | //static void UnaryOpSlow_LOG(void* pDataIn1, void* pDataOut, int64_t len, int64_t strideIn, int64_t strideOut) {
104 | //    return UnaryOpSlow<T, const T(*)(T)>(LOG_OP<T>, pDataIn1, pDataOut, len, strideIn, strideOut);
105 | //}
106 | 
107 | 
108 | /*
109 |  * NAN and INFINITY like macros (same behavior as glibc for NAN, same as C99
110 |  * for INFINITY)
111 |  *
112 |  * XXX: I should test whether INFINITY and NAN are available on the platform
113 |  */
114 | static const inline float __npy_inff(void)
115 | {
116 |     const union { uint32_t __i; float __f; } __bint = { 0x7f800000UL };
117 |     return __bint.__f;
118 | }
119 | 
120 | static const inline  float __npy_nanf(void)
121 | {
122 |     const union { uint32_t __i; float __f; } __bint = { 0x7fc00000UL };
123 |     return __bint.__f;
124 | }
125 | 
126 | static const inline float __npy_pzerof(void)
127 | {
128 |     const union { uint32_t __i; float __f; } __bint = { 0x00000000UL };
129 |     return __bint.__f;
130 | }
131 | 
132 | static const inline float __npy_nzerof(void)
133 | {
134 |     const union { uint32_t __i; float __f; } __bint = { 0x80000000UL };
135 |     return __bint.__f;
136 | }
137 | 
138 | #define NPY_INFINITYF __npy_inff()
139 | #define NPY_NANF __npy_nanf()
140 | #define NPY_PZEROF __npy_pzerof()
141 | #define NPY_NZEROF __npy_nzerof()
142 | 
143 | #define NPY_INFINITY ((npy_double)NPY_INFINITYF)
144 | #define NPY_NAN ((npy_double)NPY_NANF)
145 | #define NPY_PZERO ((npy_double)NPY_PZEROF)
146 | #define NPY_NZERO ((npy_double)NPY_NZEROF)
147 | 
148 | #define NPY_INFINITYL ((npy_longdouble)NPY_INFINITYF)
149 | #define NPY_NANL ((npy_longdouble)NPY_NANF)
150 | #define NPY_PZEROL ((npy_longdouble)NPY_PZEROF)
151 | #define NPY_NZEROL ((npy_longdouble)NPY_NZEROF)
152 | 
153 | /*
154 |  * Useful constants
155 |  */
156 | #define NPY_E         2.718281828459045235360287471352662498  /* e */
157 | #define NPY_LOG2E     1.442695040888963407359924681001892137  /* log_2 e */
158 | #define NPY_LOG10E    0.434294481903251827651128918916605082  /* log_10 e */
159 | #define NPY_LOGE2     0.693147180559945309417232121458176568  /* log_e 2 */
160 | #define NPY_LOGE10    2.302585092994045684017991454684364208  /* log_e 10 */
161 | #define NPY_PI        3.141592653589793238462643383279502884  /* pi */
162 | #define NPY_PI_2      1.570796326794896619231321691639751442  /* pi/2 */
163 | #define NPY_PI_4      0.785398163397448309615660845819875721  /* pi/4 */
164 | #define NPY_1_PI      0.318309886183790671537767526745028724  /* 1/pi */
165 | #define NPY_2_PI      0.636619772367581343075535053490057448  /* 2/pi */
166 | #define NPY_EULER     0.577215664901532860606512090082402431  /* Euler constant */
167 | #define NPY_SQRT2     1.414213562373095048801688724209698079  /* sqrt(2) */
168 | #define NPY_SQRT1_2   0.707106781186547524400844362104849039  /* 1/sqrt(2) */
169 | 
170 | #define NPY_Ef        2.718281828459045235360287471352662498F /* e */
171 | #define NPY_LOG2Ef    1.442695040888963407359924681001892137F /* log_2 e */
172 | #define NPY_LOG10Ef   0.434294481903251827651128918916605082F /* log_10 e */
173 | #define NPY_LOGE2f    0.693147180559945309417232121458176568F /* log_e 2 */
174 | #define NPY_LOGE10f   2.302585092994045684017991454684364208F /* log_e 10 */
175 | #define NPY_PIf       3.141592653589793238462643383279502884F /* pi */
176 | #define NPY_PI_2f     1.570796326794896619231321691639751442F /* pi/2 */
177 | #define NPY_PI_4f     0.785398163397448309615660845819875721F /* pi/4 */
178 | #define NPY_1_PIf     0.318309886183790671537767526745028724F /* 1/pi */
179 | #define NPY_2_PIf     0.636619772367581343075535053490057448F /* 2/pi */
180 | #define NPY_EULERf    0.577215664901532860606512090082402431F /* Euler constant */
181 | #define NPY_SQRT2f    1.414213562373095048801688724209698079F /* sqrt(2) */
182 | #define NPY_SQRT1_2f  0.707106781186547524400844362104849039F /* 1/sqrt(2) */
183 | 
184 | #define NPY_El        2.718281828459045235360287471352662498L /* e */
185 | #define NPY_LOG2El    1.442695040888963407359924681001892137L /* log_2 e */
186 | #define NPY_LOG10El   0.434294481903251827651128918916605082L /* log_10 e */
187 | #define NPY_LOGE2l    0.693147180559945309417232121458176568L /* log_e 2 */
188 | #define NPY_LOGE10l   2.302585092994045684017991454684364208L /* log_e 10 */
189 | #define NPY_PIl       3.141592653589793238462643383279502884L /* pi */
190 | #define NPY_PI_2l     1.570796326794896619231321691639751442L /* pi/2 */
191 | #define NPY_PI_4l     0.785398163397448309615660845819875721L /* pi/4 */
192 | #define NPY_1_PIl     0.318309886183790671537767526745028724L /* 1/pi */
193 | #define NPY_2_PIl     0.636619772367581343075535053490057448L /* 2/pi */
194 | #define NPY_EULERl    0.577215664901532860606512090082402431L /* Euler constant */
195 | #define NPY_SQRT2l    1.414213562373095048801688724209698079L /* sqrt(2) */
196 | #define NPY_SQRT1_2l  0.707106781186547524400844362104849039L /* 1/sqrt(2) */
197 | 
198 | /*
199 |  * Constants used in vector implementation of exp(x)
200 |  */
201 | #define NPY_RINT_CVT_MAGICf 0x1.800000p+23f
202 | #define NPY_CODY_WAITE_LOGE_2_HIGHf -6.93145752e-1f
203 | #define NPY_CODY_WAITE_LOGE_2_LOWf -1.42860677e-6f
204 | #define NPY_COEFF_P0_EXPf 9.999999999980870924916e-01f
205 | #define NPY_COEFF_P1_EXPf 7.257664613233124478488e-01f
206 | #define NPY_COEFF_P2_EXPf 2.473615434895520810817e-01f
207 | #define NPY_COEFF_P3_EXPf 5.114512081637298353406e-02f
208 | #define NPY_COEFF_P4_EXPf 6.757896990527504603057e-03f
209 | #define NPY_COEFF_P5_EXPf 5.082762527590693718096e-04f
210 | #define NPY_COEFF_Q0_EXPf 1.000000000000000000000e+00f
211 | #define NPY_COEFF_Q1_EXPf -2.742335390411667452936e-01f
212 | #define NPY_COEFF_Q2_EXPf 2.159509375685829852307e-02f
213 | 
214 |  /*
215 |   * Constants used in vector implementation of log(x)
216 |   */
217 | #define NPY_COEFF_P0_LOGf 0.000000000000000000000e+00f
218 | #define NPY_COEFF_P1_LOGf 9.999999999999998702752e-01f
219 | #define NPY_COEFF_P2_LOGf 2.112677543073053063722e+00f
220 | #define NPY_COEFF_P3_LOGf 1.480000633576506585156e+00f
221 | #define NPY_COEFF_P4_LOGf 3.808837741388407920751e-01f
222 | #define NPY_COEFF_P5_LOGf 2.589979117907922693523e-02f
223 | #define NPY_COEFF_Q0_LOGf 1.000000000000000000000e+00f
224 | #define NPY_COEFF_Q1_LOGf 2.612677543073109236779e+00f
225 | #define NPY_COEFF_Q2_LOGf 2.453006071784736363091e+00f
226 | #define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f
227 | #define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f
228 | #define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f
229 |   /*
230 |    * Constants used in vector implementation of sinf/cosf(x)
231 |    */
232 | #define NPY_TWO_O_PIf 0x1.45f306p-1f
233 | #define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f
234 | #define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f
235 | #define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f
236 | #define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f
237 | #define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f
238 | #define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f
239 | #define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f
240 | #define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f
241 | #define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f
242 | #define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f
243 | #define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
244 | #define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
245 | 
246 | #define FLT_MIN          1.175494351e-38F        // min normalized positive value
247 | 
248 | static const inline int mask_to_int(__m256 _x_) { return _mm256_movemask_ps(_x_); };
249 | 
250 | static const inline __m256 set1_ps(float _x_) { return _mm256_set1_ps(_x_); };
251 | 
252 | //static const inline __mmask16
253 | //get_full_load_mask_ps(void)
254 | //{
255 | //    return 0xFFFF;
256 | //}
257 | //
258 | //static const inline __mmask8
259 | //get_full_load_mask_pd(void)
260 | //{
261 | //    return 0xFF;
262 | //}
263 | //
264 | //static const inline __mmask16
265 | //get_partial_load_mask_ps(const int num_elem, const int total_elem)
266 | //{
267 | //    return (0x0001 << num_elem) - 0x0001;
268 | //}
269 | //
270 | //static const inline __mmask8
271 | //get_partial_load_mask_pd(const int num_elem, const int total_elem)
272 | //{
273 | //    return (0x01 << num_elem) - 0x01;
274 | //}
275 | 
276 | 
277 | static const inline __m256
278 | get_full_load_mask_ps(void)
279 | {
280 |     return _mm256_set1_ps(-1.0);
281 | }
282 | 
283 | static const inline __m256i
284 | get_full_load_mask_pd(void)
285 | {
286 |     return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
287 | }
288 | 
289 | static const inline __m256
290 | get_partial_load_mask_ps(const int num_elem, const int num_lanes)
291 | {
292 |     float maskint[16] = { -1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
293 |                             1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 };
294 |     float* addr = maskint + num_lanes - num_elem;
295 |     return _mm256_loadu_ps(addr);
296 | }
297 | 
298 | static const inline __m256i
299 | get_partial_load_mask_pd(const int num_elem, const int num_lanes)
300 | {
301 |     int maskint[16] = { -1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1 };
302 |     int* addr = maskint + 2 * num_lanes - 2 * num_elem;
303 |     return _mm256_loadu_si256((__m256i*) addr);
304 | }
305 | 
306 | static const inline __m256
307 | masked_gather_ps(__m256 src,
308 |     float* addr,
309 |     __m256i vindex,
310 |     __m256 mask)
311 | {
312 |     return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
313 | }
314 | 
315 | static const inline __m256d
316 | masked_gather_pd(__m256d src,
317 |     double* addr,
318 |     __m128i vindex,
319 |     __m256d mask)
320 | {
321 |     return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
322 | }
323 | 
324 | static const inline __m256
325 | masked_load_ps(__m256 mask, float* addr)
326 | {
327 |     return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
328 | }
329 | 
330 | static const inline __m256d
331 | masked_load_pd(__m256i mask, double* addr)
332 | {
333 |     return _mm256_maskload_pd(addr, mask);
334 | }
335 | 
336 | static const inline __m256
337 | fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
338 | {
339 |     return _mm256_blendv_ps(x, val, mask);
340 | }
341 | 
342 | static const inline __m256d
343 | fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
344 | {
345 |     return _mm256_blendv_pd(x, val, mask);
346 | }
347 | 
348 | static const inline __m256
349 | or_masks(__m256 x, __m256 y) { return _mm256_or_ps(x, y); }
350 | 
351 | static const inline __m256
352 | and_masks(__m256 x, __m256 y) { return _mm256_and_ps(x, y); }
353 | 
354 | static const inline __m256
355 | xor_masks(__m256 x, __m256 y) { return _mm256_xor_ps(x, y); }
356 | 
357 | static const inline __m256
358 | set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
359 | {
360 |     return _mm256_blendv_ps(x, val, mask);
361 | }
362 | 
363 | static const inline __m256d
364 | set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
365 | {
366 |     return _mm256_blendv_pd(x, val, mask);
367 | }
368 | 
369 | static const inline __m256
370 | blend(__m256 x, __m256 y, __m256 ymask)
371 | {
372 |     return _mm256_blendv_ps(x, y, ymask);
373 | }
374 | 
375 | template<const int COMP_OP>
376 | static inline __m256
377 | cmp_ps(__m256 x, __m256 y) { return _mm256_cmp_ps(x, y, COMP_OP); };
378 | 
379 | static const inline __m256
380 | add_ps(__m256 x, __m256 y) { return _mm256_add_ps(x, y); };
381 | 
382 | static const inline __m256
383 | sub_ps(__m256 x, __m256 y) { return _mm256_sub_ps(x, y); };
384 | 
385 | static const inline __m256
386 | div_ps(__m256 x, __m256 y) { return _mm256_div_ps(x, y); };
387 | 
388 | static const inline void
389 | maskstore_ps(float * x, __m256i y, __m256 z) {
390 |     return _mm256_maskstore_ps(x, y, z);
391 | };
392 | 
393 | static const inline __m256i
394 | cvtps_epi32(__m256 x) { return _mm256_cvtps_epi32(x); };
395 | 
396 | //static const inline __m256
397 | //fma_add_ps(__m256 x, __m256 y, __m256 z) { return _mm256_fmadd_ps(x, y, z); };
398 | 
399 | static const inline __m256
400 | fma_add_ps(__m256 x, __m256 y, __m256 z) { return _mm256_add_ps(z, _mm256_mul_ps(x, y)); };
401 | 
402 | static const inline __m256
403 | get_exponent(__m256 x)
404 | {
405 |     /*
406 |      * Special handling of denormals:
407 |      * 1) Multiply denormal elements with 2**100 (0x71800000)
408 |      * 2) Get the 8 bits of unbiased exponent
409 |      * 3) Subtract 100 from exponent of denormals
410 |      */
411 | 
412 |     __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
413 |     __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
414 |     __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);
415 | 
416 |     __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
417 |     __m256 temp = _mm256_mul_ps(temp1, two_power_100);
418 |     x = _mm256_blendv_ps(x, temp, denormal_mask);
419 | 
420 |     __m256 exp = _mm256_cvtepi32_ps(
421 |         _mm256_sub_epi32(
422 |             _mm256_srli_epi32(
423 |                 _mm256_castps_si256(x), 23), _mm256_set1_epi32(0x7E)));
424 | 
425 |     __m256 denorm_exp = _mm256_sub_ps(exp, _mm256_set1_ps(100.0f));
426 |     return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
427 | }
428 | 
429 | static const inline __m256
430 | get_mantissa(__m256 x)
431 | {
432 |     /*
433 |      * Special handling of denormals:
434 |      * 1) Multiply denormal elements with 2**100 (0x71800000)
435 |      * 2) Get the 23 bits of mantissa
436 |      * 3) Mantissa for denormals is not affected by the multiplication
437 |      */
438 | 
439 |     __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
440 |     __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
441 |     __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);
442 | 
443 |     __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
444 |     __m256 temp = _mm256_mul_ps(temp1, two_power_100);
445 |     x = _mm256_blendv_ps(x, temp, denormal_mask);
446 | 
447 |     __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff);
448 |     __m256i exp_126_bits = _mm256_set1_epi32(126 << 23);
449 |     return _mm256_castsi256_ps(
450 |         _mm256_or_si256(
451 |             _mm256_and_si256(
452 |                 _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
453 | }
454 | 
455 | 
456 | 
457 | 
458 | /*
459 |  * Vectorized implementation of log using AVX2 and AVX512
460 |  * 1) if x < 0.0f; return -NAN (invalid input)
461 |  * 2) Range reduction: y = x/2^k;
462 |  *      a) y = normalized mantissa, k is the exponent (0.5 <= y < 1)
463 |  * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q
464 |  *      b) P = 5th order and Q = 5th order polynomials obtained from Remez's
465 |  *      algorithm (mini-max polynomial approximation)
466 |  * 5) Compute log(x) = log(y) + k*ln(2)
467 |  * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945)
468 |  * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same
469 |  * x = 0x3f486945)
470 |  */
471 | template<typename VTYPE, typename VTYPEi>
472 | static void
473 | log_FLOAT(void* pDataIn,
474 |     void* pDataOut,
475 |     const int64_t array_size,
476 |     const int64_t steps,
477 |     const int64_t stepsOut)
478 | {
479 |     const int64_t stride = steps / (int64_t)sizeof(float);
480 |     const int32_t num_lanes = sizeof(VTYPE) / (int64_t)sizeof(float);
481 | 
482 |     float* op = (float*)pDataOut;
483 |     float* ip = (float*)pDataIn;
484 | 
485 |     /*
486 |      * Note: while generally indices are int64_t, we ensure that our maximum index
487 |      * will fit in an int32 as a precondition for this function via
488 |      * IS_OUTPUT_BLOCKABLE_UNARY
489 |      */
490 |     int32_t indexarr[16];
491 |     for (int32_t ii = 0; ii < 16; ii++) {
492 |         indexarr[ii] = ii * (int32_t)stride;
493 |     }
494 | 
495 |     /* Load up frequently used constants */
496 |     VTYPE log_p0 = set1_ps(NPY_COEFF_P0_LOGf);
497 |     VTYPE log_p1 = set1_ps(NPY_COEFF_P1_LOGf);
498 |     VTYPE log_p2 = set1_ps(NPY_COEFF_P2_LOGf);
499 |     VTYPE log_p3 = set1_ps(NPY_COEFF_P3_LOGf);
500 |     VTYPE log_p4 = set1_ps(NPY_COEFF_P4_LOGf);
501 |     VTYPE log_p5 = set1_ps(NPY_COEFF_P5_LOGf);
502 |     VTYPE log_q0 = set1_ps(NPY_COEFF_Q0_LOGf);
503 |     VTYPE log_q1 = set1_ps(NPY_COEFF_Q1_LOGf);
504 |     VTYPE log_q2 = set1_ps(NPY_COEFF_Q2_LOGf);
505 |     VTYPE log_q3 = set1_ps(NPY_COEFF_Q3_LOGf);
506 |     VTYPE log_q4 = set1_ps(NPY_COEFF_Q4_LOGf);
507 |     VTYPE log_q5 = set1_ps(NPY_COEFF_Q5_LOGf);
508 |     VTYPE loge2 =  set1_ps(NPY_LOGE2f);
509 |     VTYPE nan = set1_ps(NPY_NANF);
510 |     VTYPE neg_nan = set1_ps(-NPY_NANF);
511 |     VTYPE neg_inf = set1_ps(-NPY_INFINITYF);
512 |     VTYPE inf = set1_ps(NPY_INFINITYF);
513 |     VTYPE zeros_f = set1_ps(0.0f);
514 |     VTYPE ones_f = set1_ps(1.0f);
515 |     VTYPEi vindex = LOADUI((VTYPEi*)indexarr);
516 |     VTYPE poly, num_poly, denom_poly, exponent;
517 | 
518 |     VTYPE inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
519 |     VTYPE invalid_mask = get_partial_load_mask_ps(0, num_lanes);
520 |     VTYPE divide_by_zero_mask = invalid_mask;
521 |     VTYPE load_mask = get_full_load_mask_ps();
522 | 
523 |     int64_t num_remaining_elements = array_size;
524 | 
525 |     while (num_remaining_elements > 0) {
526 | 
527 |         if (num_remaining_elements < num_lanes) {
528 |             load_mask = get_partial_load_mask_ps((int)num_remaining_elements,
529 |                 num_lanes);
530 |         }
531 | 
532 |         VTYPE x_in;
533 |         if (stride == 1) {
534 |             x_in = masked_load_ps(load_mask, ip);
535 |         }
536 |         else {
537 |             x_in = masked_gather_ps(zeros_f, ip, vindex, load_mask);
538 |         }
539 | 
540 |         negx_mask = cmp_ps< _CMP_LT_OQ>(x_in, zeros_f);
541 |         zero_mask = cmp_ps< _CMP_EQ_OQ>(x_in, zeros_f);
542 |         inf_mask = cmp_ps< _CMP_EQ_OQ>(x_in, inf);
543 |         nan_mask = cmp_ps< _CMP_NEQ_UQ>(x_in, x_in);
544 | 
545 |         divide_by_zero_mask = or_masks(divide_by_zero_mask,
546 |             and_masks(zero_mask, load_mask));
547 |         invalid_mask = or_masks(invalid_mask, negx_mask);
548 | 
549 |         VTYPE x = set_masked_lanes_ps(x_in, zeros_f, negx_mask);
550 | 
551 |         /* set x = normalized mantissa */
552 |         exponent = get_exponent(x);
553 |         x = get_mantissa(x);
554 | 
555 |         /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
556 |         sqrt2_mask = cmp_ps< _CMP_LE_OQ>(x, set1_ps(NPY_SQRT1_2f));
557 | 
558 |         x = blend(x, add_ps(x, x), sqrt2_mask);
559 |         exponent = blend(exponent, sub_ps(exponent, ones_f), sqrt2_mask);
560 | 
561 |         /* x = x - 1 */
562 |         x = sub_ps(x, ones_f);
563 | 
564 |         /* Polynomial approximation for log(1+x) */
565 |         num_poly = fma_add_ps(log_p5, x, log_p4);
566 |         num_poly = fma_add_ps(num_poly, x, log_p3);
567 |         num_poly = fma_add_ps(num_poly, x, log_p2);
568 |         num_poly = fma_add_ps(num_poly, x, log_p1);
569 |         num_poly = fma_add_ps(num_poly, x, log_p0);
570 |         denom_poly = fma_add_ps(log_q5, x, log_q4);
571 |         denom_poly = fma_add_ps(denom_poly, x, log_q3);
572 |         denom_poly = fma_add_ps(denom_poly, x, log_q2);
573 |         denom_poly = fma_add_ps(denom_poly, x, log_q1);
574 |         denom_poly = fma_add_ps(denom_poly, x, log_q0);
575 |         poly = div_ps(num_poly, denom_poly);
576 |         poly = fma_add_ps(exponent, loge2, poly);
577 | 
578 |         /*
579 |          * x < 0.0f; return -NAN
580 |          * x = +/- NAN; return NAN
581 |          * x = 0.0f; return -INF
582 |          */
583 |         poly = set_masked_lanes_ps(poly, nan, nan_mask);
584 |         poly = set_masked_lanes_ps(poly, neg_nan, negx_mask);
585 |         poly = set_masked_lanes_ps(poly, neg_inf, zero_mask);
586 |         poly = set_masked_lanes_ps(poly, inf, inf_mask);
587 | 
588 |         maskstore_ps(op, cvtps_epi32(load_mask), poly);
589 | 
590 |         ip += num_lanes * stride;
591 |         op += num_lanes;
592 |         num_remaining_elements -= num_lanes;
593 |     }
594 | 
595 |     if (mask_to_int(invalid_mask)) {
596 |         npy_set_floatstatus_invalid();
597 |     }
598 |     if (mask_to_int(divide_by_zero_mask)) {
599 |         npy_set_floatstatus_divbyzero();
600 |     }
601 | }
602 | 
603 | 
604 | extern "C"
605 | UNARY_FUNC GetLogOpFast(int func, int atopInType1, int* wantedOutType) {
606 | 
607 |     switch (func) {
608 |     case TRIG_OPERATION::LOG:
609 |         *wantedOutType = ATOP_DOUBLE;
610 |         if (atopInType1 == ATOP_FLOAT) {
611 |             *wantedOutType = ATOP_FLOAT;
612 |         }
613 |         switch (atopInType1) {
614 |         case ATOP_FLOAT: return log_FLOAT<__m256, __m256i>;
615 |         }
616 |     }
617 |     return NULL;
618 | }
619 | 
620 | #if defined(__clang__)
621 | #pragma clang attribute pop
622 | #endif
623 | 


--------------------------------------------------------------------------------