├── benchmarks ├── __init__.py └── README.rst ├── src ├── pnumpy │ ├── tests │ │ └── __init__.py │ ├── recycler.cpp │ ├── recarray.py │ ├── __init__.py │ ├── arange.cpp │ ├── conversions.cpp │ ├── cpu.py │ ├── common.h │ ├── benchmark.py │ ├── module_init.cpp │ ├── ledger.cpp │ └── sort.py ├── nte │ └── x64 │ │ └── Release │ │ └── nte.vcxproj.FileListAbsolute.txt └── atop │ ├── readme.txt │ ├── invalids.h │ ├── atop.h │ ├── halffloat.h │ ├── atop.cpp │ ├── fill.cpp │ ├── recarray.cpp │ ├── common_inc.h │ ├── threads.cpp │ └── ops_log.cpp ├── doc_src ├── doc_requirements.txt ├── images │ ├── bench4graph.PNG │ ├── bench4graph2.PNG │ ├── bench4graph3.PNG │ └── threading_npadd.PNG ├── source │ ├── installation.rst │ ├── benchmarking_asv.rst │ ├── use.rst │ ├── roadmap.rst │ ├── conf.py │ └── index.rst ├── Makefile └── make.bat ├── ci └── requirements.txt ├── AUTHORS.rst ├── pyproject.toml ├── CHANGELOG.rst ├── .coveragerc ├── .editorconfig ├── MANIFEST.in ├── .bumpversion.cfg ├── test_requirements.txt ├── tests ├── conftest.py ├── test_pnumpy.py └── test_ufuncs.py ├── LICENSE ├── .gitignore ├── .github └── workflows │ ├── push_docs.yml │ ├── build.yml │ ├── pypi.yml │ └── build_uploadpypi.yml ├── setup.cfg ├── CONTRIBUTING.md ├── CODE_OF_CONDUCT.md ├── asv.conf.json ├── README.md ├── _add_newdocs.py └── setup.py /benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pnumpy/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc_src/doc_requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==3.2.0 2 | -------------------------------------------------------------------------------- /src/nte/x64/Release/nte.vcxproj.FileListAbsolute.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ci/requirements.txt: -------------------------------------------------------------------------------- 1 | virtualenv>=16.6.0 2 | six>=1.14.0 3 | twine 4 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | 2 | Authors 3 | ======= 4 | 5 | * Matti Picus - https://labs.quansight.org/ 6 | -------------------------------------------------------------------------------- /doc_src/images/bench4graph.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quansight/pnumpy/HEAD/doc_src/images/bench4graph.PNG -------------------------------------------------------------------------------- /doc_src/images/bench4graph2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quansight/pnumpy/HEAD/doc_src/images/bench4graph2.PNG -------------------------------------------------------------------------------- /doc_src/images/bench4graph3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quansight/pnumpy/HEAD/doc_src/images/bench4graph3.PNG -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "wheel", 4 | "setuptools", 5 | "numpy<1.20", 6 | ] 7 | -------------------------------------------------------------------------------- /doc_src/images/threading_npadd.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quansight/pnumpy/HEAD/doc_src/images/threading_npadd.PNG -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | 2 | Changelog 3 | ========= 4 | 5 | 0.0.0 (2020-09-09) 6 | ------------------ 7 | 8 | * First release on PyPI. 9 | -------------------------------------------------------------------------------- /src/atop/readme.txt: -------------------------------------------------------------------------------- 1 | atop : array threaded operation 2 | A library containing vector intrinsic loops and threading to speed up calculations 3 | -------------------------------------------------------------------------------- /src/pnumpy/recycler.cpp: -------------------------------------------------------------------------------- 1 | #include "Python.h" 2 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 3 | #include 4 | #include 5 | #include "../atop/atop.h" 6 | 7 | 8 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [paths] 2 | source = src 3 | 4 | [run] 5 | branch = true 6 | source = 7 | src 8 | tests 9 | parallel = true 10 | 11 | [report] 12 | show_missing = true 13 | precision = 2 14 | omit = *migrations* 15 | -------------------------------------------------------------------------------- /doc_src/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ------------ 3 | 4 | This is a binary package and requires compilation. We recommend using pip or 5 | conda to obtain a pre-built version:: 6 | 7 | $ pip install pnumpy 8 | 9 | To use the package one it is installed:: 10 | 11 | >>> import pnumpy as pn 12 | 13 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # see https://editorconfig.org/ 2 | root = true 3 | 4 | [*] 5 | end_of_line = lf 6 | trim_trailing_whitespace = true 7 | insert_final_newline = true 8 | indent_style = space 9 | indent_size = 4 10 | charset = utf-8 11 | 12 | [*.{bat,cmd,ps1}] 13 | end_of_line = crlf 14 | 15 | [*.{yml,yaml}] 16 | indent_size = 2 17 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft docs 2 | graft src 3 | graft ci 4 | graft tests 5 | 6 | include .bumpversion.cfg 7 | include .coveragerc 8 | include .editorconfig 9 | 10 | include AUTHORS.rst 11 | include CHANGELOG.rst 12 | include CONTRIBUTING.md 13 | include LICENSE 14 | include README.md 15 | 16 | global-exclude *.py[cod] __pycache__/* *.so *.dylib 17 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.0.0 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:README.rst] 11 | search = v{current_version}. 12 | replace = v{new_version}. 13 | 14 | [bumpversion:file:src/pnumpy/__init__.py] 15 | search = __version__ = '{current_version}' 16 | replace = __version__ = '{new_version}' 17 | -------------------------------------------------------------------------------- /doc_src/source/benchmarking_asv.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | 4 | .. _benchmarking_asv: 5 | 6 | Benchmarking via ASV 7 | -------------------- 8 | 9 | ASV_ is a tool that discovers and runs benchmarks. It saves the state of the 10 | run and the results, and is useful for comparing performance across versions. 11 | To run the benchmarks, do: 12 | 13 | .. code-block:: shell 14 | 15 | $ cd benchmarks 16 | $ asv run 17 | 18 | .. _ASV: https://asv.readthedocs.io/en/stable/using.html 19 | -------------------------------------------------------------------------------- /doc_src/source/use.rst: -------------------------------------------------------------------------------- 1 | How to use pnumpy 2 | ================= 3 | .. 4 | The rest of this is taken from the __init__.py and the function's docstrings 5 | 6 | pnumpy functions 7 | ---------------- 8 | 9 | .. automodule:: pnumpy 10 | :members: enable, disable, atop_disable, atop_enable, atop_info, atop_isenabled, atop_setworkers, getitem, recarray_to_colmajor, sort, lexsort 11 | 12 | .. _benchmarking: 13 | 14 | Benchmarking 15 | ------------ 16 | 17 | .. automodule:: pnumpy.benchmark 18 | :members: benchmark 19 | 20 | -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | cython==0.29.21 2 | wheel<=0.35.1 3 | setuptools<49.2.0 4 | hypothesis==5.41.5 5 | pytest==6.2.0 6 | pytz==2020.4 7 | pytest-cov==2.10.1 8 | pickle5; python_version == '3.7' and platform_python_implementation != 'PyPy' 9 | # for numpy.random.test.test_extending 10 | cffi 11 | # For testing types. Notes on the restrictions: 12 | # - Mypy relies on C API features not present in PyPy 13 | # - Mypy doesn't currently work on Python 3.9 14 | mypy==0.790; platform_python_implementation != "PyPy" 15 | typing_extensions 16 | -------------------------------------------------------------------------------- /doc_src/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= -WT --keep-going 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from packaging.utils import Version 4 | HAVE_PNUMPY = True 5 | try: 6 | import pnumpy 7 | except Exception: 8 | HAVE_PNUMPY = False 9 | 10 | old_numpy = Version(np.__version__) < Version('1.18') 11 | 12 | @pytest.fixture(scope='session') 13 | def initialize_pnumpy(): 14 | if HAVE_PNUMPY: 15 | from numpy.core._multiarray_umath import __cpu_features__ as cpu 16 | if not cpu['AVX2']: 17 | pytest.skip('pnumpy.initialize requires AVX2') 18 | pnumpy.initialize() 19 | 20 | @pytest.fixture(scope='function') 21 | def rng(): 22 | if old_numpy: 23 | class OldRNG(np.random.RandomState): 24 | pass 25 | rng = OldRNG(1234) 26 | rng.random = rng.random_sample 27 | rng.integers = rng.randint 28 | return rng 29 | else: 30 | return np.random.default_rng(1234) 31 | -------------------------------------------------------------------------------- /doc_src/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020, Matti Picus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # generated files 2 | src/pnumpy/PNUMPY.h 3 | 4 | # benchmark environments 5 | .asv 6 | 7 | *.py[cod] 8 | __pycache__ 9 | 10 | # msvc 11 | *.sln 12 | *.vcxproj 13 | *.tlog 14 | *.log 15 | *.db 16 | *.opendb 17 | *.idb 18 | *.pdb 19 | *.ipch 20 | *.sqlite 21 | *.suo 22 | *.obj 23 | *.filters 24 | *.user 25 | *.cod 26 | *.lastbuildstate 27 | 28 | # C extensions 29 | *.so 30 | 31 | # version 32 | src/pnumpy/_version.py 33 | 34 | # Packages 35 | *.egg 36 | *.egg-info 37 | dist 38 | build 39 | eggs 40 | .eggs 41 | parts 42 | bin 43 | var 44 | sdist 45 | wheelhouse 46 | develop-eggs 47 | .installed.cfg 48 | lib 49 | lib64 50 | venv*/ 51 | pyvenv*/ 52 | pip-wheel-metadata/ 53 | 54 | # Installer logs 55 | pip-log.txt 56 | 57 | # Unit test / coverage reports 58 | .coverage 59 | .coverage.* 60 | .pytest_cache/ 61 | nosetests.xml 62 | coverage.xml 63 | htmlcov 64 | 65 | # Translations 66 | *.mo 67 | 68 | # Complexity 69 | output/*.html 70 | output/*/index.html 71 | 72 | # Sphinx 73 | docs/_build 74 | 75 | # Mypy Cache 76 | .mypy_cache/ 77 | 78 | # Editor tmp files 79 | .*.swp 80 | /.vs/VSWorkspaceState.json 81 | /.vs/ProjectSettings.json 82 | -------------------------------------------------------------------------------- /src/atop/invalids.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "common_inc.h" 3 | 4 | // For when integers have an invalid predefined 5 | static const int8_t GetInvalid(int8_t x) { return (int8_t)(0x80); }; 6 | static const int16_t GetInvalid(int16_t x) { return (int16_t)(0x8000); }; 7 | static const int32_t GetInvalid(int32_t x) { return (int32_t)(0x800000000000); }; 8 | static const int64_t GetInvalid(int64_t x) { return (int64_t)(0x8000000000000000); }; 9 | 10 | static const uint8_t GetInvalid(uint8_t x) { return (uint8_t)(0xFF); }; 11 | static const uint16_t GetInvalid(uint16_t x) { return (uint16_t)(0xFFFF); }; 12 | static const uint32_t GetInvalid(uint32_t x) { return (uint32_t)(0xFFFFFFFF); }; 13 | static const uint64_t GetInvalid(uint64_t x) { return (uint64_t)(0xFFFFFFFFFFFFFFFF); }; 14 | 15 | static const float GetInvalid(float x) { return std::numeric_limits::quiet_NaN(); }; 16 | static const double GetInvalid(double x) { return std::numeric_limits::quiet_NaN(); }; 17 | static const long double GetInvalid(long double x) { return std::numeric_limits::quiet_NaN(); }; 18 | 19 | 20 | //------------------------------------------------------------------------- 21 | -------------------------------------------------------------------------------- /doc_src/source/roadmap.rst: -------------------------------------------------------------------------------- 1 | Roadmap 2 | ======= 3 | 4 | Version 2.0 of the package uses multithreaded ufunc loops and parallel sorts. 5 | 6 | Future versions of the package will extend these capabilites to cover more of 7 | the NumPy functionality. Some of these proposed enhancements will require new 8 | APIs from NumPy. 9 | 10 | Conversions 11 | ----------- 12 | 13 | Currently NumPy does not expose a hook for dtype conversions. When available, 14 | PNumPy will parallelize those conversions. 15 | 16 | Vectorized loops 17 | ---------------- 18 | 19 | NumPy is only now beginning to use `SIMD `_ 20 | instructions to speed up loops. We have a few further enhancements to the 21 | current NumPy implementations. Check out the code in the `atop` directory. 22 | 23 | Using a better memory allocator 24 | ------------------------------- 25 | 26 | NumPy uses a small cache for data memory but does not have one for larger 27 | arrays. When the new API is available, we will provide a better cache. 28 | 29 | Ledger 30 | ------ 31 | What PNumPy hooks can be recorded and timed. This built in profiler will help 32 | you to tweak and speed up your code. 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /src/atop/atop.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "common_inc.h" 3 | 4 | // Export DLL section 5 | #if defined(_WIN32) && !defined(__GNUC__) 6 | 7 | #define DllExport __declspec(dllexport) 8 | 9 | #else 10 | 11 | #define DllExport 12 | 13 | #endif 14 | 15 | 16 | extern "C" { 17 | 18 | // defined in atop.cpp 19 | DllExport BOOL atop_init(); 20 | 21 | // defined in ops_binary.cpp 22 | DllExport ANY_TWO_FUNC GetSimpleMathOpFast(int func, int atopInType1, int atopInType2, int* wantedOutType); 23 | DllExport REDUCE_FUNC GetReduceMathOpFast(int func, int atopInType1); 24 | DllExport ANY_TWO_FUNC GetComparisonOpFast(int func, int atopInType1, int atopInType2, int* wantedOutType); 25 | DllExport UNARY_FUNC GetUnaryOpFast(int func, int atopInType1, int* wantedOutType); 26 | DllExport UNARY_FUNC GetUnaryOpSlow(int func, int atopInType1, int* wantedOutType); 27 | DllExport UNARY_FUNC GetTrigOpFast(int func, int atopInType1, int* wantedOutType); 28 | DllExport UNARY_FUNC GetTrigOpSlow(int func, int atopInType1, int* wantedOutType); 29 | DllExport UNARY_FUNC GetLogOpFast(int func, int atopInType1, int* wantedOutType); 30 | 31 | // CPUID capabilities 32 | extern DllExport int g_bmi2; 33 | extern DllExport int g_avx2; 34 | extern DllExport ATOP_cpuid_t g_cpuid; 35 | 36 | } 37 | 38 | -------------------------------------------------------------------------------- /.github/workflows/push_docs.yml: -------------------------------------------------------------------------------- 1 | name: Push_docs 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | 7 | jobs: 8 | update_docs: 9 | runs-on: ubuntu-latest 10 | defaults: 11 | run: 12 | shell: bash 13 | 14 | steps: 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: 3.8 19 | 20 | - uses: actions/checkout@v2 21 | with: 22 | submodules: recursive 23 | fetch-depth: 0 24 | 25 | - name: build docs 26 | run: | 27 | set -ex 28 | python -m pip install . 29 | pushd doc_src 30 | python -m pip install --progress-bar=off -r doc_requirements.txt 31 | make html 32 | popd 33 | git fetch origin site 34 | git checkout site 35 | target=main # fixme: use $GIT_BRANCH or so 36 | rm -rf docs/$target/* 37 | cp -r doc_src/build/html/* docs/$target 38 | if [ $target == "main" ]; then 39 | rm -rf docs/_static/* 40 | cp -r doc_src/build/html/_static/* docs/_static 41 | fi 42 | git add docs || true 43 | git config user.email "mattigit@picus.org.il" 44 | git config user.name "mattibot" 45 | # If there aren't changes, doesn't make a commit; push will be a no-op 46 | git commit -m "auto-generating sphinx docs" || true 47 | 48 | - name: Push 49 | uses: ad-m/github-push-action@master 50 | with: 51 | github_token: ${{ github.token }} 52 | branch: site 53 | -------------------------------------------------------------------------------- /tests/test_pnumpy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | HAVE_PNUMPY = True 4 | try: 5 | import pnumpy as pn 6 | except Exception: 7 | HAVE_PNUMPY = False 8 | 9 | 10 | def test_enable(initialize_pnumpy): 11 | # enable/disable return the previous value 12 | if HAVE_PNUMPY: 13 | old = pn.atop_isenabled() 14 | pn.atop_enable() 15 | assert pn.atop_isenabled() == True 16 | pn.atop_disable() 17 | assert pn.atop_isenabled() == False 18 | 19 | # restore prior state 20 | if old: 21 | pn.atop_enable() 22 | else: 23 | pn.atop_disable() 24 | assert pn.atop_isenabled() == old 25 | 26 | 27 | def test_result(rng): 28 | """ test that the basic idea of rng and ufunc result testing works. 29 | """ 30 | 31 | # this is currently the only test that does not require initialize_pnumpy 32 | # which is useful for CI runs without AVX2. Otherwise all the tests will be 33 | # skipped, and pytest will notice that all the tests are skipped and will 34 | # complain. 35 | 36 | if HAVE_PNUMPY: 37 | print('numpy version', np.__version__) 38 | print(pn.cpustring()) 39 | 40 | m = rng.integers(100, size=(10, 10), dtype=np.int32) 41 | o = np.empty_like(m) 42 | for i in range(m.shape[0]): 43 | for j in range(m.shape[1]): 44 | o[i, j] = m[i, j] + m[i, j] 45 | assert np.all(np.add(m, m) == o) 46 | 47 | 48 | def test_numpy_off(initialize_pnumpy): 49 | if HAVE_PNUMPY: 50 | np.test() 51 | 52 | 53 | def test_numpy_on(initialize_pnumpy): 54 | if HAVE_PNUMPY: 55 | pn.atop_enable() 56 | np.test() 57 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 140 3 | exclude = .eggs,build,dist 4 | 5 | [tool:pytest] 6 | # If a pytest section is found in one of the possible config files 7 | # (pytest.ini, setup.cfg), then pytest will not look for any others, 8 | # so if you add a pytest config section elsewhere, 9 | # you will need to delete this section from setup.cfg. 10 | norecursedirs = 11 | .git 12 | .env 13 | dist 14 | build 15 | migrations 16 | 17 | python_files = 18 | test_*.py 19 | *_test.py 20 | tests.py 21 | addopts = 22 | -ra 23 | --strict-markers 24 | --ignore=docs/conf.py 25 | --ignore=setup.py 26 | --ignore=ci 27 | --ignore=.eggs 28 | --doctest-modules 29 | --doctest-glob=\*.rst 30 | --tb=short 31 | --pyargs 32 | # The order of these options matters. testpaths comes after addopts so that 33 | # pnumpy in testpaths is interpreted as 34 | # --pyargs pnumpy. 35 | # Any tests in the src/ directory (that is, tests installed with the package) 36 | # can be run by any user with pytest --pyargs pnumpy. 37 | # Packages that are sensitive to the host machine, most famously NumPy, 38 | # include tests with the installed package so that any user can check 39 | # at any time that everything is working properly. 40 | # If you do choose to make installable tests, this will run the installed 41 | # tests as they are actually installed (same principle as when we ensure that 42 | # we always test the installed version of the package). 43 | # If you have no need for this (and your src/ directory is very large), 44 | # you can save a few milliseconds on testing by telling pytest not to search 45 | # the src/ directory by removing 46 | # --pyargs and pnumpy from the options here. 47 | testpaths = 48 | pnumpy 49 | tests/ 50 | filterwarnings = 51 | ignore:.*AVX2 52 | 53 | [tool:isort] 54 | force_single_line = True 55 | line_length = 120 56 | known_first_party = pnumpy 57 | default_section = THIRDPARTY 58 | forced_separate = test_pnumpy 59 | skip = .eggs,build,dist 60 | 61 | [bdist_wheel] 62 | py-limited-api = cp36 63 | -------------------------------------------------------------------------------- /doc_src/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'PNumPy' 21 | copyright = '2020-2021, tdimitri, mattip' 22 | author = 'Quansight' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = 'v2.0.20' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.autosummary', 36 | 'sphinx.ext.napoleon', 37 | ] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ['_templates'] 41 | 42 | # List of patterns, relative to source directory, that match files and 43 | # directories to ignore when looking for source files. 44 | # This pattern also affects html_static_path and html_extra_path. 45 | exclude_patterns = [] 46 | 47 | 48 | # -- Options for HTML output ------------------------------------------------- 49 | 50 | # The theme to use for HTML and HTML Help pages. See the documentation for 51 | # a list of builtin themes. 52 | # 53 | html_theme = 'alabaster' 54 | 55 | # Add any paths that contain custom static files (such as style sheets) here, 56 | # relative to this directory. They are copied after the builtin static files, 57 | # so a file named "default.css" will overwrite the builtin "default.css". 58 | #html_static_path = ['_static'] 59 | -------------------------------------------------------------------------------- /src/atop/halffloat.h: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | /* 6 | * Half-precision routines 7 | */ 8 | /* half/float16 isn't a floating-point type in C */ 9 | 10 | typedef uint16_t atop_half; 11 | 12 | ///* Conversions */ 13 | //float atop_half_to_float(atop_half h); 14 | //double atop_half_to_double(atop_half h); 15 | //atop_half atop_float_to_half(float f); 16 | //atop_half atop_double_to_half(double d); 17 | ///* Comparisons */ 18 | //int atop_half_eq(atop_half h1, atop_half h2); 19 | //int atop_half_ne(atop_half h1, atop_half h2); 20 | //int atop_half_le(atop_half h1, atop_half h2); 21 | //int atop_half_lt(atop_half h1, atop_half h2); 22 | //int atop_half_ge(atop_half h1, atop_half h2); 23 | //int atop_half_gt(atop_half h1, atop_half h2); 24 | ///* faster *_nonan variants for when you know h1 and h2 are not NaN */ 25 | //int atop_half_eq_nonan(atop_half h1, atop_half h2); 26 | //int atop_half_lt_nonan(atop_half h1, atop_half h2); 27 | //int atop_half_le_nonan(atop_half h1, atop_half h2); 28 | ///* Miscellaneous functions */ 29 | //int atop_half_iszero(atop_half h); 30 | //int atop_half_isnan(atop_half h); 31 | //int atop_half_isinf(atop_half h); 32 | //int atop_half_isfinite(atop_half h); 33 | //int atop_half_signbit(atop_half h); 34 | //atop_half atop_half_copysign(atop_half x, atop_half y); 35 | //atop_half atop_half_spacing(atop_half h); 36 | //atop_half atop_half_nextafter(atop_half x, atop_half y); 37 | //atop_half atop_half_divmod(atop_half x, atop_half y, atop_half *modulus); 38 | // 39 | ///* 40 | // * Half-precision constants 41 | // */ 42 | // 43 | //#define ATOP_HALF_ZERO (0x0000u) 44 | //#define ATOP_HALF_PZERO (0x0000u) 45 | //#define ATOP_HALF_NZERO (0x8000u) 46 | //#define ATOP_HALF_ONE (0x3c00u) 47 | //#define ATOP_HALF_NEGONE (0xbc00u) 48 | //#define ATOP_HALF_PINF (0x7c00u) 49 | //#define ATOP_HALF_NINF (0xfc00u) 50 | //#define ATOP_HALF_NAN (0x7e00u) 51 | // 52 | //#define ATOP_MAX_HALF (0x7bffu) 53 | // 54 | ///* 55 | // * Bit-level conversions 56 | // */ 57 | 58 | //uint16_t atop_floatbits_to_halfbits(uint32_t f); 59 | //uint16_t atop_doublebits_to_halfbits(uint64_t d); 60 | //uint32_t atop_halfbits_to_floatbits(uint16_t h); 61 | //uint64_t atop_halfbits_to_doublebits(uint16_t h); 62 | 63 | #ifdef __cplusplus 64 | } 65 | #endif 66 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build,test 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | #schedule: 9 | # - cron: '0 0 * * 0' # weekly 10 | 11 | jobs: 12 | build: 13 | 14 | runs-on: ${{ matrix.os }} 15 | defaults: 16 | run: 17 | shell: bash 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: [3.6, 3.7, 3.8, 3.9] 22 | os: [ubuntu-latest, macos-latest, windows-latest] 23 | platform: [x64] 24 | include: 25 | - python-version: 3.8 26 | os: windows-latest 27 | platform: x86 28 | steps: 29 | - uses: actions/checkout@v2 30 | with: 31 | submodules: recursive 32 | - name: Set up Python ${{ matrix.python-version }} 33 | uses: actions/setup-python@v2 34 | with: 35 | python-version: ${{ matrix.python-version }} 36 | architecture: ${{ matrix.platform }} 37 | - name: Install prerequisits 38 | run: | 39 | set -ex 40 | python -m pip install --upgrade pip 41 | python -m pip install --progress-bar=off -r ci/requirements.txt 42 | echo $(python -c"import sys; print(sys.version)") 43 | - name: Build and install 44 | run: | 45 | set -ex 46 | python -m pip install . 47 | - name: Test 48 | run: | 49 | python -m pip install -r test_requirements.txt 50 | python -m pytest tests -vv --durations 10 51 | 52 | build_docs: 53 | runs-on: ubuntu-latest 54 | defaults: 55 | run: 56 | shell: bash 57 | 58 | steps: 59 | - name: Set up Python ${{ matrix.python-version }} 60 | uses: actions/setup-python@v2 61 | with: 62 | python-version: 3.8 63 | 64 | - uses: actions/checkout@v2 65 | with: 66 | submodules: recursive 67 | fetch-depth: 0 68 | 69 | - name: build docs 70 | run: | 71 | set -ex 72 | python -m pip install . 73 | pushd doc_src 74 | python -m pip install --progress-bar=off -r doc_requirements.txt 75 | make html 76 | popd 77 | - name: store docs 78 | uses: actions/upload-artifact@v2 79 | with: 80 | name: docs 81 | path: doc_src/build/html 82 | -------------------------------------------------------------------------------- /src/pnumpy/recarray.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | 4 | __all__ = [ 5 | 'recarray_to_colmajor'] 6 | 7 | from pnumpy._pnumpy import recarray_to_colmajor as _recarray_to_colmajor 8 | 9 | 10 | #----------------------------------------------------------------------------------------- 11 | def recarray_to_colmajor(item, parallel=True): 12 | """ 13 | Converts a numpy record array (void type) to a dictionary of numpy arrays, col major 14 | 15 | Returns 16 | ------- 17 | A dictionary of numpy arrays corresponding to the original numpy record array. 18 | 19 | Examples 20 | -------- 21 | >>> x=np.array([(1.0, 2, 3, 4, 5, 'this is a long test'), (3.0, 4, 5, 6, 7, 'short'), (30.0, 40, 50, 60, 70, '')], 22 | dtype=[('x', '>> item=np.tile(x,100_000) 24 | >>> mydict = recarray_to_colmajor(item) 25 | """ 26 | if item.dtype.char == 'V': 27 | # warnings.warn(f"Converting numpy record array. Performance may suffer.") 28 | # flip row-major to column-major 29 | list_types = [*item.dtype.fields.values()] 30 | success = True 31 | for t in list_types: 32 | val = t[0].char 33 | # if the record type has an object or another record type, we cannot handle 34 | if val == 'O' or val =='V': 35 | success = False 36 | break; 37 | 38 | d={} 39 | if successs and parallel: 40 | offsets=[] 41 | arrays=np.empty(len(item.dtype.fields), dtype='O') 42 | arrlen = len(item) 43 | count =0 44 | for name, v in item.dtype.fields.items(): 45 | offsets.append(v[1]) 46 | arr= np.empty(arrlen, dtype=v[0]) 47 | arrays[count] = arr 48 | count += 1 49 | # build dict of names and new arrays 50 | d[name] = arr 51 | 52 | # Call parallel routine to convert 53 | _recarray_to_colmajor(item, np.asarray(offsets, dtype=np.int64), arrays); 54 | 55 | else: 56 | # single thread way 57 | for name in item.dtype.names: 58 | d[name] = item[:][name].copy() 59 | return d 60 | 61 | warnings.warn(f"The array passed was not a numpy record array.") 62 | return item 63 | 64 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are welcome, and they are greatly appreciated! Every 4 | little bit helps, and we will try to give credit appropriately. 5 | 6 | # Bug reports 7 | 8 | Use the [issue tracker](https://github.com/Quansight/pnumpy/issues). 9 | Please include: 10 | 11 | * Your operating system name and version. 12 | * Any details about your local setup that might be helpful in troubleshooting. 13 | * Detailed steps to reproduce the bug. 14 | 15 | # Documentation improvements 16 | 17 | FastNumPy could always use more documentation, whether as part of the 18 | official FastNumPy docs, in docstrings, or even on the web in blog posts, 19 | articles, and such. 20 | 21 | # Feature requests and feedback 22 | 23 | The best way to send feedback is to file an issue on the issue tracker. 24 | 25 | If you are proposing a feature: 26 | 27 | * Explain in detail how it would work. 28 | * Keep the scope as narrow as possible, to make it easier to implement. 29 | * Remember that this is a volunteer-driven project, and that code contributions are welcome :) 30 | 31 | # Development 32 | 33 | To set up `pnumpy` for local development: 34 | 35 | 1. Fork [pnumpy](https://github.com/Quansight/pnumpy) 36 | (look for the "Fork" button). 37 | 2. Clone your fork locally 38 | ``` 39 | git clone git@github.com:YOURGITHUBNAME/pnumpy.git 40 | ``` 41 | 42 | 3. Create a branch for local development:: 43 | ``` 44 | git checkout -b name-of-your-bugfix-or-feature 45 | ``` 46 | 47 | Now you can make your changes locally. 48 | 49 | 4. When you're done making changes run all the tests with 50 | ``` 51 | python setup.py build_ext --inplace 52 | python -m pip install pytest 53 | python -m pytest tests 54 | ``` 55 | 56 | 5. Commit your changes and push your branch to GitHub:: 57 | ``` 58 | git add . 59 | git commit -m "Your detailed description of your changes." 60 | git push origin name-of-your-bugfix-or-feature 61 | ``` 62 | 63 | 6. Submit a pull request through the GitHub website. 64 | 65 | ### Pull Request Guidelines 66 | 67 | If you need some code review or feedback while you're developing the code just make the pull request. 68 | 69 | For merging, you should: 70 | 71 | 1. Update documentation when there's new API, functionality etc. 72 | 2. Add a note to `CHANGELOG.rst` about the changes. 73 | 3. Add yourself to `AUTHORS.rst`. 74 | 75 | 1If you don't have all the necessary python versions available 76 | locally you can rely on CI - it will [run the 77 | tests](https://travis-ci.org/Quansight/pnumpy/pull_requests) 78 | for each change you add in the pull request. 79 | 80 | It will be slower though ... 81 | 82 | ### Tips 83 | 84 | To run a subset of tests:: 85 | ``` 86 | python -m pytest -k test_myfeature 87 | ``` 88 | 89 | 90 | -------------------------------------------------------------------------------- /src/pnumpy/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pnumpy now calls ``init`` at startup to setup the package. This imports NumPy, 3 | replacing all the inner loops of UFuncs with wrapped versions. Then you can 4 | enable/disable any of the subsystems: 5 | 6 | - threading 7 | 8 | Threading will kick in when the number of elements to be processed is more 9 | than 50,000. It will break the operation into chunks. Each chunk will be 10 | executed in its own thread. 11 | 12 | - ledger 13 | 14 | The ledger records data on each loop execution to 15 | enable more accurate heuristics on memory allocation, threading behavior 16 | and reporting for logging and benchmarking. 17 | 18 | - recycler 19 | 20 | Once we can change the NumPy memory allocation strategy, we can use the 21 | data from the ledger to create more performant memory caches. 22 | 23 | - atop 24 | 25 | Provide faster implementations of NumPy inner loops. 26 | """ 27 | from ._version import __version__ 28 | __all__ = [ 29 | 'initialize', 'atop_enable', 'atop_disable', 'atop_isenabled', 'atop_info', 'atop_setworkers','cpustring', 30 | 'thread_enable', 'thread_disable', 'thread_isenabled', 'thread_getworkers', 'thread_setworkers', 'thread_zigzag', 31 | 'ledger_enable', 'ledger_disable', 'ledger_isenabled', 'ledger_info', 32 | 'recycler_enable', 'recycler_disable', 'recycler_isenabled', 'recycler_info', 33 | 'timer_gettsc','timer_getutc', 'benchmark', 'recarray_to_colmajor', 'init', 'enable', 'disable', 'cpu_count_linux', 34 | 'sort', 'lexsort' 35 | ] 36 | 37 | import numpy as np 38 | import numpy.core._multiarray_umath as umath 39 | 40 | # TODO check for Apple M1 chip (where AVX2 makes no sense) 41 | # TODO check for numpy version 42 | 43 | try: 44 | # Numpy 1.18 does not have __cpu_features 45 | # If we cannot find it, we load anyway because 95% have AVX2 46 | # and we can hook numpy 1.18 ufuncs 47 | # TODO: check for Apple M1 chip 48 | __hasavx2 = umath.__cpu_features__['AVX2'] 49 | except Exception: 50 | __hasavx2 = True 51 | 52 | if not __hasavx2: 53 | raise ValueError(f"PNumPy requires a CPU with AVX2 capability to work") 54 | 55 | del __hasavx2 56 | 57 | import pnumpy._pnumpy as _pnumpy 58 | from pnumpy._pnumpy import atop_enable, atop_disable, atop_isenabled, atop_info, atop_setworkers, cpustring 59 | from pnumpy._pnumpy import thread_enable, thread_disable, thread_isenabled, thread_getworkers, thread_setworkers, thread_zigzag 60 | from pnumpy._pnumpy import timer_gettsc, timer_getutc 61 | from pnumpy._pnumpy import ledger_enable, ledger_disable, ledger_isenabled, ledger_info 62 | from pnumpy._pnumpy import recycler_enable, recycler_disable, recycler_isenabled, recycler_info 63 | from pnumpy._pnumpy import getitem, lexsort32, lexsort64 64 | 65 | from .cpu import cpu_count_linux, init, enable, disable 66 | from .sort import sort, lexsort, argsort, argmin, argmax, searchsorted 67 | from .benchmark import benchmark, benchmark_func 68 | from .recarray import recarray_to_colmajor 69 | 70 | # to be removed 71 | def initialize(): 72 | init() 73 | 74 | # start the engine by default 75 | # TODO: check environment variable 76 | init() 77 | -------------------------------------------------------------------------------- /benchmarks/README.rst: -------------------------------------------------------------------------------- 1 | .. -*- rst -*- 2 | 3 | ========== 4 | Benchmarks 5 | ========== 6 | 7 | This package uses `Airspeed Velocity`_ for benchmarks. The benchmarks are adapted 8 | from the ones in the `NumPy github repo`_ 9 | 10 | 11 | Usage 12 | ----- 13 | 14 | Airspeed Velocity manages building and Python virtualenvs by itself. 15 | 16 | Before beginning, ensure that *airspeed velocity* is installed. 17 | By default, `asv` ships with support for anaconda and virtualenv:: 18 | 19 | pip install asv 20 | pip install virtualenv 21 | 22 | After contributing new benchmarks, you should test them locally 23 | before submitting a pull request. 24 | 25 | To run all benchmarks, navigate to the top-level repo directory via the command 26 | line and execute:: 27 | 28 | asv run 29 | 30 | The first time this is run, it will build a profile of the machine. The 31 | information is stored in a top-level `.asv` directory that will be ignored by 32 | git. (Note: running benchmarks could take a while. Each benchmark is run 33 | multiple times to measure the distribution in execution times.) 34 | 35 | To run benchmarks across a series of git commits, `asv` supports git-like 36 | syntax. For example to run benchmarks on all commits on a branch off `main`, 37 | do:: 38 | asv run main..mybranch 39 | 40 | To view benchmarks once run, use ``asv show ``:: 41 | 42 | asv show main 43 | 44 | This will display the results in plain text in the console. For a graphical 45 | view, you can create html via ``asv publish`` and then view the result with 46 | ``asv preview``. 47 | 48 | More on how to use ``asv`` can be found in `ASV documentation`_ 49 | Command-line help is available as usual via ``asv --help`` and 50 | ``asv run --help``. 51 | 52 | .. _ASV documentation: https://asv.readthedocs.io/ 53 | 54 | 55 | Writing benchmarks 56 | ------------------ 57 | 58 | See `Airspeed Velocity`_ documentation for basics on how to write benchmarks. 59 | 60 | Some things to consider: 61 | 62 | - The benchmark suite should be importable with any version of the project. 63 | 64 | - The benchmark parameters etc. should not depend on which version is 65 | installed. 66 | 67 | - Try to keep the runtime of the benchmark reasonable. 68 | 69 | - Prefer ASV's ``time_`` methods for benchmarking times rather than cooking up 70 | time measurements via ``time.clock``, even if it requires some juggling when 71 | writing the benchmark. 72 | 73 | - Preparing arrays etc. should generally be put in the ``setup`` method rather 74 | than the ``time_`` methods, to avoid counting preparation time together with 75 | the time of the benchmarked operation. 76 | 77 | - Be mindful that large arrays created with ``np.empty`` or ``np.zeros`` might 78 | not be allocated in physical memory until the memory is accessed. If this is 79 | desired behaviour, make sure to comment it in your setup function. If 80 | you are benchmarking an algorithm, it is unlikely that a user will be 81 | executing said algorithm on a newly created empty/zero array. One can force 82 | pagefaults to occur in the setup phase either by calling ``np.ones`` or 83 | ``arr.fill(value)`` after creating the array, 84 | 85 | .. _`Airspeed Velocity`: https://asv.readthedocs.io/ 86 | .. _`NumPy github repo`: https://github.com/numpy/numpy 87 | -------------------------------------------------------------------------------- /tests/test_ufuncs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | HAVE_PNUMPY = True 3 | try: 4 | import pnumpy 5 | except Exception: 6 | HAVE_PNUMPY = False 7 | 8 | import pytest 9 | 10 | # pnumpy.initialize() is called from conftest.py 11 | 12 | 13 | def type2dtype(types): 14 | """ 15 | Maps the ufunc.type to the input, output dtypes. 16 | type2dtype('ii->?') -> (np.int32, np.int32), (np.bool_,) 17 | """ 18 | inp, out = types.split('->') 19 | return tuple(np.dtype(c) for c in inp), tuple(np.dtype(c) for c in out) 20 | 21 | 22 | def get_ufuncs_and_types(): 23 | """Create a dictionary with keys of ufunc names and values of all the 24 | supported type signatures 25 | """ 26 | ufuncs = [x for x in dir(np) if isinstance(getattr(np, x), np.ufunc)] 27 | if 'matmul' in ufuncs: 28 | ufuncs.remove('matmul') 29 | # Maybe use a collections.defaultdict instead? 30 | ret = dict([[x, []] for x in ufuncs]) 31 | for s in ret: 32 | ret[s] = [type2dtype(t) for t in getattr(np, s).types] 33 | return ret 34 | 35 | 36 | def fill_random(a, rng): 37 | """Fill an ndarray with random values. This will uniformly cover the 38 | bit-valued number space, which in the case of floats is differnt from 39 | rng.uniform() 40 | """ 41 | if a.dtype == 'object': 42 | v = a.reshape(-1) 43 | # Slow !!! 44 | for i in range(v.size): 45 | v[i] = float(rng._bit_generator.random_raw(1)) 46 | else: 47 | v = a.view(np.uint64) 48 | v[:] = rng._bit_generator.random_raw(v.size).reshape(v.shape) 49 | return a 50 | 51 | typemap = get_ufuncs_and_types() 52 | 53 | def data(in_dtypes, out_dtypes, shape, rng): 54 | """ Return two tuples: input and output, with random data dtypes and 55 | shape 56 | """ 57 | ret_in = [fill_random(np.empty(shape, dtype=d), rng) for d in in_dtypes] 58 | ret_out = tuple([fill_random(np.empty(shape, dtype=d), rng) for d in out_dtypes]) 59 | return ret_in, ret_out 60 | 61 | @pytest.mark.filterwarnings("ignore::RuntimeWarning") 62 | @pytest.mark.parametrize(['name', 'types'], ([k, v] for k,v in typemap.items())) 63 | def test_threads(name, types, initialize_pnumpy, rng): 64 | """ Test that enabling the threading does not change the results 65 | """ 66 | if HAVE_PNUMPY: 67 | ufunc = getattr(np, name) 68 | for in_dtypes, out_dtypes in types: 69 | # Skip object dtypes 70 | if any([o == 'object' for o in out_dtypes]): 71 | continue 72 | if any([o == 'object' for o in in_dtypes]): 73 | continue 74 | in_data, out_data = data(in_dtypes, out_dtypes, [1024, 1024], rng) 75 | if (name in ('power',) and 76 | issubclass(in_data[1].dtype.type, np.integer)): 77 | in_data[1] = np.abs(in_data[1]) 78 | in_data[1][in_data[1] < 0] = 0 79 | if len(out_data) == 1: 80 | out_data = out_data[0] 81 | out1 = ufunc(*in_data, out=out_data) 82 | pnumpy.thread_enable() 83 | assert pnumpy.thread_isenabled() 84 | out2 = ufunc(*in_data, out=out_data) 85 | pnumpy.thread_disable() 86 | # may not work on datetime 87 | if not any([o == 'datetime64' for o in out_dtypes]) and not any([o == 'timedelta64' for o in out_dtypes]): 88 | np.testing.assert_allclose(out1, out2, equal_nan=True) 89 | -------------------------------------------------------------------------------- /src/atop/atop.cpp: -------------------------------------------------------------------------------- 1 | #include "atop.h" 2 | #include "threads.h" 3 | 4 | #define LOGGING(...) 5 | #define LOGERROR printf 6 | 7 | //---------------------------------------------------------------------------------- 8 | // Lookup to go from 1 byte to 8 byte boolean values 9 | int64_t gBooleanLUT64[256]; 10 | int32_t gBooleanLUT32[16]; 11 | 12 | int64_t gBooleanLUT64Inverse[256]; 13 | int32_t gBooleanLUT32Inverse[16]; 14 | 15 | void* g_cMathWorker = NULL; 16 | 17 | // Keep track of stats 18 | static int64_t g_TotalAllocs = 0; 19 | static int64_t g_TotalFree = 0; 20 | static int64_t g_TotalMemoryAllocated = 0; 21 | static int64_t g_TotalMemoryFreed = 0; 22 | 23 | #define MAGIC_PAGE_GUARD 0xDEADBEEFDEADBEEF 24 | //----------------------------------------------- 25 | void* FmAlloc(size_t _Size) { 26 | // make thread safe 27 | uint64_t* pageGuard = (uint64_t*)malloc(_Size + 16); 28 | if (pageGuard) { 29 | InterlockedIncrement64(&g_TotalAllocs); 30 | InterlockedAdd64(&g_TotalMemoryAllocated, _Size); 31 | pageGuard[0] = _Size; 32 | pageGuard[1] = MAGIC_PAGE_GUARD; 33 | 34 | // Skip past guard 35 | return &pageGuard[2]; 36 | } 37 | return NULL; 38 | } 39 | 40 | void FmFree(void* _Block) { 41 | // The C standard requires that free() be a no-op when called with nullptr. 42 | // FmAlloc can return a nullptr, and since we want this function to behave 43 | // like free() we also need to handle the nullptr case here. 44 | if (!_Block) { return; } 45 | 46 | //LOGRECYCLE("Freeing %p\n", _Block); 47 | InterlockedIncrement64(&g_TotalFree); 48 | uint64_t* pageGuard = (uint64_t*)_Block; 49 | pageGuard--; 50 | pageGuard--; 51 | if (pageGuard[1] != MAGIC_PAGE_GUARD) { 52 | LOGERROR("!! User freed bad memory, no page guard %p\n", pageGuard); 53 | } 54 | else { 55 | InterlockedAdd64(&g_TotalMemoryFreed, pageGuard[0]); 56 | // mark so cannot free again 57 | pageGuard[1] = 0; 58 | } 59 | 60 | free(pageGuard); 61 | } 62 | 63 | //==================================================== 64 | // Must be called to initialize atop 65 | // Will start threads and detect the CPU 66 | // Will build runtime lookup tables 67 | // NOTE: return FALSE if ALREADY initialized 68 | BOOL atop_init() { 69 | 70 | // Check if init already called 71 | if (g_cMathWorker) return FALSE; 72 | 73 | // Build LUTs used in comarisons after mask generated 74 | for (int i = 0; i < 256; i++) { 75 | BYTE* pDest = (BYTE*)&gBooleanLUT64[i]; 76 | for (int j = 0; j < 8; j++) { 77 | *pDest++ = ((i >> j) & 1); 78 | } 79 | } 80 | // Build LUTs 81 | for (int i = 0; i < 16; i++) { 82 | BYTE* pDest = (BYTE*)&gBooleanLUT32[i]; 83 | for (int j = 0; j < 4; j++) { 84 | *pDest++ = ((i >> j) & 1); 85 | } 86 | } 87 | 88 | // Build LUTs 89 | for (int i = 0; i < 256; i++) { 90 | gBooleanLUT64Inverse[i] = gBooleanLUT64[i] ^ 0x0101010101010101LL; 91 | } 92 | // Build LUTs 93 | for (int i = 0; i < 16; i++) { 94 | gBooleanLUT32Inverse[i] = gBooleanLUT32[i] ^ 0x01010101; 95 | } 96 | 97 | g_cMathWorker = new CMathWorker(); 98 | 99 | // start up the worker threads now in case we use them 100 | THREADER->StartWorkerThreads(0); 101 | 102 | LOGGING("ATOP loaded\n"); 103 | return TRUE; 104 | } 105 | 106 | -------------------------------------------------------------------------------- /src/pnumpy/arange.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include 3 | #include "../atop/threads.h" 4 | 5 | //------------------------------------------------------ 6 | // The arange routine is largely copied from numpy 7 | // 8 | #define error_converting(x) (((x) == -1) && PyErr_Occurred()) 9 | 10 | /* 11 | * Like ceil(value), but check for overflow. 12 | * 13 | * Return 0 on success, -1 on failure. In case of failure, set a PyExc_Overflow 14 | * exception 15 | */ 16 | static npy_intp 17 | _arange_safe_ceil_to_intp(double value) 18 | { 19 | double ivalue; 20 | 21 | ivalue = ceil(value); 22 | /* condition inverted to handle NaN */ 23 | if (isnan(ivalue)) { 24 | PyErr_SetString(PyExc_ValueError, 25 | "arange: cannot compute length"); 26 | return -1; 27 | } 28 | if (!(NPY_MIN_INTP <= ivalue && ivalue <= NPY_MAX_INTP)) { 29 | PyErr_SetString(PyExc_OverflowError, 30 | "arange: overflow while computing length"); 31 | return -1; 32 | } 33 | 34 | return (npy_intp)ivalue; 35 | } 36 | 37 | /*NUMPY_API 38 | Arange, 39 | */ 40 | PyObject* 41 | PArange(double start, double stop, double step, int type_num) 42 | { 43 | npy_intp length; 44 | PyArrayObject* range; 45 | PyArray_ArrFuncs* funcs; 46 | PyObject* obj; 47 | int ret; 48 | double delta, tmp_len; 49 | NPY_BEGIN_THREADS_DEF; 50 | 51 | delta = stop - start; 52 | tmp_len = delta / step; 53 | 54 | /* Underflow and divide-by-inf check */ 55 | if (tmp_len == 0.0 && delta != 0.0) { 56 | if (signbit(tmp_len)) { 57 | length = 0; 58 | } 59 | else { 60 | length = 1; 61 | } 62 | } 63 | else { 64 | length = _arange_safe_ceil_to_intp(tmp_len); 65 | if (error_converting(length)) { 66 | return NULL; 67 | } 68 | } 69 | 70 | if (length <= 0) { 71 | length = 0; 72 | return PyArray_New(&PyArray_Type, 1, &length, type_num, 73 | NULL, NULL, 0, 0, NULL); 74 | } 75 | range = (PyArrayObject*)PyArray_New(&PyArray_Type, 1, &length, type_num, 76 | NULL, NULL, 0, 0, NULL); 77 | if (range == NULL) { 78 | return NULL; 79 | } 80 | funcs = PyArray_DESCR(range)->f; 81 | 82 | /* 83 | * place start in the buffer and the next value in the second position 84 | * if length > 2, then call the inner loop, otherwise stop 85 | */ 86 | obj = PyFloat_FromDouble(start); 87 | ret = funcs->setitem(obj, PyArray_DATA(range), range); 88 | Py_DECREF(obj); 89 | if (ret < 0) { 90 | goto fail; 91 | } 92 | if (length == 1) { 93 | return (PyObject*)range; 94 | } 95 | obj = PyFloat_FromDouble(start + step); 96 | ret = funcs->setitem(obj, PyArray_BYTES(range) + PyArray_ITEMSIZE(range), 97 | range); 98 | Py_DECREF(obj); 99 | if (ret < 0) { 100 | goto fail; 101 | } 102 | if (length == 2) { 103 | return (PyObject*)range; 104 | } 105 | if (!funcs->fill) { 106 | PyErr_SetString(PyExc_ValueError, 107 | "no fill-function for data-type."); 108 | Py_DECREF(range); 109 | return NULL; 110 | } 111 | NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(range)); 112 | funcs->fill(PyArray_DATA(range), length, range); 113 | NPY_END_THREADS; 114 | if (PyErr_Occurred()) { 115 | goto fail; 116 | } 117 | return (PyObject*)range; 118 | 119 | fail: 120 | Py_DECREF(range); 121 | return NULL; 122 | } 123 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at info@quansight.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /src/pnumpy/conversions.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | #define LOGGING(...) 4 | 5 | // TODO: look at casting in ndarraytypes.h, convert_datatype.c 6 | //NPY_NO_EXPORT PyArray_VectorUnaryFunc* 7 | //PyArray_GetCastFunc(PyArray_Descr* descr, int type_num) 8 | //{ 9 | // PyArray_VectorUnaryFunc* castfunc = NULL; 10 | // 11 | // if (type_num < NPY_NTYPES_ABI_COMPATIBLE) { 12 | // castfunc = descr->f->cast[type_num]; 13 | 14 | //----------------------------------- 15 | // Converts (in parallel) a numpy recarray (void type) 16 | // Caller must have PREALLOCATE the colmajor arrays to copy data into 17 | // Caller must also pass the struct offsets (within the recarray) 18 | // 19 | // Input1: the recordarray to convert 20 | // Input2: int64 array of offsets 21 | // Input3: object array of numpy arrays pre allocated that match in order the recarray 22 | extern "C" 23 | PyObject* 24 | recarray_to_colmajor(PyObject* self, PyObject* args) { 25 | 26 | PyArrayObject* inArr = NULL; 27 | PyArrayObject* offsetArr = NULL; 28 | PyArrayObject* arrArr = NULL; 29 | 30 | //if (!PyArg_ParseTuple(args, "O!O!O!:recarray_to_colmajor", 31 | // &PyArray_Type, &inArr, 32 | // &PyArray_Type, &offsetArr, 33 | // &PyArray_Type, &arrArr)) { 34 | // return NULL; 35 | //} 36 | 37 | if (PyTuple_Size(args) == 3) { 38 | inArr = (PyArrayObject*)PyTuple_GetItem(args, 0); 39 | offsetArr = (PyArrayObject*)PyTuple_GetItem(args, 1); 40 | arrArr = (PyArrayObject*)PyTuple_GetItem(args, 2); 41 | } 42 | else { 43 | PyErr_Format(PyExc_ValueError, "recarray_to_colmajor must input 3 numpy arrays"); 44 | return NULL; 45 | } 46 | 47 | int64_t itemSize = PyArray_ITEMSIZE(inArr); 48 | 49 | if (itemSize != PyArray_STRIDE(inArr, 0)) { 50 | PyErr_Format(PyExc_ValueError, "recarray_to_colmajor cannot yet handle strides"); 51 | return NULL; 52 | } 53 | 54 | if (NPY_VOID != PyArray_TYPE(inArr)) { 55 | PyErr_Format(PyExc_ValueError, "recarray_to_colmajor must be void type"); 56 | return NULL; 57 | } 58 | 59 | if (NPY_OBJECT != PyArray_TYPE(arrArr)) { 60 | PyErr_Format(PyExc_ValueError, "recarray_to_colmajor third param must be object array"); 61 | return NULL; 62 | } 63 | 64 | int64_t length = ArrayLength(inArr); 65 | int64_t numArrays = ArrayLength(arrArr); 66 | 67 | if (numArrays != ArrayLength(offsetArr)) { 68 | PyErr_Format(PyExc_ValueError, "recarray_to_colmajor inputs do not match"); 69 | return NULL; 70 | } 71 | 72 | int64_t totalRows = length; 73 | int64_t* pOffsets = (int64_t*)PyArray_BYTES(offsetArr); 74 | PyArrayObject** ppArrays = (PyArrayObject**)PyArray_BYTES(arrArr); 75 | 76 | stRecarrayOffsets* pstOffset; 77 | 78 | // TODO allocate this on the stack 79 | pstOffset = (stRecarrayOffsets*)WORKSPACE_ALLOC(sizeof(stRecarrayOffsets) * numArrays); 80 | 81 | for (int64_t i = 0; i < numArrays; i++) { 82 | // Consider adding pOffsets here 83 | pstOffset[i].pData = PyArray_BYTES(ppArrays[i]); 84 | pstOffset[i].readoffset = pOffsets[i]; 85 | pstOffset[i].itemsize = PyArray_ITEMSIZE(ppArrays[i]); 86 | } 87 | 88 | char* pStartOffset = PyArray_BYTES(inArr); 89 | 90 | // Call atop to finish the work 91 | RecArrayToColMajor( 92 | pstOffset, 93 | pStartOffset, 94 | totalRows, 95 | numArrays, 96 | itemSize); 97 | 98 | WORKSPACE_FREE(pstOffset); 99 | 100 | RETURN_NONE; 101 | } 102 | 103 | -------------------------------------------------------------------------------- /asv.conf.json: -------------------------------------------------------------------------------- 1 | { 2 | // The version of the config file format. Do not change, unless 3 | // you know what you are doing. 4 | "version": 1, 5 | 6 | // The name of the project being benchmarked 7 | "project": "pnumpy", 8 | 9 | // The project's homepage 10 | "project_url": "https://quansight.github.io/pnumpy/stable/index.html", 11 | 12 | // The URL or local path of the source code repository for the 13 | // project being benchmarked 14 | "repo": ".", 15 | 16 | // List of branches to benchmark. If not provided, defaults to "master" 17 | // (for git) or "tip" (for mercurial). 18 | "branches": ["main"], 19 | 20 | // The DVCS being used. If not set, it will be automatically 21 | // determined from "repo" by looking at the protocol in the URL 22 | // (if remote), or by looking for special directories, such as 23 | // ".git" (if local). 24 | "dvcs": "git", 25 | 26 | // The tool to use to create environments. May be "conda", 27 | // "virtualenv" or other value depending on the plugins in use. 28 | // If missing or the empty string, the tool will be automatically 29 | // determined by looking for tools on the PATH environment 30 | // variable. 31 | "environment_type": "virtualenv", 32 | 33 | // the base URL to show a commit for the project. 34 | "show_commit_url": "https://github.com/Qaunsight/pnumpy.git", 35 | 36 | // The Pythons you'd like to test against. If not provided, defaults 37 | // to the current version of Python used to run `asv`. 38 | //"pythons": ["3.7"], 39 | 40 | // The matrix of dependencies to test. Each key is the name of a 41 | // package (in PyPI) and the values are version numbers. An empty 42 | // list indicates to just test against the default (latest) 43 | // version. 44 | "matrix": { 45 | "numpy": [], 46 | }, 47 | 48 | // The directory (relative to the current directory) that benchmarks are 49 | // stored in. If not provided, defaults to "benchmarks" 50 | "benchmark_dir": "benchmarks", 51 | 52 | // The directory (relative to the current directory) to cache the Python 53 | // environments in. If not provided, defaults to "env" 54 | "env_dir": ".asv/env", 55 | 56 | 57 | // The directory (relative to the current directory) that raw benchmark 58 | // results are stored in. If not provided, defaults to "results". 59 | "results_dir": ".asv/results", 60 | 61 | // The directory (relative to the current directory) that the html tree 62 | // should be written to. If not provided, defaults to "html". 63 | "html_dir": "html", 64 | 65 | // The number of characters to retain in the commit hashes. 66 | // "hash_length": 8, 67 | 68 | // `asv` will cache wheels of the recent builds in each 69 | // environment, making them faster to install next time. This is 70 | // number of builds to keep, per environment. 71 | "build_cache_size": 8, 72 | 73 | // The commits after which the regression search in `asv publish` 74 | // should start looking for regressions. Dictionary whose keys are 75 | // regexps matching to benchmark names, and values corresponding to 76 | // the commit (exclusive) after which to start looking for 77 | // regressions. The default is to start from the first commit 78 | // with results. If the commit is `null`, regression detection is 79 | // skipped for the matching benchmark. 80 | // 81 | // "regressions_first_commits": { 82 | // "some_benchmark": "352cdf", // Consider regressions only after this commit 83 | // "another_benchmark": null, // Skip regression detection altogether 84 | // } 85 | } 86 | -------------------------------------------------------------------------------- /doc_src/source/index.rst: -------------------------------------------------------------------------------- 1 | .. pnumpy documentation master file, created by 2 | sphinx-quickstart on Thu Oct 22 12:01:26 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to PNumPy's documentation! 7 | ================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | roadmap 14 | installation 15 | use 16 | 17 | PNumPy seamlessly speeds up NumPy for large arrays (64K+ elements) with *no 18 | change required to your existing NumPy code*. 19 | 20 | This first release speeds up NumPy binary and unary ufuncs such as ``add``, 21 | ``multiply``, ``isnan``, ``abs``, ``sin``, ``log``, ``sum``, ``min`` and many more. 22 | Sped up functions also include: ``sort``, ``argsort``, ``lexsort``, boolean indexing, and 23 | fancy indexing. In the near future we will speed up: ``astype``, ``where``, ``putmask``, 24 | ``arange``, ``searchsorted``. 25 | 26 | Installation 27 | ------------ 28 | 29 | .. code-block:: python 30 | 31 | pip install pnumpy 32 | 33 | To use the project: 34 | 35 | .. code-block:: python 36 | 37 | import pnumpy as pn 38 | 39 | 40 | PNumPy speeds up NumPy silently under the hood. To see some benchmarks 41 | yourself :ref:`run ASV ` or use the built-in :ref:`benchmark 42 | ` function: 43 | 44 | .. code-block:: python 45 | 46 | pn.benchmark() 47 | 48 | .. image:: ../images/bench4graph2.PNG 49 | .. image:: ../images/bench4graph3.PNG 50 | 51 | To get a partial list of functions sped up run 52 | 53 | .. code-block:: python 54 | 55 | pn.atop_info() 56 | 57 | To disable or enable pnumpy run 58 | 59 | .. code-block:: python 60 | 61 | pn.disable() 62 | pn.enable() 63 | 64 | To cap the number of additional worker threads to 3 run 65 | 66 | .. code-block:: python 67 | 68 | pn.thread_setworkers(3) 69 | 70 | .. _ASV: https://asv.readthedocs.io/en/stable/using.html 71 | 72 | 73 | Additional Functionality 74 | ------------------------ 75 | 76 | PNumPy provides additional routines such as converting a NumPy record array to a column major array in parallel (**pn.recarray_to_colmajor**) which is useful for DataFrames. Other routines include **pn.lexsort32**, which performs an indirect sort using **np.int32** instead of **np.int64** consuming half the memory and running faster. 77 | 78 | Threading 79 | --------- 80 | 81 | PNumPy uses a combination of threads and 256 bit vector intrinsics to speed up calculations. By default most operations will only use 3 additional worker threads in combination with the main python thread for a total 4. Large arrays are divided up into 16K chunks and threads are assigned to maintain cache coherency. More threads are dynamically deployed for more intensive CPU problems like **np.sin**. Users can customize threading. The example below shows how 4 threads can work together to quadruple the effective L2 cache size. 82 | 83 | .. image:: ../images/threading_npadd.PNG 84 | 85 | 86 | FAQ 87 | --- 88 | 89 | **Q: If I type np.sort(a) where a is an array, will it be sped up?** 90 | 91 | *A: If len(a) > 65536 and pnumpy has been imported, it will automatically be sped up* 92 | 93 | **Q: How is sort sped up?** 94 | 95 | *A: PNumPy uses additional threads to divide up the sorting job. For example it might perform an 8 way quicksort followed by a 4 way mergesort* 96 | 97 | Development 98 | ----------- 99 | 100 | To run all the tests run: 101 | 102 | .. code-block:: python 103 | 104 | python -m pip install pytest 105 | python -m pytest tests 106 | 107 | 108 | Indices and tables 109 | ================== 110 | 111 | * :ref:`genindex` 112 | * :ref:`modindex` 113 | * :ref:`search` 114 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: Build and upload to PyPI 2 | # publish when a (published) GitHub Release is created 3 | on: 4 | release: 5 | types: 6 | - published 7 | 8 | jobs: 9 | build_wheels: 10 | name: Build wheels ${{ matrix.os }}, ${{ matrix.platform }} 11 | runs-on: ${{ matrix.os }} 12 | defaults: 13 | run: 14 | shell: bash 15 | strategy: 16 | matrix: 17 | os: [ubuntu-latest, macos-latest, windows-latest] 18 | platform: [x64] 19 | python-version: [3.7] # changed from [3.6, 3.7, 3.8] 20 | exclude: 21 | - os: macos-latest 22 | platform: x32 23 | steps: 24 | - uses: actions/checkout@v2 25 | with: 26 | submodules: recursive 27 | - name: Set up python ${{ matrix.python-version }} 28 | uses: actions/setup-python@v2 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | 32 | - name: Install cibuildwheel 33 | run: | 34 | python -m pip install cibuildwheel==1.6.3 35 | - name: Build wheels on ${{ matrix.os}} 36 | run: | 37 | RUNNER_OS="${{ runner.os }}" 38 | PLATFORM="${{ matrix.platform }}" 39 | echo $RUNNER_OS 40 | echo $PLATFORM 41 | if [ "$PLATFORM" == "x64" ]; then 42 | export CIBW_SKIP="cp27-* cp35-* *-win32 *-manylinux_i686 *manylinux_aarch64 *manylinux_ppc64le *manylinux_s390x" 43 | elif [ "$PLATFORM" == "x32" ]; then 44 | export CIBW_SKIP="cp27-* cp35-*" 45 | fi 46 | # to exclude manylinux_aarch64 manylinux_ppc64le manylinux_s390x 47 | if [ "$RUNNER_OS" == "Windows" ]; then 48 | if [ "$PLATFORM" == "x64" ]; then 49 | export CIBW_BUILD="cp37-win_amd64" 50 | elif [ "$PLATFORM" == "x32" ]; then 51 | export CIBW_BUILD="cp37-win32" 52 | fi 53 | elif [ "$RUNNER_OS" == "Linux" ]; then 54 | if [ "$PLATFORM" == "x64" ]; then 55 | export CIBW_BUILD="cp36-manylinux_x86_64 cp37-manylinux_x86_64 cp38-manylinux_x86_64" 56 | elif [ "$PLATFORM" == "x32" ]; then 57 | export CIBW_BUILD="cp36-manylinux_i686 cp37-manylinux_i686 cp38-manylinux_i686" 58 | fi 59 | elif [ "$RUNNER_OS" == "macOS" ]; then 60 | export CIBW_BUILD="cp37-macosx_x86_64" 61 | fi 62 | python -m cibuildwheel --output-dir wheelhouse 63 | - uses: actions/upload-artifact@v2 64 | with: 65 | path: ./wheelhouse/*.whl 66 | 67 | build_sdist: 68 | name: Build source distribution 69 | runs-on: ubuntu-latest 70 | steps: 71 | - uses: actions/checkout@v2 72 | 73 | - uses: actions/setup-python@v2 74 | name: Install Python 75 | with: 76 | python-version: '3.7' 77 | 78 | - name: Build sdist 79 | run: | 80 | python -m pip install numpy>=1.19.0 81 | python setup.py sdist 82 | - uses: actions/upload-artifact@v2 83 | with: 84 | path: dist/*.tar.gz 85 | 86 | upload_pypi: 87 | needs: [build_wheels, build_sdist] 88 | runs-on: ubuntu-latest 89 | # publish when a GitHub Release is created 90 | if: github.event_name == 'release' && github.event.action == 'published' 91 | steps: 92 | - uses: actions/download-artifact@v2 93 | with: 94 | name: artifact 95 | path: dist 96 | 97 | - uses: pypa/gh-action-pypi-publish@master 98 | with: 99 | user: __token__ 100 | password: ${{ secrets.test_pypi }} # switch to non-test pwd after testing 101 | repository_url: https://test.pypi.org/legacy/ # remove line after testing 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PNumPy 2 | Parallel NumPy seamlessly speeds up NumPy for large arrays (64K+ elements) with *no change required to your existing NumPy code*. 3 | 4 | PNumPy supports Linux, Windows, and MacOS for NumPy >= 1.18 for python 3.6, 3.7, 3.8, and 3.9. 5 | 6 | This first release speeds up NumPy binary and unary ufuncs such as **add, multiply, isnan, abs, sin, log, sum, min and many more**. 7 | Sped up functions also include: **sort, argsort, lexsort, arange, boolean indexing, and fancy indexing**. 8 | In the near future we will speed up: **astype, where, putmask, and searchsorted**. 9 | 10 | Other packages that use numpy, such as [scikit-learn](https://scikit-learn.org/stable/) or [pandas](https://github.com/pandas-dev/pandas), will also be sped up for large arrays. 11 | 12 | [![CI Status](https://github.com/Quansight/pnumpy/workflows/tox/badge.svg)](https://github.com/Quansight/pnumpy/actions) 13 | 14 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 15 | 16 | ## Installation 17 | ``` 18 | pip install pnumpy 19 | ``` 20 | 21 | You can also install the latest development versions with 22 | ``` 23 | pip install https://github.com/Quansight/pnumpy/archive/main.zip 24 | ``` 25 | 26 | ## Documentation 27 | 28 | See the [full documentation](https://quansight.github.io/pnumpy/stable/index.html) 29 | 30 | To use the project: 31 | 32 | ```python 33 | import pnumpy as pn 34 | ``` 35 | 36 | Parallel NumPy speeds up NumPy silently under the hood. To see some benchmarks yourself run 37 | ``` 38 | pn.benchmark() 39 | ``` 40 | ![plot](./doc_src/images/bench4graph2.PNG) 41 | ![plot](./doc_src/images/bench4graph3.PNG) 42 | 43 | To get a partial list of functions sped up run 44 | ``` 45 | pn.atop_info() 46 | ``` 47 | 48 | To disable or enable pnumpy run 49 | ``` 50 | pn.disable() 51 | pn.enable() 52 | ``` 53 | 54 | ## Additional Functionality 55 | PNumPy provides additional routines such as converting a NumPy record array to a column major array in parallel (**pn.recarray_to_colmajor**) which is useful for DataFrames. Other routines include **pn.lexsort32**, which performs an indirect sort using **np.int32** instead of **np.int64** consuming half the memory and running faster. 56 | 57 | ## Threading 58 | PNumPy uses a combination of threads and 256 bit vector intrinsics to speed up calculations. By default most operations will only use 3 additional worker threads in combination with the main python thread for a total 4. Large arrays are divided up into 16K chunks and threads are assigned to maintain cache coherency. More threads are dynamically deployed for more intensive CPU problems like **np.sin**. Users can customize threading. The example below shows how 4 threads can work together to quadruple the effective L2 cache size. 59 | 60 | ![plot](./doc_src/images/threading_npadd.PNG) 61 | 62 | To cap the number of additional worker threads to 3 run 63 | ``` 64 | pn.thread_setworkers(3) 65 | ``` 66 | 67 | To disable or re-enable threading run 68 | ``` 69 | pn.thread_disable() 70 | pn.thread_enable() 71 | ``` 72 | 73 | To disable or re-enable just the atop engine run 74 | ``` 75 | pn.atop_disable() 76 | pn.atop_enable() 77 | ``` 78 | 79 | ## FAQ 80 | **Q: If I type np.sort(a) where a is an array, will it be sped up?** 81 | 82 | *A: If len(a) > 65536 and pnumpy has been imported, it will automatically be sped up* 83 | 84 | **Q: How is sort sped up?** 85 | 86 | *A: PNumPy uses additional threads to divide up the sorting job. For example it might perform an 8 way quicksort followed by a 4 way mergesort* 87 | 88 | **Q: How is scikit or pandas sped up?** 89 | 90 | *A: PNumPy's vector loops and threads will speed up any package that uses large NumPy arrays* 91 | 92 | ## Development 93 | 94 | To run all the tests run: 95 | 96 | ``` 97 | python -m pip install pytest 98 | python -m pytest tests 99 | ``` 100 | -------------------------------------------------------------------------------- /src/pnumpy/cpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import re 4 | import glob 5 | 6 | import pnumpy._pnumpy as _pnumpy 7 | from pnumpy._pnumpy import atop_enable, atop_disable, atop_isenabled, atop_info, atop_setworkers, cpustring 8 | from pnumpy._pnumpy import thread_enable, thread_disable, thread_isenabled, thread_getworkers, thread_setworkers, thread_zigzag 9 | 10 | __all__ = [ 11 | 'cpu_count_linux', 'init', 'enable', 'disable'] 12 | 13 | # NOTE: code adapted from psinfo 14 | 15 | def open_binary(fname, **kwargs): 16 | return open(fname, "rb", **kwargs) 17 | 18 | def cpu_physical_linux(): 19 | """Return the number of physical cores in the system. 20 | None may be returned on failure. 21 | """ 22 | # Method #1 23 | ls = set() 24 | # These 2 files are the same but */core_cpus_list is newer while 25 | # */thread_siblings_list is deprecated and may disappear in the future. 26 | # https://www.kernel.org/doc/Documentation/admin-guide/cputopology.rst 27 | # https://github.com/giampaolo/psutil/pull/1727#issuecomment-707624964 28 | # https://lkml.org/lkml/2019/2/26/41 29 | p1 = "/sys/devices/system/cpu/cpu[0-9]*/topology/core_cpus_list" 30 | p2 = "/sys/devices/system/cpu/cpu[0-9]*/topology/thread_siblings_list" 31 | for path in glob.glob(p1) or glob.glob(p2): 32 | with open_binary(path) as f: 33 | ls.add(f.read().strip()) 34 | result = len(ls) 35 | if result != 0: 36 | return result 37 | 38 | # Method #2 39 | mapping = {} 40 | current_info = {} 41 | with open_binary('/proc/cpuinfo') as f: 42 | for line in f: 43 | line = line.strip().lower() 44 | if not line: 45 | # new section 46 | try: 47 | mapping[current_info[b'physical id']] = \ 48 | current_info[b'cpu cores'] 49 | except KeyError: 50 | pass 51 | current_info = {} 52 | else: 53 | # ongoing section 54 | if line.startswith((b'physical id', b'cpu cores')): 55 | key, value = line.split(b'\t:', 1) 56 | current_info[key] = int(value) 57 | 58 | result = sum(mapping.values()) 59 | return result or None # mimic os.cpu_count() 60 | 61 | def cpu_count_linux(): 62 | """ 63 | Return the number of logical CPUs and physical cores. 64 | None may be returned on failure. 65 | """ 66 | try: 67 | num= os.sysconf("SC_NPROCESSORS_ONLN") 68 | except ValueError: 69 | # as a second fallback we try to parse /proc/cpuinfo 70 | num = 0 71 | with open_binary('/proc/cpuinfo') as f: 72 | for line in f: 73 | if line.lower().startswith(b'processor'): 74 | num += 1 75 | 76 | # try to parse /proc/stat as a last resort 77 | if num == 0: 78 | search = re.compile(r'cpu\d') 79 | with open_text('/proc/stat') as f: 80 | for line in f: 81 | line = line.split(' ')[0] 82 | if search.match(line): 83 | num += 1 84 | 85 | if num == 0: 86 | # mimic os.cpu_count() 87 | num=None 88 | return num, cpu_physical_linux() 89 | 90 | def init(): 91 | """ 92 | Called at load time to start the atop and threading engines. 93 | 94 | Parameters 95 | ---------- 96 | None 97 | 98 | See Also 99 | -------- 100 | pn.enable 101 | pn.disable 102 | """ 103 | 104 | import platform 105 | if platform.system() == 'Linux': 106 | logical,physical = cpu_count_linux() 107 | _pnumpy.initialize() 108 | else: 109 | _pnumpy.initialize() 110 | 111 | def enable(): 112 | """ 113 | Call to enable the atop engine, use threads, and hook numpy functions. 114 | 115 | Parameters 116 | ---------- 117 | None 118 | 119 | Returns 120 | ------- 121 | None 122 | 123 | See Also 124 | -------- 125 | pn.disable 126 | pn.atop_info 127 | """ 128 | atop_enable() 129 | thread_enable() 130 | 131 | def disable(): 132 | """ 133 | Call to disable the atop engine, stop any threads, and unhook numpy functions. 134 | 135 | Parameters 136 | ---------- 137 | None 138 | 139 | Returns 140 | ------- 141 | None 142 | 143 | See Also 144 | -------- 145 | pn.enable 146 | pn.atop_info 147 | """ 148 | atop_disable() 149 | thread_disable() 150 | 151 | -------------------------------------------------------------------------------- /_add_newdocs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Add docstrings to c-extension modules: 3 | Will create a header file for each module in the current directory, and 4 | fill it with the docstrings. 5 | """ 6 | from collections import defaultdict 7 | import io 8 | import os 9 | srcdir = os.path.join(os.path.dirname(__file__), 'src', 'pnumpy') 10 | 11 | def append_header(fid, function, docstring): 12 | key = function.upper() + "_DOC" 13 | docstring = docstring.replace('"', '\\"') 14 | docstring = docstring.replace('\n', '"\n"') 15 | fid.write("\n") 16 | fid.write(f'static char {key}[] = "{docstring}";') 17 | 18 | headers = defaultdict(io.StringIO) 19 | 20 | def add_newdoc(module, function, docstring): 21 | fid = headers[module.upper() + '.h'] 22 | append_header(fid, function, docstring) 23 | 24 | 25 | add_newdoc('pnumpy', 'initialize', 26 | """ 27 | Initialize the module. Replaces all the ufunc inner loops with a new version 28 | using ``PyUFunc_ReplaceLoopBySignature``. If none of the other options are 29 | enabled, the original inner loop function will be called. Will also call 30 | ``numpy.setbufsize(8192 * 1024)`` to work around numpy issue 17649. 31 | """) 32 | 33 | 34 | add_newdoc('pnumpy', 'atop_enable', 35 | """ 36 | enable the atop inner loop implementations. 37 | """) 38 | 39 | 40 | add_newdoc('pnumpy', 'atop_disable', 41 | """ 42 | disable the atop inner loop implementations. 43 | """) 44 | 45 | 46 | add_newdoc('pnumpy', "atop_isenabled", 47 | "returns True if atop enabled, else False") 48 | 49 | 50 | add_newdoc('pnumpy', "thread_enable", 51 | """ 52 | Enable worker threads for inner loops when they are large enough to justify 53 | the extra overhead. 54 | """) 55 | 56 | 57 | add_newdoc('pnumpy', "thread_disable", 58 | "Disable worker threads") 59 | 60 | 61 | add_newdoc('pnumpy', "thread_isenabled", 62 | "Returns True if worker threads enabled else False") 63 | 64 | 65 | add_newdoc('pnumpy', "thread_getworkers", 66 | "Get the number of worker threads") 67 | 68 | 69 | add_newdoc('pnumpy', "thread_setworkers", 70 | "Set the number of worker threads, return previous value. Must be at least 1.") 71 | 72 | 73 | add_newdoc('pnumpy', "timer_gettsc", 74 | "Get the time stamp counter") 75 | 76 | 77 | add_newdoc('pnumpy', "timer_getutc", 78 | "Get the time in utc nanos since unix epoch") 79 | 80 | 81 | add_newdoc('pnumpy', "cpustring", 82 | "Cpu brand string plus features") 83 | 84 | 85 | add_newdoc('pnumpy', "oldinit", 86 | "old, deprecated") 87 | 88 | 89 | add_newdoc('pnumpy', "ledger_enable", 90 | """ 91 | Enable ledger debuggging. This collects statistics on each run of a loop: 92 | input signature and dimensions, time to execute the loop and more 93 | """) 94 | 95 | 96 | add_newdoc('pnumpy', "ledger_disable", 97 | "Disable ledger") 98 | 99 | 100 | add_newdoc('pnumpy', "ledger_isenabled", 101 | "Returns True if ledger enabled else False") 102 | 103 | 104 | add_newdoc('pnumpy', "ledger_info", 105 | "Return ledger information") 106 | 107 | add_newdoc('pnumpy', 'recarray_to_colmajor', 108 | ("Converts a numpy record array (void type) to a dictionary of numpy arrays, col major\n" 109 | "Inputs\n" 110 | "------\n" 111 | "item: A numpy recorarray to return as column major\n" 112 | "parallel: Default to True\n" 113 | "\n" 114 | "Returns\n" 115 | "-------\n" 116 | "A dictionary of numpy arrays corresponding to the original numpy record array.\n" 117 | "\n" 118 | "Examples\n" 119 | "--------\n" 120 | ">>> x=np.array([(1.0, 2, 3, 4, 5, 'this is a long test'), (3.0, 4, 5, 6, 7, 'short'), (30.0, 40, 50, 60, 70, '')],\n" 121 | " dtype=[('x', '>> item=np.tile(x,100_000)\n" 123 | ">>> mydict = recarray_to_colmajor(item)" 124 | )) 125 | 126 | add_newdoc('pnumpy', "recycler_enable", 127 | "Enable recycler to compact memory usage") 128 | 129 | 130 | add_newdoc('pnumpy', "recycler_disable", 131 | "Disable recycler") 132 | 133 | 134 | add_newdoc('pnumpy', "recycler_isenabled", 135 | "Returns True if recycler enabled else False") 136 | 137 | 138 | add_newdoc('pnumpy', "recycler_info", 139 | "Return recycler information") 140 | 141 | # Rewrite any of the headers that changed 142 | 143 | def main(): 144 | for k, v in headers.items(): 145 | txt2 = '' 146 | target = os.path.join(srcdir, k) 147 | txt1 = v.getvalue() 148 | if os.path.exists(target): 149 | with open(target) as fid: 150 | txt2 = fid.read() 151 | if txt1 != txt2: 152 | print('writing', target) 153 | with open(target, 'w') as fid: 154 | fid.write(txt1) 155 | 156 | if __name__ == "__main__": 157 | main() 158 | -------------------------------------------------------------------------------- /src/atop/fill.cpp: -------------------------------------------------------------------------------- 1 | #include "common_inc.h" 2 | #include "threads.h" 3 | #include "halffloat.h" 4 | #include 5 | #include 6 | 7 | //#define LOGGING printf 8 | #define LOGGING(...) 9 | 10 | // NOTES 11 | // FillZeros calls 12 | // PyArray_AssignRawScalar 13 | // raw_array_assign_scalar 14 | // which then calls PyArray_GetDTypeTransferFunction 15 | // /* Process the innermost dimension */ 16 | //stransfer(dst_data, dst_strides_it[0], src_data, 0, 17 | // shape_it[0], src_itemsize, transferdata); 18 | 19 | ////================================================================================ 20 | typedef int(*ARANGE_FILL)(char* pBufferV, void* pFirstV, void* pNextValueV, int64_t start, int64_t length); 21 | 22 | // vector code disabled for now (does not seem much faster) 23 | //template 24 | //static int 25 | //ArangeFillTypeInt32(char* pBufferV, void* pFirstV, void* pNextValueV, int64_t start, int64_t length) 26 | //{ 27 | // //printf("int32 fill\n"); 28 | // TYPE* pBuffer = (TYPE*)pBufferV; 29 | // 30 | // // The start/next are stored by numpy in first two values of array 31 | // TYPE first = *(TYPE*)pFirstV; 32 | // TYPE delta = *(TYPE*)pNextValueV; 33 | // 34 | // delta -= first; 35 | // 36 | // __m256i mstart = _mm256_set_epi32(7,6,5,4,3,2,1,0); 37 | // __m256i madd= _mm256_set1_epi32(sizeof(__m256i)/sizeof(TYPE)); // 8 38 | // __m256i mdelta = _mm256_set1_epi32((int32_t)delta); 39 | // madd = _mm256_mullo_epi32(madd, mdelta); 40 | // mstart = _mm256_add_epi32(mstart, _mm256_set1_epi32((int32_t)start)); 41 | // mstart = _mm256_mullo_epi32(mstart, mdelta); 42 | // 43 | // __m256i* pDest = (__m256i*)(pBuffer + start); 44 | // __m256i* pDestEnd = pDest + (length - start) / 8; 45 | // 46 | // while (pDest != pDestEnd) { 47 | // _mm256_storeu_si256(pDest, mstart); 48 | // mstart = _mm256_add_epi32(mstart, madd); 49 | // pDest++; 50 | // } 51 | // 52 | // start = start + length - (length & 7); 53 | // for (int64_t i = start; i < length; i++) { 54 | // pBuffer[i] = (TYPE)(first + i * delta); 55 | // } 56 | // 57 | // return 0; 58 | //} 59 | 60 | 61 | template 62 | static int 63 | ArangeFillType(char *pBufferV, void* pFirstV, void* pNextValueV, int64_t start, int64_t length) 64 | { 65 | TYPE* pBuffer = (TYPE*)pBufferV; 66 | 67 | // The start/next are stored by numpy in first two values of array 68 | TYPE first = *(TYPE*)pFirstV; 69 | TYPE delta = *(TYPE*)pNextValueV; 70 | 71 | delta -= first; 72 | 73 | // TOOD: vectorize this code 74 | for (int64_t i = start; i < length; i++) { 75 | pBuffer[i] = (TYPE)(first + i * delta); 76 | } 77 | // Path below is slower 78 | //TYPE* pBufferEnd = pBuffer + length; 79 | //while (pBuffer < pBufferEnd) { 80 | // *pBuffer++ = start; 81 | // start += delta; 82 | //} 83 | return 0; 84 | } 85 | 86 | ARANGE_FILL g_ArangeFill[ATOP_LAST] = { 87 | NULL, //ArangeFillType, 88 | ArangeFillType, ArangeFillType, 89 | ArangeFillType, ArangeFillType, 90 | ArangeFillType, ArangeFillType, 91 | ArangeFillType, ArangeFillType, 92 | NULL, NULL, //int128 93 | NULL, ArangeFillType, ArangeFillType, ArangeFillType, 94 | NULL, NULL, NULL, NULL, // Complex 95 | NULL, NULL, NULL // String, unicode, void 96 | }; 97 | 98 | extern "C" int ArangeFill( 99 | int atype, 100 | char* pBuffer, 101 | void* pFirstValue, 102 | void* pSecondValue, 103 | int64_t length, 104 | int32_t threadwakeup) { 105 | 106 | ARANGE_FILL pArangeFill = g_ArangeFill[atype]; 107 | 108 | // check if we have the routine 109 | if (pArangeFill) { 110 | 111 | // Multithreaded callback 112 | struct ArangeCallbackStruct { 113 | ARANGE_FILL pArangeFill; 114 | char* pBuffer; 115 | void* pFirstValue; 116 | void* pSecondValue; 117 | int64_t length; 118 | } stArangeCallback{ pArangeFill, pBuffer, pFirstValue, pSecondValue, length }; 119 | 120 | // This is the routine that will be called back from multiple threads 121 | auto lambdaArangeCallback = [](void* callbackArgT, int core, int64_t start, int64_t length) -> int64_t { 122 | LOGGING("[%d] Arange %lld %lld\n", core, start, length); 123 | ArangeCallbackStruct* cb=(ArangeCallbackStruct * )callbackArgT; 124 | cb->pArangeFill(cb->pBuffer, cb->pFirstValue, cb->pSecondValue, start, start + length); 125 | return 1; 126 | }; 127 | 128 | THREADER->DoMultiThreadedChunkWork(length, lambdaArangeCallback, &stArangeCallback, threadwakeup); 129 | return 0; 130 | } 131 | // fail 132 | return -1; 133 | } 134 | 135 | -------------------------------------------------------------------------------- /src/pnumpy/common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Python.h" 3 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 4 | // NOTE: See PY_ARRAY_UNIQUE_SYMBOL 5 | // If this is not included, calling PY_ARRAY functions will have a null value 6 | #define PY_ARRAY_UNIQUE_SYMBOL sharedata_ARRAY_API 7 | 8 | #ifndef SHAREDATA_MAIN_C_FILE 9 | #define NO_IMPORT_ARRAY 10 | #endif 11 | 12 | #include "numpy/ndarrayobject.h" 13 | #include "numpy/ufuncobject.h" 14 | #include 15 | #include 16 | #include "../atop/atop.h" 17 | 18 | 19 | int dtype_to_atop(int dtype); 20 | 21 | // Global user settings controlled by python functions 22 | // set to 0 to disable 23 | struct stSettings { 24 | int32_t AtopEnabled; 25 | int32_t LedgerEnabled; 26 | int32_t RecyclerEnabled; 27 | int32_t ZigZag; // set to 0 to disable 28 | int32_t Initialized; 29 | int32_t Reserved; 30 | binaryfunc NumpyGetItem; // optional hook 31 | }; 32 | 33 | extern stSettings g_Settings; 34 | 35 | struct stUFuncToAtop { 36 | const char* str_ufunc_name; 37 | const int atop_op; 38 | }; 39 | 40 | enum OP_CATEGORY:int32_t { 41 | OPCAT_BINARY = 0, 42 | OPCAT_UNARY = 1, 43 | OPCAT_COMPARE = 2, 44 | OPCAT_TRIG = 3, 45 | OPCAT_CONVERT = 4, 46 | OPCAT_SORT = 5, 47 | OPCAT_ARGSORT = 6, 48 | OPCAT_ARANGE = 7, 49 | OPCAT_ARGMINMAX = 8, 50 | OPCAT_LAST = 9, 51 | }; 52 | 53 | struct stOpCategory { 54 | const char* StrName; 55 | int32_t NumOps; 56 | OP_CATEGORY CatEnum; // 57 | stUFuncToAtop* pUFuncToAtop; 58 | }; 59 | 60 | 61 | //--------------------------------------------------------------------- 62 | // NOTE: See SDSArrayInfo and keep same 63 | struct ArrayInfo { 64 | 65 | // Numpy object 66 | PyArrayObject* pObject; 67 | 68 | // First bytes 69 | char* pData; 70 | 71 | // Width in bytes of one row 72 | int64_t ItemSize; 73 | 74 | // total number of items 75 | int64_t ArrayLength; 76 | 77 | int64_t NumBytes; 78 | 79 | int NumpyDType; 80 | int NDim; 81 | 82 | // When calling ensure contiguous, we might make a copy 83 | // if so, pObject is the copy and must be deleted. pOriginal was passed in 84 | PyArrayObject* pOriginalObject; 85 | 86 | }; 87 | 88 | extern void* GetDefaultForType(int numpyInType); 89 | extern int64_t CalcArrayLength(int ndim, npy_intp* dims); 90 | extern int64_t ArrayLength(PyArrayObject* inArr); 91 | extern PyArrayObject* AllocateNumpyArray(int ndim, npy_intp* dims, int32_t numpyType, int64_t itemsize = 0, int fortran_array = 0, npy_intp* strides = nullptr); 92 | extern PyArrayObject* AllocateLikeResize(PyArrayObject* inArr, npy_intp rowSize); 93 | extern PyArrayObject* AllocateLikeNumpyArray(PyArrayObject* inArr, int numpyType); 94 | extern BOOL ConvertScalarObject(PyObject* inObject1, void* pDest, int16_t numpyOutType, void** ppDataIn, int64_t* pItemSize); 95 | extern int GetStridesAndContig(PyArrayObject* inArray, int& ndim, int64_t& stride); 96 | 97 | // defined in pnumpy 98 | extern stOpCategory gOpCategory[OPCAT_LAST]; 99 | 100 | extern void LedgerRecord(int32_t op_category, int64_t start_time, int64_t end_time, char** args, const npy_intp* dimensions, const npy_intp* steps, void* innerloop, int funcop, int atype); 101 | extern void LedgerRecord2(int32_t op_category, int64_t start_time, int64_t end_time, int atype, int64_t length); 102 | extern void LedgerInit(); 103 | extern int64_t CalcArrayLength(int ndim, npy_intp* dims); 104 | extern int64_t ArrayLength(PyArrayObject* inArr); 105 | extern PyArrayObject* AllocateNumpyArray(int ndim, npy_intp* dims, int32_t numpyType, int64_t itemsize, int fortran_array, npy_intp* strides); 106 | extern PyArrayObject* AllocateLikeResize(PyArrayObject* inArr, npy_intp rowSize); 107 | extern PyArrayObject* AllocateLikeNumpyArray(PyArrayObject* inArr, int numpyType); 108 | extern ArrayInfo* BuildArrayInfo( 109 | PyObject* listObject, 110 | int64_t* pTupleSize, 111 | int64_t* pTotalItemSize, 112 | BOOL checkrows = TRUE, 113 | BOOL convert = TRUE); 114 | 115 | extern void FreeArrayInfo(ArrayInfo* pAlloc); 116 | 117 | extern PyObject* BooleanIndexInternal(PyArrayObject* aValues, PyArrayObject* aIndex); 118 | extern "C" PyObject *getitem(PyObject * self, PyObject * args); 119 | 120 | #define RETURN_NONE Py_INCREF(Py_None); return Py_None; 121 | #define RETURN_FALSE Py_XINCREF(Py_False); return Py_False; 122 | #define RETURN_TRUE Py_XINCREF(Py_True); return Py_True; 123 | 124 | #define IS_BINARY_REDUCE ((args[0] == args[2])\ 125 | && (steps[0] == steps[2])\ 126 | && (steps[0] == 0)) 127 | 128 | extern PyTypeObject* pPyArray_Type; 129 | 130 | #if defined(_WIN32) && !defined(__GNUC__) 131 | 132 | #define CASE_NPY_INT32 case NPY_INT32: case NPY_INT 133 | #define CASE_NPY_UINT32 case NPY_UINT32: case NPY_UINT 134 | #define CASE_NPY_INT64 case NPY_INT64 135 | #define CASE_NPY_UINT64 case NPY_UINT64 136 | #define CASE_NPY_FLOAT64 case NPY_DOUBLE: case NPY_LONGDOUBLE 137 | 138 | #else 139 | 140 | #define CASE_NPY_INT32 case NPY_INT32 141 | #define CASE_NPY_UINT32 case NPY_UINT32 142 | #define CASE_NPY_INT64 case NPY_INT64: case NPY_LONGLONG 143 | #define CASE_NPY_UINT64 case NPY_UINT64: case NPY_ULONGLONG 144 | #define CASE_NPY_FLOAT64 case NPY_DOUBLE 145 | #endif 146 | 147 | 148 | -------------------------------------------------------------------------------- /src/pnumpy/benchmark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | __all__ = [ 5 | 'benchmark','benchmark_func'] 6 | 7 | from pnumpy._pnumpy import atop_enable, atop_disable, atop_isenabled 8 | from pnumpy._pnumpy import thread_enable, thread_disable, thread_isenabled 9 | from pnumpy._pnumpy import timer_gettsc, timer_getutc 10 | 11 | import numpy as np 12 | 13 | # TODO: move this to new location 14 | def benchmark_timeit( 15 | func=np.equal, 16 | ctypes=[np.bool_, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64], 17 | scalar=False, 18 | unary = False, 19 | reduct = False, 20 | outdtype=None, 21 | recycle=True, 22 | sizes=[1_000_000]): 23 | ''' 24 | Internal routine to benchmark a function. 25 | 26 | ''' 27 | 28 | def time_func(recycle, c): 29 | if unary is False: 30 | starttime = timer_gettsc() 31 | if recycle: 32 | result=func(a,b,out=c) 33 | else: 34 | result=func(a,b) 35 | delta= timer_gettsc() - starttime 36 | 37 | else: 38 | starttime = timer_gettsc() 39 | if reduct is True: 40 | result=func(a) 41 | else: 42 | if recycle: 43 | result=func(a,out=c) 44 | else: 45 | result=func(a) 46 | 47 | delta= timer_gettsc() - starttime 48 | return delta, result 49 | 50 | timedelta = np.zeros(len(ctypes), np.int64) 51 | 52 | for s in sizes: 53 | slot = 0 54 | loop_size = 100 55 | mtimedelta = np.zeros(loop_size, np.int64) 56 | for ctype in ctypes: 57 | if ctype is np.bool_: 58 | a=np.arange(s, dtype=np.int8).astype(ctype)+1 59 | else: 60 | a=np.arange(s, dtype=ctype) 61 | a=a % 253 62 | a+=1 63 | 64 | if scalar is True: 65 | b=a[5] 66 | else: 67 | b=a.copy() 68 | 69 | # dry run 70 | delta, c=time_func(False, None) 71 | 72 | # main timing loop 73 | for loop in range(loop_size): 74 | delta, result = time_func(recycle, c) 75 | del result 76 | 77 | mtimedelta[loop] = delta 78 | 79 | timedelta[slot] = np.median(mtimedelta) 80 | # print("median is ", timedelta[slot], slot) 81 | slot = slot + 1 82 | return timedelta 83 | 84 | 85 | def benchmark_func( 86 | func=np.equal, 87 | ctypes=[np.bool_, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64], 88 | scalar=False, 89 | unary = False, 90 | reduct = False, 91 | outdtype=None, 92 | recycle=True, 93 | atop=True, 94 | thread=True, 95 | sizes=[1_000_000]): 96 | ''' 97 | Benchmark one function. 98 | 99 | Examples 100 | -------- 101 | 102 | benchmark_func(np.add) 103 | benchmark_func(np.add, sizes=[2**16]) 104 | benchmark_func(np.sqrt, unary=True) 105 | ''' 106 | # disable atop and threading 107 | atop_disable() 108 | thread_disable() 109 | # get original time 110 | t0=benchmark_timeit(func=func, ctypes=ctypes, scalar=scalar, unary=unary, reduct=reduct, outdtype=outdtype, recycle=recycle, sizes=sizes) 111 | 112 | # now possibly enable atop and threading 113 | if atop: 114 | atop_enable() 115 | if thread: 116 | thread_enable() 117 | t1=benchmark_timeit(func=func, ctypes=ctypes, scalar=scalar, unary=unary, reduct=reduct, outdtype=outdtype, recycle=recycle, sizes=sizes) 118 | return t0/t1 119 | 120 | def benchmark( 121 | ctypes=[np.bool_, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64], 122 | recycle=True, 123 | atop=True, 124 | thread=True, 125 | sizes=[1_000_000]): 126 | ''' 127 | Performs a simple benchmark of the ratio of normal numpy (no threading) vs parallel numpy (threaded). 128 | The output is formatted to be copied and pasted in a csv file. 129 | A result above 1.0 indicates an improvement, below 1.0 indicates worse peformance. 130 | 131 | Parameters 132 | ---------- 133 | ctypes :list of numpy dtypes to test, for example [np.int32, npfloat32] 134 | recycle: True or False 135 | atop: True or False. Whether or not the atop engine is used in benchmarking. 136 | thread: True or False. 137 | sizes: list of array sizes to test, for example [100_000, 1_000_000] 138 | 139 | Returns 140 | ------- 141 | output text formatted for a .csv file 142 | 143 | Examples 144 | -------- 145 | pn.benchmark() 146 | pn.benchmark(thread=False) 147 | pn.benchmark(sizes=[2**16]) 148 | pn.benchmark(ctypes=[np.float32, np.float64]) 149 | ''' 150 | 151 | def ctype_string(ct): 152 | s=f'{sizes[0]} rows,' 153 | for i in ct: 154 | s=s+f'{i.__name__},' 155 | return s 156 | 157 | def output_data(rowname, data): 158 | s=f'{rowname},' 159 | for i in data: 160 | s=s+f'{i:5.2f},' 161 | print(s) 162 | 163 | print(ctype_string(ctypes)) 164 | output_data("a==b", benchmark_func(np.equal, ctypes=ctypes, scalar=False, unary=False, recycle=recycle, atop=atop, thread=thread, outdtype='?', sizes=sizes)) 165 | output_data("a==5", benchmark_func(np.equal, ctypes=ctypes, scalar=True, unary=False, recycle=recycle, atop=atop, thread=thread, outdtype='?', sizes=sizes)) 166 | output_data("a+b", benchmark_func(np.add, ctypes=ctypes, scalar=False, unary=False,recycle=recycle, atop=atop, thread=thread, sizes=sizes)) 167 | output_data("a+5", benchmark_func(np.add, ctypes=ctypes, scalar=True, unary=False,recycle=recycle, atop=atop, thread=thread, sizes=sizes)) 168 | output_data("a/5", benchmark_func(np.true_divide, ctypes=ctypes, scalar=True, unary=False,recycle=recycle, atop=atop, thread=thread, sizes=sizes)) 169 | output_data("abs", benchmark_func(np.abs, ctypes=ctypes, scalar=False, unary=True,recycle=recycle, atop=atop, thread=thread, sizes=sizes)) 170 | output_data("isnan", benchmark_func(np.isnan, ctypes=ctypes, scalar=False, unary=True,recycle=recycle, atop=atop, thread=thread, sizes=sizes)) 171 | output_data("sin", benchmark_func(np.sin, ctypes=ctypes, scalar=False, unary=True,recycle=recycle, atop=atop, thread=thread, sizes=sizes)) 172 | output_data("log", benchmark_func(np.log, ctypes=ctypes, scalar=False, unary=True,recycle=recycle, atop=atop, thread=thread, sizes=sizes)) 173 | output_data("sum", benchmark_func(np.sum, ctypes=ctypes, scalar=False, reduct=True, unary=True,recycle=recycle, atop=atop, thread=thread, sizes=sizes)) 174 | output_data("min", benchmark_func(np.min, ctypes=ctypes, scalar=False, reduct=True, unary=True,recycle=recycle, atop=atop, thread=thread, sizes=sizes)) 175 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | 6 | import io 7 | import os 8 | import platform 9 | from glob import glob 10 | from os.path import basename 11 | from os.path import dirname 12 | from os.path import join 13 | from os.path import relpath 14 | from os.path import splitext 15 | 16 | from setuptools import Extension 17 | from setuptools import find_packages 18 | from setuptools import setup 19 | import numpy as np 20 | 21 | try: 22 | from setuptools_scm import get_version 23 | except Exception: 24 | try: 25 | import pip 26 | package='setuptools_scm' 27 | if hasattr(pip, 'main'): 28 | pip.main(['install', package]) 29 | else: 30 | pip._internal.main(['install', package]) 31 | from setuptools_scm import get_version 32 | except Exception: 33 | print("**could not install pip or setuptools_scm, version is defaulted") 34 | 35 | def myversion(): 36 | version = '2.0.23' 37 | try: 38 | mversion = get_version() 39 | s = mversion.split('.') 40 | if len(s) >=3: 41 | # see if we can parse the current version 42 | if int(s[0])==2 and int(s[1])==0: 43 | version = '2.0.' 44 | lastnum = s[2] 45 | for i in lastnum: 46 | if i >='0' and i <= '9': 47 | version = version + i 48 | except Exception: 49 | pass 50 | return version 51 | 52 | thisversion=myversion() 53 | 54 | def writeversion(): 55 | text_file = open("src/pnumpy/_version.py", "w") 56 | strver = f"__version__='{thisversion}'" 57 | n = text_file.write(strver) 58 | text_file.close() 59 | return thisversion 60 | 61 | # Enable code coverage for C code: we can't use CFLAGS=-coverage in tox.ini, since that may mess with compiling 62 | # dependencies (e.g. numpy). Therefore we set SETUP_PY_EXT_COVERAGE after deps have been safely installed). 63 | if os.environ.get('SETUP_PY_EXT_COVERAGE') == 'yes' and platform.system() == 'Linux': 64 | CFLAGS = os.environ['CFLAGS'] = '-fprofile-arcs -ftest-coverage -DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION' 65 | LFLAGS = os.environ['LFLAGS'] = '-lgcov' 66 | else: 67 | CFLAGS = '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION' 68 | LFLAGS = '' 69 | 70 | if platform.system() == 'Windows': 71 | CFLAGS += ' /Ox /Ob2 /Oi /Ot /d2FH4- /GS- /arch:AVX2' 72 | else: 73 | CFLAGS += ' -mavx2 -fpermissive -Wno-unused-variable -Wno-unused-function -std=c++11 -pthread -falign-functions=32' 74 | 75 | if platform.system() == 'Linux': 76 | LFLAGS += ' -lm' 77 | 78 | def read(*names, **kwargs): 79 | with io.open( 80 | join(dirname(__file__), *names), 81 | encoding=kwargs.get('encoding', 'utf8') 82 | ) as fh: 83 | return fh.read() 84 | 85 | 86 | with open("README.md") as readme: 87 | long_description = readme.read() 88 | 89 | import _add_newdocs 90 | _add_newdocs.main() 91 | 92 | setup( 93 | name='pnumpy', 94 | #version=get_git_version(), #'0.0.0', 95 | version=writeversion(), 96 | license='MIT', 97 | description='Faster loops for NumPy using multithreading and other tricks', 98 | long_description=long_description, 99 | long_description_content_type="text/markdown", 100 | author='Quansight', 101 | author_email='info@quansight.com', 102 | url='https://quansight.github.io/pnumpy/stable/index.html', 103 | packages=find_packages('src'), 104 | package_dir={'': 'src'}, 105 | py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')], 106 | include_package_data=True, 107 | zip_safe=False, 108 | classifiers=[ 109 | # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers 110 | "Development Status :: 4 - Beta", 111 | 'Intended Audience :: Developers', 112 | 'License :: OSI Approved :: MIT License', 113 | 'Operating System :: Unix', 114 | 'Operating System :: POSIX', 115 | 'Operating System :: Microsoft :: Windows', 116 | 'Programming Language :: Python', 117 | 'Programming Language :: Python :: 3', 118 | 'Programming Language :: Python :: 3.6', 119 | 'Programming Language :: Python :: 3.7', 120 | 'Programming Language :: Python :: 3.8', 121 | 'Programming Language :: Python :: 3.9', 122 | 'Programming Language :: Python :: Implementation :: CPython', 123 | 'Programming Language :: Python :: Implementation :: PyPy', 124 | 'Topic :: Utilities', 125 | ], 126 | project_urls={ 127 | 'Changelog': 'https://github.com/Quansight/pnumpy/blob/master/CHANGELOG.rst', 128 | 'Issue Tracker': 'https://github.com/Quansight/pnumpy/issues', 129 | }, 130 | keywords=[ 131 | # eg: 'keyword1', 'keyword2', 'keyword3', 132 | ], 133 | #setup_requires=['setuptools_scm'], 134 | #use_scm_version = { 135 | # 'version_scheme': 'post-release', 136 | # 'local_scheme': 'no-local-version', 137 | # 'write_to': 'src/pnumpy/_version.py', 138 | # 'write_to_template': '__version__ = "{version}"', 139 | #}, 140 | python_requires='>=3.6', 141 | install_requires=[ 142 | # eg: 'aspectlib==1.1.1', 'six>=1.7', 143 | 'numpy>=1.18.0', # has ufunc hooks 144 | ], 145 | extras_require={ 146 | # eg: 147 | # 'rst': ['docutils>=0.11'], 148 | # ':python_version=="2.6"': ['argparse'], 149 | }, 150 | ext_modules=[ 151 | Extension( 152 | 'pnumpy._pnumpy', 153 | sources=['src/pnumpy/_pnumpy.cpp', 154 | 'src/pnumpy/module_init.cpp', 155 | 'src/pnumpy/common.cpp', 156 | 'src/pnumpy/ledger.cpp', 157 | 'src/pnumpy/getitem.cpp', 158 | 'src/pnumpy/conversions.cpp', 159 | 'src/pnumpy/recycler.cpp', 160 | 'src/pnumpy/sorting.cpp', 161 | #'src/pnumpy/arange.cpp', 162 | #'src/pnumpy/item_selection.cpp', 163 | 'src/atop/atop.cpp', 164 | 'src/atop/threads.cpp', 165 | 'src/atop/recarray.cpp', 166 | 'src/atop/sort.cpp', 167 | 'src/atop/fill.cpp', 168 | 'src/atop/ops_binary.cpp', 169 | 'src/atop/ops_compare.cpp', 170 | 'src/atop/ops_unary.cpp', 171 | 'src/atop/ops_trig.cpp', 172 | 'src/atop/ops_log.cpp', 173 | ], 174 | extra_compile_args=CFLAGS.split(), 175 | extra_link_args=LFLAGS.split(), 176 | include_dirs=['src/pnumpy', 'src/atop', np.get_include()], 177 | py_limited_api=True, 178 | ) 179 | ], 180 | ) 181 | -------------------------------------------------------------------------------- /src/pnumpy/module_init.cpp: -------------------------------------------------------------------------------- 1 | #define SHAREDATA_MAIN_C_FILE 2 | #include "common.h" 3 | #include "PNUMPY.h" 4 | //extern "C" void** PyArray_API; 5 | PyTypeObject* pPyArray_Type = NULL; 6 | 7 | /* 8 | * Some C++ compilers do not like mixin non-designated-initializers 9 | * like PyModuleDef_HEAD_INIT with designated-initializers like 10 | * .m_doc, so break this part out into a C file 11 | */ 12 | 13 | 14 | extern "C" PyObject* oldinit(PyObject *self, PyObject *args, PyObject *kwargs); 15 | extern "C" PyObject* newinit(PyObject* self, PyObject* args, PyObject* kwargs); 16 | extern "C" PyObject* atop_enable(PyObject * self, PyObject * args); 17 | extern "C" PyObject* atop_disable(PyObject * self, PyObject * args); 18 | extern "C" PyObject* atop_isenabled(PyObject * self, PyObject * args); 19 | extern "C" PyObject* atop_info(PyObject * self, PyObject * args); 20 | extern "C" PyObject* atop_setworkers(PyObject * self, PyObject * args); 21 | extern "C" PyObject* thread_enable(PyObject * self, PyObject * args); 22 | extern "C" PyObject* thread_disable(PyObject * self, PyObject * args); 23 | extern "C" PyObject* thread_isenabled(PyObject * self, PyObject * args); 24 | extern "C" PyObject* thread_getworkers(PyObject * self, PyObject * args); 25 | extern "C" PyObject* thread_setworkers(PyObject * self, PyObject * args); 26 | extern "C" PyObject* thread_zigzag(PyObject * self, PyObject * args); 27 | 28 | // ledger.cpp 29 | extern "C" PyObject* ledger_enable(PyObject * self, PyObject * args); 30 | extern "C" PyObject* ledger_disable(PyObject * self, PyObject * args); 31 | extern "C" PyObject* ledger_isenabled(PyObject * self, PyObject * args); 32 | extern "C" PyObject* ledger_info(PyObject * self, PyObject * args); 33 | 34 | // recycler.cpp 35 | extern "C" PyObject* recycler_enable(PyObject * self, PyObject * args); 36 | extern "C" PyObject* recycler_disable(PyObject * self, PyObject * args); 37 | extern "C" PyObject* recycler_isenabled(PyObject * self, PyObject * args); 38 | extern "C" PyObject* recycler_info(PyObject * self, PyObject * args); 39 | 40 | extern "C" PyObject * hook_enable(PyObject * self, PyObject * args); 41 | extern "C" PyObject * hook_disable(PyObject * self, PyObject * args); 42 | 43 | extern "C" PyObject* timer_gettsc(PyObject * self, PyObject * args); 44 | extern "C" PyObject* timer_getutc(PyObject * self, PyObject * args); 45 | extern "C" PyObject* cpustring(PyObject * self, PyObject * args); 46 | extern "C" PyObject * getitem(PyObject * self, PyObject * args); 47 | extern "C" PyObject * lexsort32(PyObject * self, PyObject * args, PyObject * kwargs); 48 | extern "C" PyObject * lexsort64(PyObject * self, PyObject * args, PyObject * kwargs); 49 | extern "C" PyObject * sort(PyObject * self, PyObject * args, PyObject * kwargs); 50 | 51 | // conversions.cpp 52 | extern "C" PyObject* recarray_to_colmajor(PyObject* self, PyObject* args); 53 | 54 | static char m_doc[] = "Provide methods to override NumPy ufuncs"; 55 | 56 | 57 | PyDoc_STRVAR(oldinit_doc, 58 | "oldinit(ufunc_name:"); 59 | 60 | static PyMethodDef module_functions[] = { 61 | {"initialize", (PyCFunction)newinit, METH_VARARGS | METH_KEYWORDS, INITIALIZE_DOC}, 62 | {"atop_enable", (PyCFunction)atop_enable, METH_VARARGS, ATOP_ENABLE_DOC}, 63 | {"atop_disable", (PyCFunction)atop_disable, METH_VARARGS, ATOP_DISABLE_DOC}, 64 | {"atop_isenabled", (PyCFunction)atop_isenabled, METH_VARARGS, ATOP_ISENABLED_DOC}, 65 | {"atop_info", (PyCFunction)atop_info, METH_VARARGS, "return dict"}, 66 | {"atop_setworkers", (PyCFunction)atop_setworkers, METH_VARARGS, "set workers for a func"}, 67 | {"thread_enable", (PyCFunction)thread_enable, METH_VARARGS, THREAD_ENABLE_DOC}, 68 | {"thread_disable", (PyCFunction)thread_disable, METH_VARARGS, THREAD_DISABLE_DOC}, 69 | {"thread_isenabled", (PyCFunction)thread_isenabled, METH_VARARGS, THREAD_ISENABLED_DOC}, 70 | {"thread_getworkers",(PyCFunction)thread_getworkers, METH_VARARGS, THREAD_GETWORKERS_DOC}, 71 | {"thread_setworkers",(PyCFunction)thread_setworkers, METH_VARARGS, THREAD_SETWORKERS_DOC}, 72 | {"thread_zigzag", (PyCFunction)thread_zigzag, METH_VARARGS, "toggle zigzag mode"}, 73 | {"timer_gettsc", (PyCFunction)timer_gettsc, METH_VARARGS, TIMER_GETTSC_DOC}, 74 | {"timer_getutc", (PyCFunction)timer_getutc, METH_VARARGS, TIMER_GETUTC_DOC}, 75 | {"hook_enable", (PyCFunction)hook_enable, METH_VARARGS, "Enable hook for numpy array __getitem__ for fancy and bool indexing"}, 76 | {"hook_disable", (PyCFunction)hook_disable, METH_VARARGS, "Disable hook for numpy array __getitem__ for fancy and bool indexing"}, 77 | {"ledger_enable", (PyCFunction)ledger_enable, METH_VARARGS, LEDGER_ENABLE_DOC}, 78 | {"ledger_disable", (PyCFunction)ledger_disable, METH_VARARGS, LEDGER_DISABLE_DOC}, 79 | {"ledger_isenabled", (PyCFunction)ledger_isenabled, METH_VARARGS, LEDGER_ISENABLED_DOC}, 80 | {"ledger_info", (PyCFunction)ledger_info, METH_VARARGS, LEDGER_INFO_DOC}, 81 | {"recycler_enable", (PyCFunction)recycler_enable, METH_VARARGS, RECYCLER_ENABLE_DOC}, 82 | {"recycler_disable", (PyCFunction)recycler_disable, METH_VARARGS, RECYCLER_DISABLE_DOC}, 83 | {"recycler_isenabled", (PyCFunction)recycler_isenabled, METH_VARARGS, RECYCLER_ISENABLED_DOC}, 84 | {"recycler_info", (PyCFunction)recycler_info, METH_VARARGS, RECYCLER_INFO_DOC}, 85 | {"cpustring", (PyCFunction)cpustring, METH_VARARGS, CPUSTRING_DOC}, 86 | {"oldinit", (PyCFunction)oldinit, METH_VARARGS | METH_KEYWORDS, OLDINIT_DOC}, 87 | {"recarray_to_colmajor", (PyCFunction)recarray_to_colmajor, METH_VARARGS, "convert record array to col major"}, 88 | {"getitem", (PyCFunction)getitem, METH_VARARGS | METH_KEYWORDS, "alternative to fancy index or boolean index"}, 89 | {"lexsort32", (PyCFunction)lexsort32, METH_VARARGS | METH_KEYWORDS, "lexigraphical sort returning int32 fancy indexing"}, 90 | {"lexsort64", (PyCFunction)lexsort64, METH_VARARGS | METH_KEYWORDS, "lexigraphical sort returning int64 fancy indexing"}, 91 | {"sort", (PyCFunction)sort, METH_VARARGS | METH_KEYWORDS, "parallel inplace quicksort, followed by mergesort"}, 92 | {NULL, NULL, 0, NULL} 93 | }; 94 | 95 | 96 | static PyModuleDef moduledef = { 97 | PyModuleDef_HEAD_INIT, 98 | "pnumpy._pnumpy", // Module name 99 | m_doc, // Module description 100 | 0, 101 | module_functions, // Structure that defines the methods 102 | NULL, // slots 103 | NULL, // GC traverse 104 | NULL, // GC 105 | NULL // freefunc 106 | }; 107 | 108 | PyMODINIT_FUNC PyInit__pnumpy(void) { 109 | PyObject *module; 110 | 111 | module = PyModule_Create(&moduledef); 112 | 113 | if (module == NULL) 114 | return NULL; 115 | 116 | // Load numpy for PyArray_Type 117 | import_array(); 118 | pPyArray_Type = &PyArray_Type; 119 | 120 | atop_init(); 121 | LedgerInit(); 122 | 123 | return module; 124 | } 125 | -------------------------------------------------------------------------------- /src/atop/recarray.cpp: -------------------------------------------------------------------------------- 1 | #include "common_inc.h" 2 | #include 3 | #include "invalids.h" 4 | #include "threads.h" 5 | 6 | #if defined(__clang__) 7 | #pragma clang diagnostic ignored "-Wmissing-braces" 8 | #pragma clang diagnostic ignored "-Wunused-function" 9 | #pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) 10 | #endif 11 | 12 | #if defined(__GNUC__) 13 | //#pragma GCC target "arch=core-avx2,tune=core-avx2" 14 | #if __GNUC_PREREQ(4, 4) || (__clang__ > 0 && __clang_major__ >= 3) || !defined(__GNUC__) 15 | /* GCC >= 4.4 or clang or non-GCC compilers */ 16 | #include 17 | #elif __GNUC_PREREQ(4, 1) 18 | /* GCC 4.1, 4.2, and 4.3 do not have x86intrin.h, directly include SSE2 header */ 19 | #include 20 | #endif 21 | #endif 22 | 23 | 24 | //#define LOGGING printf 25 | #define LOGGING(...) 26 | 27 | static const int64_t CHUNKSIZE = 16384; 28 | 29 | // This is used to multiply the strides 30 | const union 31 | { 32 | int32_t i[8]; 33 | __m256i m; 34 | //} __vindex8_strides = { 7, 6, 5, 4, 3, 2, 1, 0 }; 35 | } __vindex8_strides = { 0, 1, 2, 3, 4, 5, 6, 7 }; 36 | 37 | //----------------------------------- 38 | // 39 | void ConvertRecArray(char* pStartOffset, int64_t startRow, int64_t totalRows, stRecarrayOffsets* pstOffset, int64_t numArrays, int64_t itemSize) 40 | { 41 | // Try to keep everything in L1Cache 42 | const int64_t L1CACHE = 32768; 43 | int64_t CHUNKROWS = L1CACHE / (itemSize * 2); 44 | if (CHUNKROWS < 1) { 45 | CHUNKROWS = 1; 46 | } 47 | 48 | __m256i vindex = _mm256_mullo_epi32(_mm256_set1_epi32((int32_t)itemSize), _mm256_loadu_si256(&__vindex8_strides.m)); 49 | __m128i vindex128 = _mm256_extracti128_si256(vindex, 0); 50 | 51 | while (startRow < totalRows) { 52 | 53 | // Calc how many rows to process in this pass 54 | int64_t endRow = startRow + CHUNKROWS; 55 | if (endRow > totalRows) { 56 | endRow = totalRows; 57 | } 58 | 59 | int64_t origRow = startRow; 60 | 61 | //printf("processing %lld\n", startRow); 62 | for (int64_t i = 0; i < numArrays; i++) { 63 | 64 | startRow = origRow; 65 | 66 | // Calculate place to read 67 | char* pRead = pStartOffset + pstOffset[i].readoffset; 68 | char* pWrite = pstOffset[i].pData; 69 | 70 | int64_t arrItemSize = pstOffset[i].itemsize; 71 | 72 | //printf("processing start:%lld end:%lld pRead:%p %p itemsize: %lld\n", startRow, endRow, pRead, pWrite, arrItemSize); 73 | 74 | switch (pstOffset[i].itemsize) { 75 | case 1: 76 | while (startRow < endRow) { 77 | int8_t data = *(int8_t*)(pRead + (startRow * itemSize)); 78 | *(int8_t*)(pWrite + startRow) = data; 79 | startRow++; 80 | } 81 | break; 82 | case 2: 83 | while (startRow < endRow) { 84 | int16_t data = *(int16_t*)(pRead + (startRow * itemSize)); 85 | *(int16_t*)(pWrite + startRow * arrItemSize) = data; 86 | startRow++; 87 | } 88 | break; 89 | case 4: 90 | // ??? use _mm256_i32gather_epi32 to speed up 91 | { 92 | int64_t endSubRow = endRow - 8; 93 | while (startRow < endSubRow) { 94 | __m256i m0 = _mm256_i32gather_epi32((int32_t*)(pRead + (startRow * itemSize)), vindex, 1); 95 | _mm256_storeu_si256((__m256i*)(pWrite + (startRow * arrItemSize)), m0); 96 | startRow += 8; 97 | } 98 | while (startRow < endRow) { 99 | int32_t data = *(int32_t*)(pRead + (startRow * itemSize)); 100 | *(int32_t*)(pWrite + startRow * arrItemSize) = data; 101 | startRow++; 102 | } 103 | } 104 | break; 105 | case 8: 106 | { 107 | int64_t endSubRow = endRow - 4; 108 | while (startRow < endSubRow) { 109 | __m256i m0 = _mm256_i32gather_epi64((int64_t*)(pRead + (startRow * itemSize)), vindex128, 1); 110 | _mm256_storeu_si256((__m256i*)(pWrite + (startRow * arrItemSize)), m0); 111 | startRow += 4; 112 | } 113 | while (startRow < endRow) { 114 | int64_t data = *(int64_t*)(pRead + (startRow * itemSize)); 115 | *(int64_t*)(pWrite + startRow * arrItemSize) = data; 116 | startRow++; 117 | } 118 | } 119 | break; 120 | default: 121 | while (startRow < endRow) { 122 | char* pSrc = pRead + (startRow * itemSize); 123 | char* pDest = pWrite + (startRow * arrItemSize); 124 | char* pEnd = pSrc + arrItemSize; 125 | while ((pSrc + 8) < pEnd) { 126 | *(int64_t*)pDest = *(int64_t*)pSrc; 127 | pDest += 8; 128 | pSrc += 8; 129 | } 130 | while (pSrc < pEnd) { 131 | *pDest++ = *pSrc++; 132 | } 133 | startRow++; 134 | } 135 | break; 136 | 137 | } 138 | 139 | } 140 | } 141 | } 142 | 143 | 144 | //============================================== 145 | // totalRows = total number of record array rows 146 | // 147 | extern "C" void RecArrayToColMajor( 148 | stRecarrayOffsets* pstOffset, 149 | char* pStartOffset, 150 | int64_t totalRows, 151 | int64_t numArrays, 152 | int64_t itemSize) { 153 | 154 | static const int64_t CHUNKSIZE = 16384; 155 | 156 | // Try to keep everything in L1Cache 157 | const int64_t L1CACHE = 32768; 158 | int64_t CHUNKROWS = L1CACHE / (itemSize * 2); 159 | if (CHUNKROWS < 1) { 160 | CHUNKROWS = 1; 161 | } 162 | 163 | LOGGING("Chunkrows is %I64d \n", CHUNKROWS); 164 | 165 | int64_t startRow = 0; 166 | 167 | if (THREADER && totalRows > 16384) { 168 | // Prepare for multithreading 169 | struct stConvertRec { 170 | char* pStartOffset; 171 | int64_t startRow; 172 | int64_t totalRows; 173 | stRecarrayOffsets* pstOffset; 174 | int64_t numArrays; 175 | int64_t itemSize; 176 | int64_t lastRow; 177 | } stConvert; 178 | 179 | int64_t items = (totalRows + (CHUNKSIZE - 1)) / CHUNKSIZE; 180 | 181 | stConvert.pStartOffset = pStartOffset; 182 | stConvert.startRow = startRow; 183 | stConvert.totalRows = totalRows; 184 | stConvert.pstOffset = pstOffset; 185 | stConvert.numArrays = numArrays; 186 | stConvert.itemSize = itemSize; 187 | stConvert.lastRow = items - 1; 188 | 189 | auto lambdaConvertRecCallback = [](void* callbackArgT, int core, int64_t workIndex) -> int64_t { 190 | stConvertRec* callbackArg = (stConvertRec*)callbackArgT; 191 | int64_t startRow = callbackArg->startRow + (workIndex * CHUNKSIZE); 192 | int64_t totalRows = startRow + CHUNKSIZE; 193 | 194 | if (totalRows > callbackArg->totalRows) { 195 | totalRows = callbackArg->totalRows; 196 | } 197 | 198 | ConvertRecArray( 199 | callbackArg->pStartOffset, 200 | startRow, 201 | totalRows, 202 | callbackArg->pstOffset, 203 | callbackArg->numArrays, 204 | callbackArg->itemSize); 205 | 206 | LOGGING("[%d] %lld completed\n", core, workIndex); 207 | return 1; 208 | }; 209 | 210 | THREADER->DoMultiThreadedWork((int)items, lambdaConvertRecCallback, &stConvert); 211 | 212 | } 213 | else { 214 | ConvertRecArray(pStartOffset, startRow, totalRows, pstOffset, numArrays, itemSize); 215 | } 216 | } 217 | #if defined(__clang__) 218 | #pragma clang attribute pop 219 | #endif 220 | 221 | -------------------------------------------------------------------------------- /.github/workflows/build_uploadpypi.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow that is manually triggered 2 | 3 | name: Manual workflow 4 | 5 | # Controls when the action will run. Workflow runs when manually triggered using the UI 6 | # or API. 7 | on: 8 | workflow_dispatch: 9 | # Inputs the workflow accepts. 10 | inputs: 11 | name: 12 | # Force prompt 13 | description: 'Confirm you wish to run' 14 | # Input has to be provided for the workflow to run 15 | required: true 16 | 17 | jobs: 18 | build: 19 | name: Build wheels ${{ matrix.os }}, ${{ matrix.platform }} 20 | runs-on: ${{ matrix.os }} 21 | strategy: 22 | matrix: 23 | # the mac computer used by github actions is too old to run the tests 24 | # when fixed, add back macos-latest. notee: mac user can still download and use riptable 25 | os: [ubuntu-latest] #, windows-latest, macos-latest] 26 | python-version: [3.6, 3.7, 3.8, 3.9] 27 | platform: [x64] 28 | 29 | steps: 30 | - uses: actions/checkout@v2 31 | - name: Set up Python ${{ matrix.python-version }} 32 | uses: actions/setup-python@v2 33 | with: 34 | python-version: ${{ matrix.python-version }} 35 | architecture: x64 36 | - name: Install dependencies 37 | run: | 38 | python -m pip install --upgrade pip 39 | python -m pip install numpy>=1.19.1 setuptools setuptools_scm cibuildwheel>=1.7.4 40 | 41 | # ======================= BUILD WHEELS AND UPLOAD TO PYPI ================================== 42 | 43 | - name: Build wheels (non-windows) ${{ matrix.python-version }} on ${{ matrix.os }} 44 | if: matrix.python-version == '3.8' && matrix.os != 'windows-latest' 45 | env: 46 | CIBW_BUILD: 'cp36-* cp37-* cp38-* cp39-*' 47 | CIBW_SKIP: 'cp27-* cp35-* *-manylinux_i686 *manylinux_aarch64 *manylinux_ppc64le *manylinux_s390x' 48 | PYPI_PASSWORD: ${{ secrets.pypi_password }} 49 | PYPI_USERNAME: ${{ secrets.pypi_username }} 50 | TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} 51 | TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} 52 | run: | 53 | pip install cibuildwheel pip setuptools_scm twine --upgrade; 54 | python -m cibuildwheel --output-dir dist; 55 | python -m twine upload dist/* --skip-existing --verbose; 56 | # python -m twine upload dist/* -u "$PYPI_USERNAME" -p "$PYPI_PASSWORD" --skip-existing --verbose; 57 | 58 | - name: Build wheels (windows) ${{ matrix.python-version }} on ${{ matrix.os }} 59 | if: matrix.python-version == '3.8' && matrix.os == 'windows-latest' 60 | env: 61 | CIBW_BUILD: 'cp36-* cp37-* cp38-*' 62 | #CIBW_BUILD: 'cp36-*' 63 | CIBW_SKIP: 'cp27-* cp35-* *-win32' 64 | PYPI_PASSWORD: ${{ secrets.pypi_password }} 65 | PYPI_USERNAME: ${{ secrets.pypi_username }} 66 | TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} 67 | TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} 68 | run: | 69 | pip install cibuildwheel twine --upgrade; 70 | python -m cibuildwheel --output-dir dist; 71 | python -m twine upload dist/* --skip-existing --verbose; 72 | # python -m twine upload dist/* --skip-existing --verbose; 73 | 74 | # - name: Build wheels on ${{ matrix.os}} 75 | # run: | 76 | # RUNNER_OS="${{ runner.os }}" 77 | # PLATFORM="${{ matrix.platform }}" 78 | # echo $RUNNER_OS 79 | # echo $PLATFORM 80 | # export CIBW_BUILD="cp36-manylinux_x86_64 cp37-manylinux_x86_64 cp38-manylinux_x86_64" 81 | # if [ "$RUNNER_OS" == "Windows" ]; then 82 | # if [ "$PLATFORM" == "x64" ]; then 83 | # export CIBW_BUILD="cp36-win_amd64 cp37-win_amd64 cp38-win_amd64" 84 | # elif [ "$PLATFORM" == "x32" ]; then 85 | # export CIBW_BUILD="cp37-win32" 86 | # fi 87 | # elif [ "$RUNNER_OS" == "Linux" ]; then 88 | # if [ "$PLATFORM" == "x64" ]; then 89 | # export CIBW_BUILD="cp36-manylinux_x86_64 cp37-manylinux_x86_64 cp38-manylinux_x86_64" 90 | # elif [ "$PLATFORM" == "x32" ]; then 91 | # export CIBW_BUILD="cp36-manylinux_i686 cp37-manylinux_i686 cp38-manylinux_i686" 92 | # fi 93 | # elif [ "$RUNNER_OS" == "macOS" ]; then 94 | # export CIBW_BUILD="cp36-macosx_x86_64 cp37-macosx_x86_64 cp38-macosx_x86_64" 95 | # fi 96 | # python -m cibuildwheel --output-dir wheelhouse 97 | # - uses: actions/upload-artifact@v2 98 | # with: 99 | # path: ./wheelhouse/*.whl 100 | 101 | # build_sdist: 102 | # name: Build source dist ${{ matrix.os }}, ${{ matrix.platform }} 103 | # runs-on: ${{ matrix.os }} 104 | # strategy: 105 | # matrix: 106 | # # the mac computer used by github actions is too old to run the tests 107 | # # build windows and mac the standward way 108 | # os: [windows-latest, macos-latest] 109 | # python-version: [3.6, 3.7, 3.8] 110 | # platform: [x64] 111 | # exclude: 112 | # - os: macos-latest 113 | # platform: x32 114 | # steps: 115 | # - uses: actions/checkout@v2 116 | # 117 | # - uses: actions/setup-python@v2 118 | # name: Install Python 119 | # with: 120 | # python-version: ${{ matrix.python-version }} 121 | # 122 | # - name: Build sdist 123 | # env: 124 | # TWINE_USERNAME: __token__ 125 | # TWINE_PASSWORD: ${{ secrets.pypi }} 126 | # run: | 127 | # python -m pip install --upgrade pip 128 | # python -m pip install numpy>=1.19.1 setuptools setuptools_scm wheel twine 129 | # python setup.py build --force 130 | # python setup.py install 131 | # python setup.py sdist 132 | # #python -m twine upload dist/* --skip-existing --verbose; 133 | # python -m twine upload dist/* -u "tdimitri" -p "!" --skip-existing --verbose; 134 | # # twine upload dist/* --verbose 135 | # - uses: actions/upload-artifact@v2 136 | # with: 137 | # path: dist/*.tar.gz 138 | 139 | wheels: 140 | name: wheels ${{ matrix.os }}, ${{ matrix.platform }} 141 | runs-on: ${{ matrix.os }} 142 | strategy: 143 | matrix: 144 | # the mac computer used by github actions is too old to run the tests 145 | # build windows and mac the standward way 146 | os: [windows-latest, macos-latest] 147 | python-version: [3.6, 3.7, 3.8, 3.9] 148 | platform: [x64] 149 | steps: 150 | - uses: actions/checkout@v2 151 | - name: Set up Python 3.x 152 | uses: actions/setup-python@v2 153 | with: 154 | python-version: ${{ matrix.python-version }} 155 | - name: Install dependencies 156 | run: python -m pip install --upgrade setuptools wheel numpy>=1.19.1 setuptools_scm twine 157 | - name: Build wheels 158 | env: 159 | PYPI_PASSWORD: ${{ secrets.pypi_password }} 160 | PYPI_USERNAME: ${{ secrets.pypi_username }} 161 | TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} 162 | TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} 163 | run: | 164 | python setup.py bdist_wheel 165 | python -m twine upload dist/* --skip-existing --verbose; 166 | - uses: actions/upload-artifact@v2 167 | with: 168 | name: dist 169 | path: dist 170 | 171 | # deploy: 172 | # if: ${{ github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/master' }} 173 | # # pnumpy 174 | # runs-on: ubuntu-latest 175 | # 176 | # steps: 177 | # - uses: actions/checkout@v2 178 | # with: 179 | # # Set fetch-depth to 0 so all history is retrieved; this is needed so we get the git tags 180 | # # which we use for setting the package version (via setuptools-scm). 181 | # fetch-depth: 0 182 | # - name: Set up Python 183 | # uses: actions/setup-python@v2 184 | # with: 185 | # python-version: '3.7' 186 | # - name: Install dependencies 187 | # run: | 188 | # python -m pip install --upgrade pip 189 | # python -m pip install setuptools setuptools_scm wheel twine 190 | # - name: Build and publish 191 | # env: 192 | # TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 193 | # TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 194 | # run: | 195 | # python setup.py sdist 196 | # twine upload dist/* --verbose 197 | 198 | -------------------------------------------------------------------------------- /src/pnumpy/ledger.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | #if defined(_WIN32) 4 | 5 | // global scope 6 | typedef VOID(WINAPI* FuncGetSystemTime)(LPFILETIME); 7 | FuncGetSystemTime g_GetSystemTime; 8 | FILETIME g_TimeStart; 9 | static bool g_IsPreciseTime = false; 10 | 11 | 12 | //------------------------------------ 13 | // Returns windows time in Nanos 14 | __inline static uint64_t GetWindowsTime() { 15 | FILETIME timeNow; 16 | g_GetSystemTime(&timeNow); 17 | return (*(uint64_t*)&timeNow * 100) - 11644473600000000000L; 18 | } 19 | 20 | //------------------------------------------------------------------- 21 | // 22 | class CTimeStamp { 23 | public: 24 | CTimeStamp() 25 | { 26 | FARPROC fp; 27 | 28 | g_GetSystemTime = GetSystemTimeAsFileTime; 29 | 30 | HMODULE hModule = LoadLibraryW(L"kernel32.dll"); 31 | 32 | // Use printf instead of logging because logging is probably not up yet 33 | // Logging uses the timestamping, so timestamping loads first 34 | if (hModule != NULL) { 35 | fp = GetProcAddress(hModule, "GetSystemTimePreciseAsFileTime"); 36 | if (fp != NULL) { 37 | g_IsPreciseTime = true; 38 | //printf("Using precise GetSystemTimePreciseAsFileTime time...\n"); 39 | g_GetSystemTime = (VOID(WINAPI*)(LPFILETIME)) fp; 40 | } 41 | else { 42 | //LOGGING("**Using imprecise GetSystemTimeAsFileTime...\n"); 43 | } 44 | } 45 | else { 46 | printf("!! error load kernel32\n"); 47 | } 48 | 49 | } 50 | }; 51 | 52 | static CTimeStamp* g_TimeStamp = new CTimeStamp(); 53 | 54 | 55 | //--------------------------------------------------------- 56 | // Returns and int64_t nanosecs since unix epoch 57 | extern "C" 58 | PyObject* timer_getutc(PyObject* self, PyObject* args) { 59 | 60 | // return nano time since Unix Epoch 61 | return PyLong_FromLongLong((long long)GetWindowsTime()); 62 | } 63 | 64 | //--------------------------------------------------------- 65 | // Returns and uint64_t timestamp counter 66 | extern "C" 67 | PyObject* timer_gettsc(PyObject* self, PyObject* args) { 68 | 69 | // return tsc 70 | return PyLong_FromUnsignedLongLong(__rdtsc()); 71 | } 72 | 73 | 74 | 75 | #else 76 | 77 | #include 78 | #include 79 | #include 80 | 81 | uint64_t GetTimeStamp() { 82 | //struct timeval tv; 83 | //gettimeofday(&tv, NULL); 84 | //return tv.tv_sec*(uint64_t)1000000 + tv.tv_usec; 85 | 86 | struct timespec x; 87 | clock_gettime(CLOCK_REALTIME, &x); 88 | return x.tv_sec * 1000000000L + x.tv_nsec; 89 | } 90 | 91 | static __inline__ uint64_t rdtsc(void) 92 | { 93 | unsigned hi, lo; 94 | __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); 95 | return ((uint64_t)lo) | (((uint64_t)hi) << 32); 96 | } 97 | 98 | //--------------------------------------------------------- 99 | // Returns and uint64_t timestamp counter 100 | extern "C" 101 | PyObject* timer_gettsc(PyObject* self, PyObject* args) { 102 | 103 | // return tsc 104 | return PyLong_FromUnsignedLongLong(rdtsc()); 105 | } 106 | 107 | //--------------------------------------------------------- 108 | // Returns and int64_t nanosecs since unix epoch 109 | extern "C" 110 | PyObject* timer_getutc(PyObject* self, PyObject* args) { 111 | 112 | // return nano time since Unix Epoch 113 | return PyLong_FromLongLong(GetTimeStamp()); 114 | } 115 | 116 | #endif 117 | 118 | //--------------------------------------------------------- 119 | // Returns nanoseconds since utc epoch 120 | uint64_t GetUTCNanos() { 121 | #if defined(_WIN32) 122 | return GetWindowsTime(); 123 | #else 124 | return GetTimeStamp(); 125 | #endif 126 | } 127 | 128 | // See ATOP_TYPES 129 | static const char* gStrAtopTypes[]= { 130 | "bool", 131 | "int8", "uint8", 132 | "int16", "uint16", 133 | "int32", "uint32", 134 | "int64", "uint64", 135 | "int128", "uint128", 136 | "float16", "float32", "float64", "float80", 137 | "cfloat16", "cfloat32", "cfloat64", "cfloat80", 138 | "string", "unicode", 139 | "void", 140 | "last" 141 | }; 142 | 143 | 144 | struct stLEDGER_ITEM { 145 | const char* StrName; 146 | int64_t StartTime; 147 | int64_t TotalTime; 148 | 149 | int64_t ArrayLength1; 150 | int64_t ArrayLength2; 151 | int64_t ArrayLength3; // not valid for unary 152 | 153 | int32_t ArrayGroup; 154 | int32_t ArrayOp; 155 | int32_t AType; 156 | int32_t Reserved1; 157 | 158 | const char* StrCatName; 159 | const char* StrOpName; 160 | }; 161 | 162 | //----------------------------------------------------------- 163 | // allocated on 64 byte alignment 164 | struct stLedgerRing { 165 | // must be power of 2 for mask to work 166 | static const int64_t RING_BUFFER_SIZE = 8096; 167 | static const int64_t RING_BUFFER_MASK = 8095; 168 | 169 | volatile int64_t Head; 170 | volatile int64_t Tail; 171 | 172 | stLEDGER_ITEM LedgerQueue[RING_BUFFER_SIZE]; 173 | 174 | void Init() { 175 | Head = 0; 176 | Tail = 0; 177 | 178 | for (int i = 0; i < RING_BUFFER_SIZE; i++) { 179 | LedgerQueue[i].StrName = 0; 180 | LedgerQueue[i].StartTime = 0; 181 | LedgerQueue[i].TotalTime = 0; 182 | } 183 | } 184 | 185 | // Circular wrap around buffer 186 | // If (Head - Tail) > RING_BUFFER_SIZE then buffer has overflowed 187 | stLEDGER_ITEM* GetNextEntry() { 188 | return &LedgerQueue[RING_BUFFER_MASK & Tail++]; 189 | }; 190 | }; 191 | 192 | // Global ring buffer of last RING_BUFFER_SIZE math operations 193 | static stLedgerRing g_LedgerRing; 194 | 195 | // rough estimate of last op code 196 | #define MAX_FUNCOP 40 197 | const char* g_str_ufunc_name[OPCAT_LAST][MAX_FUNCOP]; 198 | 199 | void LedgerInit() { 200 | 201 | // Init the ring buffer that holds entries 202 | g_LedgerRing.Init(); 203 | 204 | // Build reverse lookup table 205 | for (int i = 0; i < OPCAT_LAST; i++) { 206 | stOpCategory* pstOpCategory = &gOpCategory[i]; 207 | for (int j = 0; j < pstOpCategory->NumOps; j++) { 208 | int k = pstOpCategory->pUFuncToAtop[j].atop_op; 209 | if (k >= 0 && k < MAX_FUNCOP) { 210 | // NOTE: can print out everything we hook here 211 | //printf("%d %d %s\n", i, k, pstOpCategory->pUFuncToAtop[j].str_ufunc_name); 212 | g_str_ufunc_name[i][k] = pstOpCategory->pUFuncToAtop[j].str_ufunc_name; 213 | } 214 | } 215 | } 216 | } 217 | 218 | //-------------------------------------------------- 219 | // When the ufunc is hooked, if the ledger is turned on it can be recorded. 220 | // The recording will go into the ring buffer for later retrieval. 221 | // The ring buffer only holds so much and can overflow 222 | void LedgerRecord(int32_t op_category, int64_t start_time, int64_t end_time, char** args, const npy_intp* dimensions, const npy_intp* steps, void* innerloop, int funcop, int atype) { 223 | int64_t deltaTime = end_time - start_time; 224 | 225 | stOpCategory* pstOpCategory = &gOpCategory[op_category]; 226 | 227 | // Get the next slot in the ring buffer 228 | stLEDGER_ITEM* pEntry = g_LedgerRing.GetNextEntry(); 229 | 230 | pEntry->ArrayGroup = op_category; 231 | pEntry->ArrayOp = funcop; 232 | pEntry->AType = atype; 233 | 234 | const char* strCatName = pstOpCategory->StrName; 235 | 236 | // Check for reduce operation 237 | if (op_category == OPCAT_BINARY && IS_BINARY_REDUCE) { 238 | strCatName = "Reduce"; 239 | } 240 | 241 | pEntry->StrCatName = strCatName; 242 | pEntry->StrOpName = g_str_ufunc_name[op_category][funcop]; 243 | pEntry->ArrayLength1 = (int64_t)dimensions[0]; 244 | pEntry->ArrayLength2 = (int64_t)dimensions[1]; 245 | 246 | // temporary for debugging print out results 247 | printf ("%lld \tlen: %lld %s, %s, %s\n", (long long)deltaTime, (long long)dimensions[0], pEntry->StrOpName, gStrAtopTypes[atype], strCatName); 248 | 249 | } 250 | 251 | void LedgerRecord2(int32_t op_category, int64_t start_time, int64_t end_time, int atype, int64_t length) { 252 | int64_t deltaTime = end_time - start_time; 253 | stOpCategory* pstOpCategory = &gOpCategory[op_category]; 254 | 255 | // Get the next slot in the ring buffer 256 | stLEDGER_ITEM* pEntry = g_LedgerRing.GetNextEntry(); 257 | 258 | pEntry->ArrayGroup = op_category; 259 | pEntry->ArrayOp = 0; 260 | pEntry->AType = atype; 261 | 262 | const char* strCatName = pstOpCategory->StrName; 263 | 264 | pEntry->StrCatName = strCatName; 265 | pEntry->StrOpName = "ledger2"; 266 | pEntry->ArrayLength1 = length; 267 | pEntry->ArrayLength2 = 0; 268 | 269 | // temporary for debugging print out results 270 | printf("%lld \tlen: %lld %s, %s, %s\n", (long long)deltaTime, (long long)length, pEntry->StrOpName, gStrAtopTypes[atype], strCatName); 271 | 272 | } 273 | 274 | -------------------------------------------------------------------------------- /src/atop/common_inc.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #if defined(_WIN32) && !defined(__GNUC__) 10 | #define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers 11 | #define NOMINMAX 12 | // Windows Header Files: 13 | #include 14 | #include 15 | #endif 16 | 17 | 18 | /* 19 | Macro symbol definitions to simplify conditional code compilation within riptide. 20 | 21 | References: 22 | * https://sourceforge.net/p/predef/wiki/Compilers/ 23 | 24 | */ 25 | 26 | /* 27 | Platform/OS detection 28 | */ 29 | 30 | #if defined(_WIN32) 31 | // Target OS is Windows 32 | # define RT_OS_WINDOWS 1 33 | 34 | #elif defined(__linux__) 35 | // Target OS is Linux 36 | # define RT_OS_LINUX 1 37 | 38 | // Target OS is UNIX-like 39 | # define RT_OS_FAMILY_UNIX 1 40 | 41 | #elif defined(__APPLE__) 42 | // Target OS is macOS or iOS 43 | # define RT_OS_DARWIN 1 44 | 45 | // Target OS is UNIX-like 46 | # define RT_OS_FAMILY_UNIX 1 47 | 48 | // Target OS is BSD-like 49 | # define RT_OS_FAMILY_BSD 1 50 | 51 | #elif __FreeBSD__ 52 | // Target OS is FreeBSD 53 | # define RT_OS_FREEBSD 1 54 | 55 | // Target OS is UNIX-like 56 | # define RT_OS_FAMILY_UNIX 1 57 | 58 | // Target OS is BSD-like 59 | # define RT_OS_FAMILY_BSD 1 60 | 61 | #else 62 | // If we can't detect the OS, make it a compiler error; compilation is likely to fail anyway due to 63 | // not having any working implementations of some functions, so at least we can make it obvious why 64 | // the compilation is failing. 65 | # error Unable to detect/classify the target OS. 66 | 67 | #endif /* Platform/OS detection */ 68 | 69 | 70 | /* 71 | Compiler detection. 72 | The order these detection checks operate in is IMPORTANT -- use CAUTION if changing or reordering them! 73 | */ 74 | 75 | #if defined(__clang__) 76 | // Compiler is Clang/LLVM. 77 | # define RT_COMPILER_CLANG 1 78 | 79 | #elif defined(__GNUC__) 80 | // Compiler is GCC/g++. 81 | # define RT_COMPILER_GCC 1 82 | 83 | #elif defined(__INTEL_COMPILER) || defined(_ICC) 84 | // Compiler is the Intel C/C++ compiler. 85 | # define RT_COMPILER_INTEL 1 86 | 87 | #elif defined(_MSC_VER) 88 | /* 89 | This check needs to be towards the end; a number of compilers (e.g. clang, Intel C/C++) 90 | define the _MSC_VER symbol when running on Windows, so putting this check last means we 91 | should have caught any of those already and this should be bona-fide MSVC. 92 | */ 93 | // Compiler is the Microsoft C/C++ compiler. 94 | # define RT_COMPILER_MSVC 1 95 | 96 | #else 97 | // Couldn't detect the compiler. 98 | // We could allow compilation to proceed anyway, but the compiler/platform behavior detection 99 | // below won't pass and it's important for correctness so this is an error. 100 | # error Unable to detect/classify the compiler being used. 101 | 102 | #endif /* compiler detection */ 103 | 104 | 105 | /* 106 | Compiler behavior detection. 107 | For conciseness/correctness in riptide code, we define some additional symbols here specifying certain 108 | compiler behaviors. This way any code depending on these behaviors expresses it in terms of the behavior 109 | rather than whether it's being compiled under a specific compiler(s) and/or platforms; this in turn 110 | makes it easier to support new compilers and platforms just by adding the necessary defines here. 111 | */ 112 | 113 | #if !defined(RT_COMPILER_MSVC) 114 | // Indicates whether the targeted compiler/platform defaults to emitting vector load/store operations 115 | // requiring an aligned pointer when a vector pointer is dereferenced (so any such pointers must be 116 | // aligned to prevent segfaults). When zero/false, the targeted compiler/platform emits unaligned 117 | // vector load/store instructions by default. 118 | # define RT_TARGET_VECTOR_MEMOP_DEFAULT_ALIGNED 1 119 | #else 120 | // Indicates whether the targeted compiler/platform defaults to emitting vector load/store operations 121 | // requiring an aligned pointer when a vector pointer is dereferenced (so any such pointers must be 122 | // aligned to prevent segfaults). When zero/false, the targeted compiler/platform emits unaligned 123 | // vector load/store instructions by default. 124 | # define RT_TARGET_VECTOR_MEMOP_DEFAULT_ALIGNED 0 125 | #endif /* RT_TARGET_VECTOR_MEMOP_DEFAULT_ALIGNED */ 126 | 127 | //------------------------------------------- 128 | //------------------------------------------- 129 | #define VOID void 130 | typedef void* PVOID; 131 | typedef void* LPVOID; 132 | typedef void* HANDLE; 133 | 134 | #define TRUE 1 135 | #define FALSE 0 136 | typedef int BOOL; 137 | typedef unsigned char BYTE; 138 | 139 | 140 | #if defined(_WIN32) && !defined(__GNUC__) 141 | #define WINAPI __stdcall 142 | #define InterlockedCompareExchange128 _InterlockedCompareExchange128 143 | #ifndef InterlockedAdd64 144 | #define InterlockedAdd64 _InterlockedAdd64 145 | #endif 146 | #ifndef InterlockedDecrement64 147 | #define InterlockedDecrement64 _InterlockedDecrement64 148 | #define InterlockedIncrement64 _InterlockedIncrement64 149 | #endif 150 | #define InterlockedIncrement _InterlockedIncrement 151 | #define InterlockedDecrement _InterlockedDecrement 152 | 153 | #define AtopInterlockedOr(X,Y) InterlockedOr64((int64_t*)X,Y) 154 | #define AtopInterlockedAnd(X,Y) InterlockedAnd64((int64_t*)X,Y) 155 | #define AtopInterlockedXor(X,Y) InterlockedXor64((int64_t*)X,Y) 156 | 157 | #include 158 | #ifndef MEM_ALIGN 159 | #define MEM_ALIGN(x) __declspec(align(x)) 160 | #define ALIGN(x) __declspec(align(64)) 161 | 162 | #define FORCEINLINE __forceinline 163 | #define FORCE_INLINE __forceinline 164 | 165 | #define ALIGNED_ALLOC(Size,Alignment) _aligned_malloc(Size,Alignment) 166 | #define ALIGNED_FREE(block) _aligned_free(block) 167 | 168 | #define lzcnt_64 _lzcnt_u64 169 | 170 | #endif 171 | #else 172 | 173 | #define WINAPI 174 | #include 175 | 176 | // consider sync_add_and_fetch 177 | #define InterlockedAdd64(val, len) (__sync_fetch_and_add(val, len) + len) 178 | #define InterlockedIncrement64(val) (__sync_fetch_and_add(val, 1) + 1) 179 | #define InterlockedIncrement(val) (__sync_fetch_and_add(val, 1) + 1) 180 | #define InterlockedDecrement(val) (__sync_fetch_and_add(val, -1) - 1) 181 | #define AtopInterlockedOr(val, bitpos) (__sync_fetch_and_or(val, bitpos)) 182 | #define AtopInterlockedAnd(val, bitpos) (__sync_fetch_and_and(val, bitpos)) 183 | #define AtopInterlockedXor(val, bitpos) (__sync_fetch_and_xor(val, bitpos)) 184 | 185 | #ifndef __GNUC_PREREQ 186 | #define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor))) 187 | #endif 188 | #ifndef MEM_ALIGN 189 | #define MEM_ALIGN(x) __attribute__((aligned(x))) 190 | #endif 191 | 192 | #define FORCEINLINE inline __attribute__((always_inline)) 193 | #define FORCE_INLINE inline __attribute__((always_inline)) 194 | #define ALIGN(x) x __attribute__((aligned(64))) 195 | 196 | // Workaround for platforms/compilers which don't support C11 aligned_alloc 197 | // but which do have posix_memalign(). 198 | #ifndef aligned_alloc 199 | 200 | #ifdef posix_memalign 201 | FORCEINLINE void* aligned_alloc(size_t alignment, size_t size) 202 | { 203 | void* buffer = NULL; 204 | posix_memalign(&buffer, alignment, size); 205 | return buffer; 206 | } 207 | 208 | #else 209 | // clang compiler does not support so we default to malloc 210 | //#warning Unable to determine how to perform aligned allocations on this platform. 211 | #define aligned_alloc(alignment, size) malloc(size) 212 | #endif // defined(posix_memalign) 213 | 214 | #endif // !defined(aligned_alloc) 215 | 216 | #define ALIGNED_ALLOC(Size,Alignment) aligned_alloc(Alignment,Size) 217 | #define ALIGNED_FREE(block) free(block) 218 | 219 | #define lzcnt_64 __builtin_clzll 220 | 221 | #endif 222 | 223 | // To detect CPU features like AVX-256 224 | typedef struct { 225 | uint32_t f1c; 226 | uint32_t f1d; 227 | uint32_t f7b; 228 | uint32_t f7c; 229 | } ATOP_cpuid_t; 230 | 231 | // Missing types include 232 | // Half Float 233 | // A bool that takes up one bit 234 | // 2 byte unicode 235 | // pointers to variable length strings of 1,2,4 itemsize 236 | enum ATOP_TYPES { 237 | ATOP_BOOL = 0, 238 | ATOP_INT8, ATOP_UINT8, 239 | ATOP_INT16, ATOP_UINT16, 240 | ATOP_INT32, ATOP_UINT32, 241 | ATOP_INT64, ATOP_UINT64, 242 | ATOP_INT128, ATOP_UINT128, 243 | ATOP_HALF_FLOAT, ATOP_FLOAT, ATOP_DOUBLE, ATOP_LONGDOUBLE, // 11, 12, 13, 14 244 | ATOP_CHALF_FLOAT, ATOP_CFLOAT, ATOP_CDOUBLE, ATOP_CLONGDOUBLE, 245 | ATOP_STRING, ATOP_UNICODE, 246 | ATOP_VOID, 247 | ATOP_LAST 248 | }; 249 | 250 | enum COMP_OPERATION { 251 | // Two inputs, Always return a bool 252 | CMP_EQ = 0, 253 | CMP_NE = 1, 254 | CMP_LT = 2, 255 | CMP_GT = 3, 256 | CMP_LTE = 4, 257 | CMP_GTE = 5, 258 | CMP_LAST = 6, 259 | }; 260 | 261 | enum UNARY_OPERATION { 262 | UNARY_INVALID = 0, 263 | 264 | // One input, returns same data type 265 | ABS = 1, 266 | SIGNBIT = 2, 267 | FABS = 3, 268 | INVERT = 4, 269 | FLOOR = 5, 270 | CEIL = 6, 271 | TRUNC = 7, 272 | ROUND = 8, 273 | NEGATIVE = 9, 274 | POSITIVE = 10, 275 | SIGN = 11, 276 | RINT = 12, 277 | 278 | // One input, always return a float one input 279 | SQRT = 15, 280 | SQUARE = 16, 281 | RECIPROCAL = 17, 282 | 283 | // one input, output bool 284 | LOGICAL_NOT = 18, 285 | ISINF = 19, 286 | ISNAN = 20, 287 | ISFINITE = 21, 288 | ISNORMAL = 22, 289 | 290 | ISNOTINF = 23, 291 | ISNOTNAN = 24, 292 | ISNOTFINITE = 25, 293 | ISNOTNORMAL = 26, 294 | ISNANORZERO = 27, 295 | 296 | // One input, does not allow floats 297 | BITWISE_NOT = 28, // same as invert? 298 | 299 | UNARY_LAST = 35, 300 | }; 301 | 302 | enum BINARY_OPERATION { 303 | BINARY_INVALID = 0, 304 | 305 | // Two ops, returns same type 306 | ADD = 1, 307 | SUB = 2, 308 | MUL = 3, 309 | MOD = 4, // Warning: there are two mods - C,Java mod and Python mod 310 | 311 | MIN = 5, 312 | MAX = 6, 313 | NANMIN = 7, 314 | NANMAX = 8, 315 | FLOORDIV = 9, 316 | POWER = 10, 317 | REMAINDER = 11, 318 | FMOD = 12, 319 | 320 | // Two ops, always return a double 321 | DIV = 13, 322 | SUBDATETIMES = 14, // returns double 323 | SUBDATES = 15, // returns int 324 | 325 | // Two inputs, Always return a bool 326 | LOGICAL_AND = 16, 327 | LOGICAL_XOR = 17, 328 | LOGICAL_OR = 18, 329 | 330 | // Two inputs, second input must be int based 331 | BITWISE_LSHIFT = 19, //left_shift 332 | BITWISE_RSHIFT = 20, 333 | BITWISE_AND = 21, 334 | BITWISE_XOR = 22, 335 | BITWISE_OR = 23, 336 | BITWISE_ANDNOT = 24, 337 | BITWISE_NOTAND = 25, 338 | BITWISE_XOR_SPECIAL = 26, 339 | 340 | ATAN2 = 27, 341 | HYPOT = 28, 342 | 343 | BINARY_LAST = 29, 344 | }; 345 | 346 | enum TRIG_OPERATION { 347 | TRIG_INVALID = 0, 348 | // One op, returns same type 349 | SIN = 1, 350 | COS = 2, 351 | TAN = 3, 352 | ASIN = 4, 353 | ACOS = 5, 354 | ATAN = 6, 355 | SINH = 7, 356 | COSH = 8, 357 | TANH = 9, 358 | ASINH = 10, 359 | ACOSH = 11, 360 | ATANH = 12, 361 | 362 | LOG = 13, 363 | LOG2 = 14, 364 | LOG10 = 15, 365 | EXP = 16, 366 | EXP2 = 17, 367 | EXPM1 = 18, 368 | LOG1P = 19, 369 | CBRT = 20, 370 | 371 | TRIG_LAST = 21 372 | }; 373 | 374 | //---------------------------------------------------------------------------------- 375 | // Lookup to go from 1 byte to 8 byte boolean values 376 | extern int64_t gBooleanLUT64[256]; 377 | extern int32_t gBooleanLUT32[16]; 378 | 379 | extern int64_t gBooleanLUT64Inverse[256]; 380 | extern int32_t gBooleanLUT32Inverse[16]; 381 | 382 | typedef void(*UNARY_FUNC)(void* pDataIn, void* pDataOut, int64_t len, int64_t strideIn, int64_t strideOut); 383 | // Pass in two vectors and return one vector 384 | // Used for operations like C = A + B 385 | typedef void(*ANY_TWO_FUNC)(void* pDataIn, void* pDataIn2, void* pDataOut, int64_t len, int64_t strideIn1, int64_t strideIn2, int64_t strideOut); 386 | typedef void(*GROUPBY_FUNC)(void* pstGroupBy, int64_t index); 387 | typedef void(*REDUCE_FUNC)(void* pDataIn1X, void* pDataOutX, void* pStartVal, int64_t datalen, int64_t strideIn); 388 | 389 | 390 | //====================================================== 391 | // Unary 392 | //------------------------------------------------------ 393 | // Macro stub for returning None 394 | #define STRIDE_NEXT(_TYPE_, _MEM_, _STRIDE_) (_TYPE_*)((char*)_MEM_ + _STRIDE_) 395 | 396 | 397 | //-------------------------------------------------------------------- 398 | // multithreaded struct used for calling unary op codes 399 | struct UNARY_CALLBACK { 400 | UNARY_FUNC pUnaryCallback; 401 | 402 | char* pDataIn; 403 | char* pDataOut; 404 | 405 | int64_t itemSizeIn; 406 | int64_t itemSizeOut; 407 | }; 408 | 409 | 410 | //==================================================================== 411 | void* FmAlloc(size_t _Size); 412 | void FmFree(void* _Block); 413 | 414 | #define WORKSPACE_ALLOC FmAlloc 415 | #define WORKSPACE_FREE FmFree 416 | 417 | // Default stack size in linux is 320KB 418 | #define MAX_STACK_ALLOC (1024) 419 | 420 | // For small buffers that can be allocated on the stack 421 | #if defined(_WIN32) && !defined(__GNUC__) 422 | #define POSSIBLY_STACK_ALLOC(_alloc_size_) _alloc_size_ > MAX_STACK_ALLOC ? (char*)WORKSPACE_ALLOC(_alloc_size_) : (char*)_malloca(_alloc_size_); 423 | #define POSSIBLY_STACK_ALLOC_TYPE(_TYPE_, _alloc_size_) _alloc_size_ > MAX_STACK_ALLOC ? (_TYPE_)WORKSPACE_ALLOC(_alloc_size_) : (_TYPE_)_malloca(_alloc_size_); 424 | #else 425 | #define POSSIBLY_STACK_ALLOC(_alloc_size_) _alloc_size_ > MAX_STACK_ALLOC ? (char*)WORKSPACE_ALLOC(_alloc_size_) : (char*)alloca(_alloc_size_); 426 | #define POSSIBLY_STACK_ALLOC_TYPE(_TYPE_, _alloc_size_) _alloc_size_ > MAX_STACK_ALLOC ? (_TYPE_)WORKSPACE_ALLOC(_alloc_size_) : (_TYPE_)alloca(_alloc_size_); 427 | #endif 428 | #define POSSIBLY_STACK_FREE(_alloc_size_, _mem_ptr_) if (_alloc_size_ > MAX_STACK_ALLOC) WORKSPACE_FREE(_mem_ptr_); 429 | 430 | //======================================================================= 431 | // Conversions 432 | struct stRecarrayOffsets { 433 | char* pData; 434 | int64_t readoffset; 435 | int64_t itemsize; 436 | }; 437 | 438 | extern "C" void RecArrayToColMajor( 439 | stRecarrayOffsets* pstOffset, 440 | char* pStartOffset, 441 | int64_t totalRows, 442 | int64_t numArrays, 443 | int64_t itemSize); 444 | 445 | //===================================================================== 446 | // Sorting 447 | enum SORT_MODE { 448 | SORT_MODE_QSORT = 0, 449 | SORT_MODE_HEAP = 1, 450 | SORT_MODE_MERGE = 2, 451 | }; 452 | 453 | extern "C" int64_t IsSorted(void* pDataIn1,int64_t arraySize1, int32_t arrayType1, int64_t itemSize); 454 | extern "C" int SortIndex32( 455 | int64_t * pCutOffs, 456 | int64_t cutOffLength, 457 | void* pDataIn1, 458 | int64_t arraySize1, 459 | int32_t * pDataOut1, 460 | SORT_MODE mode, 461 | int arrayType1, 462 | int64_t strlen); 463 | 464 | extern "C" int SortIndex64( 465 | int64_t * pCutOffs, 466 | int64_t cutOffLength, 467 | void* pDataIn1, 468 | int64_t arraySize1, 469 | int64_t * pDataOut1, 470 | SORT_MODE mode, 471 | int arrayType1, 472 | int64_t strlen); 473 | 474 | 475 | typedef int64_t(*GROUP_INDEX_FUNC)( 476 | void* pDataIn1, 477 | int64_t arraySize1V, 478 | void* pDataIndexInV, 479 | void* pGroupOutV, 480 | void* pFirstOutV, 481 | void* pCountOutV, 482 | bool* pFilter, // optional 483 | int64_t base_index, 484 | int64_t strlen); 485 | 486 | 487 | extern "C" int64_t GroupIndex32( 488 | void* pDataIn1, 489 | int64_t arraySize1V, 490 | void* pDataIndexInV, 491 | void* pGroupOutV, 492 | void* pFirstOutV, 493 | void* pCountOutV, 494 | bool* pFilter, // optional 495 | int64_t base_index, 496 | int64_t strlen); 497 | 498 | extern "C" int64_t GroupIndex64( 499 | void* pDataIn1, 500 | int64_t arraySize1V, 501 | void* pDataIndexInV, 502 | void* pGroupOutV, 503 | void* pFirstOutV, 504 | void* pCountOutV, 505 | bool* pFilter, // optional 506 | int64_t base_index, 507 | int64_t strlen); 508 | 509 | extern "C" int Sort( 510 | SORT_MODE sortmode, 511 | int atype, 512 | void* pDataIn, 513 | int64_t arrayLength, 514 | int64_t stridesIn, 515 | int64_t itemSize, 516 | void* pDataOut1, 517 | int64_t stridesOut); 518 | 519 | extern "C" int ArangeFill( 520 | int atype, 521 | char* pBuffer, 522 | void* pFirstValue, 523 | void* pSecondValue, 524 | int64_t length, 525 | int32_t threadwakeup); 526 | 527 | -------------------------------------------------------------------------------- /src/atop/threads.cpp: -------------------------------------------------------------------------------- 1 | #include "threads.h" 2 | 3 | // to debug thread wakeup allow LOGGING to printf 4 | //#define LOGGING printf 5 | #define LOGGING(...) 6 | #define LOGERROR printf 7 | 8 | #if defined(RT_OS_DARWIN) 9 | /* For MacOS use a conditional wakeup */ 10 | pthread_cond_t g_WakeupCond = PTHREAD_COND_INITIALIZER; 11 | pthread_mutex_t g_WakeupMutex = PTHREAD_MUTEX_INITIALIZER; 12 | #endif 13 | 14 | 15 | #if defined(RT_OS_WINDOWS) 16 | WakeSingleAddress g_WakeSingleAddress = InitWakeCalls(); 17 | WakeAllAddress g_WakeAllAddress; 18 | WaitAddress g_WaitAddress; 19 | 20 | //----------------------------------------------------------------- 21 | // Not every version of Windows has this useful API so we have to check for it dynamically 22 | WakeSingleAddress InitWakeCalls() 23 | { 24 | FARPROC fp; 25 | 26 | HMODULE hModule = LoadLibraryW(L"kernelbase.dll"); 27 | 28 | if (hModule != NULL) { 29 | fp = GetProcAddress(hModule, "WakeByAddressSingle"); 30 | if (fp != NULL) { 31 | //LogInform("**System supports WakeByAddressSingle ...\n"); 32 | g_WakeSingleAddress = (VOID(WINAPI*)(PVOID)) fp; 33 | 34 | fp = GetProcAddress(hModule, "WakeByAddressAll"); 35 | g_WakeAllAddress = (WakeAllAddress)fp; 36 | 37 | fp = GetProcAddress(hModule, "WaitOnAddress"); 38 | g_WaitAddress = (WaitAddress)fp; 39 | 40 | } 41 | else { 42 | LOGERROR("**System does NOT support WakeByAddressSingle ...\n"); 43 | g_WakeSingleAddress = NULL; 44 | g_WakeAllAddress = NULL; 45 | g_WaitAddress = NULL; 46 | 47 | } 48 | } 49 | 50 | return g_WakeSingleAddress; 51 | } 52 | 53 | #else 54 | WakeSingleAddress g_WakeSingleAddress = NULL; 55 | WakeAllAddress g_WakeAllAddress = NULL; 56 | WaitAddress g_WaitAddress = NULL; 57 | #endif 58 | 59 | 60 | //----------------------------------------------------------- 61 | // Main thread loop 62 | // Threads will wait on an address then wake up when there is work 63 | // Linux uses a futex to control how many threads wakeup 64 | // Windows uses a counter 65 | // Darwin (macOS) does not support futexes or WaitOnAddress, so it will need to use one of: 66 | // * POSIX condition variables 67 | // * C++11 condition variables from 68 | // * libdispatch (GCD), using dispatch_semaphore_t (via dispatch_semaphore_create()) to control concurrency; include 69 | // * BSD syscalls like __psynch_cvwait (and other __psynch functions). These are not externally documented -- need to look in github.com/apple/darwin-libpthread to see how things work. 70 | // 71 | #if defined(RT_OS_WINDOWS) 72 | DWORD WINAPI WorkerThreadFunction(LPVOID lpParam) 73 | #else 74 | void* 75 | WorkerThreadFunction(void* lpParam) 76 | #endif 77 | { 78 | stWorkerRing* pWorkerRing = (stWorkerRing*)lpParam; 79 | 80 | DWORD core = (DWORD)(InterlockedIncrement64(&pWorkerRing->WorkThread)); 81 | // The first 3 get assigned 1,2,3 and main is reserved for 0 and pool 0 82 | // The next (3,4,5,6) are assigned 4,5,6,7 and pool 1 83 | DWORD pool = core / MAX_WORKER_CHANNEL; 84 | 85 | LOGGING("Thread created with parameter: %d %p\n", core, g_WaitAddress); 86 | 87 | // On windows we set the thread affinity mask 88 | if (g_WaitAddress != NULL) { 89 | DWORD tempcore = core - 1; 90 | // If hyperthreading is on skip every other core 91 | if (THREADER->GlobalWorkerParams.HyperThreading) 92 | tempcore = tempcore * 2; 93 | uint64_t ret = SetThreadAffinityMask(GetCurrentThread(), (uint64_t)1 << tempcore); 94 | //uint64_t ret = SetThreadAffinityMask(GetCurrentThread(), 0xFFFFFFFF); 95 | } 96 | 97 | int64_t lastWorkItemCompleted = -1; 98 | 99 | // 100 | // Setting Cancelled will stop all worker threads 101 | // 102 | while (THREADER->GlobalWorkerParams.Cancelled == 0) { 103 | int64_t workIndexCompleted; 104 | int64_t workIndex; 105 | 106 | workIndex = pWorkerRing->Pool[pool].WorkIndex; 107 | workIndexCompleted = pWorkerRing->Pool[pool].WorkIndexCompleted; 108 | 109 | int64_t didSomeWork = 0; 110 | 111 | // See if work to do 112 | if (workIndex > workIndexCompleted) { 113 | stMATH_WORKER_ITEM* pWorkItem = pWorkerRing->GetExistingWorkItem(); 114 | 115 | // check if the work was for our thread 116 | if ((int64_t)core <= pWorkItem->ThreadWakeup) { 117 | LOGGING("Pos Waking %d %d %lld\n", core, pool, workIndex); 118 | didSomeWork = pWorkItem->DoWork(core, pWorkerRing->MainWorkIndex); 119 | } 120 | else { 121 | LOGGING("Not Waking %d %d %lld\n", core, pool, workIndex); 122 | 123 | } 124 | } 125 | 126 | // didSomeWork contains how many work items the thread completed 127 | // TODO: Use core as an index to keep stats track of how many 128 | // work items each thread is completing for future thread tuning 129 | // 130 | // NOTE: if we did some work, we loop back to top while to check for more work 131 | // before waiting again on the worker Q 132 | // 133 | if (!didSomeWork) { 134 | workIndexCompleted = workIndex; 135 | 136 | #if defined(RT_OS_WINDOWS) 137 | //printf("Sleeping %d", core); 138 | if (g_WaitAddress == NULL) { 139 | // For Windows 7 we just sleep 140 | Sleep(THREADER->GlobalWorkerParams.SleepTime); 141 | } 142 | else { 143 | if (!didSomeWork) { 144 | 145 | //workIndexCompleted++; 146 | } 147 | 148 | LOGGING("[%d][%d] WaitAddress %llu %p %d\n", core, pool, workIndexCompleted, &(pWorkerRing->Pool[pool].WorkIndex), (int)didSomeWork); 149 | 150 | // Otherwise wake up using conditional variable 151 | g_WaitAddress( 152 | &(pWorkerRing->Pool[pool].WorkIndex), 153 | (PVOID)&workIndexCompleted, 154 | 8, // The size of the value being waited on (i.e. the number of bytes to read from the two pointers then compare). 155 | 1000000L); 156 | } 157 | #elif defined(RT_OS_LINUX) 158 | 159 | LOGGING("[%d] WaitAddress %llu %llu %d\n", core, workIndexCompleted, pWorkerRing->Pool[pool].WorkIndex, (int)didSomeWork); 160 | 161 | //int futex(int *uaddr, int futex_op, int val, 162 | // const struct timespec *timeout, /* or: uint32_t val2 */ 163 | // int *uaddr2, int val3); 164 | futex((int*)&(pWorkerRing->Pool[pool].WorkIndex), FUTEX_WAIT, (int)workIndexCompleted, NULL, NULL, 0); 165 | 166 | #elif defined(RT_OS_DARWIN) 167 | LOGGING("[%lu] WaitAddress %llu %llu %d\n", core, workIndexCompleted, pWorkerRing->Pool[pool].WorkIndex, (int)didSomeWork); 168 | 169 | pthread_mutex_lock(&g_WakeupMutex); 170 | pthread_cond_wait(&g_WakeupCond, &g_WakeupMutex); 171 | pthread_mutex_unlock(&g_WakeupMutex); 172 | 173 | #else 174 | #error riptide MathThreads support needs to be implemented for this platform. 175 | 176 | #endif 177 | 178 | LOGGING("Waking %d %d\n", core, pool); 179 | 180 | //YieldProcessor(); 181 | } 182 | //YieldProcessor(); 183 | } 184 | 185 | LOGERROR("Thread %d exiting!!!\n", (int)core); 186 | #if defined(RT_OS_WINDOWS) 187 | return 0; 188 | #else 189 | return NULL; 190 | #endif 191 | } 192 | 193 | 194 | #if defined(RT_OS_WINDOWS) 195 | 196 | //----------------------------------------------------------- 197 | // 198 | THANDLE StartThread(stWorkerRing* pWorkerRing) 199 | { 200 | DWORD dwThreadId; 201 | THANDLE hThread; 202 | 203 | hThread = CreateThread( 204 | NULL, // default security attributes 205 | 0, // use default stack size 206 | WorkerThreadFunction, // thread function 207 | pWorkerRing, // argument to thread function 208 | 0, // use default creation flags 209 | &dwThreadId); // returns the thread identifier 210 | 211 | //printf("The thread ID: %d.\n", dwThreadId); 212 | 213 | // Check the return value for success. If something wrong... 214 | if (hThread == NULL) { 215 | LOGERROR("CreateThread() failed, error: %d.\n", GetLastError()); 216 | return NULL; 217 | } 218 | 219 | return hThread; 220 | 221 | } 222 | 223 | #else 224 | 225 | //----------------------------------------------------------- 226 | // 227 | THANDLE StartThread(stWorkerRing* pWorkerRing) 228 | { 229 | int err; 230 | THANDLE hThread; 231 | 232 | err = pthread_create(&hThread, NULL, &WorkerThreadFunction, pWorkerRing); 233 | 234 | if (err != 0) { 235 | LOGERROR("*** Cannot create thread :[%s]\n", strerror(err)); 236 | } 237 | 238 | return hThread; 239 | } 240 | #endif 241 | 242 | //============================================================================================ 243 | #if defined(__GNUC__) 244 | # define MEM_STATIC static __inline __attribute__((unused)) 245 | #elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) 246 | # define MEM_STATIC static inline 247 | #elif defined(_MSC_VER) 248 | # define MEM_STATIC static __inline 249 | #else 250 | # define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ 251 | #endif 252 | 253 | typedef unsigned int U32; 254 | 255 | // Taken from the ZSTD project 256 | MEM_STATIC ATOP_cpuid_t ATOP_cpuid(void) { 257 | U32 f1c = 0; 258 | U32 f1d = 0; 259 | U32 f7b = 0; 260 | U32 f7c = 0; 261 | #ifdef _MSC_VER 262 | int reg[4]; 263 | __cpuid((int*)reg, 0); 264 | { 265 | int const n = reg[0]; 266 | if (n >= 1) { 267 | __cpuid((int*)reg, 1); 268 | f1c = (U32)reg[2]; 269 | f1d = (U32)reg[3]; 270 | } 271 | if (n >= 7) { 272 | __cpuidex((int*)reg, 7, 0); 273 | f7b = (U32)reg[1]; 274 | f7c = (U32)reg[2]; 275 | } 276 | } 277 | #elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__) 278 | /* The following block like the normal cpuid branch below, but gcc 279 | * reserves ebx for use of its pic register so we must specially 280 | * handle the save and restore to avoid clobbering the register 281 | */ 282 | U32 n; 283 | __asm__( 284 | "pushl %%ebx\n\t" 285 | "cpuid\n\t" 286 | "popl %%ebx\n\t" 287 | : "=a"(n) 288 | : "a"(0) 289 | : "ecx", "edx"); 290 | if (n >= 1) { 291 | U32 f1a; 292 | __asm__( 293 | "pushl %%ebx\n\t" 294 | "cpuid\n\t" 295 | "popl %%ebx\n\t" 296 | : "=a"(f1a), "=c"(f1c), "=d"(f1d) 297 | : "a"(1)); 298 | } 299 | if (n >= 7) { 300 | __asm__( 301 | "pushl %%ebx\n\t" 302 | "cpuid\n\t" 303 | "movl %%ebx, %%eax\n\r" 304 | "popl %%ebx" 305 | : "=a"(f7b), "=c"(f7c) 306 | : "a"(7), "c"(0) 307 | : "edx"); 308 | } 309 | #elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__) 310 | U32 n; 311 | __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx"); 312 | if (n >= 1) { 313 | U32 f1a; 314 | __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx"); 315 | } 316 | if (n >= 7) { 317 | U32 f7a; 318 | __asm__("cpuid" 319 | : "=a"(f7a), "=b"(f7b), "=c"(f7c) 320 | : "a"(7), "c"(0) 321 | : "edx"); 322 | } 323 | #endif 324 | { 325 | ATOP_cpuid_t cpuid; 326 | cpuid.f1c = f1c; 327 | cpuid.f1d = f1d; 328 | cpuid.f7b = f7b; 329 | cpuid.f7c = f7c; 330 | return cpuid; 331 | } 332 | } 333 | 334 | #define X(name, r, bit) \ 335 | MEM_STATIC int ATOP_cpuid_##name(ATOP_cpuid_t const cpuid) { \ 336 | return ((cpuid.r) & (1U << bit)) != 0; \ 337 | } 338 | 339 | /* cpuid(1): Processor Info and Feature Bits. */ 340 | #define C(name, bit) X(name, f1c, bit) 341 | C(sse3, 0) 342 | C(pclmuldq, 1) 343 | C(dtes64, 2) 344 | C(monitor, 3) 345 | C(dscpl, 4) 346 | C(vmx, 5) 347 | C(smx, 6) 348 | C(eist, 7) 349 | C(tm2, 8) 350 | C(ssse3, 9) 351 | C(cnxtid, 10) 352 | C(fma, 12) 353 | C(cx16, 13) 354 | C(xtpr, 14) 355 | C(pdcm, 15) 356 | C(pcid, 17) 357 | C(dca, 18) 358 | C(sse41, 19) 359 | C(sse42, 20) 360 | C(x2apic, 21) 361 | C(movbe, 22) 362 | C(popcnt, 23) 363 | C(tscdeadline, 24) 364 | C(aes, 25) 365 | C(xsave, 26) 366 | C(osxsave, 27) 367 | C(avx, 28) 368 | C(f16c, 29) 369 | C(rdrand, 30) 370 | #undef C 371 | #define D(name, bit) X(name, f1d, bit) 372 | D(fpu, 0) 373 | D(vme, 1) 374 | D(de, 2) 375 | D(pse, 3) 376 | D(tsc, 4) 377 | D(msr, 5) 378 | D(pae, 6) 379 | D(mce, 7) 380 | D(cx8, 8) 381 | D(apic, 9) 382 | D(sep, 11) 383 | D(mtrr, 12) 384 | D(pge, 13) 385 | D(mca, 14) 386 | D(cmov, 15) 387 | D(pat, 16) 388 | D(pse36, 17) 389 | D(psn, 18) 390 | D(clfsh, 19) 391 | D(ds, 21) 392 | D(acpi, 22) 393 | D(mmx, 23) 394 | D(fxsr, 24) 395 | D(sse, 25) 396 | D(sse2, 26) 397 | D(ss, 27) 398 | D(htt, 28) 399 | D(tm, 29) 400 | D(pbe, 31) 401 | #undef D 402 | 403 | /* cpuid(7): Extended Features. */ 404 | #define B(name, bit) X(name, f7b, bit) 405 | B(bmi1, 3) 406 | B(hle, 4) 407 | B(avx2, 5) 408 | B(smep, 7) 409 | B(bmi2, 8) 410 | B(erms, 9) 411 | B(invpcid, 10) 412 | B(rtm, 11) 413 | B(mpx, 14) 414 | B(avx512f, 16) 415 | B(avx512dq, 17) 416 | B(rdseed, 18) 417 | B(adx, 19) 418 | B(smap, 20) 419 | B(avx512ifma, 21) 420 | B(pcommit, 22) 421 | B(clflushopt, 23) 422 | B(clwb, 24) 423 | B(avx512pf, 26) 424 | B(avx512er, 27) 425 | B(avx512cd, 28) 426 | B(sha, 29) 427 | B(avx512bw, 30) 428 | B(avx512vl, 31) 429 | #undef B 430 | #define C(name, bit) X(name, f7c, bit) 431 | C(prefetchwt1, 0) 432 | C(avx512vbmi, 1) 433 | #undef C 434 | 435 | #undef X 436 | 437 | extern "C" { 438 | int g_bmi2 = 0; 439 | int g_avx2 = 0; 440 | ATOP_cpuid_t g_cpuid; 441 | }; 442 | 443 | #if defined(RT_OS_WINDOWS) 444 | 445 | void PrintCPUInfo(char* buffer, size_t buffercount) { 446 | int CPUInfo[4] = { -1 }; 447 | unsigned nExIds, i = 0; 448 | char CPUBrandString[0x40]; 449 | // Get the information associated with each extended ID. 450 | __cpuid(CPUInfo, 0x80000000); 451 | nExIds = CPUInfo[0]; 452 | 453 | for (unsigned int i = 0x80000000; i <= nExIds; ++i) 454 | { 455 | __cpuid(CPUInfo, i); 456 | 457 | if (i == 0x80000002) { 458 | for (size_t i = 0; i < sizeof(CPUInfo); i++) 459 | CPUBrandString[i] = ((char*)CPUInfo)[i]; 460 | } 461 | else if (i == 0x80000003) { 462 | for (size_t i = 0; i < sizeof(CPUInfo); i++) 463 | CPUBrandString[i + 16] = ((char*)CPUInfo)[i]; 464 | } 465 | else if (i == 0x80000004) { 466 | for (size_t i = 0; i < sizeof(CPUInfo); i++) 467 | CPUBrandString[i + 32] = ((char*)CPUInfo)[i]; 468 | } 469 | } 470 | 471 | // NEW CODE 472 | g_cpuid = ATOP_cpuid(); 473 | 474 | g_bmi2 = ATOP_cpuid_bmi2(g_cpuid); 475 | g_avx2 = ATOP_cpuid_avx2(g_cpuid); 476 | 477 | snprintf(buffer, buffercount, "**CPU: %s AVX2:%d BMI2:%d f1c:0x%.8x f1d:0x%.8x f7b:0x%.8x f7c:0x%.8x", CPUBrandString, g_avx2, g_bmi2, g_cpuid.f1c, g_cpuid.f1d, g_cpuid.f7b, g_cpuid.f7c); 478 | if (g_avx2 == 0) { 479 | printf("!!!NOTE: this system does not support AVX2 or BMI2 instructions, and will not work!\n"); 480 | } 481 | 482 | } 483 | 484 | #else 485 | extern "C" { 486 | #include 487 | #include 488 | #include 489 | 490 | #include 491 | #include 492 | 493 | #ifdef RT_OS_FREEBSD 494 | #include // Use thr_self() syscall under FreeBSD to get thread id 495 | #endif // RT_OS_FREEBSD 496 | 497 | pid_t gettid(void) { 498 | #if defined(RT_OS_LINUX) 499 | return syscall(SYS_gettid); 500 | 501 | #elif defined(RT_OS_DARWIN) 502 | uint64_t thread_id; 503 | return pthread_threadid_np(NULL, &thread_id) ? 0 : (pid_t)thread_id; 504 | 505 | #elif defined(RT_OS_FREEBSD) 506 | // https://www.freebsd.org/cgi/man.cgi?query=thr_self 507 | long thread_id; 508 | return thr_self(&thread_id) ? 0 : (pid_t)thread_id; 509 | 510 | #else 511 | #error Cannot determine how to get the identifier for the current thread on this platform. 512 | #endif // defined(RT_OS_LINUX) 513 | } 514 | 515 | 516 | VOID Sleep(DWORD dwMilliseconds) { 517 | usleep(dwMilliseconds * 1000); 518 | } 519 | 520 | BOOL CloseHandle(THANDLE hObject) { 521 | return TRUE; 522 | } 523 | 524 | pid_t GetCurrentThread() { 525 | return gettid(); 526 | } 527 | 528 | uint64_t SetThreadAffinityMask(pid_t hThread, uint64_t dwThreadAffinityMask) { 529 | #if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD) 530 | cpu_set_t cpuset; 531 | 532 | uint64_t bitpos = 1; 533 | int count = 0; 534 | 535 | while (!(bitpos & dwThreadAffinityMask)) { 536 | bitpos <<= 1; 537 | count++; 538 | if (count > 63) { 539 | break; 540 | } 541 | } 542 | 543 | //printf("**linux setting affinity %d\n", count); 544 | 545 | if (count <= 63) { 546 | 547 | CPU_ZERO(&cpuset); 548 | CPU_SET(count, &cpuset); 549 | //dwThreadAffinityMask 550 | sched_setaffinity(GetCurrentThread(), sizeof(cpuset), &cpuset); 551 | } 552 | 553 | #else 554 | #warning No thread - affinity support implemented for this OS.This does not prevent riptide from running but overall performance may be reduced. 555 | #endif // defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD) 556 | 557 | return 0; 558 | } 559 | 560 | BOOL GetProcessAffinityMask(HANDLE hProcess, uint64_t* lpProcessAffinityMask, uint64_t* lpSystemAffinityMask) { 561 | #if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD) 562 | cpu_set_t cpuset; 563 | sched_getaffinity(getpid(), sizeof(cpuset), &cpuset); 564 | 565 | *lpProcessAffinityMask = 0; 566 | *lpSystemAffinityMask = 0; 567 | 568 | uint64_t bitpos = 1; 569 | for (int i = 0; i < 63; i++) { 570 | if (CPU_ISSET(i, &cpuset)) { 571 | *lpProcessAffinityMask |= bitpos; 572 | *lpSystemAffinityMask |= bitpos; 573 | } 574 | bitpos <<= 1; 575 | } 576 | 577 | if (*lpProcessAffinityMask == 0) { 578 | *lpSystemAffinityMask = 0xFF; 579 | *lpSystemAffinityMask = 0xFF; 580 | } 581 | 582 | //CPU_ISSET = 0xFF; 583 | return TRUE; 584 | 585 | #else 586 | #warning No thread - affinity support implemented for this OS.This does not prevent riptide from running but overall performance may be reduced. 587 | return FALSE; 588 | 589 | #endif // defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD) 590 | } 591 | 592 | 593 | HANDLE GetCurrentProcess(VOID) { 594 | return NULL; 595 | } 596 | 597 | DWORD GetLastError(VOID) { 598 | return 0; 599 | } 600 | 601 | HANDLE CreateThread(VOID* lpThreadAttributes, SIZE_T dwStackSize, LPTHREAD_START_ROUTINE lpStartAddress, LPVOID lpParameter, DWORD dwCreationFlags, LPDWORD lpThreadId) { 602 | return NULL; 603 | } 604 | 605 | HMODULE LoadLibraryW(const WCHAR* lpLibFileName) { 606 | return NULL; 607 | } 608 | 609 | FARPROC GetProcAddress(HMODULE hModule, const char* lpProcName) { 610 | return NULL; 611 | } 612 | } 613 | 614 | #include 615 | 616 | void PrintCPUInfo(char* buffer, size_t buffercount) { 617 | char CPUBrandString[0x40]; 618 | unsigned int CPUInfo[4] = { 0,0,0,0 }; 619 | 620 | __cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); 621 | unsigned int nExIds = CPUInfo[0]; 622 | 623 | for (size_t i = 0; i < sizeof(CPUBrandString); i++) { 624 | CPUBrandString[i] = 0; 625 | } 626 | 627 | for (unsigned int i = 0x80000000; i <= nExIds; ++i) 628 | { 629 | __cpuid(i, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); 630 | 631 | if (i == 0x80000002) { 632 | for (size_t i = 0; i < sizeof(CPUInfo); i++) 633 | CPUBrandString[i] = ((char*)CPUInfo)[i]; 634 | } 635 | else if (i == 0x80000003) { 636 | for (size_t i = 0; i < sizeof(CPUInfo); i++) 637 | CPUBrandString[i + 16] = ((char*)CPUInfo)[i]; 638 | } 639 | else if (i == 0x80000004) { 640 | for (size_t i = 0; i < sizeof(CPUInfo); i++) 641 | CPUBrandString[i + 32] = ((char*)CPUInfo)[i]; 642 | } 643 | } 644 | //printf("**CPU: %s\n", CPUBrandString); 645 | 646 | g_cpuid = ATOP_cpuid(); 647 | 648 | g_bmi2 = ATOP_cpuid_bmi2(g_cpuid); 649 | g_avx2 = ATOP_cpuid_avx2(g_cpuid); 650 | 651 | snprintf(buffer, buffercount, "**CPU: %s AVX2:%d BMI2:%d 0x%.8x 0x%.8x 0x%.8x 0x%.8x", CPUBrandString, g_avx2, g_bmi2, g_cpuid.f1c, g_cpuid.f1d, g_cpuid.f7b, g_cpuid.f7c); 652 | if (g_avx2 == 0) { 653 | printf("!!!NOTE: this system does not support AVX2 or BMI2 instructions, and will not work!\n"); 654 | } 655 | 656 | } 657 | 658 | #endif 659 | 660 | 661 | 662 | int GetProcCount() { 663 | 664 | HANDLE proc = GetCurrentProcess(); 665 | 666 | DWORD_PTR mask1; 667 | DWORD_PTR mask2; 668 | int count; 669 | 670 | count = 0; 671 | GetProcessAffinityMask(proc, &mask1, &mask2); 672 | 673 | while (mask1 != 0) { 674 | if (mask1 & 1) count++; 675 | mask1 = mask1 >> 1; 676 | } 677 | 678 | //printf("**Process count: %d riptide_cpp build date and time: %s %s\n", count, __DATE__, __TIME__); 679 | 680 | if (count == 0) count = MAX_THREADS_WHEN_CANNOT_DETECT; 681 | 682 | if (count > MAX_THREADS_ALLOWED) count = MAX_THREADS_ALLOWED; 683 | 684 | return count; 685 | 686 | } 687 | -------------------------------------------------------------------------------- /src/pnumpy/sort.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | __all__ = [ 5 | 'lexsort','sort', 'argsort','argmin', 'argmax', 'searchsorted'] 6 | 7 | from pnumpy._pnumpy import getitem, lexsort32, lexsort64 8 | import numpy as np 9 | 10 | from numpy import asarray, array, asanyarray 11 | from numpy import concatenate 12 | 13 | #array_function_dispatch = functools.partial( 14 | # overrides.array_function_dispatch, module='numpy') 15 | 16 | # functions that are now methods 17 | def _wrapit(obj, method, *args, **kwds): 18 | try: 19 | wrap = obj.__array_wrap__ 20 | except AttributeError: 21 | wrap = None 22 | result = getattr(asarray(obj), method)(*args, **kwds) 23 | if wrap: 24 | if not isinstance(result, mu.ndarray): 25 | result = asarray(result) 26 | result = wrap(result) 27 | return result 28 | 29 | 30 | def _wrapfunc(obj, method, *args, **kwds): 31 | bound = getattr(obj, method, None) 32 | if bound is None: 33 | return _wrapit(obj, method, *args, **kwds) 34 | 35 | try: 36 | return bound(*args, **kwds) 37 | except TypeError: 38 | # A TypeError occurs if the object does have such a method in its 39 | # class, but its signature is not identical to that of NumPy's. This 40 | # situation has occurred in the case of a downstream library like 41 | # 'pandas'. 42 | # 43 | # Call _wrapit from within the except clause to ensure a potential 44 | # exception has a traceback chain. 45 | return _wrapit(obj, method, *args, **kwds) 46 | 47 | 48 | def sort(a, axis=-1, kind=None, order=None): 49 | """ 50 | Return a sorted copy of an array. 51 | 52 | Parameters 53 | ---------- 54 | a : array_like 55 | Array to be sorted. 56 | axis : int or None, optional 57 | Axis along which to sort. If None, the array is flattened before 58 | sorting. The default is -1, which sorts along the last axis. 59 | kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional 60 | Sorting algorithm. The default is 'quicksort'. Note that both 'stable' 61 | and 'mergesort' use timsort or radix sort under the covers and, in general, 62 | the actual implementation will vary with data type. The 'mergesort' option 63 | is retained for backwards compatibility. 64 | 65 | .. versionchanged:: 1.15.0. 66 | The 'stable' option was added. 67 | 68 | order : str or list of str, optional 69 | When `a` is an array with fields defined, this argument specifies 70 | which fields to compare first, second, etc. A single field can 71 | be specified as a string, and not all fields need be specified, 72 | but unspecified fields will still be used, in the order in which 73 | they come up in the dtype, to break ties. 74 | 75 | Returns 76 | ------- 77 | sorted_array : ndarray 78 | Array of the same type and shape as `a`. 79 | 80 | Threading 81 | --------- 82 | Up to 8 threads 83 | 84 | See Also 85 | -------- 86 | ndarray.sort : Method to sort an array in-place. 87 | argsort : Indirect sort. 88 | lexsort : Indirect stable sort on multiple keys. 89 | searchsorted : Find elements in a sorted array. 90 | partition : Partial sort. 91 | 92 | Notes 93 | ----- 94 | The various sorting algorithms are characterized by their average speed, 95 | worst case performance, work space size, and whether they are stable. A 96 | stable sort keeps items with the same key in the same relative 97 | order. The four algorithms implemented in NumPy have the following 98 | properties: 99 | 100 | =========== ======= ============= ============ ======== 101 | kind speed worst case work space stable 102 | =========== ======= ============= ============ ======== 103 | 'quicksort' 1 O(n^2) 0 no 104 | 'heapsort' 3 O(n*log(n)) 0 no 105 | 'mergesort' 2 O(n*log(n)) ~n/2 yes 106 | 'timsort' 2 O(n*log(n)) ~n/2 yes 107 | =========== ======= ============= ============ ======== 108 | 109 | .. note:: The datatype determines which of 'mergesort' or 'timsort' 110 | is actually used, even if 'mergesort' is specified. User selection 111 | at a finer scale is not currently available. 112 | 113 | All the sort algorithms make temporary copies of the data when 114 | sorting along any but the last axis. Consequently, sorting along 115 | the last axis is faster and uses less space than sorting along 116 | any other axis. 117 | 118 | The sort order for complex numbers is lexicographic. If both the real 119 | and imaginary parts are non-nan then the order is determined by the 120 | real parts except when they are equal, in which case the order is 121 | determined by the imaginary parts. 122 | 123 | Previous to numpy 1.4.0 sorting real and complex arrays containing nan 124 | values led to undefined behaviour. In numpy versions >= 1.4.0 nan 125 | values are sorted to the end. The extended sort order is: 126 | 127 | * Real: [R, nan] 128 | * Complex: [R + Rj, R + nanj, nan + Rj, nan + nanj] 129 | 130 | where R is a non-nan real value. Complex values with the same nan 131 | placements are sorted according to the non-nan part if it exists. 132 | Non-nan values are sorted as before. 133 | 134 | .. versionadded:: 1.12.0 135 | 136 | quicksort has been changed to `introsort `_. 137 | When sorting does not make enough progress it switches to 138 | `heapsort `_. 139 | This implementation makes quicksort O(n*log(n)) in the worst case. 140 | 141 | 'stable' automatically chooses the best stable sorting algorithm 142 | for the data type being sorted. 143 | It, along with 'mergesort' is currently mapped to 144 | `timsort `_ 145 | or `radix sort `_ 146 | depending on the data type. 147 | API forward compatibility currently limits the 148 | ability to select the implementation and it is hardwired for the different 149 | data types. 150 | 151 | .. versionadded:: 1.17.0 152 | 153 | Timsort is added for better performance on already or nearly 154 | sorted data. On random data timsort is almost identical to 155 | mergesort. It is now used for stable sort while quicksort is still the 156 | default sort if none is chosen. For timsort details, refer to 157 | `CPython listsort.txt `_. 158 | 'mergesort' and 'stable' are mapped to radix sort for integer data types. Radix sort is an 159 | O(n) sort instead of O(n log n). 160 | 161 | .. versionchanged:: 1.18.0 162 | 163 | NaT now sorts to the end of arrays for consistency with NaN. 164 | 165 | Examples 166 | -------- 167 | >>> a = np.array([[1,4],[3,1]]) 168 | >>> np.sort(a) # sort along the last axis 169 | array([[1, 4], 170 | [1, 3]]) 171 | >>> np.sort(a, axis=None) # sort the flattened array 172 | array([1, 1, 3, 4]) 173 | >>> np.sort(a, axis=0) # sort along the first axis 174 | array([[1, 1], 175 | [3, 4]]) 176 | 177 | Use the `order` keyword to specify a field to use when sorting a 178 | structured array: 179 | 180 | >>> dtype = [('name', 'S10'), ('height', float), ('age', int)] 181 | >>> values = [('Arthur', 1.8, 41), ('Lancelot', 1.9, 38), 182 | ... ('Galahad', 1.7, 38)] 183 | >>> a = np.array(values, dtype=dtype) # create a structured array 184 | >>> np.sort(a, order='height') # doctest: +SKIP 185 | array([('Galahad', 1.7, 38), ('Arthur', 1.8, 41), 186 | ('Lancelot', 1.8999999999999999, 38)], 187 | dtype=[('name', '|S10'), ('height', '>> np.sort(a, order=['age', 'height']) # doctest: +SKIP 192 | array([('Galahad', 1.7, 38), ('Lancelot', 1.8999999999999999, 38), 193 | ('Arthur', 1.8, 41)], 194 | dtype=[('name', '|S10'), ('height', '>> surnames = ('Hertz', 'Galilei', 'Hertz') 255 | >>> first_names = ('Heinrich', 'Galileo', 'Gustav') 256 | >>> ind = np.lexsort((first_names, surnames)) 257 | >>> ind 258 | array([1, 2, 0]) 259 | 260 | >>> [surnames[i] + ", " + first_names[i] for i in ind] 261 | ['Galilei, Galileo', 'Hertz, Gustav', 'Hertz, Heinrich'] 262 | 263 | Sort two columns of numbers: 264 | 265 | >>> a = [1,5,1,4,3,4,4] # First column 266 | >>> b = [9,4,0,4,0,2,1] # Second column 267 | >>> ind = np.lexsort((b,a)) # Sort by a, then by b 268 | >>> ind 269 | array([2, 0, 4, 6, 5, 3, 1]) 270 | 271 | >>> [(a[i],b[i]) for i in ind] 272 | [(1, 0), (1, 9), (3, 0), (4, 1), (4, 2), (4, 4), (5, 4)] 273 | 274 | Note that sorting is first according to the elements of ``a``. 275 | Secondary sorting is according to the elements of ``b``. 276 | 277 | A normal ``argsort`` would have yielded: 278 | 279 | >>> [(a[i],b[i]) for i in np.argsort(a)] 280 | [(1, 9), (1, 0), (3, 0), (4, 4), (4, 2), (4, 1), (5, 4)] 281 | 282 | Structured arrays are sorted lexically by ``argsort``: 283 | 284 | >>> x = np.array([(1,9), (5,4), (1,0), (4,4), (3,0), (4,2), (4,1)], 285 | ... dtype=np.dtype([('x', int), ('y', int)])) 286 | 287 | >>> np.argsort(x) # or np.argsort(x, order=('x', 'y')) 288 | array([2, 0, 4, 6, 5, 3, 1]) 289 | """ 290 | 291 | try: 292 | return lexsort32(*args, **kwargs) 293 | except Exception: 294 | return np.lexsort(*args, **kwargs) 295 | 296 | def argsort(a, axis=-1, kind=None, order=None): 297 | """ 298 | Returns the indices that would sort an array. 299 | 300 | Perform an indirect sort along the given axis using the algorithm specified 301 | by the `kind` keyword. It returns an array of indices of the same shape as 302 | `a` that index data along the given axis in sorted order. 303 | 304 | Parameters 305 | ---------- 306 | a : array_like 307 | Array to sort. 308 | axis : int or None, optional 309 | Axis along which to sort. The default is -1 (the last axis). If None, 310 | the flattened array is used. 311 | kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional 312 | Sorting algorithm. The default is 'quicksort'. Note that both 'stable' 313 | and 'mergesort' use timsort under the covers and, in general, the 314 | actual implementation will vary with data type. The 'mergesort' option 315 | is retained for backwards compatibility. 316 | 317 | .. versionchanged:: 1.15.0. 318 | The 'stable' option was added. 319 | order : str or list of str, optional 320 | When `a` is an array with fields defined, this argument specifies 321 | which fields to compare first, second, etc. A single field can 322 | be specified as a string, and not all fields need be specified, 323 | but unspecified fields will still be used, in the order in which 324 | they come up in the dtype, to break ties. 325 | 326 | Returns 327 | ------- 328 | index_array : ndarray, int 329 | Array of indices that sort `a` along the specified `axis`. 330 | If `a` is one-dimensional, ``a[index_array]`` yields a sorted `a`. 331 | More generally, ``np.take_along_axis(a, index_array, axis=axis)`` 332 | always yields the sorted `a`, irrespective of dimensionality. 333 | 334 | See Also 335 | -------- 336 | sort : Describes sorting algorithms used. 337 | lexsort : Indirect stable sort with multiple keys. 338 | ndarray.sort : Inplace sort. 339 | argpartition : Indirect partial sort. 340 | take_along_axis : Apply ``index_array`` from argsort 341 | to an array as if by calling sort. 342 | 343 | Notes 344 | ----- 345 | See `sort` for notes on the different sorting algorithms. 346 | 347 | As of NumPy 1.4.0 `argsort` works with real/complex arrays containing 348 | nan values. The enhanced sort order is documented in `sort`. 349 | 350 | Examples 351 | -------- 352 | One dimensional array: 353 | 354 | >>> x = np.array([3, 1, 2]) 355 | >>> np.argsort(x) 356 | array([1, 2, 0]) 357 | 358 | Two-dimensional array: 359 | 360 | >>> x = np.array([[0, 3], [2, 2]]) 361 | >>> x 362 | array([[0, 3], 363 | [2, 2]]) 364 | 365 | >>> ind = np.argsort(x, axis=0) # sorts along first axis (down) 366 | >>> ind 367 | array([[0, 1], 368 | [1, 0]]) 369 | >>> np.take_along_axis(x, ind, axis=0) # same as np.sort(x, axis=0) 370 | array([[0, 2], 371 | [2, 3]]) 372 | 373 | >>> ind = np.argsort(x, axis=1) # sorts along last axis (across) 374 | >>> ind 375 | array([[0, 1], 376 | [0, 1]]) 377 | >>> np.take_along_axis(x, ind, axis=1) # same as np.sort(x, axis=1) 378 | array([[0, 3], 379 | [2, 2]]) 380 | 381 | Indices of the sorted elements of a N-dimensional array: 382 | 383 | >>> ind = np.unravel_index(np.argsort(x, axis=None), x.shape) 384 | >>> ind 385 | (array([0, 1, 1, 0]), array([0, 0, 1, 1])) 386 | >>> x[ind] # same as np.sort(x, axis=None) 387 | array([0, 2, 2, 3]) 388 | 389 | Sorting with keys: 390 | 391 | >>> x = np.array([(1, 0), (0, 1)], dtype=[('x', '>> x 393 | array([(1, 0), (0, 1)], 394 | dtype=[('x', '>> np.argsort(x, order=('x','y')) 397 | array([1, 0]) 398 | 399 | >>> np.argsort(x, order=('y','x')) 400 | array([0, 1]) 401 | 402 | """ 403 | return _wrapfunc(a, 'argsort', axis=axis, kind=kind, order=order) 404 | 405 | 406 | def _argmax_dispatcher(a, axis=None, out=None): 407 | return (a, out) 408 | 409 | 410 | def argmax(a, axis=None, out=None): 411 | """ 412 | Returns the indices of the maximum values along an axis. 413 | 414 | Parameters 415 | ---------- 416 | a : array_like 417 | Input array. 418 | axis : int, optional 419 | By default, the index is into the flattened array, otherwise 420 | along the specified axis. 421 | out : array, optional 422 | If provided, the result will be inserted into this array. It should 423 | be of the appropriate shape and dtype. 424 | 425 | Returns 426 | ------- 427 | index_array : ndarray of ints 428 | Array of indices into the array. It has the same shape as `a.shape` 429 | with the dimension along `axis` removed. 430 | 431 | See Also 432 | -------- 433 | ndarray.argmax, argmin 434 | amax : The maximum value along a given axis. 435 | unravel_index : Convert a flat index into an index tuple. 436 | take_along_axis : Apply ``np.expand_dims(index_array, axis)`` 437 | from argmax to an array as if by calling max. 438 | 439 | Notes 440 | ----- 441 | In case of multiple occurrences of the maximum values, the indices 442 | corresponding to the first occurrence are returned. 443 | 444 | Examples 445 | -------- 446 | >>> a = np.arange(6).reshape(2,3) + 10 447 | >>> a 448 | array([[10, 11, 12], 449 | [13, 14, 15]]) 450 | >>> np.argmax(a) 451 | 5 452 | >>> np.argmax(a, axis=0) 453 | array([1, 1, 1]) 454 | >>> np.argmax(a, axis=1) 455 | array([2, 2]) 456 | 457 | Indexes of the maximal elements of a N-dimensional array: 458 | 459 | >>> ind = np.unravel_index(np.argmax(a, axis=None), a.shape) 460 | >>> ind 461 | (1, 2) 462 | >>> a[ind] 463 | 15 464 | 465 | >>> b = np.arange(6) 466 | >>> b[1] = 5 467 | >>> b 468 | array([0, 5, 2, 3, 4, 5]) 469 | >>> np.argmax(b) # Only the first occurrence is returned. 470 | 1 471 | 472 | >>> x = np.array([[4,2,3], [1,0,3]]) 473 | >>> index_array = np.argmax(x, axis=-1) 474 | >>> # Same as np.max(x, axis=-1, keepdims=True) 475 | >>> np.take_along_axis(x, np.expand_dims(index_array, axis=-1), axis=-1) 476 | array([[4], 477 | [3]]) 478 | >>> # Same as np.max(x, axis=-1) 479 | >>> np.take_along_axis(x, np.expand_dims(index_array, axis=-1), axis=-1).squeeze(axis=-1) 480 | array([4, 3]) 481 | 482 | """ 483 | return _wrapfunc(a, 'argmax', axis=axis, out=out) 484 | 485 | 486 | def _argmin_dispatcher(a, axis=None, out=None): 487 | return (a, out) 488 | 489 | 490 | def argmin(a, axis=None, out=None): 491 | """ 492 | Returns the indices of the minimum values along an axis. 493 | 494 | Parameters 495 | ---------- 496 | a : array_like 497 | Input array. 498 | axis : int, optional 499 | By default, the index is into the flattened array, otherwise 500 | along the specified axis. 501 | out : array, optional 502 | If provided, the result will be inserted into this array. It should 503 | be of the appropriate shape and dtype. 504 | 505 | Returns 506 | ------- 507 | index_array : ndarray of ints 508 | Array of indices into the array. It has the same shape as `a.shape` 509 | with the dimension along `axis` removed. 510 | 511 | See Also 512 | -------- 513 | ndarray.argmin, argmax 514 | amin : The minimum value along a given axis. 515 | unravel_index : Convert a flat index into an index tuple. 516 | take_along_axis : Apply ``np.expand_dims(index_array, axis)`` 517 | from argmin to an array as if by calling min. 518 | 519 | Notes 520 | ----- 521 | In case of multiple occurrences of the minimum values, the indices 522 | corresponding to the first occurrence are returned. 523 | 524 | Examples 525 | -------- 526 | >>> a = np.arange(6).reshape(2,3) + 10 527 | >>> a 528 | array([[10, 11, 12], 529 | [13, 14, 15]]) 530 | >>> np.argmin(a) 531 | 0 532 | >>> np.argmin(a, axis=0) 533 | array([0, 0, 0]) 534 | >>> np.argmin(a, axis=1) 535 | array([0, 0]) 536 | 537 | Indices of the minimum elements of a N-dimensional array: 538 | 539 | >>> ind = np.unravel_index(np.argmin(a, axis=None), a.shape) 540 | >>> ind 541 | (0, 0) 542 | >>> a[ind] 543 | 10 544 | 545 | >>> b = np.arange(6) + 10 546 | >>> b[4] = 10 547 | >>> b 548 | array([10, 11, 12, 13, 10, 15]) 549 | >>> np.argmin(b) # Only the first occurrence is returned. 550 | 0 551 | 552 | >>> x = np.array([[4,2,3], [1,0,3]]) 553 | >>> index_array = np.argmin(x, axis=-1) 554 | >>> # Same as np.min(x, axis=-1, keepdims=True) 555 | >>> np.take_along_axis(x, np.expand_dims(index_array, axis=-1), axis=-1) 556 | array([[2], 557 | [0]]) 558 | >>> # Same as np.max(x, axis=-1) 559 | >>> np.take_along_axis(x, np.expand_dims(index_array, axis=-1), axis=-1).squeeze(axis=-1) 560 | array([2, 0]) 561 | 562 | """ 563 | return _wrapfunc(a, 'argmin', axis=axis, out=out) 564 | 565 | 566 | def _searchsorted_dispatcher(a, v, side=None, sorter=None): 567 | return (a, v, sorter) 568 | 569 | 570 | def searchsorted(a, v, side='left', sorter=None): 571 | """ 572 | Find indices where elements should be inserted to maintain order. 573 | 574 | Find the indices into a sorted array `a` such that, if the 575 | corresponding elements in `v` were inserted before the indices, the 576 | order of `a` would be preserved. 577 | 578 | Assuming that `a` is sorted: 579 | 580 | ====== ============================ 581 | `side` returned index `i` satisfies 582 | ====== ============================ 583 | left ``a[i-1] < v <= a[i]`` 584 | right ``a[i-1] <= v < a[i]`` 585 | ====== ============================ 586 | 587 | Parameters 588 | ---------- 589 | a : 1-D array_like 590 | Input array. If `sorter` is None, then it must be sorted in 591 | ascending order, otherwise `sorter` must be an array of indices 592 | that sort it. 593 | v : array_like 594 | Values to insert into `a`. 595 | side : {'left', 'right'}, optional 596 | If 'left', the index of the first suitable location found is given. 597 | If 'right', return the last such index. If there is no suitable 598 | index, return either 0 or N (where N is the length of `a`). 599 | sorter : 1-D array_like, optional 600 | Optional array of integer indices that sort array a into ascending 601 | order. They are typically the result of argsort. 602 | 603 | .. versionadded:: 1.7.0 604 | 605 | Returns 606 | ------- 607 | indices : array of ints 608 | Array of insertion points with the same shape as `v`. 609 | 610 | See Also 611 | -------- 612 | sort : Return a sorted copy of an array. 613 | histogram : Produce histogram from 1-D data. 614 | 615 | Notes 616 | ----- 617 | Binary search is used to find the required insertion points. 618 | 619 | As of NumPy 1.4.0 `searchsorted` works with real/complex arrays containing 620 | `nan` values. The enhanced sort order is documented in `sort`. 621 | 622 | This function uses the same algorithm as the builtin python `bisect.bisect_left` 623 | (``side='left'``) and `bisect.bisect_right` (``side='right'``) functions, 624 | which is also vectorized in the `v` argument. 625 | 626 | Examples 627 | -------- 628 | >>> np.searchsorted([1,2,3,4,5], 3) 629 | 2 630 | >>> np.searchsorted([1,2,3,4,5], 3, side='right') 631 | 3 632 | >>> np.searchsorted([1,2,3,4,5], [-10, 10, 2, 3]) 633 | array([0, 5, 1, 2]) 634 | 635 | """ 636 | return _wrapfunc(a, 'searchsorted', v, side=side, sorter=sorter) 637 | 638 | -------------------------------------------------------------------------------- /src/atop/ops_log.cpp: -------------------------------------------------------------------------------- 1 | #include "common_inc.h" 2 | #include 3 | #include "invalids.h" 4 | 5 | #if defined(__clang__) 6 | #pragma clang diagnostic ignored "-Wmissing-braces" 7 | #pragma clang diagnostic ignored "-Wunused-function" 8 | #pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) 9 | #endif 10 | 11 | #if defined(__GNUC__) 12 | //#pragma GCC target "arch=core-avx2,tune=core-avx2" 13 | #if __GNUC_PREREQ(4, 4) || (__clang__ > 0 && __clang_major__ >= 3) || !defined(__GNUC__) 14 | /* GCC >= 4.4 or clang or non-GCC compilers */ 15 | #include 16 | #elif __GNUC_PREREQ(4, 1) 17 | /* GCC 4.1, 4.2, and 4.3 do not have x86intrin.h, directly include SSE2 header */ 18 | #include 19 | #endif 20 | #endif 21 | 22 | 23 | //#define LOGGING printf 24 | #define LOGGING(...) 25 | 26 | #if !RT_TARGET_VECTOR_MEMOP_DEFAULT_ALIGNED 27 | // MSVC compiler by default assumed unaligned loads 28 | #define LOADU(X) *(X) 29 | 30 | #else 31 | static const inline __m256d LOADU(__m256d* x) { return _mm256_loadu_pd((double const*)x); }; 32 | static const inline __m256 LOADU(__m256* x) { return _mm256_loadu_ps((float const*)x); }; 33 | static const inline __m256i LOADU(__m256i* x) { return _mm256_loadu_si256((__m256i const*)x); }; 34 | #endif 35 | 36 | static const inline __m256i LOADUI(__m256i* x) { return _mm256_loadu_si256((__m256i const*)x); }; 37 | 38 | static const inline void STOREU(__m256d* x, __m256d y) { _mm256_storeu_pd((double*)x, y); } 39 | static const inline void STOREU(__m256* x, __m256 y) { _mm256_storeu_ps((float*)x, y); } 40 | static const inline void STOREU(__m256i* x, __m256i y) { _mm256_storeu_si256((__m256i*)x, y); } 41 | 42 | // For aligned loads which must be on 32 byte boundary 43 | static const inline __m256d LOADA(__m256d* x) { return _mm256_load_pd((double const*)x); }; 44 | static const inline __m256 LOADA(__m256* x) { return _mm256_load_ps((float const*)x); }; 45 | static const inline __m256i LOADA(__m256i* x) { return _mm256_load_si256((__m256i const*)x); }; 46 | 47 | // Aligned stores 48 | static const inline void STOREA(__m256d* x, __m256d y) { _mm256_store_pd((double*)x, y); } 49 | static const inline void STOREA(__m256* x, __m256 y) { _mm256_store_ps((float*)x, y); } 50 | static const inline void STOREA(__m256i* x, __m256i y) { _mm256_store_si256((__m256i*)x, y); } 51 | 52 | 53 | template static const inline long double LOG_OP(long double x) { return logl(x); } 54 | template static const inline double LOG_OP(double x) { return log(x); } 55 | template static const inline float LOG_OP(float x) { return logf(x); } 56 | 57 | 58 | #if defined(RT_COMPILER_MSVC) 59 | 60 | template static const inline __m256 LOG_OP_256(__m256 x) { return _mm256_log_ps(x); } 61 | template static const inline __m256d LOG_OP_256(__m256d x) { return _mm256_log_pd(x); } 62 | #include 63 | #endif 64 | 65 | #if defined(__GNUC__) 66 | // May require -lm for linker 67 | 68 | extern "C" { 69 | __m256d _ZGVdN4v_log(__m256d x); 70 | __m256 _ZGVdN8v_logf(__m256 x); 71 | } 72 | 73 | template static const inline __m256 LOG_OP_256(__m256 x) { return _ZGVdN8v_logf(x); } 74 | template static const inline __m256d LOG_OP_256(__m256d x) { return _ZGVdN4v_log(x); } 75 | 76 | #endif 77 | 78 | 79 | # include 80 | 81 | void npy_set_floatstatus_divbyzero(void) 82 | { 83 | feraiseexcept(FE_DIVBYZERO); 84 | } 85 | 86 | void npy_set_floatstatus_overflow(void) 87 | { 88 | feraiseexcept(FE_OVERFLOW); 89 | } 90 | 91 | void npy_set_floatstatus_underflow(void) 92 | { 93 | feraiseexcept(FE_UNDERFLOW); 94 | } 95 | 96 | void npy_set_floatstatus_invalid(void) 97 | { 98 | feraiseexcept(FE_INVALID); 99 | } 100 | 101 | //------------------------------------------------------------------- 102 | //template 103 | //static void UnaryOpSlow_LOG(void* pDataIn1, void* pDataOut, int64_t len, int64_t strideIn, int64_t strideOut) { 104 | // return UnaryOpSlow(LOG_OP, pDataIn1, pDataOut, len, strideIn, strideOut); 105 | //} 106 | 107 | 108 | /* 109 | * NAN and INFINITY like macros (same behavior as glibc for NAN, same as C99 110 | * for INFINITY) 111 | * 112 | * XXX: I should test whether INFINITY and NAN are available on the platform 113 | */ 114 | static const inline float __npy_inff(void) 115 | { 116 | const union { uint32_t __i; float __f; } __bint = { 0x7f800000UL }; 117 | return __bint.__f; 118 | } 119 | 120 | static const inline float __npy_nanf(void) 121 | { 122 | const union { uint32_t __i; float __f; } __bint = { 0x7fc00000UL }; 123 | return __bint.__f; 124 | } 125 | 126 | static const inline float __npy_pzerof(void) 127 | { 128 | const union { uint32_t __i; float __f; } __bint = { 0x00000000UL }; 129 | return __bint.__f; 130 | } 131 | 132 | static const inline float __npy_nzerof(void) 133 | { 134 | const union { uint32_t __i; float __f; } __bint = { 0x80000000UL }; 135 | return __bint.__f; 136 | } 137 | 138 | #define NPY_INFINITYF __npy_inff() 139 | #define NPY_NANF __npy_nanf() 140 | #define NPY_PZEROF __npy_pzerof() 141 | #define NPY_NZEROF __npy_nzerof() 142 | 143 | #define NPY_INFINITY ((npy_double)NPY_INFINITYF) 144 | #define NPY_NAN ((npy_double)NPY_NANF) 145 | #define NPY_PZERO ((npy_double)NPY_PZEROF) 146 | #define NPY_NZERO ((npy_double)NPY_NZEROF) 147 | 148 | #define NPY_INFINITYL ((npy_longdouble)NPY_INFINITYF) 149 | #define NPY_NANL ((npy_longdouble)NPY_NANF) 150 | #define NPY_PZEROL ((npy_longdouble)NPY_PZEROF) 151 | #define NPY_NZEROL ((npy_longdouble)NPY_NZEROF) 152 | 153 | /* 154 | * Useful constants 155 | */ 156 | #define NPY_E 2.718281828459045235360287471352662498 /* e */ 157 | #define NPY_LOG2E 1.442695040888963407359924681001892137 /* log_2 e */ 158 | #define NPY_LOG10E 0.434294481903251827651128918916605082 /* log_10 e */ 159 | #define NPY_LOGE2 0.693147180559945309417232121458176568 /* log_e 2 */ 160 | #define NPY_LOGE10 2.302585092994045684017991454684364208 /* log_e 10 */ 161 | #define NPY_PI 3.141592653589793238462643383279502884 /* pi */ 162 | #define NPY_PI_2 1.570796326794896619231321691639751442 /* pi/2 */ 163 | #define NPY_PI_4 0.785398163397448309615660845819875721 /* pi/4 */ 164 | #define NPY_1_PI 0.318309886183790671537767526745028724 /* 1/pi */ 165 | #define NPY_2_PI 0.636619772367581343075535053490057448 /* 2/pi */ 166 | #define NPY_EULER 0.577215664901532860606512090082402431 /* Euler constant */ 167 | #define NPY_SQRT2 1.414213562373095048801688724209698079 /* sqrt(2) */ 168 | #define NPY_SQRT1_2 0.707106781186547524400844362104849039 /* 1/sqrt(2) */ 169 | 170 | #define NPY_Ef 2.718281828459045235360287471352662498F /* e */ 171 | #define NPY_LOG2Ef 1.442695040888963407359924681001892137F /* log_2 e */ 172 | #define NPY_LOG10Ef 0.434294481903251827651128918916605082F /* log_10 e */ 173 | #define NPY_LOGE2f 0.693147180559945309417232121458176568F /* log_e 2 */ 174 | #define NPY_LOGE10f 2.302585092994045684017991454684364208F /* log_e 10 */ 175 | #define NPY_PIf 3.141592653589793238462643383279502884F /* pi */ 176 | #define NPY_PI_2f 1.570796326794896619231321691639751442F /* pi/2 */ 177 | #define NPY_PI_4f 0.785398163397448309615660845819875721F /* pi/4 */ 178 | #define NPY_1_PIf 0.318309886183790671537767526745028724F /* 1/pi */ 179 | #define NPY_2_PIf 0.636619772367581343075535053490057448F /* 2/pi */ 180 | #define NPY_EULERf 0.577215664901532860606512090082402431F /* Euler constant */ 181 | #define NPY_SQRT2f 1.414213562373095048801688724209698079F /* sqrt(2) */ 182 | #define NPY_SQRT1_2f 0.707106781186547524400844362104849039F /* 1/sqrt(2) */ 183 | 184 | #define NPY_El 2.718281828459045235360287471352662498L /* e */ 185 | #define NPY_LOG2El 1.442695040888963407359924681001892137L /* log_2 e */ 186 | #define NPY_LOG10El 0.434294481903251827651128918916605082L /* log_10 e */ 187 | #define NPY_LOGE2l 0.693147180559945309417232121458176568L /* log_e 2 */ 188 | #define NPY_LOGE10l 2.302585092994045684017991454684364208L /* log_e 10 */ 189 | #define NPY_PIl 3.141592653589793238462643383279502884L /* pi */ 190 | #define NPY_PI_2l 1.570796326794896619231321691639751442L /* pi/2 */ 191 | #define NPY_PI_4l 0.785398163397448309615660845819875721L /* pi/4 */ 192 | #define NPY_1_PIl 0.318309886183790671537767526745028724L /* 1/pi */ 193 | #define NPY_2_PIl 0.636619772367581343075535053490057448L /* 2/pi */ 194 | #define NPY_EULERl 0.577215664901532860606512090082402431L /* Euler constant */ 195 | #define NPY_SQRT2l 1.414213562373095048801688724209698079L /* sqrt(2) */ 196 | #define NPY_SQRT1_2l 0.707106781186547524400844362104849039L /* 1/sqrt(2) */ 197 | 198 | /* 199 | * Constants used in vector implementation of exp(x) 200 | */ 201 | #define NPY_RINT_CVT_MAGICf 0x1.800000p+23f 202 | #define NPY_CODY_WAITE_LOGE_2_HIGHf -6.93145752e-1f 203 | #define NPY_CODY_WAITE_LOGE_2_LOWf -1.42860677e-6f 204 | #define NPY_COEFF_P0_EXPf 9.999999999980870924916e-01f 205 | #define NPY_COEFF_P1_EXPf 7.257664613233124478488e-01f 206 | #define NPY_COEFF_P2_EXPf 2.473615434895520810817e-01f 207 | #define NPY_COEFF_P3_EXPf 5.114512081637298353406e-02f 208 | #define NPY_COEFF_P4_EXPf 6.757896990527504603057e-03f 209 | #define NPY_COEFF_P5_EXPf 5.082762527590693718096e-04f 210 | #define NPY_COEFF_Q0_EXPf 1.000000000000000000000e+00f 211 | #define NPY_COEFF_Q1_EXPf -2.742335390411667452936e-01f 212 | #define NPY_COEFF_Q2_EXPf 2.159509375685829852307e-02f 213 | 214 | /* 215 | * Constants used in vector implementation of log(x) 216 | */ 217 | #define NPY_COEFF_P0_LOGf 0.000000000000000000000e+00f 218 | #define NPY_COEFF_P1_LOGf 9.999999999999998702752e-01f 219 | #define NPY_COEFF_P2_LOGf 2.112677543073053063722e+00f 220 | #define NPY_COEFF_P3_LOGf 1.480000633576506585156e+00f 221 | #define NPY_COEFF_P4_LOGf 3.808837741388407920751e-01f 222 | #define NPY_COEFF_P5_LOGf 2.589979117907922693523e-02f 223 | #define NPY_COEFF_Q0_LOGf 1.000000000000000000000e+00f 224 | #define NPY_COEFF_Q1_LOGf 2.612677543073109236779e+00f 225 | #define NPY_COEFF_Q2_LOGf 2.453006071784736363091e+00f 226 | #define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f 227 | #define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f 228 | #define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f 229 | /* 230 | * Constants used in vector implementation of sinf/cosf(x) 231 | */ 232 | #define NPY_TWO_O_PIf 0x1.45f306p-1f 233 | #define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f 234 | #define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f 235 | #define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f 236 | #define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f 237 | #define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f 238 | #define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f 239 | #define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f 240 | #define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f 241 | #define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f 242 | #define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f 243 | #define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f 244 | #define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f 245 | 246 | #define FLT_MIN 1.175494351e-38F // min normalized positive value 247 | 248 | static const inline int mask_to_int(__m256 _x_) { return _mm256_movemask_ps(_x_); }; 249 | 250 | static const inline __m256 set1_ps(float _x_) { return _mm256_set1_ps(_x_); }; 251 | 252 | //static const inline __mmask16 253 | //get_full_load_mask_ps(void) 254 | //{ 255 | // return 0xFFFF; 256 | //} 257 | // 258 | //static const inline __mmask8 259 | //get_full_load_mask_pd(void) 260 | //{ 261 | // return 0xFF; 262 | //} 263 | // 264 | //static const inline __mmask16 265 | //get_partial_load_mask_ps(const int num_elem, const int total_elem) 266 | //{ 267 | // return (0x0001 << num_elem) - 0x0001; 268 | //} 269 | // 270 | //static const inline __mmask8 271 | //get_partial_load_mask_pd(const int num_elem, const int total_elem) 272 | //{ 273 | // return (0x01 << num_elem) - 0x01; 274 | //} 275 | 276 | 277 | static const inline __m256 278 | get_full_load_mask_ps(void) 279 | { 280 | return _mm256_set1_ps(-1.0); 281 | } 282 | 283 | static const inline __m256i 284 | get_full_load_mask_pd(void) 285 | { 286 | return _mm256_castpd_si256(_mm256_set1_pd(-1.0)); 287 | } 288 | 289 | static const inline __m256 290 | get_partial_load_mask_ps(const int num_elem, const int num_lanes) 291 | { 292 | float maskint[16] = { -1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0, 293 | 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 }; 294 | float* addr = maskint + num_lanes - num_elem; 295 | return _mm256_loadu_ps(addr); 296 | } 297 | 298 | static const inline __m256i 299 | get_partial_load_mask_pd(const int num_elem, const int num_lanes) 300 | { 301 | int maskint[16] = { -1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1 }; 302 | int* addr = maskint + 2 * num_lanes - 2 * num_elem; 303 | return _mm256_loadu_si256((__m256i*) addr); 304 | } 305 | 306 | static const inline __m256 307 | masked_gather_ps(__m256 src, 308 | float* addr, 309 | __m256i vindex, 310 | __m256 mask) 311 | { 312 | return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4); 313 | } 314 | 315 | static const inline __m256d 316 | masked_gather_pd(__m256d src, 317 | double* addr, 318 | __m128i vindex, 319 | __m256d mask) 320 | { 321 | return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8); 322 | } 323 | 324 | static const inline __m256 325 | masked_load_ps(__m256 mask, float* addr) 326 | { 327 | return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask)); 328 | } 329 | 330 | static const inline __m256d 331 | masked_load_pd(__m256i mask, double* addr) 332 | { 333 | return _mm256_maskload_pd(addr, mask); 334 | } 335 | 336 | static const inline __m256 337 | fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask) 338 | { 339 | return _mm256_blendv_ps(x, val, mask); 340 | } 341 | 342 | static const inline __m256d 343 | fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask) 344 | { 345 | return _mm256_blendv_pd(x, val, mask); 346 | } 347 | 348 | static const inline __m256 349 | or_masks(__m256 x, __m256 y) { return _mm256_or_ps(x, y); } 350 | 351 | static const inline __m256 352 | and_masks(__m256 x, __m256 y) { return _mm256_and_ps(x, y); } 353 | 354 | static const inline __m256 355 | xor_masks(__m256 x, __m256 y) { return _mm256_xor_ps(x, y); } 356 | 357 | static const inline __m256 358 | set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask) 359 | { 360 | return _mm256_blendv_ps(x, val, mask); 361 | } 362 | 363 | static const inline __m256d 364 | set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask) 365 | { 366 | return _mm256_blendv_pd(x, val, mask); 367 | } 368 | 369 | static const inline __m256 370 | blend(__m256 x, __m256 y, __m256 ymask) 371 | { 372 | return _mm256_blendv_ps(x, y, ymask); 373 | } 374 | 375 | template 376 | static inline __m256 377 | cmp_ps(__m256 x, __m256 y) { return _mm256_cmp_ps(x, y, COMP_OP); }; 378 | 379 | static const inline __m256 380 | add_ps(__m256 x, __m256 y) { return _mm256_add_ps(x, y); }; 381 | 382 | static const inline __m256 383 | sub_ps(__m256 x, __m256 y) { return _mm256_sub_ps(x, y); }; 384 | 385 | static const inline __m256 386 | div_ps(__m256 x, __m256 y) { return _mm256_div_ps(x, y); }; 387 | 388 | static const inline void 389 | maskstore_ps(float * x, __m256i y, __m256 z) { 390 | return _mm256_maskstore_ps(x, y, z); 391 | }; 392 | 393 | static const inline __m256i 394 | cvtps_epi32(__m256 x) { return _mm256_cvtps_epi32(x); }; 395 | 396 | //static const inline __m256 397 | //fma_add_ps(__m256 x, __m256 y, __m256 z) { return _mm256_fmadd_ps(x, y, z); }; 398 | 399 | static const inline __m256 400 | fma_add_ps(__m256 x, __m256 y, __m256 z) { return _mm256_add_ps(z, _mm256_mul_ps(x, y)); }; 401 | 402 | static const inline __m256 403 | get_exponent(__m256 x) 404 | { 405 | /* 406 | * Special handling of denormals: 407 | * 1) Multiply denormal elements with 2**100 (0x71800000) 408 | * 2) Get the 8 bits of unbiased exponent 409 | * 3) Subtract 100 from exponent of denormals 410 | */ 411 | 412 | __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000)); 413 | __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ); 414 | __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ); 415 | 416 | __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask); 417 | __m256 temp = _mm256_mul_ps(temp1, two_power_100); 418 | x = _mm256_blendv_ps(x, temp, denormal_mask); 419 | 420 | __m256 exp = _mm256_cvtepi32_ps( 421 | _mm256_sub_epi32( 422 | _mm256_srli_epi32( 423 | _mm256_castps_si256(x), 23), _mm256_set1_epi32(0x7E))); 424 | 425 | __m256 denorm_exp = _mm256_sub_ps(exp, _mm256_set1_ps(100.0f)); 426 | return _mm256_blendv_ps(exp, denorm_exp, denormal_mask); 427 | } 428 | 429 | static const inline __m256 430 | get_mantissa(__m256 x) 431 | { 432 | /* 433 | * Special handling of denormals: 434 | * 1) Multiply denormal elements with 2**100 (0x71800000) 435 | * 2) Get the 23 bits of mantissa 436 | * 3) Mantissa for denormals is not affected by the multiplication 437 | */ 438 | 439 | __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000)); 440 | __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ); 441 | __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ); 442 | 443 | __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask); 444 | __m256 temp = _mm256_mul_ps(temp1, two_power_100); 445 | x = _mm256_blendv_ps(x, temp, denormal_mask); 446 | 447 | __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff); 448 | __m256i exp_126_bits = _mm256_set1_epi32(126 << 23); 449 | return _mm256_castsi256_ps( 450 | _mm256_or_si256( 451 | _mm256_and_si256( 452 | _mm256_castps_si256(x), mantissa_bits), exp_126_bits)); 453 | } 454 | 455 | 456 | 457 | 458 | /* 459 | * Vectorized implementation of log using AVX2 and AVX512 460 | * 1) if x < 0.0f; return -NAN (invalid input) 461 | * 2) Range reduction: y = x/2^k; 462 | * a) y = normalized mantissa, k is the exponent (0.5 <= y < 1) 463 | * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q 464 | * b) P = 5th order and Q = 5th order polynomials obtained from Remez's 465 | * algorithm (mini-max polynomial approximation) 466 | * 5) Compute log(x) = log(y) + k*ln(2) 467 | * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945) 468 | * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same 469 | * x = 0x3f486945) 470 | */ 471 | template 472 | static void 473 | log_FLOAT(void* pDataIn, 474 | void* pDataOut, 475 | const int64_t array_size, 476 | const int64_t steps, 477 | const int64_t stepsOut) 478 | { 479 | const int64_t stride = steps / (int64_t)sizeof(float); 480 | const int32_t num_lanes = sizeof(VTYPE) / (int64_t)sizeof(float); 481 | 482 | float* op = (float*)pDataOut; 483 | float* ip = (float*)pDataIn; 484 | 485 | /* 486 | * Note: while generally indices are int64_t, we ensure that our maximum index 487 | * will fit in an int32 as a precondition for this function via 488 | * IS_OUTPUT_BLOCKABLE_UNARY 489 | */ 490 | int32_t indexarr[16]; 491 | for (int32_t ii = 0; ii < 16; ii++) { 492 | indexarr[ii] = ii * (int32_t)stride; 493 | } 494 | 495 | /* Load up frequently used constants */ 496 | VTYPE log_p0 = set1_ps(NPY_COEFF_P0_LOGf); 497 | VTYPE log_p1 = set1_ps(NPY_COEFF_P1_LOGf); 498 | VTYPE log_p2 = set1_ps(NPY_COEFF_P2_LOGf); 499 | VTYPE log_p3 = set1_ps(NPY_COEFF_P3_LOGf); 500 | VTYPE log_p4 = set1_ps(NPY_COEFF_P4_LOGf); 501 | VTYPE log_p5 = set1_ps(NPY_COEFF_P5_LOGf); 502 | VTYPE log_q0 = set1_ps(NPY_COEFF_Q0_LOGf); 503 | VTYPE log_q1 = set1_ps(NPY_COEFF_Q1_LOGf); 504 | VTYPE log_q2 = set1_ps(NPY_COEFF_Q2_LOGf); 505 | VTYPE log_q3 = set1_ps(NPY_COEFF_Q3_LOGf); 506 | VTYPE log_q4 = set1_ps(NPY_COEFF_Q4_LOGf); 507 | VTYPE log_q5 = set1_ps(NPY_COEFF_Q5_LOGf); 508 | VTYPE loge2 = set1_ps(NPY_LOGE2f); 509 | VTYPE nan = set1_ps(NPY_NANF); 510 | VTYPE neg_nan = set1_ps(-NPY_NANF); 511 | VTYPE neg_inf = set1_ps(-NPY_INFINITYF); 512 | VTYPE inf = set1_ps(NPY_INFINITYF); 513 | VTYPE zeros_f = set1_ps(0.0f); 514 | VTYPE ones_f = set1_ps(1.0f); 515 | VTYPEi vindex = LOADUI((VTYPEi*)indexarr); 516 | VTYPE poly, num_poly, denom_poly, exponent; 517 | 518 | VTYPE inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask; 519 | VTYPE invalid_mask = get_partial_load_mask_ps(0, num_lanes); 520 | VTYPE divide_by_zero_mask = invalid_mask; 521 | VTYPE load_mask = get_full_load_mask_ps(); 522 | 523 | int64_t num_remaining_elements = array_size; 524 | 525 | while (num_remaining_elements > 0) { 526 | 527 | if (num_remaining_elements < num_lanes) { 528 | load_mask = get_partial_load_mask_ps((int)num_remaining_elements, 529 | num_lanes); 530 | } 531 | 532 | VTYPE x_in; 533 | if (stride == 1) { 534 | x_in = masked_load_ps(load_mask, ip); 535 | } 536 | else { 537 | x_in = masked_gather_ps(zeros_f, ip, vindex, load_mask); 538 | } 539 | 540 | negx_mask = cmp_ps< _CMP_LT_OQ>(x_in, zeros_f); 541 | zero_mask = cmp_ps< _CMP_EQ_OQ>(x_in, zeros_f); 542 | inf_mask = cmp_ps< _CMP_EQ_OQ>(x_in, inf); 543 | nan_mask = cmp_ps< _CMP_NEQ_UQ>(x_in, x_in); 544 | 545 | divide_by_zero_mask = or_masks(divide_by_zero_mask, 546 | and_masks(zero_mask, load_mask)); 547 | invalid_mask = or_masks(invalid_mask, negx_mask); 548 | 549 | VTYPE x = set_masked_lanes_ps(x_in, zeros_f, negx_mask); 550 | 551 | /* set x = normalized mantissa */ 552 | exponent = get_exponent(x); 553 | x = get_mantissa(x); 554 | 555 | /* if x < sqrt(2) {exp = exp-1; x = 2*x} */ 556 | sqrt2_mask = cmp_ps< _CMP_LE_OQ>(x, set1_ps(NPY_SQRT1_2f)); 557 | 558 | x = blend(x, add_ps(x, x), sqrt2_mask); 559 | exponent = blend(exponent, sub_ps(exponent, ones_f), sqrt2_mask); 560 | 561 | /* x = x - 1 */ 562 | x = sub_ps(x, ones_f); 563 | 564 | /* Polynomial approximation for log(1+x) */ 565 | num_poly = fma_add_ps(log_p5, x, log_p4); 566 | num_poly = fma_add_ps(num_poly, x, log_p3); 567 | num_poly = fma_add_ps(num_poly, x, log_p2); 568 | num_poly = fma_add_ps(num_poly, x, log_p1); 569 | num_poly = fma_add_ps(num_poly, x, log_p0); 570 | denom_poly = fma_add_ps(log_q5, x, log_q4); 571 | denom_poly = fma_add_ps(denom_poly, x, log_q3); 572 | denom_poly = fma_add_ps(denom_poly, x, log_q2); 573 | denom_poly = fma_add_ps(denom_poly, x, log_q1); 574 | denom_poly = fma_add_ps(denom_poly, x, log_q0); 575 | poly = div_ps(num_poly, denom_poly); 576 | poly = fma_add_ps(exponent, loge2, poly); 577 | 578 | /* 579 | * x < 0.0f; return -NAN 580 | * x = +/- NAN; return NAN 581 | * x = 0.0f; return -INF 582 | */ 583 | poly = set_masked_lanes_ps(poly, nan, nan_mask); 584 | poly = set_masked_lanes_ps(poly, neg_nan, negx_mask); 585 | poly = set_masked_lanes_ps(poly, neg_inf, zero_mask); 586 | poly = set_masked_lanes_ps(poly, inf, inf_mask); 587 | 588 | maskstore_ps(op, cvtps_epi32(load_mask), poly); 589 | 590 | ip += num_lanes * stride; 591 | op += num_lanes; 592 | num_remaining_elements -= num_lanes; 593 | } 594 | 595 | if (mask_to_int(invalid_mask)) { 596 | npy_set_floatstatus_invalid(); 597 | } 598 | if (mask_to_int(divide_by_zero_mask)) { 599 | npy_set_floatstatus_divbyzero(); 600 | } 601 | } 602 | 603 | 604 | extern "C" 605 | UNARY_FUNC GetLogOpFast(int func, int atopInType1, int* wantedOutType) { 606 | 607 | switch (func) { 608 | case TRIG_OPERATION::LOG: 609 | *wantedOutType = ATOP_DOUBLE; 610 | if (atopInType1 == ATOP_FLOAT) { 611 | *wantedOutType = ATOP_FLOAT; 612 | } 613 | switch (atopInType1) { 614 | case ATOP_FLOAT: return log_FLOAT<__m256, __m256i>; 615 | } 616 | } 617 | return NULL; 618 | } 619 | 620 | #if defined(__clang__) 621 | #pragma clang attribute pop 622 | #endif 623 | --------------------------------------------------------------------------------