├── .ci └── hack-intel-cl-into-conda-env.sh ├── .conda-ci-build-configure.sh ├── .editorconfig ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ └── feature_request.md ├── dependabot.yml └── workflows │ ├── autopush.yml │ ├── ci.yml │ └── wheels.yml ├── .gitignore ├── .gitlab-ci.yml ├── .gitmodules ├── .pylintrc-local.yml ├── .test-conda-env-py3.yml ├── CITATION.cff ├── CMakeLists.txt ├── LICENSE ├── README.rst ├── TODOs ├── contrib ├── cldis.py ├── fortran-to-opencl │ ├── README │ └── translate.py └── pyopencl.vim ├── doc ├── .gitignore ├── Makefile ├── algorithm.rst ├── array.rst ├── conf.py ├── howto.rst ├── index.rst ├── make_constants.py ├── misc.rst ├── runtime.rst ├── runtime_const.rst ├── runtime_gl.rst ├── runtime_memory.rst ├── runtime_platform.rst ├── runtime_program.rst ├── runtime_queue.rst ├── subst.rst ├── tools.rst ├── types.rst └── upload-docs.sh ├── examples ├── .gitignore ├── black-hole-accretion.py ├── demo-struct-reduce.py ├── demo.py ├── demo_array.py ├── demo_array_svm.py ├── demo_elementwise.py ├── demo_elementwise_complex.py ├── demo_mandelbrot.py ├── demo_meta_codepy.py ├── demo_meta_template.py ├── dump-performance.py ├── dump-properties.py ├── gl_interop_demo.py ├── gl_particle_animation.py ├── image_filters_using_image2d_t.py ├── ipython-demo.ipynb ├── median-filter.py ├── n-body.py ├── narray.py ├── noisyImage.jpg ├── pi-monte-carlo.py ├── svm.py └── transpose.py ├── pyopencl ├── __init__.py ├── _cluda.py ├── _mymako.py ├── algorithm.py ├── array.py ├── bitonic_sort.py ├── bitonic_sort_templates.py ├── cache.py ├── capture_call.py ├── characterize │ ├── __init__.py │ └── performance.py ├── cl │ ├── pyopencl-airy.cl │ ├── pyopencl-bessel-j-complex.cl │ ├── pyopencl-bessel-j.cl │ ├── pyopencl-bessel-y.cl │ ├── pyopencl-complex.h │ ├── pyopencl-eval-tbl.cl │ ├── pyopencl-hankel-complex.cl │ └── pyopencl-random123 │ │ ├── array.h │ │ ├── openclfeatures.h │ │ ├── philox.cl │ │ └── threefry.cl ├── clmath.py ├── clrandom.py ├── cltypes.py ├── elementwise.py ├── invoker.py ├── ipython_ext.py ├── reduction.py ├── scan.py ├── tools.py └── version.py ├── pyproject.toml ├── run-mypy.sh ├── run-pylint.sh ├── scripts ├── build-ocl-macos.sh ├── build-ocl-windows.sh └── build-ocl.sh ├── src ├── bitlog.cpp ├── bitlog.hpp ├── clinfo_ext.h ├── mempool.hpp ├── pyopencl_ext.h ├── tools.hpp ├── wrap_cl.cpp ├── wrap_cl.hpp ├── wrap_cl_part_1.cpp ├── wrap_cl_part_2.cpp ├── wrap_constants.cpp ├── wrap_helpers.hpp └── wrap_mempool.cpp └── test ├── add-vectors-32.spv ├── add-vectors-64.spv ├── empty-header.h ├── test_algorithm.py ├── test_array.py ├── test_arrays_in_structs.py ├── test_clmath.py ├── test_clrandom.py ├── test_enqueue_copy.py └── test_wrapper.py /.ci/hack-intel-cl-into-conda-env.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # https://github.com/conda-forge/intel-compiler-repack-feedstock/issues/7 4 | sed -i 's/- pocl/- intel-opencl-rt!=2022.2/g' "$CONDA_ENVIRONMENT" 5 | -------------------------------------------------------------------------------- /.conda-ci-build-configure.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inducer/pyopencl/b8b8d4d852e8a26356861ffda578874dc064e54c/.conda-ci-build-configure.sh -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # https://editorconfig.org/ 2 | # https://github.com/editorconfig/editorconfig-vim 3 | # https://github.com/editorconfig/editorconfig-emacs 4 | 5 | root = true 6 | 7 | [*] 8 | indent_style = space 9 | end_of_line = lf 10 | charset = utf-8 11 | trim_trailing_whitespace = true 12 | insert_final_newline = true 13 | 14 | [*.py] 15 | indent_size = 4 16 | 17 | [*.rst] 18 | indent_size = 4 19 | 20 | [*.cpp] 21 | indent_size = 2 22 | 23 | [*.hpp] 24 | indent_size = 2 25 | 26 | # There may be one in doc/ 27 | [Makefile] 28 | indent_style = tab 29 | 30 | # https://github.com/microsoft/vscode/issues/1679 31 | [*.md] 32 | trim_trailing_whitespace = false 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Environment (please complete the following information):** 24 | - OS: [e.g. Linux] 25 | - ICD Loader and version: [e.g. ocl-icd 2.3.1] 26 | - ICD and version: [e.g. pocl 1.8] 27 | - CPU/GPU: [e.g. Nvidia Titan V] 28 | - Python version: [e.g. 3.10] 29 | - PyOpenCL version: [e.g. 2021.1] 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: ❓ Question 4 | url: https://github.com/inducer/pyopencl/discussions/categories/q-a 5 | about: Ask and answer questions about PyOpenCL on Discussions 6 | - name: 🔧 Troubleshooting 7 | url: https://github.com/inducer/pyopencl/discussions/categories/troubleshooting 8 | about: For troubleshooting help, see the Discussions 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Set update schedule for GitHub Actions 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "weekly" 8 | 9 | # vim: sw=4 10 | -------------------------------------------------------------------------------- /.github/workflows/autopush.yml: -------------------------------------------------------------------------------- 1 | name: Gitlab mirror 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | autopush: 9 | name: Automatic push to gitlab.tiker.net 10 | if: startsWith(github.repository, 'inducer/') 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - run: | 15 | curl -L -O https://tiker.net/ci-support-v0 16 | . ./ci-support-v0 17 | mirror_github_to_gitlab 18 | 19 | env: 20 | GITLAB_AUTOPUSH_KEY: ${{ secrets.GITLAB_AUTOPUSH_KEY }} 21 | 22 | # vim: sw=4 23 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: 5 | - main 6 | tags: 7 | - v* 8 | pull_request: 9 | schedule: 10 | - cron: '17 3 * * 0' 11 | 12 | jobs: 13 | ruff: 14 | name: Ruff 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | with: 19 | submodules: true 20 | - uses: actions/setup-python@v5 21 | - name: "Main Script" 22 | run: | 23 | pip install ruff 24 | ruff check 25 | 26 | typos: 27 | name: Typos 28 | runs-on: ubuntu-latest 29 | steps: 30 | - uses: actions/checkout@v4 31 | - uses: crate-ci/typos@master 32 | 33 | pylint: 34 | name: Pylint 35 | runs-on: ubuntu-latest 36 | steps: 37 | - uses: actions/checkout@v4 38 | - name: "Main Script" 39 | run: | 40 | CONDA_ENVIRONMENT=.test-conda-env-py3.yml 41 | echo "- matplotlib" >> $CONDA_ENVIRONMENT 42 | echo "- pyopengl" >> $CONDA_ENVIRONMENT 43 | echo "- ipython" >> $CONDA_ENVIRONMENT 44 | 45 | curl -L -O https://tiker.net/ci-support-v0 46 | . ci-support-v0 47 | build_py_project_in_conda_env 48 | 49 | # Avoid linting local directory, where native module 50 | # cannot be imported. 51 | rm -Rf "$(get_proj_name)" 52 | 53 | run_pylint "$(get_proj_name)" test/*.py 54 | 55 | mypy: 56 | name: Mypy 57 | runs-on: ubuntu-latest 58 | steps: 59 | - uses: actions/checkout@v4 60 | - name: "Main Script" 61 | run: | 62 | curl -L -O https://tiker.net/ci-support-v0 63 | . ci-support-v0 64 | export CL_USE_SHIPPED_EXT=on 65 | build_py_project_in_conda_env 66 | 67 | python -m pip install mypy importlib-resources 68 | ./run-mypy.sh 69 | 70 | pytest: 71 | name: Pytest Linux POCL 72 | runs-on: ubuntu-latest 73 | steps: 74 | - uses: actions/checkout@v4 75 | - name: "Main Script" 76 | run: | 77 | curl -L -O https://tiker.net/ci-support-v0 78 | . ci-support-v0 79 | export CL_USE_SHIPPED_EXT=on 80 | build_py_project_in_conda_env 81 | test_py_project 82 | 83 | pytest_intel: 84 | name: Pytest Linux Intel CL 85 | runs-on: ubuntu-latest 86 | steps: 87 | - uses: actions/checkout@v4 88 | - name: "Main Script" 89 | run: | 90 | export CONDA_ENVIRONMENT=.test-conda-env-py3.yml 91 | .ci/hack-intel-cl-into-conda-env.sh 92 | 93 | curl -L -O https://tiker.net/ci-support-v0 94 | . ci-support-v0 95 | export CL_USE_SHIPPED_EXT=on 96 | build_py_project_in_conda_env 97 | test_py_project 98 | 99 | pytest_win: 100 | name: Pytest Windows Intel CL 101 | runs-on: windows-latest 102 | steps: 103 | - uses: actions/checkout@v4 104 | - name: "Main Script" 105 | shell: bash 106 | run: | 107 | set -x 108 | export CONDA_ENVIRONMENT=.test-conda-env-py3.yml 109 | 110 | sed -i 's/- ocl-icd/- khronos-opencl-icd-loader/g' "$CONDA_ENVIRONMENT" 111 | sed -i '/- git/d' "$CONDA_ENVIRONMENT" 112 | 113 | .ci/hack-intel-cl-into-conda-env.sh 114 | 115 | curl -L -O https://tiker.net/ci-support-v0 116 | . ci-support-v0 117 | export CL_USE_SHIPPED_EXT=on 118 | build_py_project_in_conda_env 119 | test_py_project 120 | 121 | pytest_mac: 122 | name: Pytest Mac POCL 123 | runs-on: macos-latest 124 | steps: 125 | - uses: actions/checkout@v4 126 | - name: "Main Script" 127 | run: | 128 | export CC=gcc 129 | CONDA_ENVIRONMENT=.test-conda-env.yml 130 | grep -v ocl-icd .test-conda-env-py3.yml > $CONDA_ENVIRONMENT 131 | 132 | curl -L -O https://tiker.net/ci-support-v0 133 | . ci-support-v0 134 | build_py_project_in_conda_env 135 | test_py_project 136 | 137 | docs: 138 | name: Documentation 139 | runs-on: ubuntu-latest 140 | steps: 141 | - uses: actions/checkout@v4 142 | - 143 | uses: actions/setup-python@v5 144 | with: 145 | python-version: '3.x' 146 | - name: "Main Script" 147 | run: | 148 | CONDA_ENVIRONMENT=.test-conda-env-py3.yml 149 | 150 | curl -L -O https://tiker.net/ci-support-v0 151 | . ci-support-v0 152 | export CL_USE_SHIPPED_EXT=on 153 | build_py_project_in_conda_env 154 | build_docs 155 | 156 | examples: 157 | name: Examples 158 | runs-on: ubuntu-latest 159 | steps: 160 | - uses: actions/checkout@v4 161 | - name: "Main Script" 162 | run: | 163 | EXTRA_INSTALL="pillow cgen mako imageio" 164 | 165 | curl -L -O https://tiker.net/ci-support-v0 166 | . ci-support-v0 167 | build_py_project_in_conda_env 168 | (cd examples; rm -f gl_*) 169 | run_examples --no-require-main 170 | 171 | downstream_tests: 172 | strategy: 173 | matrix: 174 | downstream_project: [loopy, boxtree, meshmode] 175 | name: Tests for downstream project ${{ matrix.downstream_project }} 176 | runs-on: ubuntu-latest 177 | steps: 178 | - uses: actions/checkout@v4 179 | - name: "Main Script" 180 | env: 181 | DOWNSTREAM_PROJECT: ${{ matrix.downstream_project }} 182 | run: | 183 | curl -L -O https://tiker.net/ci-support-v0 184 | . ci-support-v0 185 | 186 | prepare_downstream_build "https://github.com/inducer/$DOWNSTREAM_PROJECT.git" 187 | sed -i 's/pyopencl/ocl-icd/' .test-conda-env-py3.yml 188 | build_py_project_in_conda_env 189 | test_py_project 190 | 191 | # vim: sw=4 192 | -------------------------------------------------------------------------------- /.github/workflows/wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build and upload to PyPI 2 | 3 | # Build on every branch push, tag push, and pull request change: 4 | on: 5 | push: 6 | branches: 7 | - main 8 | tags: 9 | - v* 10 | pull_request: 11 | schedule: 12 | - cron: '17 3 * * 0' 13 | 14 | jobs: 15 | build_wheels: 16 | name: Build wheels on ${{ matrix.os }} 17 | runs-on: ${{ matrix.os }} 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | os: [ubuntu-latest, windows-latest, macos-13, macos-14] 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | with: 26 | submodules: 'true' 27 | 28 | - uses: actions/setup-python@v5 29 | with: 30 | python-version: '3.x' 31 | 32 | - name: Install cibuildwheel 33 | run: python -m pip install cibuildwheel==2.22.0 34 | 35 | - name: Build wheels 36 | shell: bash 37 | run: | 38 | set -x 39 | if [[ ${{ matrix.os }} == windows-* ]]; then 40 | export CL_INC_DIR="D:/a/pyopencl/pyopencl/OpenCL-Headers/install/include" 41 | export CL_LIB_DIR="C:/Program Files/OpenCL-ICD-Loader/lib" 42 | fi 43 | python -m cibuildwheel --output-dir wheelhouse 44 | 45 | - uses: actions/upload-artifact@v4 46 | with: 47 | name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} 48 | path: ./wheelhouse/*.whl 49 | 50 | build_sdist: 51 | name: Build source distribution 52 | runs-on: ubuntu-latest 53 | steps: 54 | - uses: actions/checkout@v4 55 | with: 56 | submodules: 'true' 57 | 58 | - name: Build sdist 59 | run: pipx run build --sdist 60 | 61 | - uses: actions/upload-artifact@v4 62 | with: 63 | name: cibw-sdist 64 | path: dist/*.tar.gz 65 | 66 | upload_pypi: 67 | needs: [build_wheels, build_sdist] 68 | 69 | environment: pypi 70 | permissions: 71 | id-token: write 72 | 73 | runs-on: ubuntu-latest 74 | # upload to PyPI on every tag starting with 'v' 75 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') 76 | # alternatively, to publish when a GitHub Release is created, use the following rule: 77 | # if: github.event_name == 'release' && github.event.action == 'published' 78 | steps: 79 | - uses: actions/download-artifact@v4 80 | with: 81 | pattern: cibw-* 82 | path: dist 83 | merge-multiple: true 84 | 85 | - uses: pypa/gh-action-pypi-publish@release/v1 86 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _skbuild 2 | 3 | .pydevproject 4 | .project 5 | .settings 6 | *~ 7 | .*.sw[po] 8 | .sw[po] 9 | *.dat 10 | *.pyc 11 | build 12 | *.prof 13 | doc/hedge-notes.pdf 14 | *.vtk 15 | *.silo 16 | *.session 17 | dump.py 18 | *.orig 19 | /Makefile 20 | *.png 21 | tags 22 | *.vtu 23 | *.pvtu 24 | *.pvd 25 | doc/user-reference 26 | doc/dev-reference 27 | *.poly 28 | *.node 29 | *.bak 30 | *.pdf 31 | *.tif 32 | *.so 33 | *.pyd 34 | *.mpeg 35 | *-journal 36 | visitlog.py 37 | *.log 38 | .figleaf 39 | dist 40 | *.egg* 41 | MANIFEST 42 | *.patch 43 | *.LOCAL.[0-9]* 44 | *.REMOTE.[0-9]* 45 | *.BASE.[0-9]* 46 | tmp 47 | temp* 48 | setuptools.pth 49 | distribute-*.tar.gz 50 | core 51 | *.sess 52 | _build 53 | __pycache__ 54 | *.o 55 | .ipynb_checkpoints 56 | cscope.* 57 | 58 | # needed by jenkins env 59 | .env 60 | virtualenv-[0-9]* 61 | pytest.xml 62 | setuptools*tar.gz 63 | build-and-test-py-project.sh 64 | 65 | cffi_build.py 66 | 67 | .cache 68 | .pytest_cache 69 | .idea 70 | 71 | wheelhouse 72 | 73 | memray-*.bin 74 | memray-*.html 75 | 76 | .pylintrc.yml 77 | .run-pylint.py 78 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | variables: 2 | GIT_SUBMODULE_STRATEGY: recursive 3 | 4 | Python 3 Intel CPU: 5 | script: | 6 | source /opt/enable-intel-cl.sh 7 | export PYOPENCL_TEST="intel(r):pu" 8 | export EXTRA_INSTALL="numpy mako" 9 | 10 | curl -L -O https://tiker.net/ci-support-v0 11 | . ci-support-v0 12 | build_py_project_in_venv 13 | test_py_project 14 | tags: 15 | - python3 16 | - intel-cl-cpu 17 | except: 18 | - tags 19 | artifacts: 20 | reports: 21 | junit: test/pytest.xml 22 | 23 | Python 3 Nvidia Titan V: 24 | script: | 25 | export PYOPENCL_TEST=nvi:titan 26 | export EXTRA_INSTALL="numpy mako" 27 | 28 | curl -L -O https://tiker.net/ci-support-v0 29 | . ci-support-v0 30 | build_py_project_in_venv 31 | test_py_project 32 | tags: 33 | - python3 34 | - nvidia-titan-v 35 | except: 36 | - tags 37 | artifacts: 38 | reports: 39 | junit: test/pytest.xml 40 | 41 | Python 3 POCL: 42 | script: | 43 | export PYOPENCL_TEST=portable:cpu 44 | export EXTRA_INSTALL="numpy mako" 45 | 46 | curl -L -O https://tiker.net/ci-support-v0 47 | . ci-support-v0 48 | build_py_project_in_venv 49 | test_py_project 50 | tags: 51 | - python3 52 | - pocl 53 | except: 54 | - tags 55 | artifacts: 56 | reports: 57 | junit: test/pytest.xml 58 | 59 | Python 3 POCL CL 1.1: 60 | script: | 61 | export PYOPENCL_TEST=portable:cpu 62 | export EXTRA_INSTALL="numpy mako" 63 | export PYOPENCL_PRETEND_CL_VERSION='1.1' 64 | 65 | curl -L -O https://tiker.net/ci-support-v0 66 | . ci-support-v0 67 | build_py_project_in_venv 68 | test_py_project 69 | tags: 70 | - python3 71 | - pocl 72 | except: 73 | - tags 74 | artifacts: 75 | reports: 76 | junit: test/pytest.xml 77 | 78 | Python 3 POCL Titan V: 79 | script: | 80 | export PYOPENCL_TEST=portable:titan 81 | export EXTRA_INSTALL="numpy mako" 82 | 83 | curl -L -O https://tiker.net/ci-support-v0 84 | . ci-support-v0 85 | build_py_project_in_venv 86 | test_py_project 87 | tags: 88 | - python3 89 | - pocl 90 | - nvidia-titan-v 91 | except: 92 | - tags 93 | artifacts: 94 | reports: 95 | junit: test/pytest.xml 96 | 97 | Python 3 POCL (+GL and special functions): 98 | script: | 99 | export PYOPENCL_TEST=portable:cpu 100 | export EXTRA_INSTALL="numpy mako scipy pyfmmlib" 101 | export PYOPENCL_ENABLE_GL=ON 102 | 103 | curl -L -O https://tiker.net/ci-support-v0 104 | . ci-support-v0 105 | build_py_project_in_venv 106 | test_py_project 107 | tags: 108 | - python3 109 | - pocl 110 | except: 111 | - tags 112 | artifacts: 113 | reports: 114 | junit: test/pytest.xml 115 | 116 | Ruff: 117 | script: | 118 | pipx install ruff 119 | ruff check 120 | tags: 121 | - docker-runner 122 | except: 123 | - tags 124 | 125 | Pylint: 126 | script: | 127 | export EXTRA_INSTALL="numpy mako matplotlib PyOpenGl IPython" 128 | 129 | curl -L -O https://tiker.net/ci-support-v0 130 | . ci-support-v0 131 | 132 | build_py_project_in_venv 133 | 134 | # Avoid linting local directory, where native module 135 | # cannot be imported. 136 | rm -Rf "$(get_proj_name)" 137 | 138 | run_pylint "$(get_proj_name)" test/*.py 139 | tags: 140 | - python3 141 | except: 142 | - tags 143 | 144 | Mypy: 145 | script: | 146 | export EXTRA_INSTALL="numpy mako mypy importlib-resources" 147 | 148 | curl -L -O https://tiker.net/ci-support-v0 149 | . ci-support-v0 150 | build_py_project_in_venv 151 | python -m mypy --show-error-codes pyopencl test 152 | tags: 153 | - python3 154 | except: 155 | - tags 156 | 157 | Documentation: 158 | script: | 159 | export EXTRA_INSTALL="numpy mako" 160 | 161 | curl -L -O https://tiker.net/ci-support-v0 162 | . ci-support-v0 163 | build_py_project_in_venv 164 | build_docs 165 | maybe_upload_docs 166 | tags: 167 | - linux 168 | 169 | Examples: 170 | script: | 171 | export EXTRA_INSTALL="pillow cgen mako imageio" 172 | 173 | curl -L -O https://tiker.net/ci-support-v0 174 | . ci-support-v0 175 | build_py_project_in_venv 176 | (cd examples; rm -f gl_*) 177 | run_examples --no-require-main 178 | except: 179 | - tags 180 | tags: 181 | - python3 182 | - pocl 183 | 184 | Downstream: 185 | parallel: 186 | matrix: 187 | - DOWNSTREAM_PROJECT: [loopy, boxtree, meshmode] 188 | tags: 189 | - large-node 190 | - docker-runner 191 | script: | 192 | curl -L -O https://tiker.net/ci-support-v0 193 | . ci-support-v0 194 | 195 | prepare_downstream_build "https://github.com/inducer/$DOWNSTREAM_PROJECT.git" 196 | sed -i 's/pyopencl/ocl-icd/' .test-conda-env-py3.yml 197 | build_py_project_in_conda_env 198 | test_py_project 199 | 200 | # vim: sw=2 201 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "pyopencl/compyte"] 2 | path = pyopencl/compyte 3 | url = https://github.com/inducer/compyte 4 | -------------------------------------------------------------------------------- /.pylintrc-local.yml: -------------------------------------------------------------------------------- 1 | - arg: ignore 2 | val: compyte 3 | - arg: generated-members 4 | val: 5 | - cltypes.* 6 | - gl_platform.* 7 | - mako.template 8 | -------------------------------------------------------------------------------- /.test-conda-env-py3.yml: -------------------------------------------------------------------------------- 1 | name: test-conda-env 2 | channels: 3 | - conda-forge 4 | - nodefaults 5 | 6 | dependencies: 7 | - python=3 8 | - git 9 | - numpy 10 | - ocl-icd 11 | - pocl 12 | - mako 13 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Kloeckner" 5 | given-names: "Andreas" 6 | orcid: "https://orcid.org/0000-0003-1228-519X" 7 | - family-names: "Yu" 8 | given-names: "Yichao" 9 | - family-names: "Wala" 10 | given-names: "Matt" 11 | - family-names: "Fernando" 12 | given-names: "Isuru" 13 | - family-names: "Bencun" 14 | given-names: "Marko" 15 | - family-names: "Kulkarni" 16 | given-names: "Kaushik" 17 | - family-names: "Diener" 18 | given-names: "Matthias" 19 | - family-names: "Gao" 20 | given-names: "Hao" 21 | - family-names: "Fikl" 22 | given-names: "Alex" 23 | - family-names: "Weiner" 24 | given-names: "Zach" 25 | - family-names: "Weigert" 26 | given-names: "Martin" 27 | - family-names: "Palmer" 28 | given-names: "Rebecca" 29 | - family-names: "Latham" 30 | given-names: "Shane" 31 | - family-names: "Magno" 32 | given-names: "Gonçalo" 33 | - family-names: "Fuller" 34 | given-names: "Henry" 35 | - family-names: "Mackenzie" 36 | given-names: "Jonathan" 37 | - family-names: "Niarchos" 38 | given-names: "Sotiris" 39 | - family-names: "Gill" 40 | given-names: "Shahzaib" 41 | - family-names: "Gohlke" 42 | given-names: "Christoph" 43 | - family-names: "Bhosale" 44 | given-names: "Aditya" 45 | - family-names: "Rothberg" 46 | given-names: "Alex" 47 | - family-names: "Ey" 48 | given-names: "Emanuel" 49 | - family-names: "Rapp" 50 | given-names: "Holger" 51 | - family-names: "van der Walt" 52 | given-names: "Stefan" 53 | # Removed pending resolution of https://github.com/zenodo/zenodo/issues/2343 54 | # - alias: "gw0" 55 | - family-names: "Thalhammer" 56 | given-names: "Gregor" 57 | - family-names: "Kieffer" 58 | given-names: "Jerome" 59 | - family-names: "Poliarnyi" 60 | given-names: "Nikolai" 61 | - family-names: "Bollinger" 62 | given-names: "Drew" 63 | - family-names: "Nitz" 64 | given-names: "Alex" 65 | - family-names: "Bokota" 66 | given-names: "Grzegorz" 67 | orcid: 'https://orcid.org/0000-0002-5470-1676' 68 | 69 | title: "PyOpenCL" 70 | version: 2022.1.3 71 | doi: 10.5281/zenodo.6533956 72 | date-released: 2022-03-10 73 | url: "https://github.com/inducer/pyopencl" 74 | license: MIT 75 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.17...3.26) 2 | 3 | project(pyopencl LANGUAGES CXX VERSION ${SKBUILD_PROJECT_VERSION}) 4 | 5 | if(NOT SKBUILD) 6 | message(WARNING "\ 7 | This CMake file is meant to be executed using 'scikit-build'. Running 8 | it directly will almost certainly not produce the desired result. If 9 | you are a user trying to install this package, please use the command 10 | below, which will install all necessary build dependencies, compile 11 | the package in an isolated environment, and then install it. 12 | ===================================================================== 13 | $ pip install . 14 | ===================================================================== 15 | If you are a software developer, and this is your own package, then 16 | it is usually much more efficient to install the build dependencies 17 | in your environment once and use the following command that avoids 18 | a costly creation of a new virtual environment at every compilation: 19 | ===================================================================== 20 | $ pip install nanobind scikit-build-core[pyproject] 21 | $ pip install --no-build-isolation -ve . 22 | ===================================================================== 23 | You may optionally add -Ceditable.rebuild=true to auto-rebuild when 24 | the package is imported. Otherwise, you need to re-run the above 25 | after editing C++ files.") 26 | endif() 27 | 28 | # {{{ Options 29 | 30 | option(PYOPENCL_TRACE "Enable OpenCL tracing" $ENV{PYOPENCL_TRACE}) 31 | option(PYOPENCL_ENABLE_GL "Enable OpenGL interoperability" $ENV{PYOPENCL_ENABLE_GL}) 32 | option(PYOPENCL_USE_SHIPPED_EXT "Use shipped CL extension header" $ENV{PYOPENCL_USE_SHIPPED_EXT}) 33 | 34 | set(CL_INC_DIR CACHE STRING "OpenCL include directory") 35 | set(CL_LIB_DIR CACHE STRING "OpenCL library directory") 36 | set(CL_LIBNAME CACHE STRING "OpenCL library name") 37 | 38 | set(PYOPENCL_PRETEND_CL_VERSION CACHE STRING "Pretend to be a different OpenCL version") 39 | 40 | if(NOT CL_INC_DIR) 41 | message(STATUS "CL_INC_DIR not set, trying to guess it from environment variables.") 42 | if(DEFINED ENV{CL_INC_DIR}) 43 | message(STATUS "Using OpenCL include directory from environment '$ENV{CL_INC_DIR}'") 44 | set(CL_INC_DIR $ENV{CL_INC_DIR}) 45 | endif() 46 | 47 | if(DEFINED ENV{CL_LIB_DIR}) 48 | message(STATUS "Using OpenCL library directory from environment '$ENV{CL_INC_DIR}'") 49 | set(CL_LIB_DIR $ENV{CL_LIB_DIR}) 50 | endif() 51 | 52 | if(DEFINED ENV{CL_LIBNAME}) 53 | message(STATUS "Using OpenCL library name from environment '$ENV{CL_LIBNAME}'") 54 | set(CL_LIBNAME $ENV{CL_LIBNAME}) 55 | endif() 56 | endif(NOT CL_INC_DIR) 57 | 58 | if(NOT CL_INC_DIR) 59 | message(STATUS "CL_INC_DIR not set, trying to guess it from conda environment.") 60 | if(DEFINED ENV{CONDA_PREFIX}) 61 | # Linux/MacOS: 62 | if(EXISTS $ENV{CONDA_PREFIX}/lib/libOpenCL${CMAKE_SHARED_LIBRARY_SUFFIX}) 63 | message(STATUS "Found OpenCL in conda environment '$ENV{CONDA_PREFIX}'") 64 | set(CL_INC_DIR $ENV{CONDA_PREFIX}/include) 65 | set(CL_LIB_DIR $ENV{CONDA_PREFIX}/lib) 66 | set(CL_LIBNAME OpenCL) 67 | # Windows: 68 | elseif(EXISTS $ENV{CONDA_PREFIX}/Library/lib/OpenCL${CMAKE_STATIC_LIBRARY_SUFFIX}) 69 | message(STATUS "Found OpenCL in conda environment '$ENV{CONDA_PREFIX}'") 70 | set(CL_INC_DIR $ENV{CONDA_PREFIX}/Library/include) 71 | set(CL_LIB_DIR $ENV{CONDA_PREFIX}/Library/lib) 72 | set(CL_LIBNAME OpenCL) 73 | endif() 74 | 75 | endif(DEFINED ENV{CONDA_PREFIX}) 76 | endif(NOT CL_INC_DIR) 77 | 78 | if(NOT PYOPENCL_PRETEND_CL_VERSION) 79 | if(DEFINED ENV{PYOPENCL_PRETEND_CL_VERSION}) 80 | set(PYOPENCL_PRETEND_CL_VERSION $ENV{PYOPENCL_PRETEND_CL_VERSION}) 81 | endif() 82 | endif() 83 | 84 | if(PYOPENCL_PRETEND_CL_VERSION) 85 | # Split the version string into a list 86 | string(REPLACE "." ";" VERSION_LIST ${PYOPENCL_PRETEND_CL_VERSION}) 87 | 88 | # Get the major and minor version numbers 89 | list(GET VERSION_LIST 0 MAJOR) 90 | list(GET VERSION_LIST 1 MINOR) 91 | 92 | # Calculate the numerical value 93 | math(EXPR ARG "0x1000*${MAJOR} + 0x10*${MINOR}") 94 | message(STATUS "Pretending to use OpenCL version ${PYOPENCL_PRETEND_CL_VERSION} (${ARG})") 95 | set(PYOPENCL_PRETEND_CL_VERSION ${ARG}) 96 | endif() 97 | 98 | message(STATUS "CL_INC_DIR ${CL_INC_DIR}") 99 | message(STATUS "CL_LIB_DIR ${CL_LIB_DIR}") 100 | message(STATUS "CL_LIBNAME ${CL_LIBNAME}") 101 | 102 | # }}} 103 | 104 | # {{{ Get version information 105 | 106 | find_program(GIT git) 107 | 108 | if(GIT AND EXISTS ${CMAKE_SOURCE_DIR}/.git) 109 | # Exact tag match => released version 110 | execute_process(COMMAND git describe --exact-match --dirty=* 111 | OUTPUT_VARIABLE PYOPENCL_VERSION_GIT 112 | RESULT_VARIABLE git_result 113 | OUTPUT_STRIP_TRAILING_WHITESPACE 114 | ERROR_QUIET 115 | ) 116 | if(NOT ${git_result} EQUAL 0) 117 | # No exact tag match => development version 118 | execute_process(COMMAND git describe --long --always --dirty=* 119 | OUTPUT_VARIABLE PYOPENCL_VERSION_GIT 120 | OUTPUT_STRIP_TRAILING_WHITESPACE 121 | ) 122 | set(PYOPENCL_REL "(dev)") 123 | else() 124 | set(PYOPENCL_REL "(release)") 125 | endif() 126 | else() 127 | set(PYOPENCL_VERSION_GIT "v${PROJECT_VERSION}") 128 | set(PYOPENCL_REL "(non-git)") 129 | endif() 130 | 131 | # }}} 132 | 133 | find_package(Python COMPONENTS Interpreter Development.Module NumPy REQUIRED) 134 | 135 | if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) 136 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) 137 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") 138 | endif() 139 | 140 | # {{{ Detect nanobind and import it 141 | 142 | execute_process( 143 | COMMAND 144 | "${PYTHON_EXECUTABLE}" -c "import nanobind; print(nanobind.__version__)" 145 | OUTPUT_VARIABLE NANOBIND_VERSION 146 | OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ECHO STDOUT) 147 | 148 | execute_process( 149 | COMMAND 150 | "${PYTHON_EXECUTABLE}" -c "import nanobind; print(nanobind.cmake_dir())" 151 | OUTPUT_VARIABLE NANOBIND_DIR 152 | OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ECHO STDOUT) 153 | list(APPEND CMAKE_PREFIX_PATH "${NANOBIND_DIR}") 154 | 155 | # }}} 156 | 157 | link_directories(${CL_LIB_DIR}) 158 | include_directories(${CL_INC_DIR} ${Python_NumPy_INCLUDE_DIRS}) 159 | 160 | find_package(nanobind CONFIG REQUIRED) 161 | 162 | set(OpenCL_ROOT ${CL_LIB_DIR}) 163 | set(OpenCL_INCLUDE_DIR ${CL_INC_DIR}) 164 | set(OpenCL_LIBRARY ${CL_LIBNAME}) 165 | find_package(OpenCL REQUIRED) 166 | 167 | nanobind_add_module( 168 | _cl 169 | NB_STATIC # Build static libnanobind (the extension module itself remains a shared library) 170 | LTO 171 | NOMINSIZE 172 | src/wrap_constants.cpp 173 | src/wrap_cl.cpp 174 | src/wrap_cl_part_1.cpp 175 | src/wrap_cl_part_2.cpp 176 | src/wrap_mempool.cpp 177 | src/bitlog.cpp 178 | ) 179 | 180 | target_link_libraries(_cl PRIVATE ${OpenCL_LIBRARY}) 181 | 182 | target_compile_definitions(_cl 183 | PRIVATE 184 | PYGPU_PACKAGE=pyopencl 185 | PYGPU_PYOPENCL 186 | ) 187 | 188 | if (PYOPENCL_PRETEND_CL_VERSION) 189 | target_compile_definitions( 190 | _cl PRIVATE PYOPENCL_PRETEND_CL_VERSION=${PYOPENCL_PRETEND_CL_VERSION}) 191 | endif() 192 | 193 | if (PYOPENCL_ENABLE_GL) 194 | target_compile_definitions(_cl PRIVATE HAVE_GL=1) 195 | endif() 196 | 197 | if (PYOPENCL_TRACE) 198 | target_compile_definitions(_cl PRIVATE PYOPENCL_TRACE=1) 199 | endif() 200 | 201 | if (PYOPENCL_USE_SHIPPED_EXT) 202 | target_compile_definitions(_cl PRIVATE PYOPENCL_USE_SHIPPED_EXT=1) 203 | endif() 204 | 205 | install(TARGETS _cl LIBRARY DESTINATION pyopencl) 206 | 207 | 208 | # {{{ Print configuration 209 | 210 | message("==============================") 211 | message("PyOpenCL ${PYOPENCL_VERSION_GIT} ${PYOPENCL_REL} configuration: ") 212 | message(" PyOpenCL options: PYOPENCL_TRACE=${PYOPENCL_TRACE} PYOPENCL_ENABLE_GL=${PYOPENCL_ENABLE_GL} PYOPENCL_USE_SHIPPED_EXT=${PYOPENCL_USE_SHIPPED_EXT} PYOPENCL_PRETEND_CL_VERSION=${PYOPENCL_PRETEND_CL_VERSION}") 213 | message(" OpenCL: ${OpenCL_LIBRARIES} [${OpenCL_VERSION_STRING}]") 214 | message(" Python: ${Python_EXECUTABLE} [${Python_VERSION}]") 215 | message(" Build type: ${CMAKE_BUILD_TYPE}") 216 | message(" C++ compiler: ${CMAKE_CXX_COMPILER} [${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}]") 217 | message(" CMake: ${CMAKE_COMMAND} [${CMAKE_VERSION}]") 218 | message(" Nanobind: ${NANOBIND_DIR} [${NANOBIND_VERSION}]") 219 | message(" Build tool: ${CMAKE_MAKE_PROGRAM}") 220 | message("==============================") 221 | 222 | # }}} 223 | 224 | # vim: foldmethod=marker:sw=2 225 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | PyOpenCL is licensed to you under the MIT/X Consortium license: 2 | 3 | Copyright (c) 2009-13 Andreas Klöckner and Contributors. 4 | 5 | Permission is hereby granted, free of charge, to any person 6 | obtaining a copy of this software and associated documentation 7 | files (the "Software"), to deal in the Software without 8 | restriction, including without limitation the rights to use, 9 | copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the 11 | Software is furnished to do so, subject to the following 12 | conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | OTHER DEALINGS IN THE SOFTWARE. 25 | 26 | PyOpenCL includes derivatives of parts of the `Thrust 27 | `_ computing package (in particular the scan 28 | implementation). These parts are licensed as follows: 29 | 30 | Copyright 2008-2011 NVIDIA Corporation 31 | 32 | Licensed under the Apache License, Version 2.0 (the "License"); 33 | you may not use this file except in compliance with the License. 34 | You may obtain a copy of the License at 35 | 36 | 37 | 38 | Unless required by applicable law or agreed to in writing, software 39 | distributed under the License is distributed on an "AS IS" BASIS, 40 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 41 | See the License for the specific language governing permissions and 42 | limitations under the License. 43 | 44 | .. note:: 45 | 46 | If you use Apache-licensed parts, be aware that these may be incompatible 47 | with software licensed exclusively under GPL2. (Most software is licensed 48 | as GPL2 or later, in which case this is not an issue.) 49 | 50 | PyOpenCL includes parts of the Random123 suite of random number generators: 51 | 52 | Copyright 2010-2012, D. E. Shaw Research. 53 | All rights reserved. 54 | 55 | Redistribution and use in source and binary forms, with or without 56 | modification, are permitted provided that the following conditions are 57 | met: 58 | 59 | * Redistributions of source code must retain the above copyright 60 | notice, this list of conditions, and the following disclaimer. 61 | 62 | * Redistributions in binary form must reproduce the above copyright 63 | notice, this list of conditions, and the following disclaimer in the 64 | documentation and/or other materials provided with the distribution. 65 | 66 | * Neither the name of D. E. Shaw Research nor the names of its 67 | contributors may be used to endorse or promote products derived from 68 | this software without specific prior written permission. 69 | 70 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 71 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 72 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 73 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 74 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 75 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 76 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 77 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 78 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 79 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 80 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 81 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | PyOpenCL: Pythonic Access to OpenCL, with Arrays and Algorithms 2 | =============================================================== 3 | 4 | .. |badge-gitlab-ci| image:: https://gitlab.tiker.net/inducer/pyopencl/badges/main/pipeline.svg 5 | :alt: Gitlab Build Status 6 | :target: https://gitlab.tiker.net/inducer/pyopencl/commits/main 7 | .. |badge-github-ci| image:: https://github.com/inducer/pyopencl/actions/workflows/ci.yml/badge.svg 8 | :alt: Github Build Status 9 | :target: https://github.com/inducer/pyopencl/actions/workflows/ci.yml 10 | .. |badge-pypi| image:: https://badge.fury.io/py/pyopencl.svg 11 | :alt: Python Package Index Release Page 12 | :target: https://pypi.org/project/pyopencl/ 13 | .. |badge-zenodo| image:: https://zenodo.org/badge/1575307.svg 14 | :alt: Zenodo DOI for latest release 15 | :target: https://zenodo.org/badge/latestdoi/1575307 16 | 17 | |badge-gitlab-ci| |badge-github-ci| |badge-pypi| |badge-zenodo| 18 | 19 | PyOpenCL lets you access GPUs and other massively parallel compute 20 | devices from Python. It tries to offer computing goodness in the 21 | spirit of its sister project `PyCUDA `__: 22 | 23 | * Object cleanup tied to lifetime of objects. This idiom, often 24 | called `RAII `__ 25 | in C++, makes it much easier to write correct, leak- and 26 | crash-free code. 27 | 28 | * Completeness. PyOpenCL puts the full power of OpenCL's API at 29 | your disposal, if you wish. Every obscure ``get_info()`` query and 30 | all CL calls are accessible. 31 | 32 | * Automatic Error Checking. All CL errors are automatically 33 | translated into Python exceptions. 34 | 35 | * Speed. PyOpenCL's base layer is written in C++, so all the niceties 36 | above are virtually free. 37 | 38 | * Helpful and complete `Documentation `__ 39 | as well as a `Wiki `__. 40 | 41 | * Liberal license. PyOpenCL is open-source under the 42 | `MIT license `__ 43 | and free for commercial, academic, and private use. 44 | 45 | * Broad support. PyOpenCL was tested and works with Apple's, AMD's, and Nvidia's 46 | CL implementations. 47 | 48 | Simple 4-step `install instructions `__ 49 | using Conda on Linux and macOS (that also install a working OpenCL implementation!) 50 | can be found in the `documentation `__. 51 | 52 | What you'll need if you do *not* want to use the convenient instructions above and 53 | instead build from source: 54 | 55 | * g++/clang new enough to be compatible with nanobind (specifically, full support of C++17 is needed) 56 | * `numpy `__, and 57 | * an OpenCL implementation. (See this `howto `__ 58 | for how to get one.) 59 | 60 | Links 61 | ----- 62 | 63 | * `Documentation `__ 64 | (read how things work) 65 | * `Python package index `__ 66 | (download releases, including binary wheels for Linux, macOS, Windows) 67 | * `Conda Forge `__ 68 | (download binary packages for Linux, macOS, Windows) 69 | * `Github `__ 70 | (get latest source code, file bugs) 71 | -------------------------------------------------------------------------------- /TODOs: -------------------------------------------------------------------------------- 1 | - *_from_int_ptr, register with metaclass 2 | - generic_info 3 | - Incorporate fixes in C++ stuff from after the fork 4 | - compare and tests 5 | - MemoryPool 6 | - enqueue_nd_range_kernel size/offset mess 7 | 8 | - CommandQueue.set_property 9 | - GLBuffer 10 | - GLRenderBuffer 11 | - GLTexture 12 | - get_gl_context_info_khr 13 | - ?clEnqueueNativeKernel 14 | 15 | - Buffer interface functions should really use new-style buffer interface 16 | (old-style does not work in Py3) 17 | https://github.com/numpy/numpy/issues/4747 18 | -------------------------------------------------------------------------------- /contrib/cldis.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (C) 2022 Isuru Fernando" 2 | 3 | __license__ = """ 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | """ 22 | 23 | """ 24 | cldis.py 25 | 26 | A script to compile and print the native code for a OpenCL kernel. 27 | 28 | Usage: python cldis.py prog.cl 29 | """ 30 | 31 | import glob 32 | import os 33 | import re 34 | import subprocess 35 | import sys 36 | import tempfile 37 | 38 | 39 | def main(ctx, tmp_dir, cl_str, output=None, build_options=()): 40 | device = ctx.devices[0] 41 | platform = device.platform 42 | if platform.name == "NVIDIA CUDA": 43 | supported_outputs = ["ptx", "sass"] 44 | elif platform.name == "Portable Computing Language": 45 | if device.name.startswith("NVIDIA"): 46 | supported_outputs = ["ptx", "sass"] 47 | elif device.name.startswith("pthread") or device.name.startswith("cpu"): 48 | supported_outputs = ["asm"] 49 | else: 50 | raise NotImplementedError(f"Unknown pocl device '{device.name}'") 51 | else: 52 | raise NotImplementedError(f"Unknown opencl device '{device}'") 53 | if output is None: 54 | output = supported_outputs[0] 55 | else: 56 | assert output in supported_outputs 57 | 58 | prg = cl.Program(ctx, cl_str).build(options=build_options, 59 | cache_dir=os.path.join(tmp_dir, "cache")) 60 | 61 | for binary in prg.binaries: 62 | if output in ["ptx", "sass"]: 63 | res = binary[binary.index(b"// Generated"):].decode("utf-8") 64 | if output == "sass": 65 | with open(os.path.join(tmp_dir, "cl.ptx"), "w") as f: 66 | f.write(res) 67 | tgt = re.findall(r".target sm_[0-9]*", res, re.MULTILINE)[0] 68 | gpu_name = tgt[8:] 69 | subprocess.check_call(["ptxas", "cl.ptx", "--verbose", 70 | f"--gpu-name={gpu_name}", "--warn-on-spills"], cwd=tmp_dir) 71 | res = subprocess.check_output(["cuobjdump", "-sass", "elf.o"], 72 | cwd=tmp_dir).decode("utf-8") 73 | 74 | elif output == "asm" and platform.name == "Portable Computing Language": 75 | so = glob.glob(f"{tmp_dir}/**/*.so", recursive=True)[0] 76 | res = subprocess.check_output(["objdump", "-d", so]).decode("utf-8") 77 | 78 | print(res) 79 | 80 | 81 | if __name__ == "__main__": 82 | with tempfile.TemporaryDirectory() as tmp_dir: 83 | os.environ["POCL_CACHE_DIR"] = os.path.join(tmp_dir, "pocl_cache") 84 | import pyopencl as cl 85 | ctx = cl.create_some_context() 86 | cl_file = sys.argv[1] 87 | with open(cl_file) as f: 88 | cl_str = f.read() 89 | output = sys.argv[2] if len(sys.argv) >= 3 else None 90 | build_options = sys.argv[3:] if len(sys.argv) >= 4 else [] 91 | main(ctx, tmp_dir, cl_str, output, build_options) 92 | -------------------------------------------------------------------------------- /contrib/fortran-to-opencl/README: -------------------------------------------------------------------------------- 1 | Experimental Fortran-to-OpenCL translator 2 | ----------------------------------------- 3 | 4 | This is a highly experimental Fortran-to-OpenCL translator. Its purpose is to 5 | translate computational kernels into OpenCL-like C. It doesn't 6 | auto-parallelize. My purpose in writing this was to convert a few 7 | special-function evaluators. 8 | 9 | The best it can hope for at the moment is to automate most of the process so 10 | that you'll only have to fix up a few things manually afterwards. It further 11 | only deals with the subset of Fortran 77 that I needed. Quite a number of 12 | things are unimplemented. Patches are welcome. 13 | 14 | Andreas Kloeckner 15 | 16 | Dependencies: 17 | 18 | - cnd 19 | http://github.com/inducer/cnd 20 | 21 | - cgen 22 | http://github.com/inducer/cgen 23 | 24 | - pymbolic 25 | http://github.com/inducer/pymbolic 26 | 27 | - fparser 28 | http://code.google.com/p/f2py 29 | with fix from http://code.google.com/p/f2py/issues/detail?id=32 30 | -------------------------------------------------------------------------------- /contrib/pyopencl.vim: -------------------------------------------------------------------------------- 1 | " Vim highlighting for PyOpenCL 2 | " ----------------------------- 3 | " 4 | " (C) Andreas Kloeckner 2011, MIT license 5 | " 6 | " Uses parts of mako.vim by Armin Ronacher. 7 | " 8 | " Installation: 9 | " Just drop this file into ~/.vim/syntax/pyopencl.vim 10 | " 11 | " Then do 12 | " :set filetype=pyopencl 13 | " and use 14 | " """//CL// ...code...""" 15 | " for OpenCL code included in your Python file. 16 | " 17 | " You may also include a line 18 | " vim: filetype=pyopencl.python 19 | " at the end of your file to set the file type automatically. 20 | " 21 | " Optional: Install opencl.vim from 22 | " http://www.vim.org/scripts/script.php?script_id=3157 23 | 24 | runtime! syntax/python.vim 25 | 26 | unlet b:current_syntax 27 | try 28 | syntax include @clCode syntax/opencl.vim 29 | catch 30 | syntax include @clCode syntax/c.vim 31 | endtry 32 | 33 | unlet b:current_syntax 34 | syn include @pythonTop syntax/python.vim 35 | 36 | " {{{ mako 37 | 38 | syn region clmakoLine start="^\s*%" skip="\\$" end="$" 39 | syn region clmakoVariable start=#\${# end=#}# contains=@pythonTop 40 | syn region clmakoBlock start=#<%!# end=#%># keepend contains=@pythonTop 41 | 42 | syn match clmakoAttributeKey containedin=clmakoTag contained "[a-zA-Z_][a-zA-Z0-9_]*=" 43 | syn region clmakoAttributeValue containedin=clmakoTag contained start=/"/ skip=/\\"/ end=/"/ 44 | syn region clmakoAttributeValue containedin=clmakoTag contained start=/'/ skip=/\\'/ end=/'/ 45 | 46 | syn region clmakoTag start="" end="/\?>" 47 | 48 | " The C highlighter's paren error detection screws up highlighting of 49 | " Mako variables in C parens--turn it off. 50 | 51 | syn clear cParen 52 | syn clear cParenError 53 | if !exists("c_no_bracket_error") 54 | syn clear cBracket 55 | endif 56 | 57 | syn cluster clmakoCode contains=clmakoLine,clmakoVariable,clmakoBlock,clmakoTag 58 | 59 | hi link clmakoLine Preproc 60 | hi link clmakoVariable Preproc 61 | hi link clmakoBlock Preproc 62 | hi link clmakoTag Define 63 | hi link clmakoAttributeKey String 64 | hi link clmakoAttributeValue String 65 | 66 | " }}} 67 | 68 | syn region pythonCLString 69 | \ start=+[uU]\=\z('''\|"""\)//CL\(:[a-zA-Z_0-9]\+\)\?//+ end="\z1" keepend 70 | \ contains=@clCode,@clmakoCode 71 | 72 | syn region pythonCLRawString 73 | \ start=+[uU]\=[rR]\z('''\|"""\)//CL\(:[a-zA-Z_0-9]\+\)\?//+ end="\z1" keepend 74 | \ contains=@clCode,@clmakoCode 75 | 76 | " Uncomment if you still want the code highlighted as a string. 77 | " hi link pythonCLString String 78 | " hi link pythonCLRawString String 79 | 80 | syntax sync fromstart 81 | 82 | let b:current_syntax = "pyopencl" 83 | 84 | " vim: foldmethod=marker 85 | -------------------------------------------------------------------------------- /doc/.gitignore: -------------------------------------------------------------------------------- 1 | constants.inc 2 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= -W -n 7 | SPHINXBUILD ?= python $(shell which sphinx-build) 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | constants: 16 | python make_constants.py > constants.inc 17 | 18 | .PHONY: help Makefile 19 | 20 | # Catch-all target: route all unknown targets to Sphinx using the new 21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 22 | %: Makefile constants 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | -------------------------------------------------------------------------------- /doc/algorithm.rst: -------------------------------------------------------------------------------- 1 | Parallel Algorithms 2 | =================== 3 | 4 | .. include:: subst.rst 5 | 6 | Element-wise expression evaluation ("map") 7 | ------------------------------------------ 8 | 9 | .. module:: pyopencl.elementwise 10 | 11 | Evaluating involved expressions on :class:`pyopencl.array.Array` instances by 12 | using overloaded operators can be somewhat inefficient, because a new temporary 13 | is created for each intermediate result. The functionality in the module 14 | :mod:`pyopencl.elementwise` contains tools to help generate kernels that 15 | evaluate multi-stage expressions on one or several operands in a single pass. 16 | 17 | .. autoclass:: ElementwiseKernel 18 | 19 | Here's a usage example: 20 | 21 | .. literalinclude:: ../examples/demo_elementwise.py 22 | 23 | (You can find this example as 24 | :download:`examples/demo_elementwise.py <../examples/demo_elementwise.py>` 25 | in the PyOpenCL distribution.) 26 | 27 | .. _custom-reductions: 28 | 29 | Sums and counts ("reduce") 30 | -------------------------- 31 | 32 | .. module:: pyopencl.reduction 33 | 34 | .. autoclass:: ReductionKernel 35 | 36 | Here's a usage example:: 37 | 38 | a = pyopencl.array.arange(queue, 400, dtype=numpy.float32) 39 | b = pyopencl.array.arange(queue, 400, dtype=numpy.float32) 40 | 41 | krnl = ReductionKernel(ctx, numpy.float32, neutral="0", 42 | reduce_expr="a+b", map_expr="x[i]*y[i]", 43 | arguments="__global float *x, __global float *y") 44 | 45 | my_dot_prod = krnl(a, b).get() 46 | 47 | .. _custom-scan: 48 | 49 | Prefix Sums ("scan") 50 | -------------------- 51 | 52 | .. module:: pyopencl.scan 53 | 54 | .. |scan_extra_args| replace:: a list of tuples *(name, value)* specifying 55 | extra arguments to pass to the scan procedure. For version 2013.1, 56 | *value* must be a of a :mod:`numpy` sized scalar type. As of version 2013.2, 57 | *value* may also be a :class:`pyopencl.array.Array`. 58 | .. |preamble| replace:: A snippet of C that is inserted into the compiled kernel 59 | before the actual kernel function. May be used for, e.g. type definitions 60 | or include statements. 61 | 62 | A prefix sum is a running sum of an array, as provided by 63 | e.g. :func:`numpy.cumsum`:: 64 | 65 | >>> import numpy as np 66 | >>> a = [1,1,1,1,1,2,2,2,2,2] 67 | >>> np.cumsum(a) 68 | array([ 1, 2, 3, 4, 5, 7, 9, 11, 13, 15]) 69 | 70 | This is a very simple example of what a scan can do. It turns out that scans 71 | are significantly more versatile. They are a basic building block of many 72 | non-trivial parallel algorithms. Many of the operations enabled by scans seem 73 | difficult to parallelize because of loop-carried dependencies. 74 | 75 | .. seealso:: 76 | 77 | `Prefix sums and their applications `__, by Guy Blelloch. 78 | This article gives an overview of some surprising applications of scans. 79 | 80 | :ref:`predefined-scans` 81 | These operations built into PyOpenCL are realized using 82 | :class:`GenericScanKernel`. 83 | 84 | Usage Example 85 | ^^^^^^^^^^^^^ 86 | 87 | This example illustrates the implementation of a simplified version of 88 | :func:`pyopencl.algorithm.copy_if`, 89 | which copies integers from an array into the (variable-size) output if they are 90 | greater than 300:: 91 | 92 | knl = GenericScanKernel( 93 | ctx, np.int32, 94 | arguments="__global int *ary, __global int *out", 95 | input_expr="(ary[i] > 300) ? 1 : 0", 96 | scan_expr="a+b", neutral="0", 97 | output_statement=""" 98 | if (prev_item != item) out[item-1] = ary[i]; 99 | """) 100 | 101 | out = a.copy() 102 | knl(a, out) 103 | 104 | a_host = a.get() 105 | out_host = a_host[a_host > 300] 106 | 107 | assert (out_host == out.get()[:len(out_host)]).all() 108 | 109 | The value being scanned over is a number of flags indicating whether each array 110 | element is greater than 300. These flags are computed by *input_expr*. The 111 | prefix sum over this array gives a running count of array items greater than 112 | 300. The *output_statement* the compares ``prev_item`` (the previous item's scan 113 | result, i.e. index) to ``item`` (the current item's scan result, i.e. 114 | index). If they differ, i.e. if the predicate was satisfied at this 115 | position, then the item is stored in the output at the computed index. 116 | 117 | This example does not make use of the following advanced features also available 118 | in PyOpenCL: 119 | 120 | * Segmented scans 121 | 122 | * Access to the previous item in *input_expr* (e.g. for comparisons) 123 | See the `implementation `__ 124 | of :func:`pyopencl.algorithm.unique` for an example. 125 | 126 | Making Custom Scan Kernels 127 | ^^^^^^^^^^^^^^^^^^^^^^^^^^ 128 | 129 | .. versionadded:: 2013.1 130 | 131 | .. autoclass:: GenericScanKernel 132 | 133 | Debugging aids 134 | ~~~~~~~~~~~~~~ 135 | 136 | .. autoclass:: GenericDebugScanKernel 137 | 138 | .. _predefined-scans: 139 | 140 | Simple / Legacy Interface 141 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 142 | 143 | .. class:: ExclusiveScanKernel(ctx, dtype, scan_expr, neutral, name_prefix="scan", options=[], preamble="", devices=None) 144 | 145 | Generates a kernel that can compute a `prefix sum 146 | `__ 147 | using any associative operation given as *scan_expr*. 148 | *scan_expr* uses the formal values "a" and "b" to indicate two operands of 149 | an associative binary operation. *neutral* is the neutral element 150 | of *scan_expr*, obeying *scan_expr(a, neutral) == a*. 151 | 152 | *dtype* specifies the type of the arrays being operated on. 153 | *name_prefix* is used for kernel names to ensure recognizability 154 | in profiles and logs. *options* is a list of compiler options to use 155 | when building. *preamble* specifies a string of code that is 156 | inserted before the actual kernels. *devices* may be used to restrict 157 | the set of devices on which the kernel is meant to run. (defaults 158 | to all devices in the context *ctx*. 159 | 160 | .. method:: __call__(self, input_ary, output_ary=None, allocator=None, queue=None) 161 | 162 | .. class:: InclusiveScanKernel(ctx, dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None) 163 | 164 | Works like :class:`ExclusiveScanKernel`. 165 | 166 | .. versionchanged:: 2013.1 167 | *neutral* is now always required. 168 | 169 | For the array ``[1, 2, 3]``, inclusive scan results in ``[1, 3, 6]``, and exclusive 170 | scan results in ``[0, 1, 3]``. 171 | 172 | Here's a usage example:: 173 | 174 | knl = InclusiveScanKernel(context, np.int32, "a+b") 175 | 176 | n = 2**20-2**18+5 177 | rng = np.random.default_rng(seed=42) 178 | host_data = rng.integers(0, 10, size=n, dtype=np.int32) 179 | dev_data = cl_array.to_device(queue, host_data) 180 | 181 | knl(dev_data) 182 | assert (dev_data.get() == np.cumsum(host_data, axis=0)).all() 183 | 184 | Predicated copies ("partition", "unique", ...) 185 | ---------------------------------------------- 186 | 187 | .. module:: pyopencl.algorithm 188 | 189 | .. autofunction:: copy_if 190 | 191 | .. autofunction:: remove_if 192 | 193 | .. autofunction:: partition 194 | 195 | .. autofunction:: unique 196 | 197 | Sorting (radix sort) 198 | -------------------- 199 | 200 | .. autoclass:: RadixSort 201 | 202 | .. automethod:: __call__ 203 | 204 | Building many variable-size lists 205 | --------------------------------- 206 | 207 | .. autoclass:: ListOfListsBuilder 208 | 209 | Bitonic Sort 210 | ------------ 211 | 212 | .. module:: pyopencl.bitonic_sort 213 | 214 | .. autoclass:: BitonicSort 215 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | 3 | 4 | _conf_url = \ 5 | "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py" 6 | with urlopen(_conf_url) as _inf: 7 | exec(compile(_inf.read(), _conf_url, "exec"), globals()) 8 | 9 | exclude_patterns = ["subst.rst"] 10 | 11 | copyright = "2009-21, Andreas Kloeckner" 12 | 13 | ver_dic = {} 14 | with open("../pyopencl/version.py") as ver_file: 15 | ver_src = ver_file.read() 16 | exec(compile(ver_src, "../pyopencl/version.py", "exec"), ver_dic) 17 | version = ".".join(str(x) for x in ver_dic["VERSION"]) 18 | # The full version, including alpha/beta/rc tags. 19 | release = ver_dic["VERSION_TEXT"] 20 | 21 | intersphinx_mapping = { 22 | "python": ("https://docs.python.org/3", None), 23 | "numpy": ("https://numpy.org/doc/stable/", None), 24 | "mako": ("https://docs.makotemplates.org/en/latest", None), 25 | "pytools": ("https://documen.tician.de/pytools", None), 26 | } 27 | -------------------------------------------------------------------------------- /doc/howto.rst: -------------------------------------------------------------------------------- 1 | How-tos 2 | ======= 3 | 4 | How to use struct types with PyOpenCL 5 | ------------------------------------- 6 | 7 | We import and initialize PyOpenCL as usual: 8 | 9 | .. doctest:: 10 | :options: +ELLIPSIS 11 | 12 | >>> import numpy as np 13 | >>> import pyopencl as cl 14 | >>> import pyopencl.tools 15 | >>> import pyopencl.array 16 | 17 | >>> ctx = cl.create_some_context(interactive=False) 18 | >>> queue = cl.CommandQueue(ctx) 19 | 20 | Then, suppose we would like to declare a struct consisting of an integer and a 21 | floating point number. We first create a :class:`numpy.dtype` along these 22 | lines: 23 | 24 | .. doctest:: 25 | 26 | >>> my_struct = np.dtype([("field1", np.int32), ("field2", np.float32)]) 27 | >>> print(my_struct) 28 | [('field1', '`__. So as a first step, we 38 | match our dtype against CL's version: 39 | 40 | .. doctest:: 41 | 42 | >>> my_struct, my_struct_c_decl = cl.tools.match_dtype_to_c_struct( 43 | ... ctx.devices[0], "my_struct", my_struct) 44 | >>> print(my_struct_c_decl) 45 | typedef struct { 46 | int field1; 47 | float field2; 48 | } my_struct; 49 | 50 | 51 | 52 | We then tell PyOpenCL about our new type. 53 | 54 | .. doctest:: 55 | 56 | >>> my_struct = cl.tools.get_or_register_dtype("my_struct", my_struct) 57 | 58 | Next, we can create some data of that type on the host and transfer it to 59 | the device: 60 | 61 | .. doctest:: 62 | 63 | >>> ary_host = np.empty(20, my_struct) 64 | >>> ary_host["field1"].fill(217) 65 | >>> ary_host["field2"].fill(1000) 66 | >>> ary_host[13]["field2"] = 12 67 | >>> print(ary_host) #doctest: +NORMALIZE_WHITESPACE 68 | [(217, 1000.) (217, 1000.) (217, 1000.) (217, 1000.) (217, 1000.) 69 | (217, 1000.) (217, 1000.) (217, 1000.) (217, 1000.) (217, 1000.) 70 | (217, 1000.) (217, 1000.) (217, 1000.) (217, 12.) (217, 1000.) 71 | (217, 1000.) (217, 1000.) (217, 1000.) (217, 1000.) (217, 1000.)] 72 | 73 | >>> ary = cl.array.to_device(queue, ary_host) 74 | 75 | We can then operate on the array with our own kernels: 76 | 77 | .. doctest:: 78 | 79 | >>> prg = cl.Program(ctx, my_struct_c_decl + """ 80 | ... __kernel void set_to_1(__global my_struct *a) 81 | ... { 82 | ... a[get_global_id(0)].field1 = 1; 83 | ... } 84 | ... """).build() 85 | 86 | >>> evt = prg.set_to_1(queue, ary.shape, None, ary.data) 87 | >>> print(ary) #doctest: +NORMALIZE_WHITESPACE 88 | [(1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) 89 | (1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) 90 | (1, 1000.) (1, 12.) (1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) 91 | (1, 1000.) (1, 1000.)] 92 | 93 | as well as with PyOpenCL's built-in operations: 94 | 95 | .. doctest:: 96 | 97 | >>> from pyopencl.elementwise import ElementwiseKernel 98 | >>> elwise = ElementwiseKernel(ctx, "my_struct *a", "a[i].field1 = 2;", 99 | ... preamble=my_struct_c_decl) 100 | >>> evt = elwise(ary) 101 | >>> print(ary) #doctest: +NORMALIZE_WHITESPACE 102 | [(2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) 103 | (2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) 104 | (2, 1000.) (2, 12.) (2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) 105 | (2, 1000.) (2, 1000.)] 106 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to PyOpenCL's documentation! 2 | ==================================== 3 | 4 | PyOpenCL gives you easy, Pythonic access to the `OpenCL 5 | `__ parallel computation API. 6 | What makes PyOpenCL special? 7 | 8 | * Object cleanup tied to lifetime of objects. This idiom, 9 | often called 10 | `RAII `__ 11 | in C++, makes it much easier to write correct, leak- and 12 | crash-free code. 13 | 14 | * Completeness. PyOpenCL puts the full power of OpenCL's API at your 15 | disposal, if you wish. Every obscure ``get_info()`` query and 16 | all CL calls are accessible. 17 | 18 | * Automatic Error Checking. All errors are automatically translated 19 | into Python exceptions. 20 | 21 | * Speed. PyOpenCL's base layer is written in C++, so all the niceties above 22 | are virtually free. 23 | 24 | * Helpful Documentation. You're looking at it. ;) 25 | 26 | * Liberal license. PyOpenCL is open-source under the 27 | :ref:`MIT license ` 28 | and free for commercial, academic, and private use. 29 | 30 | Here's an example, to give you an impression: 31 | 32 | .. literalinclude:: ../examples/demo.py 33 | 34 | (You can find this example as 35 | :download:`examples/demo.py <../examples/demo.py>` in the PyOpenCL 36 | source distribution.) 37 | 38 | Tutorials 39 | ========= 40 | 41 | * Gaston Hillar's `two-part article series 42 | `__ 43 | in Dr. Dobb's Journal provides a friendly introduction to PyOpenCL. 44 | * `Simon McIntosh-Smith `__ 45 | and `Tom Deakin `__'s course 46 | `Hands-on OpenCL `__ contains 47 | both `lecture slides `__ 48 | and `exercises (with solutions) `__ 49 | (The course covers PyOpenCL as well as OpenCL's C and C++ APIs.) 50 | * PyOpenCL course at `PASI `__: Parts 51 | `1 `__ 52 | `2 `__ 53 | `3 `__ 54 | `4 `__ 55 | (YouTube, 2011) 56 | * PyOpenCL course at `DTU GPULab `__ and 57 | `Simula `__ (2011): 58 | `Lecture 1 `__ 59 | `Lecture 2 `__ 60 | `Problem set 1 `__ 61 | `Problem set 2 `__ 62 | * Ian Johnson's `PyOpenCL tutorial `__. 63 | 64 | Software that works with or enhances PyOpenCL 65 | ============================================= 66 | 67 | * Jon Roose's `pyclblas `__ 68 | (`code `__) 69 | makes BLAS in the form of `clBLAS `__ 70 | available from within :mod:`pyopencl` code. 71 | 72 | Two earlier wrappers continue to be available: 73 | one by `Eric Hunsberger `__ and one 74 | by `Lars Ericson `__. 75 | 76 | * Cedric Nugteren provides a wrapper for the 77 | `CLBlast `__ 78 | OpenCL BLAS library: 79 | `PyCLBlast `__. 80 | 81 | * Gregor Thalhammer's `gpyfft `__ provides a 82 | Python wrapper for the OpenCL FFT library clFFT from AMD. 83 | 84 | * Bogdan Opanchuk's `reikna `__ offers a 85 | variety of GPU-based algorithms (FFT, random number generation, matrix 86 | multiplication) designed to work with :class:`pyopencl.array.Array` objects. 87 | 88 | * Troels Henriksen, Ken Friis Larsen, and Cosmin Oancea's `Futhark 89 | `__ programming language offers a nice way to code 90 | nested-parallel programs with reductions and scans on data in 91 | :class:`pyopencl.array.Array` instances. 92 | 93 | * Robbert Harms and Alard Roebroeck's `MOT `__ 94 | offers a variety of GPU-enabled non-linear optimization algorithms and MCMC 95 | sampling routines for parallel optimization and sampling of multiple problems. 96 | 97 | * Vincent Favre-Nicolin's `pyvkfft `__ 98 | makes `vkfft `__ accessible from PyOpenCL. 99 | 100 | If you know of a piece of software you feel that should be on this list, please 101 | let me know, or, even better, send a patch! 102 | 103 | Contents 104 | ======== 105 | 106 | .. toctree:: 107 | :maxdepth: 2 108 | 109 | runtime 110 | runtime_const 111 | runtime_platform 112 | runtime_queue 113 | runtime_memory 114 | runtime_program 115 | runtime_gl 116 | tools 117 | array 118 | types 119 | algorithm 120 | howto 121 | misc 122 | 🚀 Github 123 | 💾 Download Releases 124 | 125 | Note that this guide does not explain OpenCL programming and technology. Please 126 | refer to the official `Khronos OpenCL documentation `__ 127 | for that. 128 | 129 | PyOpenCL also has its own `web site `__, 130 | where you can find updates, new versions, documentation, and support. 131 | 132 | Indices and tables 133 | ================== 134 | 135 | * :ref:`genindex` 136 | * :ref:`modindex` 137 | * :ref:`search` 138 | -------------------------------------------------------------------------------- /doc/runtime.rst: -------------------------------------------------------------------------------- 1 | .. _reference-doc: 2 | 3 | .. include:: subst.rst 4 | 5 | OpenCL Runtime: Basics 6 | ====================== 7 | 8 | Version Queries 9 | --------------- 10 | 11 | .. module:: pyopencl 12 | .. moduleauthor:: Andreas Kloeckner 13 | 14 | .. data:: VERSION 15 | 16 | Gives the numeric version of PyOpenCL as a variable-length tuple 17 | of integers. Enables easy version checks such as 18 | ``VERSION >= (0, 93)``. 19 | 20 | .. data:: VERSION_STATUS 21 | 22 | A text string such as ``"rc4"`` or ``"beta"`` qualifying the status 23 | of the release. 24 | 25 | .. data:: VERSION_TEXT 26 | 27 | The full release name (such as ``"0.93rc4"``) in string form. 28 | 29 | .. function:: get_cl_header_version() 30 | 31 | Return a variable-length tuple of integers representing the 32 | version of the OpenCL header against which PyOpenCL was 33 | compiled. 34 | 35 | .. versionadded:: 0.92 36 | 37 | .. _errors: 38 | 39 | Error Reporting 40 | --------------- 41 | 42 | .. class:: Error 43 | 44 | Base class for all PyOpenCL exceptions. 45 | 46 | .. class:: MemoryError 47 | 48 | .. class:: LogicError 49 | 50 | .. class:: RuntimeError 51 | 52 | -------------------------------------------------------------------------------- /doc/runtime_const.rst: -------------------------------------------------------------------------------- 1 | OpenCL Runtime: Constants 2 | ========================= 3 | 4 | .. currentmodule:: pyopencl 5 | 6 | .. include:: constants.inc 7 | 8 | .. class:: NameVersion 9 | 10 | Describes the version of a specific feature. 11 | 12 | .. note:: 13 | 14 | Only available with OpenCL 3.0 or newer. 15 | 16 | .. versionadded:: 2020.3 17 | 18 | .. method:: __init__(version, name) 19 | .. attribute:: version 20 | .. attribute:: name 21 | 22 | .. class:: DeviceTopologyAmd 23 | 24 | .. method:: __init__(bus, device, function) 25 | .. attribute:: type 26 | .. attribute:: bus 27 | .. attribute:: device 28 | .. attribute:: function 29 | 30 | .. vim: shiftwidth=4 31 | -------------------------------------------------------------------------------- /doc/runtime_gl.rst: -------------------------------------------------------------------------------- 1 | .. include:: subst.rst 2 | 3 | .. _gl-interop: 4 | 5 | OpenCL Runtime: OpenGL Interoperability 6 | ======================================= 7 | 8 | .. currentmodule:: pyopencl 9 | 10 | Functionality in this section is only available when PyOpenCL is compiled 11 | with GL support. See :func:`have_gl`. 12 | 13 | .. versionadded:: 0.91 14 | 15 | .. function:: have_gl() 16 | 17 | Return *True* if PyOpenCL was compiled with OpenGL interoperability, 18 | otherwise *False*. 19 | 20 | .. function:: get_gl_sharing_context_properties() 21 | 22 | Return a :class:`list` of :class:`context_properties` that will 23 | allow a newly created context to share the currently active GL 24 | context. 25 | 26 | .. function:: get_apple_cgl_share_group() 27 | 28 | Get share group handle for current CGL context. 29 | 30 | Apple OS X only. 31 | 32 | .. versionadded:: 2011.1 33 | 34 | .. class:: GLBuffer(context, flags, bufobj) 35 | 36 | :class:`GLBuffer` inherits from :class:`MemoryObject`. 37 | 38 | .. attribute:: gl_object 39 | 40 | .. class:: GLRenderBuffer(context, flags, bufobj) 41 | 42 | :class:`GLRenderBuffer` inherits from :class:`MemoryObject`. 43 | 44 | .. attribute:: gl_object 45 | 46 | .. class:: GLTexture(context, flags, texture_target, miplevel, texture, dims) 47 | 48 | :class:`GLTexture` inherits from :class:`Image`. Only available in OpenCL 1.2 49 | and newer. 50 | 51 | .. attribute:: gl_object 52 | 53 | .. method:: get_gl_texture_info(param) 54 | 55 | See ``gl_texture_info`` for values of *param*. Only available when 56 | PyOpenCL is compiled with GL support. See :func:`have_gl`. 57 | 58 | .. function:: enqueue_acquire_gl_objects(queue, mem_objects, wait_for=None) 59 | 60 | *mem_objects* is a list of :class:`MemoryObject` instances. 61 | |std-enqueue-blurb| 62 | 63 | .. function:: enqueue_release_gl_objects(queue, mem_objects, wait_for=None) 64 | 65 | *mem_objects* is a list of :class:`MemoryObject` instances. |std-enqueue-blurb| 66 | 67 | .. function:: get_gl_context_info_khr(properties, param_name, platform=None) 68 | 69 | Get information on which CL device corresponds to a given 70 | GL/EGL/WGL/CGL device. 71 | 72 | See the :class:`Context` constructor for the meaning of 73 | *properties* and :class:`gl_context_info` for *param_name*. 74 | 75 | 76 | .. versionchanged:: 2011.2 77 | Accepts the *platform* argument. Using *platform* equal to None is 78 | deprecated as of PyOpenCL 2011.2. 79 | -------------------------------------------------------------------------------- /doc/runtime_platform.rst: -------------------------------------------------------------------------------- 1 | .. include:: subst.rst 2 | 3 | OpenCL Runtime: Platforms, Devices and Contexts 4 | =============================================== 5 | 6 | .. currentmodule:: pyopencl 7 | 8 | Platform 9 | -------- 10 | 11 | .. function:: get_platforms() 12 | 13 | Return a list of :class:`Platform` instances. 14 | 15 | .. class:: Platform 16 | 17 | .. attribute:: info 18 | 19 | Lower case versions of the :class:`platform_info` constants 20 | may be used as attributes on instances of this class 21 | to directly query info attributes. 22 | 23 | .. method:: get_info(param) 24 | 25 | See :class:`platform_info` for values of *param*. 26 | 27 | .. method:: get_devices(device_type=device_type.ALL) 28 | 29 | Return a list of devices matching *device_type*. 30 | See :class:`device_type` for values of *device_type*. 31 | 32 | .. versionchanged:: 2013.2 33 | 34 | This used to raise an exception if no matching 35 | devices were found. Now, it will simply return 36 | an empty list. 37 | 38 | .. automethod:: from_int_ptr 39 | .. autoattribute:: int_ptr 40 | 41 | |comparable| 42 | 43 | Device 44 | ------ 45 | 46 | .. class:: Device 47 | 48 | Two instances of this class may be compared using *=="* and *"!="*. 49 | 50 | .. attribute:: info 51 | 52 | Lower case versions of the :class:`device_info` constants 53 | may be used as attributes on instances of this class 54 | to directly query info attributes. 55 | 56 | .. method:: get_info(param) 57 | 58 | See :class:`device_info` for values of *param*. 59 | 60 | .. automethod:: from_int_ptr 61 | .. autoattribute:: int_ptr 62 | 63 | .. attribute :: hashable_model_and_version_identifier 64 | 65 | An unspecified data type that can be used to (as precisely as possible, 66 | given identifying information available in OpenCL) identify a given 67 | model and software stack version of a compute device. Note that this 68 | identifier does not differentiate between different instances of the 69 | same device installed in a single host. 70 | 71 | The returned data type is hashable. 72 | 73 | .. versionadded:: 2020.1 74 | 75 | .. method:: create_sub_devices(properties) 76 | 77 | *properties* is an array of one (or more) of the forms:: 78 | 79 | [ dpp.EQUALLY, 8] 80 | [ dpp.BY_COUNTS, 5, 7, 9, dpp.PARTITION_BY_COUNTS_LIST_END] 81 | [ dpp.BY_NAMES, 5, 7, 9, dpp.PARTITION_BY_NAMES_LIST_END] 82 | [ dpp.BY_AFFINITY_DOMAIN, dad.L1_CACHE] 83 | 84 | where ``dpp`` represents :class:`device_partition_property` 85 | and ``dad`` represent :class:`device_affinity_domain`. 86 | 87 | ``PROPERTIES_LIST_END_EXT`` is added automatically. 88 | 89 | Only available with CL 1.2. 90 | 91 | .. versionadded:: 2011.2 92 | 93 | .. method:: device_and_host_timer 94 | 95 | :returns: a tuple ``(device_timestamp, host_timestamp)``. 96 | 97 | Only available with CL 2.0. 98 | 99 | .. versionadded:: 2020.3 100 | 101 | .. method:: host_timer 102 | 103 | Only available with CL 2.0. 104 | 105 | .. versionadded:: 2020.3 106 | 107 | .. autofunction:: choose_devices 108 | 109 | Context 110 | ------- 111 | 112 | .. class:: Context(devices=None, properties=None, dev_type=None) 113 | 114 | Create a new context. *properties* is a list of key-value 115 | tuples, where each key must be one of :class:`context_properties`. 116 | At most one of *devices* and *dev_type* may be not *None*, where 117 | *devices* is a list of :class:`Device` instances, and 118 | *dev_type* is one of the :class:`device_type` constants. 119 | If neither is specified, a context with a *dev_type* of 120 | :attr:`device_type.DEFAULT` is created. 121 | 122 | .. note:: 123 | 124 | Calling the constructor with no arguments may fail for 125 | CL drivers that support the OpenCL ICD (which applies to most modern systems). 126 | If you want similar, just-give-me-a-context-already behavior, we recommend 127 | :func:`create_some_context`. 128 | 129 | See e.g. this 130 | `explanation by AMD 131 | `__: 132 | 133 | **What has changed?** 134 | 135 | In previous beta releases functions such as clGetDeviceIDs() and clCreateContext() 136 | accepted a NULL value for the platform parameter. This release no longer 137 | allows this - the platform must be a valid one obtained by using the platform API. 138 | 139 | .. note:: 140 | 141 | Because of how OpenCL changed in order to support Installable Client 142 | Drivers (ICDs) in OpenCL 1.1, the following will *look* reasonable 143 | but often actually not work:: 144 | 145 | import pyopencl as cl 146 | ctx = cl.Context(dev_type=cl.device_type.ALL) 147 | 148 | Instead, make sure to choose a platform when choosing a device by type:: 149 | 150 | import pyopencl as cl 151 | 152 | platforms = cl.get_platforms() 153 | ctx = cl.Context( 154 | dev_type=cl.device_type.ALL, 155 | properties=[(cl.context_properties.PLATFORM, platforms[0])]) 156 | 157 | .. note:: 158 | 159 | For 160 | ``context_properties.CL_GL_CONTEXT_KHR``, 161 | ``context_properties.CL_EGL_DISPLAY_KHR``, 162 | ``context_properties.CL_GLX_DISPLAY_KHR``, 163 | ``context_properties.CL_WGL_HDC_KHR``, and 164 | ``context_properties.CL_CGL_SHAREGROUP_KHR`` 165 | ``context_properties.CL_CGL_SHAREGROUP_APPLE`` 166 | the value in the key-value pair is a PyOpenGL context or display 167 | instance. 168 | 169 | .. versionchanged:: 0.91.2 170 | Constructor arguments *dev_type* added. 171 | 172 | .. attribute:: info 173 | 174 | Lower case versions of the :class:`context_info` constants 175 | may be used as attributes on instances of this class 176 | to directly query info attributes. 177 | 178 | .. method:: get_info(param) 179 | 180 | See :class:`context_info` for values of *param*. 181 | 182 | .. automethod:: from_int_ptr 183 | .. autoattribute:: int_ptr 184 | 185 | .. method:: set_default_device_command_queue(dev, queue) 186 | 187 | |comparable| 188 | 189 | .. autofunction:: create_some_context 190 | -------------------------------------------------------------------------------- /doc/runtime_queue.rst: -------------------------------------------------------------------------------- 1 | .. include:: subst.rst 2 | 3 | OpenCL Runtime: Command Queues and Events 4 | ========================================= 5 | 6 | .. currentmodule:: pyopencl 7 | 8 | Command Queue 9 | ------------- 10 | 11 | .. class:: CommandQueue(context, device=None, properties=None) 12 | 13 | Create a new command queue. *properties* is a bit field 14 | consisting of :class:`command_queue_properties` values. 15 | 16 | If *device* is None, one of the devices in *context* is chosen 17 | in an implementation-defined manner. 18 | 19 | *properties* may be a bitwise combination of values from 20 | :class:`queue_properties` (or *None* which is equivalent to 21 | passing *0*). This is compatible with both OpenCL 1.x and 2.x. 22 | 23 | For OpenCL 2.0 and above, *properties* may also be a sequence 24 | of keys and values from :class:`queue_properties` as accepted 25 | by :c:func:`clCreateCommandQueueWithProperties` (see the OpenCL 26 | spec for details). The trailing *0* is added automatically 27 | and does not need to be included. 28 | 29 | A :class:`CommandQueue` may be used as a context manager, like this:: 30 | 31 | with cl.CommandQueue(self.cl_context) as queue: 32 | enqueue_stuff(queue, ...) 33 | 34 | :meth:`finish` is automatically called at the end of the ``with``-delimited 35 | context, and further operations on the queue are considered an error. 36 | 37 | .. versionadded:: 2013.1 38 | 39 | Context manager capability. 40 | 41 | .. versionchanged:: 2018.2 42 | 43 | Added the sequence-of-properties interface for OpenCL 2. 44 | 45 | .. versionchanged:: 2022.1.4 46 | 47 | Use of a command queue after its context manager completes 48 | is now considered an error. :mod:`pyopencl` will warn about this 49 | for a transitionary period and will start raising an exception 50 | in 2023. 51 | 52 | .. attribute:: info 53 | 54 | Lower case versions of the :class:`command_queue_info` constants 55 | may be used as attributes on instances of this class 56 | to directly query info attributes. 57 | 58 | .. method:: get_info(param) 59 | 60 | See :class:`command_queue_info` for values of *param*. 61 | 62 | .. method:: set_property(prop, enable) 63 | 64 | See :class:`command_queue_properties` for possible values of *prop*. 65 | *enable* is a :class:`bool`. 66 | 67 | Unavailable in OpenCL 1.1 and newer. 68 | 69 | .. method:: flush() 70 | .. method:: finish() 71 | 72 | .. automethod:: from_int_ptr 73 | .. autoattribute:: int_ptr 74 | 75 | |comparable| 76 | 77 | Event 78 | ----- 79 | 80 | .. class:: Event 81 | 82 | .. attribute:: info 83 | 84 | Lower case versions of the :class:`event_info` constants 85 | may be used as attributes on instances of this class 86 | to directly query info attributes. 87 | 88 | .. attribute:: profile 89 | 90 | An instance of :class:`ProfilingInfoGetter`. 91 | 92 | .. method:: get_info(param) 93 | 94 | See :class:`event_info` for values of *param*. 95 | 96 | .. method:: get_profiling_info(param) 97 | 98 | See :class:`profiling_info` for values of *param*. 99 | See :attr:`profile` for an easier way of obtaining 100 | the same information. 101 | 102 | .. method:: wait() 103 | 104 | .. automethod:: from_int_ptr 105 | .. autoattribute:: int_ptr 106 | 107 | .. method:: set_callback(type, cb) 108 | 109 | Add the callback *cb* with signature ``cb(status)`` to the callback 110 | queue for the event status *type* (one of the values of 111 | :class:`command_execution_status`, except :attr:`command_execution_status.QUEUED`). 112 | 113 | See the OpenCL specification for restrictions on what *cb* may and may not do. 114 | 115 | .. versionadded:: 2015.2 116 | 117 | |comparable| 118 | 119 | .. class:: ProfilingInfoGetter 120 | 121 | .. attribute:: info 122 | 123 | Lower case versions of the :class:`profiling_info` constants 124 | may be used as attributes on the attribute ``profile`` of this 125 | class to directly query profiling info. 126 | 127 | For example, you may use *evt.profile.end* instead of 128 | *evt.get_profiling_info(pyopencl.profiling_info.END)*. 129 | 130 | Event Subclasses 131 | ---------------- 132 | 133 | .. class:: UserEvent(context) 134 | 135 | A subclass of :class:`Event`. Only available with OpenCL 1.1 and newer. 136 | 137 | .. versionadded:: 0.92 138 | 139 | .. method:: set_status(status) 140 | 141 | See :class:`command_execution_status` for possible values of *status*. 142 | 143 | .. class:: NannyEvent 144 | 145 | Transfers between host and device return events of this type. They hold 146 | a reference to the host-side buffer and wait for the transfer to complete 147 | when they are freed. Therefore, they can safely release the reference to 148 | the object they're guarding upon destruction. 149 | 150 | A subclass of :class:`Event`. 151 | 152 | .. versionadded:: 2011.2 153 | 154 | .. method:: get_ward() 155 | 156 | .. method:: wait() 157 | 158 | In addition to performing the same wait as :meth:`Event.wait()`, this 159 | method also releases the reference to the guarded object. 160 | 161 | Synchronization Functions 162 | ------------------------- 163 | 164 | .. function:: wait_for_events(events) 165 | 166 | .. function:: enqueue_barrier(queue, wait_for=None) 167 | 168 | Enqueues a barrier operation. which ensures that all queued commands in 169 | command_queue have finished execution. This command is a synchronization 170 | point. 171 | 172 | .. versionadded:: 0.91.5 173 | .. versionchanged:: 2011.2 174 | Takes *wait_for* and returns an :class:`Event` 175 | 176 | .. function:: enqueue_marker(queue, wait_for=None) 177 | 178 | Returns an :class:`Event`. 179 | 180 | .. versionchanged:: 2011.2 181 | Takes *wait_for*. 182 | 183 | -------------------------------------------------------------------------------- /doc/subst.rst: -------------------------------------------------------------------------------- 1 | .. |comparable| replace:: Instances of this class are hashable, and two 2 | instances of this class may be compared using *"=="* and *"!="*. 3 | (Hashability was added in version 2011.2.) Two objects are considered 4 | the same if the underlying OpenCL object is the same, as established 5 | by C pointer equality. 6 | 7 | .. |buf-iface| replace:: must implement the Python buffer interface. 8 | (e.g. by being an :class:`numpy.ndarray`) 9 | .. |explain-waitfor| replace:: *wait_for* 10 | may either be *None* or a list of :class:`pyopencl.Event` instances for 11 | whose completion this command waits before starting execution. 12 | .. |std-enqueue-blurb| replace:: Returns a new :class:`pyopencl.Event`. |explain-waitfor| 13 | 14 | .. |copy-depr| replace:: **Note:** This function is deprecated as of PyOpenCL 2011.1. 15 | Use :func:`~pyopencl.enqueue_copy` instead. 16 | 17 | .. |glsize| replace:: *global_size* and *local_size* are tuples of identical length, with 18 | between one and three entries. *global_size* specifies the overall size 19 | of the computational grid: one work item will be launched for every 20 | integer point in the grid. *local_size* specifies the workgroup size, 21 | which must evenly divide the *global_size* in a dimension-by-dimension 22 | manner. *None* may be passed for local_size, in which case the 23 | implementation will use an implementation-defined workgroup size. 24 | If *g_times_l* is *True*, the global size will be multiplied by the 25 | local size. (which makes the behavior more like Nvidia CUDA) In this case, 26 | *global_size* and *local_size* also do not have to have the same number 27 | of entries. 28 | 29 | .. |empty-nd-range| replace:: *allow_empty_ndrange* is a :class:`bool` indicating 30 | how an empty NDRange is to be treated, where "empty" means that one or more 31 | entries of *global_size* or *local_size* are zero. OpenCL itself does not 32 | allow enqueueing kernels over empty NDRanges. Setting this flag to *True* 33 | enqueues a marker with a wait list (``clEnqueueMarkerWithWaitList``) 34 | to obtain the synchronization effects that would have resulted from 35 | the kernel enqueue. 36 | Setting *allow_empty_ndrange* to *True* requires OpenCL 1.2 or newer. 37 | -------------------------------------------------------------------------------- /doc/tools.rst: -------------------------------------------------------------------------------- 1 | Built-in Utilities 2 | ================== 3 | 4 | .. automodule:: pyopencl.tools 5 | -------------------------------------------------------------------------------- /doc/types.rst: -------------------------------------------------------------------------------- 1 | OpenCL Type Mapping 2 | =================== 3 | 4 | .. module:: pyopencl.cltypes 5 | 6 | .. _type-mappings: 7 | 8 | Scalar Types 9 | ------------ 10 | 11 | For ease of use, a the :mod:`pyopencl.cltypes` module provides convenient mapping 12 | from OpenCL type names to their equivalent :mod:`numpy` types. This saves you 13 | from referring back to the OpenCL spec to see that a ``cl_long`` is 64 bit 14 | unsigned integer. Use the module as follows: 15 | 16 | .. doctest:: 17 | 18 | >>> import numpy as np 19 | >>> import pyopencl as cl 20 | >>> import pyopencl.cltypes 21 | >>> cl_uint = cl.cltypes.uint(42) # maps to numpy.uint32 22 | >>> cl_long = cl.cltypes.long(1235) # maps to numpy.int64 23 | >>> floats = np.empty((128,), dtype=cl.cltypes.float) # array of numpy.float32 24 | 25 | .. note:: 26 | 27 | The OpenCL type ``bool`` does not have a corresponding :mod:`numpy` type 28 | defined here, because OpenCL does not specify the in-memory representation 29 | (or even the storage size) for this type. 30 | 31 | Vector Types 32 | ------------ 33 | 34 | The corresponding vector types are also made available in the same package, 35 | allowing you to easily create :mod:`numpy` arrays with the appropriate memory 36 | layout. 37 | 38 | .. doctest:: 39 | 40 | >>> import numpy as np 41 | >>> array_of_float16 = np.empty((128,), dtype=cl.cltypes.float16) # array of float16 42 | 43 | -------------------------------------------------------------------------------- /doc/upload-docs.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | rsync --verbose --archive --delete _build/html/ doc-upload:doc/pyopencl 4 | -------------------------------------------------------------------------------- /examples/.gitignore: -------------------------------------------------------------------------------- 1 | wiki-examples 2 | -------------------------------------------------------------------------------- /examples/demo-struct-reduce.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import pyopencl as cl 4 | 5 | 6 | def make_collector_dtype(device): 7 | dtype = np.dtype([ 8 | ("cur_min", np.int32), 9 | ("cur_max", np.int32), 10 | ("pad", np.int32), 11 | ]) 12 | 13 | name = "minmax_collector" 14 | from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct 15 | 16 | dtype, c_decl = match_dtype_to_c_struct(device, name, dtype) 17 | dtype = get_or_register_dtype(name, dtype) 18 | 19 | return dtype, c_decl 20 | 21 | 22 | ctx = cl.create_some_context() 23 | queue = cl.CommandQueue(ctx) 24 | 25 | mmc_dtype, mmc_c_decl = make_collector_dtype(ctx.devices[0]) 26 | 27 | preamble = mmc_c_decl + r"""//CL// 28 | 29 | minmax_collector mmc_neutral() 30 | { 31 | // FIXME: needs infinity literal in real use, ok here 32 | minmax_collector result; 33 | result.cur_min = 1<<30; 34 | result.cur_max = -(1<<30); 35 | return result; 36 | } 37 | 38 | minmax_collector mmc_from_scalar(float x) 39 | { 40 | minmax_collector result; 41 | result.cur_min = x; 42 | result.cur_max = x; 43 | return result; 44 | } 45 | 46 | minmax_collector agg_mmc(minmax_collector a, minmax_collector b) 47 | { 48 | minmax_collector result = a; 49 | if (b.cur_min < result.cur_min) 50 | result.cur_min = b.cur_min; 51 | if (b.cur_max > result.cur_max) 52 | result.cur_max = b.cur_max; 53 | return result; 54 | } 55 | 56 | """ 57 | 58 | from pyopencl.clrandom import rand as clrand 59 | 60 | 61 | a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6) 62 | a = a_gpu.get() 63 | 64 | from pyopencl.reduction import ReductionKernel 65 | 66 | 67 | red = ReductionKernel(ctx, mmc_dtype, 68 | neutral="mmc_neutral()", 69 | reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])", 70 | arguments="__global int *x", preamble=preamble) 71 | 72 | minmax = red(a_gpu).get() 73 | 74 | assert abs(minmax["cur_min"] - np.min(a)) < 1e-5 75 | assert abs(minmax["cur_max"] - np.max(a)) < 1e-5 76 | -------------------------------------------------------------------------------- /examples/demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | 5 | import pyopencl as cl 6 | 7 | 8 | rng = np.random.default_rng() 9 | a_np = rng.random(50000, dtype=np.float32) 10 | b_np = rng.random(50000, dtype=np.float32) 11 | 12 | ctx = cl.create_some_context() 13 | queue = cl.CommandQueue(ctx) 14 | 15 | mf = cl.mem_flags 16 | a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np) 17 | b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np) 18 | 19 | prg = cl.Program(ctx, """ 20 | __kernel void sum( 21 | __global const float *a_g, __global const float *b_g, __global float *res_g) 22 | { 23 | int gid = get_global_id(0); 24 | res_g[gid] = a_g[gid] + b_g[gid]; 25 | } 26 | """).build() 27 | 28 | res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes) 29 | knl = prg.sum # Use this Kernel object for repeated calls 30 | knl(queue, a_np.shape, None, a_g, b_g, res_g) 31 | 32 | res_np = np.empty_like(a_np) 33 | cl.enqueue_copy(queue, res_np, res_g) 34 | 35 | # Check on CPU with Numpy: 36 | error_np = res_np - (a_np + b_np) 37 | print(f"Error:\n{error_np}") 38 | print(f"Norm: {np.linalg.norm(error_np):.16e}") 39 | assert np.allclose(res_np, a_np + b_np) 40 | -------------------------------------------------------------------------------- /examples/demo_array.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.linalg as la 3 | 4 | import pyopencl as cl 5 | import pyopencl.array as cl_array 6 | 7 | 8 | rng = np.random.default_rng() 9 | a = rng.random(50000, dtype=np.float32) 10 | b = rng.random(50000, dtype=np.float32) 11 | 12 | ctx = cl.create_some_context() 13 | queue = cl.CommandQueue(ctx) 14 | 15 | a_dev = cl_array.to_device(queue, a) 16 | b_dev = cl_array.to_device(queue, b) 17 | dest_dev = cl_array.empty_like(a_dev) 18 | 19 | prg = cl.Program(ctx, """ 20 | __kernel void sum(__global const float *a, 21 | __global const float *b, __global float *c) 22 | { 23 | int gid = get_global_id(0); 24 | c[gid] = a[gid] + b[gid]; 25 | } 26 | """).build() 27 | 28 | knl = prg.sum # Use this Kernel object for repeated calls 29 | knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data) 30 | 31 | print(la.norm((dest_dev - (a_dev+b_dev)).get())) 32 | assert np.allclose(dest_dev.get(), (a_dev + b_dev).get()) 33 | -------------------------------------------------------------------------------- /examples/demo_array_svm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import pyopencl as cl 4 | import pyopencl.array as cl_array 5 | from pyopencl.tools import SVMAllocator, SVMPool 6 | 7 | 8 | n = 50000 9 | 10 | rng = np.random.default_rng() 11 | a = rng.random(n, dtype=np.float32) 12 | b = rng.random(n, dtype=np.float32) 13 | 14 | ctx = cl.create_some_context() 15 | queue = cl.CommandQueue(ctx) 16 | 17 | alloc = SVMAllocator(ctx, alignment=0, queue=queue) 18 | alloc = SVMPool(alloc) 19 | 20 | a_dev = cl_array.to_device(queue, a, allocator=alloc) 21 | b_dev = cl_array.to_device(queue, b, allocator=alloc) 22 | dest_dev = cl_array.empty_like(a_dev) 23 | 24 | prg = cl.Program(ctx, """ 25 | __kernel void sum(__global const float *a, 26 | __global const float *b, __global float *c) 27 | { 28 | int gid = get_global_id(0); 29 | c[gid] = a[gid] + b[gid]; 30 | } 31 | """).build() 32 | 33 | knl = prg.sum 34 | knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data) 35 | 36 | print(np.linalg.norm((dest_dev - (a_dev + b_dev)).get())) 37 | assert np.allclose(dest_dev.get(), (a_dev + b_dev).get()) 38 | -------------------------------------------------------------------------------- /examples/demo_elementwise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import pyopencl as cl 4 | import pyopencl.array 5 | from pyopencl.elementwise import ElementwiseKernel 6 | 7 | 8 | n = 10 9 | 10 | rng = np.random.default_rng() 11 | a_np = rng.random(n, dtype=np.float32) 12 | b_np = rng.random(n, dtype=np.float32) 13 | 14 | ctx = cl.create_some_context() 15 | queue = cl.CommandQueue(ctx) 16 | 17 | a_g = cl.array.to_device(queue, a_np) 18 | b_g = cl.array.to_device(queue, b_np) 19 | 20 | lin_comb = ElementwiseKernel(ctx, 21 | "float k1, float *a_g, float k2, float *b_g, float *res_g", 22 | "res_g[i] = k1 * a_g[i] + k2 * b_g[i]", 23 | "lin_comb") 24 | 25 | res_g = cl.array.empty_like(a_g) 26 | lin_comb(2, a_g, 3, b_g, res_g) 27 | 28 | # Check on GPU with PyOpenCL Array: 29 | print((res_g - (2 * a_g + 3 * b_g)).get()) 30 | 31 | # Check on CPU with Numpy: 32 | res_np = res_g.get() 33 | print(res_np - (2 * a_np + 3 * b_np)) 34 | print(np.linalg.norm(res_np - (2 * a_np + 3 * b_np))) 35 | -------------------------------------------------------------------------------- /examples/demo_elementwise_complex.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.linalg as la 3 | 4 | import pyopencl as cl 5 | import pyopencl.array as cl_array 6 | from pyopencl.elementwise import ElementwiseKernel 7 | 8 | 9 | ctx = cl.create_some_context() 10 | queue = cl.CommandQueue(ctx) 11 | 12 | n = 10 13 | 14 | rng = np.random.default_rng() 15 | a_gpu = cl_array.to_device(queue, 16 | rng.standard_normal(n, dtype=np.float32) 17 | + 1j*rng.standard_normal(n, dtype=np.float32)) 18 | b_gpu = cl_array.to_device(queue, 19 | rng.standard_normal(n, dtype=np.float32) 20 | + 1j*rng.standard_normal(n, dtype=np.float32)) 21 | 22 | complex_prod = ElementwiseKernel(ctx, 23 | "float a, " 24 | "cfloat_t *x, " 25 | "cfloat_t *y, " 26 | "cfloat_t *z", 27 | "z[i] = cfloat_rmul(a, cfloat_mul(x[i], y[i]))", 28 | "complex_prod", 29 | preamble="#include ") 30 | 31 | complex_add = ElementwiseKernel(ctx, 32 | "cfloat_t *x, " 33 | "cfloat_t *y, " 34 | "cfloat_t *z", 35 | "z[i] = cfloat_add(x[i], y[i])", 36 | "complex_add", 37 | preamble="#include ") 38 | 39 | real_part = ElementwiseKernel(ctx, 40 | "cfloat_t *x, float *z", 41 | "z[i] = cfloat_real(x[i])", 42 | "real_part", 43 | preamble="#include ") 44 | 45 | c_gpu = cl_array.empty_like(a_gpu) 46 | complex_prod(5, a_gpu, b_gpu, c_gpu) 47 | 48 | c_gpu_real = cl_array.empty(queue, len(a_gpu), dtype=np.float32) 49 | real_part(c_gpu, c_gpu_real) 50 | print(c_gpu.get().real - c_gpu_real.get()) 51 | 52 | print(la.norm(c_gpu.get() - (5*a_gpu.get()*b_gpu.get()))) 53 | assert la.norm(c_gpu.get() - (5*a_gpu.get()*b_gpu.get())) < 1e-5 54 | -------------------------------------------------------------------------------- /examples/demo_mandelbrot.py: -------------------------------------------------------------------------------- 1 | # I found this example for PyCuda here: 2 | # http://wiki.tiker.net/PyCuda/Examples/Mandelbrot 3 | # 4 | # An improved sequential/pure Python code was contributed 5 | # by CRVSADER//KY . 6 | # 7 | # I adapted it for PyOpenCL. Hopefully it is useful to someone. 8 | # July 2010, HolgerRapp@gmx.net 9 | # 10 | # Original readme below these lines. 11 | 12 | # Mandelbrot calculate using GPU, Serial numpy and faster numpy 13 | # Use to show the speed difference between CPU and GPU calculations 14 | # ian@ianozsvald.com March 2010 15 | 16 | # Based on vegaseat's TKinter/numpy example code from 2006 17 | # http://www.daniweb.com/code/snippet216851.html# 18 | # with minor changes to move to numpy from the obsolete Numeric 19 | 20 | import time 21 | 22 | import numpy as np 23 | from PIL import Image 24 | 25 | import pyopencl as cl 26 | 27 | 28 | # You can choose a calculation routine below (calc_fractal), uncomment 29 | # one of the three lines to test the three variations 30 | # Speed notes are listed in the same place 31 | 32 | # set width and height of window, more pixels take longer to calculate 33 | w = 2048 34 | h = 2048 35 | 36 | 37 | def calc_fractal_opencl(q, maxiter): 38 | ctx = cl.create_some_context() 39 | queue = cl.CommandQueue(ctx) 40 | 41 | output = np.empty(q.shape, dtype=np.uint16) 42 | 43 | mf = cl.mem_flags 44 | q_opencl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q) 45 | output_opencl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes) 46 | 47 | prg = cl.Program( 48 | ctx, 49 | """ 50 | #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 51 | __kernel void mandelbrot(__global float2 *q, 52 | __global ushort *output, ushort const maxiter) 53 | { 54 | int gid = get_global_id(0); 55 | float nreal, real = 0; 56 | float imag = 0; 57 | 58 | output[gid] = 0; 59 | 60 | for(int curiter = 0; curiter < maxiter; curiter++) { 61 | nreal = real*real - imag*imag + q[gid].x; 62 | imag = 2* real*imag + q[gid].y; 63 | real = nreal; 64 | 65 | if (real*real + imag*imag > 4.0f) { 66 | output[gid] = curiter; 67 | break; 68 | } 69 | } 70 | } 71 | """, 72 | ).build() 73 | 74 | prg.mandelbrot( 75 | queue, output.shape, None, q_opencl, output_opencl, np.uint16(maxiter) 76 | ) 77 | 78 | cl.enqueue_copy(queue, output, output_opencl).wait() 79 | 80 | return output 81 | 82 | 83 | def calc_fractal_serial(q, maxiter): 84 | # calculate z using pure python on a numpy array 85 | # note that, unlike the other two implementations, 86 | # the number of iterations per point is NOT constant 87 | z = np.zeros(q.shape, complex) 88 | output = np.resize( 89 | np.array( 90 | 0, 91 | ), 92 | q.shape, 93 | ) 94 | for i in range(len(q)): 95 | for iter in range(maxiter): 96 | z[i] = z[i] * z[i] + q[i] 97 | if abs(z[i]) > 2.0: 98 | output[i] = iter 99 | break 100 | return output 101 | 102 | 103 | def calc_fractal_numpy(q, maxiter): 104 | # calculate z using numpy, this is the original 105 | # routine from vegaseat's URL 106 | output = np.resize( 107 | np.array( 108 | 0, 109 | ), 110 | q.shape, 111 | ) 112 | z = np.zeros(q.shape, np.complex64) 113 | 114 | for it in range(maxiter): 115 | z = z * z + q 116 | done = np.greater(abs(z), 2.0) 117 | q = np.where(done, 0 + 0j, q) 118 | z = np.where(done, 0 + 0j, z) 119 | output = np.where(done, it, output) 120 | return output 121 | 122 | 123 | # choose your calculation routine here by uncommenting one of the options 124 | calc_fractal = calc_fractal_opencl 125 | # calc_fractal = calc_fractal_serial 126 | # calc_fractal = calc_fractal_numpy 127 | 128 | 129 | class Mandelbrot: 130 | def draw(self, x1, x2, y1, y2, maxiter=30): 131 | # draw the Mandelbrot set, from numpy example 132 | xx = np.arange(x1, x2, (x2 - x1) / w) 133 | yy = np.arange(y2, y1, (y1 - y2) / h) * 1j 134 | q = np.ravel(xx + yy[:, np.newaxis]).astype(np.complex64) 135 | 136 | start_main = time.time() 137 | output = calc_fractal(q, maxiter) 138 | end_main = time.time() 139 | 140 | secs = end_main - start_main 141 | print("Main took", secs) 142 | 143 | self.mandel = (output.reshape((h, w)) / float(output.max()) * 255.0).astype( 144 | np.uint8 145 | ) 146 | 147 | def create_image(self): 148 | """ " 149 | create the image from the draw() string 150 | """ 151 | # you can experiment with these x and y ranges 152 | self.draw(-2.13, 0.77, -1.3, 1.3) 153 | self.im = Image.fromarray(self.mandel) 154 | self.im.putpalette([i for rgb in ((j, 0, 0) for j in range(255)) 155 | for i in rgb]) 156 | 157 | def create_label(self): 158 | # put the image on a label widget 159 | self.image = ImageTk.PhotoImage(self.im) 160 | self.label = tk.Label(self.root, image=self.image) 161 | self.label.pack() 162 | 163 | def run_tk(self): 164 | self.root = tk.Tk() 165 | self.root.title("Mandelbrot Set") 166 | self.create_image() 167 | self.create_label() 168 | # start event loop 169 | self.root.mainloop() 170 | 171 | 172 | if __name__ == "__main__": 173 | test = Mandelbrot() 174 | try: 175 | import tkinter as tk 176 | except ModuleNotFoundError: 177 | test.create_image() 178 | else: 179 | from PIL import ImageTk 180 | try: 181 | test.run_tk() 182 | except tk.TclError: 183 | test.create_image() 184 | -------------------------------------------------------------------------------- /examples/demo_meta_codepy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.linalg as la 3 | 4 | from cgen import ( 5 | POD, 6 | Assign, 7 | Block, 8 | Const, 9 | FunctionBody, 10 | FunctionDeclaration, 11 | Initializer, 12 | Module, 13 | Pointer, 14 | Value, 15 | ) 16 | from cgen.opencl import CLGlobal, CLKernel, CLRequiredWorkGroupSize 17 | 18 | import pyopencl as cl 19 | 20 | 21 | local_size = 256 22 | thread_strides = 32 23 | macroblock_count = 33 24 | dtype = np.float32 25 | total_size = local_size*thread_strides*macroblock_count 26 | 27 | ctx = cl.create_some_context() 28 | queue = cl.CommandQueue(ctx) 29 | 30 | rng = np.random.default_rng() 31 | a = rng.standard_normal(total_size, dtype=dtype) 32 | b = rng.standard_normal(total_size, dtype=dtype) 33 | 34 | mf = cl.mem_flags 35 | a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) 36 | b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) 37 | c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) 38 | 39 | mod = Module([ 40 | FunctionBody( 41 | CLKernel(CLRequiredWorkGroupSize((local_size,), 42 | FunctionDeclaration( 43 | Value("void", "add"), 44 | arg_decls=[CLGlobal(Pointer(Const(POD(dtype, name)))) 45 | for name in ["tgt", "op1", "op2"]]))), 46 | Block([ 47 | Initializer(POD(np.int32, "idx"), 48 | "get_local_id(0) + %d * get_group_id(0)" 49 | % (local_size*thread_strides)) 50 | ]+[ 51 | Assign( 52 | "tgt[idx+%d]" % (o*local_size), 53 | "op1[idx+%d] + op2[idx+%d]" % ( 54 | o*local_size, 55 | o*local_size)) 56 | for o in range(thread_strides)]))]) 57 | 58 | knl = cl.Program(ctx, str(mod)).build().add 59 | 60 | knl(queue, (local_size*macroblock_count,), (local_size,), 61 | c_buf, a_buf, b_buf) 62 | 63 | c = np.empty_like(a) 64 | cl.enqueue_copy(queue, c, c_buf).wait() 65 | 66 | assert la.norm(c-(a+b)) == 0 67 | -------------------------------------------------------------------------------- /examples/demo_meta_template.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.linalg as la 3 | from mako.template import Template 4 | 5 | import pyopencl as cl 6 | 7 | 8 | local_size = 256 9 | thread_strides = 32 10 | macroblock_count = 33 11 | dtype = np.float32 12 | total_size = local_size*thread_strides*macroblock_count 13 | 14 | ctx = cl.create_some_context() 15 | queue = cl.CommandQueue(ctx) 16 | 17 | rng = np.random.default_rng() 18 | a = rng.standard_normal(total_size, dtype=dtype) 19 | b = rng.standard_normal(total_size, dtype=dtype) 20 | 21 | mf = cl.mem_flags 22 | a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) 23 | b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) 24 | c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) 25 | 26 | tpl = Template(""" 27 | __kernel void add( 28 | __global ${ type_name } *tgt, 29 | __global const ${ type_name } *op1, 30 | __global const ${ type_name } *op2) 31 | { 32 | int idx = get_local_id(0) 33 | + ${ local_size } * ${ thread_strides } 34 | * get_group_id(0); 35 | 36 | % for i in range(thread_strides): 37 | <% offset = i*local_size %> 38 | tgt[idx + ${ offset }] = 39 | op1[idx + ${ offset }] 40 | + op2[idx + ${ offset } ]; 41 | % endfor 42 | }""") 43 | 44 | rendered_tpl = tpl.render(type_name="float", 45 | local_size=local_size, thread_strides=thread_strides) 46 | 47 | knl = cl.Program(ctx, str(rendered_tpl)).build().add 48 | 49 | knl(queue, (local_size*macroblock_count,), (local_size,), 50 | c_buf, a_buf, b_buf) 51 | 52 | c = np.empty_like(a) 53 | cl.enqueue_copy(queue, c, c_buf).wait() 54 | 55 | assert la.norm(c-(a+b)) == 0 56 | -------------------------------------------------------------------------------- /examples/dump-performance.py: -------------------------------------------------------------------------------- 1 | import pyopencl as cl 2 | import pyopencl.characterize.performance as perf 3 | 4 | 5 | def main(): 6 | ctx = cl.create_some_context() 7 | 8 | prof_overhead, latency = perf.get_profiling_overhead(ctx) 9 | print("command latency: %g s" % latency) 10 | print("profiling overhead: {:g} s -> {:.1f} %".format( 11 | prof_overhead, 100*prof_overhead/latency)) 12 | queue = cl.CommandQueue( 13 | ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) 14 | 15 | print("empty kernel: %g s" % perf.get_empty_kernel_time(queue)) 16 | print("float32 add: %g GOps/s" % (perf.get_add_rate(queue)/1e9)) 17 | 18 | for tx_type in [ 19 | perf.HostToDeviceTransfer, 20 | perf.DeviceToHostTransfer, 21 | perf.DeviceToDeviceTransfer]: 22 | print("----------------------------------------") 23 | print(tx_type.__name__) 24 | print("----------------------------------------") 25 | 26 | print("latency: %g s" % perf.transfer_latency(queue, tx_type)) 27 | for i in range(6, 31, 2): 28 | bs = 1 << i 29 | try: 30 | result = "%g GB/s" % ( 31 | perf.transfer_bandwidth(queue, tx_type, bs)/1e9) 32 | except Exception as e: 33 | result = "exception: %s" % e.__class__.__name__ 34 | print("bandwidth @ %d bytes: %s" % (bs, result)) 35 | 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /examples/dump-properties.py: -------------------------------------------------------------------------------- 1 | from optparse import OptionParser 2 | 3 | import pyopencl as cl 4 | 5 | 6 | parser = OptionParser() 7 | parser.add_option("-s", "--short", action="store_true", 8 | help="don't print all device properties") 9 | 10 | (options, args) = parser.parse_args() 11 | 12 | 13 | def print_info(obj, info_cls): 14 | for info_name in sorted(dir(info_cls)): 15 | if not info_name.startswith("_") and info_name != "to_string": 16 | info = getattr(info_cls, info_name) 17 | try: 18 | info_value = obj.get_info(info) 19 | except Exception: 20 | info_value = "" 21 | 22 | if (info_cls == cl.device_info and info_name == "PARTITION_TYPES_EXT" 23 | and isinstance(info_value, list)): 24 | print("{}: {}".format(info_name, [ 25 | cl.device_partition_property_ext.to_string(v, 26 | "") 27 | for v in info_value])) 28 | else: 29 | try: 30 | print(f"{info_name}: {info_value}") 31 | except Exception: 32 | print("%s: " % info_name) 33 | 34 | 35 | for platform in cl.get_platforms(): 36 | print(75*"=") 37 | print(platform) 38 | print(75*"=") 39 | if not options.short: 40 | print_info(platform, cl.platform_info) 41 | 42 | for device in platform.get_devices(): 43 | if not options.short: 44 | print(75*"-") 45 | print(device) 46 | if not options.short: 47 | print(75*"-") 48 | print_info(device, cl.device_info) 49 | ctx = cl.Context([device]) 50 | for mf in [ 51 | cl.mem_flags.READ_ONLY, 52 | # cl.mem_flags.READ_WRITE, 53 | # cl.mem_flags.WRITE_ONLY 54 | ]: 55 | for itype in [ 56 | cl.mem_object_type.IMAGE2D, 57 | cl.mem_object_type.IMAGE3D 58 | ]: 59 | try: 60 | formats = cl.get_supported_image_formats(ctx, mf, itype) 61 | except Exception: 62 | formats = "" 63 | else: 64 | def str_chd_type(chdtype): 65 | result = cl.channel_type.to_string(chdtype, 66 | "") 67 | 68 | result = result.replace("_INT", "") 69 | result = result.replace("UNSIGNED", "U") 70 | result = result.replace("SIGNED", "S") 71 | result = result.replace("NORM", "N") 72 | result = result.replace("FLOAT", "F") 73 | return result 74 | 75 | formats = ", ".join( 76 | "{}-{}".format( 77 | cl.channel_order.to_string(iform.channel_order, 78 | ""), 79 | str_chd_type(iform.channel_data_type)) 80 | for iform in formats) 81 | 82 | print("{} {} FORMATS: {}\n".format( 83 | cl.mem_object_type.to_string(itype), 84 | cl.mem_flags.to_string(mf), 85 | formats)) 86 | del ctx 87 | -------------------------------------------------------------------------------- /examples/gl_interop_demo.py: -------------------------------------------------------------------------------- 1 | from OpenGL.GL import * 2 | from OpenGL.GLUT import * 3 | from OpenGL.raw.GL.VERSION.GL_1_5 import glBufferData as rawGlBufferData 4 | 5 | import pyopencl as cl 6 | 7 | 8 | n_vertices = 10000 9 | 10 | src = """ 11 | 12 | __kernel void generate_sin(__global float2* a) 13 | { 14 | int id = get_global_id(0); 15 | int n = get_global_size(0); 16 | float r = (float)id / (float)n; 17 | float x = r * 16.0f * 3.1415f; 18 | a[id].x = r * 2.0f - 1.0f; 19 | a[id].y = native_sin(x); 20 | } 21 | 22 | """ 23 | 24 | def initialize(): 25 | platform = cl.get_platforms()[0] 26 | 27 | import sys 28 | 29 | from pyopencl.tools import get_gl_sharing_context_properties 30 | if sys.platform == "darwin": 31 | ctx = cl.Context(properties=get_gl_sharing_context_properties(), 32 | devices=[]) 33 | else: 34 | # Some OSs prefer clCreateContextFromType, some prefer 35 | # clCreateContext. Try both. 36 | try: 37 | ctx = cl.Context(properties=[ 38 | (cl.context_properties.PLATFORM, platform)] 39 | + get_gl_sharing_context_properties()) 40 | except: 41 | ctx = cl.Context(properties=[ 42 | (cl.context_properties.PLATFORM, platform)] 43 | + get_gl_sharing_context_properties(), 44 | devices = [platform.get_devices()[0]]) 45 | 46 | glClearColor(1, 1, 1, 1) 47 | glColor(0, 0, 1) 48 | vbo = glGenBuffers(1) 49 | glBindBuffer(GL_ARRAY_BUFFER, vbo) 50 | rawGlBufferData(GL_ARRAY_BUFFER, n_vertices * 2 * 4, None, GL_STATIC_DRAW) 51 | glEnableClientState(GL_VERTEX_ARRAY) 52 | glVertexPointer(2, GL_FLOAT, 0, None) 53 | coords_dev = cl.GLBuffer(ctx, cl.mem_flags.READ_WRITE, int(vbo)) 54 | prog = cl.Program(ctx, src).build() 55 | queue = cl.CommandQueue(ctx) 56 | cl.enqueue_acquire_gl_objects(queue, [coords_dev]) 57 | prog.generate_sin(queue, (n_vertices,), None, coords_dev) 58 | cl.enqueue_release_gl_objects(queue, [coords_dev]) 59 | queue.finish() 60 | glFlush() 61 | 62 | def display(): 63 | glClear(GL_COLOR_BUFFER_BIT) 64 | glDrawArrays(GL_LINE_STRIP, 0, n_vertices) 65 | glFlush() 66 | 67 | def reshape(w, h): 68 | glViewport(0, 0, w, h) 69 | glMatrixMode(GL_PROJECTION) 70 | glLoadIdentity() 71 | glMatrixMode(GL_MODELVIEW) 72 | 73 | if __name__ == '__main__': 74 | import sys 75 | glutInit(sys.argv) 76 | if len(sys.argv) > 1: 77 | n_vertices = int(sys.argv[1]) 78 | glutInitWindowSize(800, 160) 79 | glutInitWindowPosition(0, 0) 80 | glutCreateWindow('OpenCL/OpenGL Interop Tutorial: Sin Generator') 81 | glutDisplayFunc(display) 82 | glutReshapeFunc(reshape) 83 | initialize() 84 | glutMainLoop() 85 | -------------------------------------------------------------------------------- /examples/gl_particle_animation.py: -------------------------------------------------------------------------------- 1 | # Visualization of particles with gravity 2 | # Source: http://enja.org/2010/08/27/adventures-in-opencl-part-2-particles-with-opengl/ 3 | 4 | import sys 5 | 6 | import numpy as np 7 | from OpenGL import GL, GLU, GLUT 8 | from OpenGL.arrays import vbo 9 | from OpenGL.GL import ( 10 | GL_ARRAY_BUFFER, GL_BLEND, GL_COLOR_ARRAY, GL_COLOR_BUFFER_BIT, 11 | GL_DEPTH_BUFFER_BIT, GL_DYNAMIC_DRAW, GL_FLOAT, GL_MODELVIEW, 12 | GL_ONE_MINUS_SRC_ALPHA, GL_POINT_SMOOTH, GL_POINTS, GL_PROJECTION, GL_SRC_ALPHA, 13 | GL_VERTEX_ARRAY) 14 | from OpenGL.GLUT import GLUT_DEPTH, GLUT_DOUBLE, GLUT_RGBA 15 | 16 | import pyopencl as cl 17 | from pyopencl.tools import get_gl_sharing_context_properties 18 | 19 | 20 | mf = cl.mem_flags 21 | 22 | width = 800 23 | height = 600 24 | num_particles = 100000 25 | time_step = 0.005 26 | mouse_down = False 27 | mouse_old = {"x": 0.0, "y": 0.0} 28 | rotate = {"x": 0.0, "y": 0.0, "z": 0.0} 29 | translate = {"x": 0.0, "y": 0.0, "z": 0.0} 30 | initial_translate = {"x": 0.0, "y": 0.0, "z": -2.5} 31 | 32 | 33 | def glut_window(): 34 | GLUT.glutInit(sys.argv) 35 | GLUT.glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH) 36 | GLUT.glutInitWindowSize(width, height) 37 | GLUT.glutInitWindowPosition(0, 0) 38 | window = GLUT.glutCreateWindow("Particle Simulation") 39 | 40 | GLUT.glutDisplayFunc(on_display) # Called by GLUT every frame 41 | GLUT.glutKeyboardFunc(on_key) 42 | GLUT.glutMouseFunc(on_click) 43 | GLUT.glutMotionFunc(on_mouse_move) 44 | GLUT.glutTimerFunc(10, on_timer, 10) # Call draw every 30 ms 45 | 46 | GL.glViewport(0, 0, width, height) 47 | GL.glMatrixMode(GL_PROJECTION) 48 | GL.glLoadIdentity() 49 | GLU.gluPerspective(60.0, width / float(height), 0.1, 1000.0) 50 | 51 | return window 52 | 53 | 54 | def initial_buffers(num_particles): 55 | rng = np.random.default_rng() 56 | 57 | np_position = np.empty((num_particles, 4), dtype=np.float32) 58 | np_color = np.empty((num_particles, 4), dtype=np.float32) 59 | np_velocity = np.empty((num_particles, 4), dtype=np.float32) 60 | 61 | np_position[:, 0] = np.sin( 62 | np.arange(0.0, num_particles) * 2.001 * np.pi / num_particles 63 | ) 64 | np_position[:, 0] *= rng.integers(num_particles) / 3.0 + 0.2 65 | np_position[:, 1] = np.cos( 66 | np.arange(0.0, num_particles) * 2.001 * np.pi / num_particles 67 | ) 68 | np_position[:, 1] *= rng.integers(num_particles) / 3.0 + 0.2 69 | np_position[:, 2] = 0.0 70 | np_position[:, 3] = 1.0 71 | 72 | np_color[:, :] = [1.0, 1.0, 1.0, 1.0] # White particles 73 | 74 | np_velocity[:, 0] = np_position[:, 0] * 2.0 75 | np_velocity[:, 1] = np_position[:, 1] * 2.0 76 | np_velocity[:, 2] = 3.0 77 | np_velocity[:, 3] = rng.integers(num_particles) 78 | 79 | gl_position = vbo.VBO( 80 | data=np_position, usage=GL_DYNAMIC_DRAW, target=GL_ARRAY_BUFFER 81 | ) 82 | gl_position.bind() 83 | gl_color = vbo.VBO(data=np_color, usage=GL_DYNAMIC_DRAW, target=GL_ARRAY_BUFFER) 84 | gl_color.bind() 85 | 86 | return (np_position, np_velocity, gl_position, gl_color) 87 | 88 | 89 | def on_timer(t): 90 | GLUT.glutTimerFunc(t, on_timer, t) 91 | GLUT.glutPostRedisplay() 92 | 93 | 94 | def on_key(*args): 95 | if args[0] == "\033" or args[0] == "q": 96 | sys.exit() 97 | 98 | 99 | def on_click(button, state, x, y): 100 | mouse_old["x"] = x 101 | mouse_old["y"] = y 102 | 103 | 104 | def on_mouse_move(x, y): 105 | rotate["x"] += (y - mouse_old["y"]) * 0.2 106 | rotate["y"] += (x - mouse_old["x"]) * 0.2 107 | 108 | mouse_old["x"] = x 109 | mouse_old["y"] = y 110 | 111 | 112 | def on_display(): 113 | """Render the particles""" 114 | # Update or particle positions by calling the OpenCL kernel 115 | cl.enqueue_acquire_gl_objects(queue, [cl_gl_position, cl_gl_color]) 116 | kernelargs = ( 117 | cl_gl_position, 118 | cl_gl_color, 119 | cl_velocity, 120 | cl_start_position, 121 | cl_start_velocity, 122 | np.float32(time_step), 123 | ) 124 | program.particle_fountain(queue, (num_particles,), None, *(kernelargs)) 125 | cl.enqueue_release_gl_objects(queue, [cl_gl_position, cl_gl_color]) 126 | queue.finish() 127 | GL.glFlush() 128 | 129 | GL.glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT) 130 | GL.glMatrixMode(GL_MODELVIEW) 131 | GL.glLoadIdentity() 132 | 133 | # Handle mouse transformations 134 | GL.glTranslatef(initial_translate["x"], initial_translate["y"], initial_translate["z"]) 135 | GL.glRotatef(rotate["x"], 1, 0, 0) 136 | GL.glRotatef(rotate["y"], 0, 1, 0) # we switched around the axis so make this rotate_z 137 | GL.glTranslatef(translate["x"], translate["y"], translate["z"]) 138 | 139 | # Render the particles 140 | GL.glEnable(GL_POINT_SMOOTH) 141 | GL.glPointSize(2) 142 | GL.glEnable(GL_BLEND) 143 | GL.glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) 144 | 145 | # Set up the VBOs 146 | gl_color.bind() 147 | GL.glColorPointer(4, GL_FLOAT, 0, gl_color) 148 | gl_position.bind() 149 | GL.glVertexPointer(4, GL_FLOAT, 0, gl_position) 150 | GL.glEnableClientState(GL_VERTEX_ARRAY) 151 | GL.glEnableClientState(GL_COLOR_ARRAY) 152 | 153 | # Draw the VBOs 154 | GL.glDrawArrays(GL_POINTS, 0, num_particles) 155 | 156 | GL.glDisableClientState(GL_COLOR_ARRAY) 157 | GL.glDisableClientState(GL_VERTEX_ARRAY) 158 | 159 | GL.glDisable(GL_BLEND) 160 | 161 | GLUT.glutSwapBuffers() 162 | 163 | 164 | window = glut_window() 165 | 166 | (np_position, np_velocity, gl_position, gl_color) = initial_buffers(num_particles) 167 | 168 | platform = cl.get_platforms()[0] 169 | context = cl.Context( 170 | properties=[(cl.context_properties.PLATFORM, platform)] 171 | + get_gl_sharing_context_properties() 172 | ) 173 | queue = cl.CommandQueue(context) 174 | 175 | cl_velocity = cl.Buffer(context, mf.COPY_HOST_PTR, hostbuf=np_velocity) 176 | cl_start_position = cl.Buffer( 177 | context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_position 178 | ) 179 | cl_start_velocity = cl.Buffer( 180 | context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_velocity 181 | ) 182 | 183 | cl_gl_position = cl.GLBuffer(context, mf.READ_WRITE, int(gl_position)) 184 | cl_gl_color = cl.GLBuffer(context, mf.READ_WRITE, int(gl_color)) 185 | 186 | kernel = """__kernel void particle_fountain(__global float4* position, 187 | __global float4* color, 188 | __global float4* velocity, 189 | __global float4* start_position, 190 | __global float4* start_velocity, 191 | float time_step) 192 | { 193 | unsigned int i = get_global_id(0); 194 | float4 p = position[i]; 195 | float4 v = velocity[i]; 196 | float life = velocity[i].w; 197 | life -= time_step; 198 | if (life <= 0.f) 199 | { 200 | p = start_position[i]; 201 | v = start_velocity[i]; 202 | life = 1.0f; 203 | } 204 | 205 | v.z -= 9.8f*time_step; 206 | p.x += v.x*time_step; 207 | p.y += v.y*time_step; 208 | p.z += v.z*time_step; 209 | v.w = life; 210 | 211 | position[i] = p; 212 | velocity[i] = v; 213 | 214 | color[i].w = life; /* Fade points as life decreases */ 215 | }""" 216 | program = cl.Program(context, kernel).build() 217 | 218 | GLUT.glutMainLoop() 219 | -------------------------------------------------------------------------------- /examples/ipython-demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "cc7d0709", 7 | "metadata": { 8 | "collapsed": false, 9 | "jupyter": { 10 | "outputs_hidden": false 11 | } 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "import numpy as np\n", 16 | "\n", 17 | "import pyopencl as cl\n", 18 | "import pyopencl.array" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "8ac8d7bb", 24 | "metadata": {}, 25 | "source": [ 26 | "Load the PyOpenCL IPython extension:" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "7023ca2f", 33 | "metadata": { 34 | "collapsed": false, 35 | "jupyter": { 36 | "outputs_hidden": false 37 | } 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "%load_ext pyopencl.ipython_ext" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "id": "9544b53c", 47 | "metadata": {}, 48 | "source": [ 49 | "Create an OpenCL context and a command queue:" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "fac17999", 56 | "metadata": { 57 | "collapsed": false, 58 | "jupyter": { 59 | "outputs_hidden": false 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "ctx = cl.create_some_context(interactive=True)\n", 65 | "queue = cl.CommandQueue(ctx)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "id": "a29daf04", 71 | "metadata": {}, 72 | "source": [ 73 | "-----\n", 74 | "\n", 75 | "Define an OpenCL kernel using the `%%cl_kernel` magic:" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "id": "65c7e6c9", 82 | "metadata": { 83 | "collapsed": false, 84 | "jupyter": { 85 | "outputs_hidden": false 86 | } 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "%%cl_kernel -o \"-cl-fast-relaxed-math\"\n", 91 | "\n", 92 | "__kernel void sum_vector(__global const float *a,\n", 93 | "__global const float *b, __global float *c)\n", 94 | "{\n", 95 | " int gid = get_global_id(0);\n", 96 | " c[gid] = a[gid] + b[gid];\n", 97 | "}" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "cfb57357", 103 | "metadata": {}, 104 | "source": [ 105 | "This looks for `cl_ctx` or `ctx` in the user namespace to find a PyOpenCL context.\n", 106 | "\n", 107 | "Kernel names are automatically injected into the user namespace, so we can just use `sum_vector` from Python below.\n", 108 | "\n", 109 | "Now create some data to work on:" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "id": "1d80ff38", 116 | "metadata": { 117 | "collapsed": false, 118 | "jupyter": { 119 | "outputs_hidden": false 120 | } 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "n = 10000\n", 125 | "\n", 126 | "a = cl.array.empty(queue, n, dtype=np.float32)\n", 127 | "a.fill(15)\n", 128 | "\n", 129 | "rng = np.random.default_rng()\n", 130 | "b_host = rng.normal(size=n).astype(np.float32)\n", 131 | "b = cl.array.to_device(queue, b_host)\n", 132 | "\n", 133 | "c = cl.array.empty_like(a)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "61fccb61", 139 | "metadata": {}, 140 | "source": [ 141 | "Run the kernel:" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "2ba991b3", 148 | "metadata": { 149 | "collapsed": false, 150 | "jupyter": { 151 | "outputs_hidden": false 152 | } 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "sum_vector(queue, (n,), None, a.data, b.data, c.data) # noqa: F821" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "id": "11a55b38", 162 | "metadata": {}, 163 | "source": [ 164 | "Check the result using `numpy` operations:" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "id": "ee3560c1", 171 | "metadata": { 172 | "collapsed": false, 173 | "jupyter": { 174 | "outputs_hidden": false 175 | } 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "assert (c.get() == b_host + 15).all()" 180 | ] 181 | } 182 | ], 183 | "metadata": { 184 | "kernelspec": { 185 | "display_name": "Python 3 (ipykernel)", 186 | "language": "python", 187 | "name": "python3" 188 | }, 189 | "language_info": { 190 | "codemirror_mode": { 191 | "name": "ipython", 192 | "version": 3 193 | }, 194 | "file_extension": ".py", 195 | "mimetype": "text/x-python", 196 | "name": "python", 197 | "nbconvert_exporter": "python", 198 | "pygments_lexer": "ipython3", 199 | "version": "3.12.4" 200 | } 201 | }, 202 | "nbformat": 4, 203 | "nbformat_minor": 5 204 | } 205 | -------------------------------------------------------------------------------- /examples/median-filter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from imageio import imread, imsave 3 | 4 | import pyopencl as cl 5 | 6 | 7 | # Read in image 8 | img = imread("noisyImage.jpg").astype(np.float32) 9 | print(img.shape) 10 | 11 | img = np.mean(img, axis=2) 12 | print(img.shape) 13 | 14 | ctx = cl.create_some_context() 15 | queue = cl.CommandQueue(ctx) 16 | 17 | mf = cl.mem_flags 18 | 19 | # Kernel function 20 | src = """ 21 | void sort(int *a, int *b, int *c) { 22 | int swap; 23 | if(*a > *b) { 24 | swap = *a; 25 | *a = *b; 26 | *b = swap; 27 | } 28 | if(*a > *c) { 29 | swap = *a; 30 | *a = *c; 31 | *c = swap; 32 | } 33 | if(*b > *c) { 34 | swap = *b; 35 | *b = *c; 36 | *c = swap; 37 | } 38 | } 39 | __kernel void medianFilter( 40 | __global float *img, __global float *result, __global int *width, __global 41 | int *height) 42 | { 43 | int w = *width; 44 | int h = *height; 45 | int posx = get_global_id(1); 46 | int posy = get_global_id(0); 47 | int i = w*posy + posx; 48 | // Keeping the edge pixels the same 49 | if( posx == 0 || posy == 0 || posx == w-1 || posy == h-1 ) 50 | { 51 | result[i] = img[i]; 52 | } 53 | else 54 | { 55 | int pixel00, pixel01, pixel02, pixel10, pixel11, pixel12, pixel20, 56 | pixel21, pixel22; 57 | pixel00 = img[i - 1 - w]; 58 | pixel01 = img[i- w]; 59 | pixel02 = img[i + 1 - w]; 60 | pixel10 = img[i - 1]; 61 | pixel11 = img[i]; 62 | pixel12 = img[i + 1]; 63 | pixel20 = img[i - 1 + w]; 64 | pixel21 = img[i + w]; 65 | pixel22 = img[i + 1 + w]; 66 | //sort the rows 67 | sort( &(pixel00), &(pixel01), &(pixel02) ); 68 | sort( &(pixel10), &(pixel11), &(pixel12) ); 69 | sort( &(pixel20), &(pixel21), &(pixel22) ); 70 | //sort the columns 71 | sort( &(pixel00), &(pixel10), &(pixel20) ); 72 | sort( &(pixel01), &(pixel11), &(pixel21) ); 73 | sort( &(pixel02), &(pixel12), &(pixel22) ); 74 | //sort the diagonal 75 | sort( &(pixel00), &(pixel11), &(pixel22) ); 76 | // median is the the middle value of the diagonal 77 | result[i] = pixel11; 78 | } 79 | } 80 | """ 81 | 82 | # Kernel function instantiation 83 | prg = cl.Program(ctx, src).build() 84 | # Allocate memory for variables on the device 85 | img_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=img) 86 | result_g = cl.Buffer(ctx, mf.WRITE_ONLY, img.nbytes) 87 | width_g = cl.Buffer( 88 | ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.int32(img.shape[1]) 89 | ) 90 | height_g = cl.Buffer( 91 | ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.int32(img.shape[0]) 92 | ) 93 | # Call Kernel. Automatically takes care of block/grid distribution 94 | prg.medianFilter(queue, img.shape, None, img_g, result_g, width_g, height_g) 95 | result = np.empty_like(img) 96 | cl.enqueue_copy(queue, result, result_g) 97 | 98 | # Show the blurred image 99 | imsave("medianFilter-OpenCL.jpg", result, mode="RGB") 100 | -------------------------------------------------------------------------------- /examples/narray.py: -------------------------------------------------------------------------------- 1 | # example by Roger Pau Monn'e 2 | import numpy as np 3 | 4 | import pyopencl as cl 5 | 6 | 7 | demo_r = np.empty((500, 5), dtype=np.uint32) 8 | ctx = cl.create_some_context() 9 | queue = cl.CommandQueue(ctx) 10 | 11 | mf = cl.mem_flags 12 | demo_buf = cl.Buffer(ctx, mf.WRITE_ONLY, demo_r.nbytes) 13 | 14 | prg = cl.Program(ctx, 15 | """ 16 | __kernel void demo(__global uint *demo) 17 | { 18 | int i; 19 | int gid = get_global_id(0); 20 | for(i=0; i<5;i++) 21 | { 22 | demo[gid*5+i] = (uint) 1; 23 | } 24 | }""") 25 | 26 | try: 27 | prg.build() 28 | except Exception: 29 | print("Error:") 30 | print(prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG)) 31 | raise 32 | 33 | prg.demo(queue, (500,), None, demo_buf) 34 | cl.enqueue_copy(queue, demo_r, demo_buf).wait() 35 | 36 | for res in demo_r: 37 | print(res) 38 | -------------------------------------------------------------------------------- /examples/noisyImage.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inducer/pyopencl/b8b8d4d852e8a26356861ffda578874dc064e54c/examples/noisyImage.jpg -------------------------------------------------------------------------------- /examples/svm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | 5 | import pyopencl as cl 6 | from pyopencl.characterize import ( 7 | has_coarse_grain_buffer_svm, 8 | has_fine_grain_buffer_svm, 9 | has_fine_grain_system_svm, 10 | ) 11 | 12 | 13 | ctx = cl.create_some_context() 14 | queue = cl.CommandQueue(ctx) 15 | 16 | dev = queue.device 17 | 18 | print( 19 | f"Device '{dev.name}' on platform '{dev.platform.name} ({dev.platform.version})'" 20 | " has the following SVM features:\n" 21 | f" Coarse-grained buffer SVM: {has_coarse_grain_buffer_svm(dev)}\n" 22 | f" Fine-grained buffer SVM: {has_fine_grain_buffer_svm(dev)}\n" 23 | f" Fine-grained system SVM: {has_fine_grain_system_svm(dev)}" 24 | ) 25 | 26 | prg = cl.Program(ctx, """ 27 | __kernel void twice( 28 | __global float *a_g) 29 | { 30 | int gid = get_global_id(0); 31 | a_g[gid] = 2*a_g[gid]; 32 | } 33 | """).build() 34 | 35 | 36 | if has_coarse_grain_buffer_svm(dev): 37 | print("Testing coarse-grained buffer SVM...", end="") 38 | 39 | svm_ary = cl.SVM(cl.csvm_empty(ctx, 10, np.float32)) 40 | assert isinstance(svm_ary.mem, np.ndarray) 41 | 42 | with svm_ary.map_rw(queue) as ary: 43 | ary.fill(17) # use from host 44 | orig_ary = ary.copy() 45 | 46 | prg.twice(queue, svm_ary.mem.shape, None, svm_ary) 47 | queue.finish() 48 | 49 | with svm_ary.map_ro(queue) as ary: 50 | assert np.array_equal(orig_ary*2, ary) 51 | 52 | print(" done.") 53 | 54 | if has_fine_grain_buffer_svm(dev): 55 | print("Testing fine-grained buffer SVM...", end="") 56 | 57 | ary = cl.fsvm_empty(ctx, 10, np.float32) 58 | assert isinstance(ary.base, cl.SVMAllocation) 59 | 60 | ary.fill(17) 61 | orig_ary = ary.copy() 62 | 63 | prg.twice(queue, ary.shape, None, cl.SVM(ary)) 64 | queue.finish() 65 | 66 | assert np.array_equal(orig_ary*2, ary) 67 | 68 | print(" done.") 69 | 70 | if has_fine_grain_system_svm(dev): 71 | print("Testing fine-grained system SVM...", end="") 72 | 73 | ary = np.zeros(10, np.float32) 74 | assert isinstance(ary, np.ndarray) 75 | 76 | ary.fill(17) 77 | orig_ary = ary.copy() 78 | 79 | prg.twice(queue, ary.shape, None, cl.SVM(ary)) 80 | queue.finish() 81 | 82 | assert np.array_equal(orig_ary*2, ary) 83 | 84 | print(" done.") 85 | -------------------------------------------------------------------------------- /examples/transpose.py: -------------------------------------------------------------------------------- 1 | # Transposition of a matrix 2 | # originally for PyCUDA by Hendrik Riedmann 3 | 4 | import numpy as np 5 | import numpy.linalg as la 6 | 7 | import pyopencl as cl 8 | 9 | 10 | block_size = 16 11 | 12 | 13 | class NaiveTranspose: 14 | def __init__(self, ctx): 15 | self.kernel = ( 16 | cl.Program( 17 | ctx, 18 | """ 19 | __kernel void transpose( 20 | __global float *a_t, __global float *a, 21 | unsigned a_width, unsigned a_height) 22 | { 23 | int read_idx = get_global_id(0) + get_global_id(1) * a_width; 24 | int write_idx = get_global_id(1) + get_global_id(0) * a_height; 25 | 26 | a_t[write_idx] = a[read_idx]; 27 | } 28 | """,) 29 | .build() 30 | .transpose 31 | ) 32 | 33 | def __call__(self, queue, tgt, src, shape): 34 | w, h = shape 35 | assert w % block_size == 0 36 | assert h % block_size == 0 37 | 38 | return self.kernel( 39 | queue, 40 | (w, h), 41 | (block_size, block_size), 42 | tgt, 43 | src, 44 | np.uint32(w), 45 | np.uint32(h), 46 | ) 47 | 48 | 49 | class SillyTranspose(NaiveTranspose): 50 | def __call__(self, queue, tgt, src, shape): 51 | w, h = shape 52 | assert w % block_size == 0 53 | assert h % block_size == 0 54 | 55 | return self.kernel( 56 | queue, (w, h), None, tgt, src, np.uint32(w), np.uint32(h) 57 | ) 58 | 59 | 60 | class TransposeWithLocal: 61 | def __init__(self, ctx): 62 | self.kernel = ( 63 | cl.Program( 64 | ctx, 65 | """ 66 | #define BLOCK_SIZE %(block_size)d 67 | #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width) 68 | #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height) 69 | 70 | __kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, BLOCK_SIZE, 1))) 71 | void transpose( 72 | __global float *a_t, __global float *a, 73 | unsigned a_width, unsigned a_height, 74 | __local float *a_local) 75 | { 76 | int base_idx_a = 77 | get_group_id(0) * BLOCK_SIZE + 78 | get_group_id(1) * A_BLOCK_STRIDE; 79 | int base_idx_a_t = 80 | get_group_id(1) * BLOCK_SIZE + 81 | get_group_id(0) * A_T_BLOCK_STRIDE; 82 | 83 | int glob_idx_a = 84 | base_idx_a + get_local_id(0) + a_width * get_local_id(1); 85 | int glob_idx_a_t = 86 | base_idx_a_t + get_local_id(0) + a_height * get_local_id(1); 87 | 88 | a_local[get_local_id(1)*BLOCK_SIZE+get_local_id(0)] = a[glob_idx_a]; 89 | 90 | barrier(CLK_LOCAL_MEM_FENCE); 91 | 92 | a_t[glob_idx_a_t] = a_local[get_local_id(0)*BLOCK_SIZE+get_local_id(1)]; 93 | } 94 | """ 95 | % {"block_size": block_size}, 96 | ) 97 | .build() 98 | .transpose 99 | ) 100 | 101 | def __call__(self, queue, tgt, src, shape): 102 | w, h = shape 103 | assert w % block_size == 0 104 | assert h % block_size == 0 105 | 106 | return self.kernel( 107 | queue, 108 | (w, h), 109 | (block_size, block_size), 110 | tgt, 111 | src, 112 | np.uint32(w), 113 | np.uint32(h), 114 | cl.LocalMemory(4 * block_size * (block_size + 1)), 115 | ) 116 | 117 | 118 | def transpose_using_cl(ctx, queue, cpu_src, cls): 119 | mf = cl.mem_flags 120 | a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cpu_src) 121 | a_t_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=cpu_src.nbytes) 122 | cls(ctx)(queue, a_t_buf, a_buf, cpu_src.shape) 123 | 124 | w, h = cpu_src.shape 125 | result = np.empty((h, w), dtype=cpu_src.dtype) 126 | cl.enqueue_copy(queue, result, a_t_buf).wait() 127 | 128 | a_buf.release() 129 | a_t_buf.release() 130 | 131 | return result 132 | 133 | 134 | def check_transpose(): 135 | for cls in [NaiveTranspose, SillyTranspose, TransposeWithLocal]: 136 | print("checking", cls.__name__) 137 | ctx = cl.create_some_context() 138 | 139 | for dev in ctx.devices: 140 | assert dev.local_mem_size > 0 141 | 142 | queue = cl.CommandQueue(ctx) 143 | 144 | for i in np.arange(10, 13, 0.125): 145 | size = int(((2 ** i) // 32) * 32) 146 | print(size) 147 | 148 | rng = np.random.default_rng() 149 | source = rng.random((size, size), dtype=np.float32) 150 | result = transpose_using_cl(ctx, queue, source, NaiveTranspose) 151 | 152 | err = source.T - result 153 | err_norm = la.norm(err) 154 | 155 | assert err_norm == 0, (size, err_norm) 156 | 157 | 158 | def benchmark_transpose(): 159 | ctx = cl.create_some_context() 160 | 161 | for dev in ctx.devices: 162 | assert dev.local_mem_size > 0 163 | 164 | queue = cl.CommandQueue( 165 | ctx, properties=cl.command_queue_properties.PROFILING_ENABLE 166 | ) 167 | 168 | sizes = [int(((2 ** i) // 32) * 32) for i in np.arange(10, 13, 0.125)] 169 | # for i in np.arange(10, 10.5, 0.125)] 170 | 171 | mem_bandwidths = {} 172 | 173 | methods = [SillyTranspose, NaiveTranspose, TransposeWithLocal] 174 | for cls in methods: 175 | name = cls.__name__.replace("Transpose", "") 176 | 177 | mem_bandwidths[cls] = meth_mem_bws = [] 178 | 179 | for size in sizes: 180 | rng = np.random.default_rng() 181 | source = rng.random((size, size), dtype=np.float32) 182 | 183 | mf = cl.mem_flags 184 | a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=source) 185 | a_t_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=source.nbytes) 186 | method = cls(ctx) 187 | 188 | for _i in range(4): 189 | method(queue, a_t_buf, a_buf, source.shape) 190 | 191 | count = 12 192 | events = [] 193 | for _i in range(count): 194 | events.append(method(queue, a_t_buf, a_buf, source.shape)) 195 | 196 | events[-1].wait() 197 | time = sum(evt.profile.end - evt.profile.start for evt in events) 198 | 199 | mem_bw = 2 * source.nbytes * count / (time * 1e-9) 200 | print("benchmarking", name, size, mem_bw / 1e9, "GB/s") 201 | meth_mem_bws.append(mem_bw) 202 | 203 | a_buf.release() 204 | a_t_buf.release() 205 | 206 | try: 207 | from matplotlib.pyplot import clf, grid, legend, plot, savefig, xlabel, ylabel 208 | except ModuleNotFoundError: 209 | pass 210 | else: 211 | for i in range(len(methods)): 212 | clf() 213 | for j in range(i + 1): 214 | method = methods[j] 215 | name = method.__name__.replace("Transpose", "") 216 | plot(sizes, np.array(mem_bandwidths[method]) / 1e9, "o-", 217 | label=name) 218 | 219 | xlabel("Matrix width/height $N$") 220 | ylabel("Memory Bandwidth [GB/s]") 221 | legend(loc="best") 222 | grid() 223 | 224 | savefig("transpose-benchmark-%d.pdf" % i) 225 | 226 | 227 | check_transpose() 228 | benchmark_transpose() 229 | -------------------------------------------------------------------------------- /pyopencl/_cluda.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" 2 | 3 | __license__ = """ 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | """ 22 | 23 | CLUDA_PREAMBLE = """ 24 | #define local_barrier() barrier(CLK_LOCAL_MEM_FENCE); 25 | 26 | #define WITHIN_KERNEL /* empty */ 27 | #define KERNEL __kernel 28 | #define GLOBAL_MEM __global 29 | #define LOCAL_MEM __local 30 | #define LOCAL_MEM_ARG __local 31 | #define REQD_WG_SIZE(X,Y,Z) __attribute__((reqd_work_group_size(X, Y, Z))) 32 | 33 | #define LID_0 ((ptrdiff_t) get_local_id(0)) 34 | #define LID_1 ((ptrdiff_t) get_local_id(1)) 35 | #define LID_2 ((ptrdiff_t) get_local_id(2)) 36 | 37 | #define GID_0 ((ptrdiff_t) get_group_id(0)) 38 | #define GID_1 ((ptrdiff_t) get_group_id(1)) 39 | #define GID_2 ((ptrdiff_t) get_group_id(2)) 40 | 41 | #define LDIM_0 ((ptrdiff_t) get_local_size(0)) 42 | #define LDIM_1 ((ptrdiff_t) get_local_size(1)) 43 | #define LDIM_2 ((ptrdiff_t) get_local_size(2)) 44 | 45 | #define GDIM_0 ((ptrdiff_t) get_num_groups(0)) 46 | #define GDIM_1 ((ptrdiff_t) get_num_groups(1)) 47 | #define GDIM_2 ((ptrdiff_t) get_num_groups(2)) 48 | 49 | % if double_support: 50 | #if __OPENCL_C_VERSION__ < 120 51 | #pragma OPENCL EXTENSION cl_khr_fp64: enable 52 | #endif 53 | % endif 54 | """ 55 | -------------------------------------------------------------------------------- /pyopencl/_mymako.py: -------------------------------------------------------------------------------- 1 | try: 2 | import mako.template # noqa: F401 3 | except ImportError as err: 4 | raise ImportError( 5 | "Some of PyOpenCL's facilities require the Mako templating engine.\n" 6 | "You or a piece of software you have used has tried to call such a\n" 7 | "part of PyOpenCL, but there was a problem importing Mako.\n\n" 8 | "You may install mako now by typing one of:\n" 9 | "- easy_install Mako\n" 10 | "- pip install Mako\n" 11 | "- aptitude install python-mako\n" 12 | "\nor whatever else is appropriate for your system.") from err 13 | 14 | from mako import * # noqa: F403 15 | -------------------------------------------------------------------------------- /pyopencl/capture_call.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (C) 2013 Andreas Kloeckner" 2 | 3 | __license__ = """ 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | """ 22 | 23 | 24 | import numpy as np 25 | 26 | from pytools.py_codegen import Indentation, PythonCodeGenerator 27 | 28 | import pyopencl as cl 29 | 30 | 31 | def capture_kernel_call(kernel, output_file, queue, g_size, l_size, *args, **kwargs): 32 | try: 33 | source = kernel._source 34 | except AttributeError as err: 35 | raise RuntimeError("cannot capture call, kernel source not available") from err 36 | 37 | if source is None: 38 | raise RuntimeError("cannot capture call, kernel source not available") 39 | 40 | cg = PythonCodeGenerator() 41 | 42 | cg("# generated by pyopencl.capture_call") 43 | cg("") 44 | cg("import numpy as np") 45 | cg("import pyopencl as cl") 46 | cg("from base64 import b64decode") 47 | cg("from zlib import decompress") 48 | cg("mf = cl.mem_flags") 49 | cg("") 50 | 51 | cg('CODE = r"""//CL//') 52 | for line in source.split("\n"): 53 | cg(line) 54 | cg('"""') 55 | 56 | # {{{ invocation 57 | 58 | arg_data = [] 59 | 60 | cg("") 61 | cg("") 62 | cg("def main():") 63 | with Indentation(cg): 64 | cg("ctx = cl.create_some_context()") 65 | cg("queue = cl.CommandQueue(ctx)") 66 | cg("") 67 | 68 | kernel_args = [] 69 | 70 | for i, arg in enumerate(args): 71 | if isinstance(arg, cl.Buffer): 72 | buf = bytearray(arg.size) 73 | cl.enqueue_copy(queue, buf, arg) 74 | arg_data.append(("arg%d_data" % i, buf)) 75 | cg("arg%d = cl.Buffer(ctx, " 76 | "mf.READ_WRITE | cl.mem_flags.COPY_HOST_PTR," 77 | % i) 78 | cg(" hostbuf=decompress(b64decode(arg%d_data)))" 79 | % i) 80 | kernel_args.append("arg%d" % i) 81 | elif isinstance(arg, (int, float)): 82 | kernel_args.append(repr(arg)) 83 | elif isinstance(arg, np.integer): 84 | kernel_args.append("np.{}({})".format( 85 | arg.dtype.type.__name__, repr(int(arg)))) 86 | elif isinstance(arg, np.floating): 87 | kernel_args.append("np.{}({})".format( 88 | arg.dtype.type.__name__, repr(float(arg)))) 89 | elif isinstance(arg, np.complexfloating): 90 | kernel_args.append("np.{}({})".format( 91 | arg.dtype.type.__name__, repr(complex(arg)))) 92 | else: 93 | try: 94 | arg_buf = memoryview(arg) 95 | except Exception as err: 96 | raise RuntimeError("cannot capture: " 97 | "unsupported arg nr %d (0-based)" % i) from err 98 | 99 | arg_data.append(("arg%d_data" % i, arg_buf)) 100 | kernel_args.append("decompress(b64decode(arg%d_data))" % i) 101 | 102 | cg("") 103 | 104 | g_times_l = kwargs.get("g_times_l", False) 105 | if g_times_l: 106 | dim = max(len(g_size), len(l_size)) 107 | l_size = l_size + (1,) * (dim-len(l_size)) 108 | g_size = g_size + (1,) * (dim-len(g_size)) 109 | g_size = tuple( 110 | gs*ls for gs, ls in zip(g_size, l_size)) 111 | 112 | global_offset = kwargs.get("global_offset", None) 113 | if global_offset is not None: 114 | kernel_args.append("global_offset=%s" % repr(global_offset)) 115 | 116 | cg("prg = cl.Program(ctx, CODE).build()") 117 | cg("knl = prg.%s" % kernel.function_name) 118 | if hasattr(kernel, "_scalar_arg_dtypes"): 119 | def strify_dtype(d): 120 | if d is None: 121 | return "None" 122 | 123 | d = np.dtype(d) 124 | s = repr(d) 125 | if s.startswith("dtype"): 126 | s = "np."+s 127 | 128 | return s 129 | 130 | cg("knl.set_scalar_arg_dtypes((%s,))" 131 | % ", ".join( 132 | strify_dtype(dt) for dt in kernel._scalar_arg_dtypes)) 133 | 134 | cg("knl(queue, {}, {},".format(repr(g_size), repr(l_size))) 135 | cg(" %s)" % ", ".join(kernel_args)) 136 | cg("") 137 | cg("queue.finish()") 138 | 139 | # }}} 140 | 141 | # {{{ data 142 | 143 | from base64 import b64encode 144 | from zlib import compress 145 | cg("") 146 | line_len = 70 147 | 148 | for name, val in arg_data: 149 | cg("%s = (" % name) 150 | with Indentation(cg): 151 | val = b64encode(compress(memoryview(val))).decode() 152 | i = 0 153 | while i < len(val): 154 | cg(repr(val[i:i+line_len])) 155 | i += line_len 156 | 157 | cg(")") 158 | 159 | # }}} 160 | 161 | # {{{ file trailer 162 | 163 | cg("") 164 | cg('if __name__ == "__main__":') 165 | with Indentation(cg): 166 | cg("main()") 167 | cg("") 168 | 169 | cg("# vim: filetype=pyopencl") 170 | 171 | # }}} 172 | 173 | if isinstance(output_file, str): 174 | with open(output_file, "w") as outf: 175 | outf.write(cg.get()) 176 | else: 177 | output_file.write(cg.get()) 178 | -------------------------------------------------------------------------------- /pyopencl/characterize/performance.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" 2 | 3 | __license__ = """ 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | """ 22 | 23 | import numpy as np 24 | 25 | import pyopencl as cl 26 | 27 | 28 | # {{{ timing helpers 29 | 30 | class Timer: 31 | def __init__(self, queue): 32 | self.queue = queue 33 | 34 | def start(self): 35 | pass 36 | 37 | def stop(self): 38 | pass 39 | 40 | def add_event(self, evt): 41 | pass 42 | 43 | def get_elapsed(self): 44 | pass 45 | 46 | 47 | class WallTimer(Timer): 48 | def start(self): 49 | from time import time 50 | self.queue.finish() 51 | self.start_time = time() 52 | 53 | def stop(self): 54 | from time import time 55 | self.queue.finish() 56 | self.end_time = time() 57 | 58 | def get_elapsed(self): 59 | return self.end_time-self.start_time 60 | 61 | 62 | def _get_time(queue, f, timer_factory=None, desired_duration=0.1, 63 | warmup_rounds=3): 64 | 65 | if timer_factory is None: 66 | timer_factory = WallTimer 67 | 68 | count = 1 69 | 70 | while True: 71 | timer = timer_factory(queue) 72 | 73 | for _i in range(warmup_rounds): 74 | f() 75 | warmup_rounds = 0 76 | 77 | timer.start() 78 | for _i in range(count): 79 | timer.add_event(f()) 80 | timer.stop() 81 | 82 | elapsed = timer.get_elapsed() 83 | if elapsed < desired_duration: 84 | if elapsed == 0: 85 | count *= 5 86 | else: 87 | new_count = int(desired_duration/elapsed) 88 | 89 | new_count = max(2*count, new_count) 90 | new_count = min(10*count, new_count) 91 | count = new_count 92 | 93 | else: 94 | return elapsed/count 95 | 96 | # }}} 97 | 98 | 99 | # {{{ transfer measurements 100 | 101 | class HostDeviceTransferBase: 102 | def __init__(self, queue, block_size): 103 | self.queue = queue 104 | self.host_buf = np.empty(block_size, dtype=np.uint8) 105 | self.dev_buf = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size) 106 | 107 | 108 | class HostToDeviceTransfer(HostDeviceTransferBase): 109 | def do(self): 110 | return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf) 111 | 112 | 113 | class DeviceToHostTransfer(HostDeviceTransferBase): 114 | def do(self): 115 | return cl.enqueue_copy(self. queue, self.host_buf, self.dev_buf) 116 | 117 | 118 | class DeviceToDeviceTransfer: 119 | def __init__(self, queue, block_size): 120 | self.queue = queue 121 | mf = cl.mem_flags 122 | self.dev_buf_1 = cl.Buffer(queue.context, mf.READ_WRITE, block_size) 123 | self.dev_buf_2 = cl.Buffer(queue.context, mf.READ_WRITE, block_size) 124 | 125 | def do(self): 126 | return cl.enqueue_copy(self. queue, self.dev_buf_2, self.dev_buf_1) 127 | 128 | 129 | def transfer_latency(queue, transfer_type, timer_factory=None): 130 | transfer = transfer_type(queue, 1) 131 | return _get_time(queue, transfer.do, timer_factory=timer_factory) 132 | 133 | 134 | def transfer_bandwidth(queue, transfer_type, block_size, timer_factory=None): 135 | """Measures one-sided bandwidth.""" 136 | 137 | transfer = transfer_type(queue, block_size) 138 | return block_size/_get_time(queue, transfer.do, timer_factory=timer_factory) 139 | 140 | # }}} 141 | 142 | 143 | def get_profiling_overhead(ctx, timer_factory=None): 144 | no_prof_queue = cl.CommandQueue(ctx) 145 | transfer = DeviceToDeviceTransfer(no_prof_queue, 1) 146 | no_prof_time = _get_time(no_prof_queue, transfer.do, timer_factory=timer_factory) 147 | 148 | prof_queue = cl.CommandQueue(ctx, 149 | properties=cl.command_queue_properties.PROFILING_ENABLE) 150 | transfer = DeviceToDeviceTransfer(prof_queue, 1) 151 | prof_time = _get_time(prof_queue, transfer.do, timer_factory=timer_factory) 152 | 153 | return prof_time - no_prof_time, prof_time 154 | 155 | 156 | def get_empty_kernel_time(queue, timer_factory=None): 157 | prg = cl.Program(queue.context, """ 158 | __kernel void empty() 159 | { } 160 | """).build() 161 | 162 | knl = prg.empty 163 | 164 | def f(): 165 | knl(queue, (1,), None) 166 | 167 | return _get_time(queue, f, timer_factory=timer_factory) 168 | 169 | 170 | def _get_full_machine_kernel_rate(queue, src, args, name="benchmark", 171 | timer_factory=None): 172 | prg = cl.Program(queue.context, src).build() 173 | 174 | knl = getattr(prg, name) 175 | 176 | dev = queue.device 177 | global_size = 4 * dev.max_compute_units 178 | 179 | def f(): 180 | knl(queue, (global_size,), None, *args) 181 | 182 | rates = [] 183 | num_dips = 0 184 | 185 | while True: 186 | elapsed = _get_time(queue, f, timer_factory=timer_factory) 187 | rate = global_size/elapsed 188 | 189 | keep_trying = not rates 190 | 191 | if rates and rate > 1.05*max(rates): # big improvement 192 | keep_trying = True 193 | num_dips = 0 194 | 195 | if rates and rate < 0.9*max(rates) and num_dips < 3: # big dip 196 | keep_trying = True 197 | num_dips += 1 198 | 199 | if keep_trying: 200 | global_size *= 2 201 | rates.append(rate) 202 | else: 203 | rates.append(rate) 204 | return max(rates) 205 | 206 | 207 | def get_add_rate(queue, type="float", timer_factory=None): 208 | return 50*10*_get_full_machine_kernel_rate(queue, """ 209 | typedef %(op_t)s op_t; 210 | __kernel void benchmark() 211 | { 212 | local op_t tgt[1024]; 213 | op_t val = get_global_id(0); 214 | 215 | for (int i = 0; i < 10; ++i) 216 | { 217 | val += val; val += val; val += val; val += val; val += val; 218 | val += val; val += val; val += val; val += val; val += val; 219 | 220 | val += val; val += val; val += val; val += val; val += val; 221 | val += val; val += val; val += val; val += val; val += val; 222 | 223 | val += val; val += val; val += val; val += val; val += val; 224 | val += val; val += val; val += val; val += val; val += val; 225 | 226 | val += val; val += val; val += val; val += val; val += val; 227 | val += val; val += val; val += val; val += val; val += val; 228 | 229 | val += val; val += val; val += val; val += val; val += val; 230 | val += val; val += val; val += val; val += val; val += val; 231 | } 232 | tgt[get_local_id(0)] = val; 233 | } 234 | """ % {"op_t": type}, ()) 235 | 236 | 237 | # vim: foldmethod=marker:filetype=pyopencl 238 | -------------------------------------------------------------------------------- /pyopencl/cl/pyopencl-bessel-j-complex.cl: -------------------------------------------------------------------------------- 1 | /* 2 | Evaluate Bessel J function J_v(z) and J_{v+1}(z) with v a nonnegative integer 3 | and z anywhere in the complex plane. 4 | 5 | Copyright (C) Vladimir Rokhlin 6 | Copyright (C) 2010-2012 Leslie Greengard and Zydrunas Gimbutas 7 | Copyright (C) 2015 Shidong Jiang, Andreas Kloeckner 8 | 9 | Manually translated from 10 | https://github.com/zgimbutas/fmmlib2d/blob/master/src/cdjseval2d.f 11 | 12 | Originally licensed under GPL, permission to license under MIT granted via email 13 | by Vladimir Rokhlin on May 25, 2015 and by Zydrunas Gimbutas on May 17, 2015. 14 | 15 | Permission is hereby granted, free of charge, to any person obtaining a copy 16 | of this software and associated documentation files (the "Software"), to deal 17 | in the Software without restriction, including without limitation the rights 18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 19 | copies of the Software, and to permit persons to whom the Software is 20 | furnished to do so, subject to the following conditions: 21 | 22 | The above copyright notice and this permission notice shall be included in 23 | all copies or substantial portions of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 31 | THE SOFTWARE. 32 | 33 | */ 34 | 35 | void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1) 36 | { 37 | int n; 38 | int nmax = 10000; 39 | 40 | int k; 41 | int kmax=8; 42 | 43 | int vscale, vp1scale; 44 | double vscaling, vp1scaling; 45 | 46 | const double small = 2e-1; 47 | const double median = 1.0e0; 48 | 49 | const double upbound = 1e40; 50 | const double upbound_inv = 1e-40; 51 | 52 | double dd; 53 | double k_factorial_inv, kv_factorial_inv, kvp1_factorial_inv; 54 | 55 | cdouble_t z_half, mz_half2, mz_half_2k, z_half_v, z_half_vp1; 56 | 57 | cdouble_t ima = cdouble_new(0, 1); 58 | cdouble_t neg_ima = cdouble_new(0, -1); 59 | 60 | cdouble_t zinv, ztmp; 61 | cdouble_t j_nm1, j_n, j_np1; 62 | 63 | cdouble_t psi, zsn, zmul, zmulinv; 64 | cdouble_t unscaled_j_n, unscaled_j_nm1, unscaled_j_np1; 65 | cdouble_t unscaled_j_v, unscaled_j_vp1; 66 | cdouble_t scaling; 67 | 68 | // assert( v >= 0 ); 69 | 70 | #if 0 71 | if (cdouble_abs(z) < tiny) 72 | { 73 | if (v == 0) 74 | { 75 | *j_v = cdouble_new(1, 0); 76 | *j_vp1 = cdouble_new(0, 0); 77 | } else 78 | { 79 | *j_v = cdouble_new(0, 0); 80 | *j_vp1 = cdouble_new(0, 0); 81 | } 82 | return; 83 | } 84 | #endif 85 | 86 | // {{{ power series for (small z) or (large v and median z) 87 | if ( (cdouble_abs(z) < small) || ( (v>12) && (cdouble_abs(z) < median))) 88 | { 89 | z_half = cdouble_divider(z,2.0); 90 | 91 | mz_half2 = cdouble_neg(cdouble_mul(z_half, z_half)); 92 | 93 | z_half_v = cdouble_powr(z_half, v); 94 | z_half_vp1 = cdouble_mul(z_half_v, z_half); 95 | 96 | 97 | // compute 1/v! 98 | kv_factorial_inv = 1.0; 99 | for ( k = 1; k <= v; k++) 100 | { 101 | kv_factorial_inv /= k; 102 | } 103 | 104 | kvp1_factorial_inv = kv_factorial_inv / (v+1); 105 | 106 | k_factorial_inv = 1.0; 107 | 108 | // compute the power series of bessel j function 109 | mz_half_2k = cdouble_new(1.0, 0); 110 | 111 | *j_v = cdouble_new(0, 0); 112 | *j_vp1 = cdouble_new(0, 0); 113 | 114 | for ( k = 0; k < kmax; k++ ) 115 | { 116 | *j_v = cdouble_add( 117 | *j_v, 118 | cdouble_mulr(mz_half_2k, kv_factorial_inv*k_factorial_inv)); 119 | *j_vp1 = cdouble_add(*j_vp1, 120 | cdouble_mulr(mz_half_2k, kvp1_factorial_inv*k_factorial_inv)); 121 | 122 | mz_half_2k = cdouble_mul(mz_half_2k, mz_half2); 123 | k_factorial_inv /= (k+1); 124 | kv_factorial_inv /= (k+v+1); 125 | kvp1_factorial_inv /= (k+v+2); 126 | } 127 | 128 | *j_v = cdouble_mul(*j_v, z_half_v ); 129 | *j_vp1 = cdouble_mul(*j_vp1, z_half_vp1 ); 130 | 131 | return; 132 | } 133 | 134 | // }}} 135 | 136 | // {{{ use recurrence for large z 137 | 138 | j_nm1 = cdouble_new(0, 0); 139 | j_n = cdouble_new(1, 0); 140 | 141 | n = v; 142 | 143 | zinv = cdouble_rdivide(1,z); 144 | 145 | while (true) 146 | { 147 | j_np1 = cdouble_sub( 148 | cdouble_mul(cdouble_rmul(2*n, zinv), j_n), 149 | j_nm1); 150 | 151 | n += 1; 152 | j_nm1 = j_n; 153 | j_n = j_np1; 154 | 155 | if (n > nmax) 156 | { 157 | *j_v = cdouble_new(nan(0x8e55e1u), 0); 158 | *j_vp1 = cdouble_new(nan(0x8e55e1u), 0); 159 | return; 160 | } 161 | 162 | if (cdouble_abs_squared(j_n) > upbound) 163 | break; 164 | } 165 | 166 | // downward recursion, account for rescalings 167 | // Record the number of times of the missed rescalings 168 | // for j_v and j_vp1. 169 | 170 | unscaled_j_np1 = cdouble_new(0, 0); 171 | unscaled_j_n = cdouble_new(1, 0); 172 | 173 | // Use normalization condition http://dlmf.nist.gov/10.12#E5 174 | psi = cdouble_new(0, 0); 175 | 176 | if (cdouble_imag(z) <= 0) 177 | zmul = ima; 178 | else 179 | zmul = neg_ima; 180 | 181 | zsn = cdouble_powr(zmul, n%4); 182 | 183 | zmulinv = cdouble_rdivide(1, zmul); 184 | 185 | vscale = 0; 186 | vp1scale = 0; 187 | 188 | while (n > 0) 189 | { 190 | ztmp = cdouble_sub( 191 | cdouble_mul(cdouble_rmul(2*n, zinv), unscaled_j_n), 192 | unscaled_j_np1); 193 | 194 | unscaled_j_nm1 = ztmp; 195 | 196 | 197 | psi = cdouble_add(psi, cdouble_mul(unscaled_j_n, zsn)); 198 | zsn = cdouble_mul(zsn, zmulinv); 199 | 200 | n -= 1; 201 | unscaled_j_np1 = unscaled_j_n; 202 | unscaled_j_n = unscaled_j_nm1; 203 | 204 | if (cdouble_abs_squared(ztmp) > upbound) 205 | { 206 | unscaled_j_np1 = cdouble_rmul(upbound_inv, unscaled_j_np1); 207 | unscaled_j_n = cdouble_rmul(upbound_inv, unscaled_j_n); 208 | psi = cdouble_rmul(upbound_inv,psi); 209 | if (n < v) vscale++; 210 | if (n < v+1) vp1scale++; 211 | } 212 | 213 | if (n == v) 214 | unscaled_j_v = unscaled_j_n; 215 | if (n == v+1) 216 | unscaled_j_vp1 = unscaled_j_n; 217 | 218 | } 219 | 220 | psi = cdouble_add(cdouble_rmul(2, psi), unscaled_j_n); 221 | 222 | if ( cdouble_imag(z) <= 0 ) 223 | { 224 | scaling = cdouble_divide( cdouble_exp( cdouble_mul(ima,z) ), psi); 225 | } else 226 | { 227 | scaling = cdouble_divide( cdouble_exp( cdouble_mul(neg_ima,z) ), psi); 228 | } 229 | vscaling = pow(upbound_inv, (double) vscale); 230 | vp1scaling = pow(upbound_inv, (double) vp1scale); 231 | 232 | *j_v = cdouble_mul(unscaled_j_v, cdouble_mulr(scaling, vscaling)); 233 | *j_vp1 = cdouble_mul(unscaled_j_vp1, cdouble_mulr(scaling,vp1scaling)); 234 | 235 | // }}} 236 | } 237 | 238 | // vim: fdm=marker 239 | -------------------------------------------------------------------------------- /pyopencl/cl/pyopencl-eval-tbl.cl: -------------------------------------------------------------------------------- 1 | // Pieced together from Boost C++ and Cephes by 2 | // Andreas Kloeckner (C) 2012 3 | // 4 | // Pieces from: 5 | // 6 | // Copyright (c) 2006 Xiaogang Zhang, John Maddock 7 | // Use, modification and distribution are subject to the 8 | // Boost Software License, Version 1.0. (See 9 | // http://www.boost.org/LICENSE_1_0.txt) 10 | // 11 | // Cephes Math Library Release 2.8: June, 2000 12 | // Copyright 1984, 1987, 1989, 1992, 2000 by Stephen L. Moshier 13 | // What you see here may be used freely, but it comes with no support or 14 | // guarantee. 15 | 16 | #pragma once 17 | 18 | typedef double special_func_scalar_type; 19 | 20 | // {{{ cephes_polevl 21 | 22 | /* 23 | * DESCRIPTION: 24 | * 25 | * Evaluates polynomial of degree N: 26 | * 27 | * 2 N 28 | * y = C + C x + C x +...+ C x 29 | * 0 1 2 N 30 | * 31 | * Coefficients are stored in reverse order: 32 | * 33 | * coef[0] = C , ..., coef[N] = C . 34 | * N 0 35 | * 36 | * The function p1evl() assumes that coef[N] = 1.0 and is 37 | * omitted from the array. Its calling arguments are 38 | * otherwise the same as polevl(). 39 | * 40 | */ 41 | 42 | special_func_scalar_type cephes_polevl(special_func_scalar_type x, __constant const special_func_scalar_type *coef, int N) 43 | { 44 | special_func_scalar_type ans; 45 | int i; 46 | __constant const special_func_scalar_type *p; 47 | 48 | p = coef; 49 | ans = *p++; 50 | i = N; 51 | 52 | do 53 | ans = ans * x + *p++; 54 | while( --i ); 55 | 56 | return( ans ); 57 | } 58 | 59 | // }}} 60 | 61 | // {{{ cephes_p1evl 62 | 63 | special_func_scalar_type cephes_p1evl( special_func_scalar_type x, __constant const special_func_scalar_type *coef, int N ) 64 | { 65 | special_func_scalar_type ans; 66 | __constant const special_func_scalar_type *p; 67 | int i; 68 | 69 | p = coef; 70 | ans = x + *p++; 71 | i = N-1; 72 | 73 | do 74 | ans = ans * x + *p++; 75 | while( --i ); 76 | 77 | return( ans ); 78 | } 79 | 80 | // }}} 81 | 82 | // {{{ boost_evaluate_rational 83 | 84 | special_func_scalar_type boost_evaluate_rational_backend(__constant const special_func_scalar_type* num, __constant const special_func_scalar_type* denom, special_func_scalar_type z, int count) 85 | { 86 | special_func_scalar_type s1, s2; 87 | if(z <= 1) 88 | { 89 | s1 = num[count-1]; 90 | s2 = denom[count-1]; 91 | for(int i = (int)count - 2; i >= 0; --i) 92 | { 93 | s1 *= z; 94 | s2 *= z; 95 | s1 += num[i]; 96 | s2 += denom[i]; 97 | } 98 | } 99 | else 100 | { 101 | z = 1 / z; 102 | s1 = num[0]; 103 | s2 = denom[0]; 104 | for(unsigned i = 1; i < count; ++i) 105 | { 106 | s1 *= z; 107 | s2 *= z; 108 | s1 += num[i]; 109 | s2 += denom[i]; 110 | } 111 | } 112 | return s1 / s2; 113 | } 114 | 115 | #define boost_evaluate_rational(num, denom, z) \ 116 | boost_evaluate_rational_backend(num, denom, z, sizeof(num)/sizeof(special_func_scalar_type)) 117 | 118 | // }}} 119 | 120 | // vim: fdm=marker 121 | -------------------------------------------------------------------------------- /pyopencl/cl/pyopencl-random123/openclfeatures.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010-2011, D. E. Shaw Research. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are 7 | met: 8 | 9 | * Redistributions of source code must retain the above copyright 10 | notice, this list of conditions, and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions, and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 16 | * Neither the name of D. E. Shaw Research nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | #ifndef __openclfeatures_dot_hpp 33 | #define __openclfeatures_dot_hpp 34 | 35 | #ifndef R123_STATIC_INLINE 36 | #define R123_STATIC_INLINE inline 37 | #endif 38 | 39 | #ifndef R123_FORCE_INLINE 40 | #define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) 41 | #endif 42 | 43 | #ifndef R123_CUDA_DEVICE 44 | #define R123_CUDA_DEVICE 45 | #endif 46 | 47 | #ifndef R123_ASSERT 48 | #define R123_ASSERT(x) 49 | #endif 50 | 51 | #ifndef R123_BUILTIN_EXPECT 52 | #define R123_BUILTIN_EXPECT(expr,likely) expr 53 | #endif 54 | 55 | #ifndef R123_USE_GNU_UINT128 56 | #define R123_USE_GNU_UINT128 0 57 | #endif 58 | 59 | #ifndef R123_USE_MULHILO64_ASM 60 | #define R123_USE_MULHILO64_ASM 0 61 | #endif 62 | 63 | #ifndef R123_USE_MULHILO64_MSVC_INTRIN 64 | #define R123_USE_MULHILO64_MSVC_INTRIN 0 65 | #endif 66 | 67 | #ifndef R123_USE_MULHILO64_CUDA_INTRIN 68 | #define R123_USE_MULHILO64_CUDA_INTRIN 0 69 | #endif 70 | 71 | #ifndef R123_USE_MULHILO64_OPENCL_INTRIN 72 | #ifdef PYOPENCL_USING_OCLGRIND 73 | #define R123_USE_MULHILO64_OPENCL_INTRIN 0 74 | #else 75 | #define R123_USE_MULHILO64_OPENCL_INTRIN 1 76 | #endif 77 | #endif 78 | 79 | #ifndef R123_USE_AES_NI 80 | #define R123_USE_AES_NI 0 81 | #endif 82 | 83 | // XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of 84 | // ulong to mul_hi. And gets lots of complaints from stdint.h 85 | // on some machines. 86 | // But these typedefs mean we cannot include stdint.h with 87 | // these headers? Do we need R123_64T, R123_32T, R123_8T? 88 | typedef ulong uint64_t; 89 | typedef uint uint32_t; 90 | typedef uchar uint8_t; 91 | #define UINT64_C(x) ((ulong)(x##UL)) 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /pyopencl/cltypes.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (C) 2016 Jonathan Mackenzie" 2 | 3 | __license__ = """ 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 18 | THE SOFTWARE. 19 | """ 20 | 21 | import warnings 22 | 23 | import numpy as np 24 | 25 | from pyopencl.tools import get_or_register_dtype 26 | 27 | 28 | if __file__.endswith("array.py"): 29 | warnings.warn( 30 | "pyopencl.array.vec is deprecated. Please use pyopencl.cltypes.", 31 | stacklevel=2) 32 | 33 | """ 34 | This file provides a type mapping from OpenCl type names to their numpy equivalents 35 | """ 36 | 37 | char = np.int8 38 | uchar = np.uint8 39 | short = np.int16 40 | ushort = np.uint16 41 | int = np.int32 42 | uint = np.uint32 43 | long = np.int64 44 | ulong = np.uint64 45 | half = np.float16 46 | float = np.float32 47 | double = np.float64 48 | 49 | 50 | # {{{ vector types 51 | 52 | def _create_vector_types(): 53 | mapping = [(k, globals()[k]) for k in 54 | ["char", "uchar", "short", "ushort", "int", 55 | "uint", "long", "ulong", "float", "double"]] 56 | 57 | def set_global(key, val): 58 | globals()[key] = val 59 | 60 | vec_types = {} 61 | vec_type_to_scalar_and_count = {} 62 | 63 | field_names = ["x", "y", "z", "w"] 64 | 65 | counts = [2, 3, 4, 8, 16] 66 | 67 | for base_name, base_type in mapping: 68 | for count in counts: 69 | name = "%s%d" % (base_name, count) 70 | 71 | titles = field_names[:count] 72 | 73 | padded_count = count 74 | if count == 3: 75 | padded_count = 4 76 | 77 | names = ["s%d" % i for i in range(count)] 78 | while len(names) < padded_count: 79 | names.append("padding%d" % (len(names) - count)) 80 | 81 | if len(titles) < len(names): 82 | titles.extend((len(names) - len(titles)) * [None]) 83 | 84 | try: 85 | dtype = np.dtype({ 86 | "names": names, 87 | "formats": [base_type] * padded_count, 88 | "titles": titles}) 89 | except NotImplementedError: 90 | try: 91 | dtype = np.dtype([((n, title), base_type) 92 | for (n, title) in zip(names, titles)]) 93 | except TypeError: 94 | dtype = np.dtype([(n, base_type) for (n, title) 95 | in zip(names, titles)]) 96 | 97 | get_or_register_dtype(name, dtype) 98 | 99 | set_global(name, dtype) 100 | 101 | def create_array(dtype, count, padded_count, *args, **kwargs): 102 | if len(args) < count: 103 | from warnings import warn 104 | warn("default values for make_xxx are deprecated;" 105 | " instead specify all parameters or use" 106 | " cltypes.zeros_xxx", 107 | DeprecationWarning, stacklevel=4) 108 | 109 | padded_args = tuple(list(args) + [0] * (padded_count - len(args))) 110 | array = eval("array(padded_args, dtype=dtype)", 111 | {"array": np.array, 112 | "padded_args": padded_args, 113 | "dtype": dtype}) 114 | for key, val in list(kwargs.items()): 115 | array[key] = val 116 | return array 117 | 118 | set_global("make_" + name, eval( 119 | "lambda *args, **kwargs: create_array(dtype, %i, %i, " 120 | "*args, **kwargs)" % (count, padded_count), 121 | {"create_array": create_array, "dtype": dtype})) 122 | set_global("filled_" + name, eval( 123 | "lambda val: make_%s(*[val]*%i)" % (name, count))) 124 | set_global("zeros_" + name, eval("lambda: filled_%s(0)" % (name))) 125 | set_global("ones_" + name, eval("lambda: filled_%s(1)" % (name))) 126 | 127 | vec_types[np.dtype(base_type), count] = dtype 128 | vec_type_to_scalar_and_count[dtype] = np.dtype(base_type), count 129 | 130 | return vec_types, vec_type_to_scalar_and_count 131 | 132 | 133 | vec_types, vec_type_to_scalar_and_count = _create_vector_types() 134 | 135 | # }}} 136 | 137 | # vim: foldmethod=marker 138 | -------------------------------------------------------------------------------- /pyopencl/ipython_ext.py: -------------------------------------------------------------------------------- 1 | from IPython.core.magic import Magics, cell_magic, line_magic, magics_class 2 | 3 | import pyopencl as cl 4 | 5 | 6 | @magics_class 7 | class PyOpenCLMagics(Magics): 8 | def _run_kernel(self, kernel, options): 9 | try: 10 | ctx = self.shell.user_ns["cl_ctx"] 11 | except KeyError: 12 | ctx = None 13 | 14 | if not isinstance(ctx, cl.Context): 15 | ctx = None 16 | 17 | if ctx is None: 18 | try: 19 | ctx = self.shell.user_ns["ctx"] 20 | except KeyError: 21 | ctx = None 22 | 23 | if ctx is None or not isinstance(ctx, cl.Context): 24 | raise RuntimeError("unable to locate cl context, which must be " 25 | "present in namespace as 'cl_ctx' or 'ctx'") 26 | 27 | prg = cl.Program(ctx, kernel).build(options=options.split()) 28 | 29 | for knl in prg.all_kernels(): 30 | self.shell.user_ns[knl.function_name] = knl 31 | 32 | @cell_magic 33 | def cl_kernel(self, line, cell): 34 | kernel = cell 35 | 36 | opts, _args = self.parse_options(line, "o:") 37 | build_options = opts.get("o", "") 38 | 39 | self._run_kernel(kernel, build_options) 40 | 41 | def _load_kernel_and_options(self, line): 42 | opts, args = self.parse_options(line, "o:f:") 43 | 44 | build_options = opts.get("o") 45 | kernel = self.shell.find_user_code(opts.get("f") or args) 46 | 47 | return kernel, build_options 48 | 49 | @line_magic 50 | def cl_kernel_from_file(self, line): 51 | kernel, build_options = self._load_kernel_and_options(line) 52 | self._run_kernel(kernel, build_options) 53 | 54 | @line_magic 55 | def cl_load_edit_kernel(self, line): 56 | kernel, build_options = self._load_kernel_and_options(line) 57 | header = "%%cl_kernel" 58 | 59 | if build_options: 60 | header = f'{header} -o "{build_options}"' 61 | 62 | content = f"{header}\n\n{kernel}" 63 | 64 | self.shell.set_next_input(content) 65 | 66 | 67 | def load_ipython_extension(ip): 68 | ip.register_magics(PyOpenCLMagics) 69 | -------------------------------------------------------------------------------- /pyopencl/version.py: -------------------------------------------------------------------------------- 1 | import re 2 | from importlib import metadata 3 | 4 | 5 | VERSION_TEXT = metadata.version("pyopencl") 6 | _match = re.match(r"^([0-9.]+)([a-z0-9]*?)$", VERSION_TEXT) 7 | assert _match is not None 8 | VERSION_STATUS = _match.group(2) 9 | VERSION = tuple(int(nr) for nr in _match.group(1).split(".")) 10 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "scikit_build_core.build" 3 | requires = [ 4 | "scikit-build-core >=0.9.3", 5 | "nanobind >=1.9.2", 6 | # https://numpy.org/doc/stable/dev/depending_on_numpy.html#build-time-dependency 7 | # Just depending on numpy will automatically expose the oldest supported ABI. 8 | # - Retrieved 2024-06-24, AK 9 | "numpy", 10 | ] 11 | 12 | [project] 13 | name = "pyopencl" 14 | version = "2025.1" 15 | description = "Python wrapper for OpenCL" 16 | readme = "README.rst" 17 | license = "MIT" 18 | authors = [ 19 | { name = "Andreas Kloeckner", email = "inform@tiker.net" }, 20 | ] 21 | requires-python = "~=3.8" 22 | classifiers = [ 23 | "Development Status :: 5 - Production/Stable", 24 | "Environment :: Console", 25 | "Intended Audience :: Developers", 26 | "Intended Audience :: Other Audience", 27 | "Intended Audience :: Science/Research", 28 | "Natural Language :: English", 29 | "Programming Language :: C++", 30 | "Programming Language :: Python", 31 | "Programming Language :: Python :: 3 :: Only", 32 | "Topic :: Scientific/Engineering", 33 | "Topic :: Scientific/Engineering :: Mathematics", 34 | "Topic :: Scientific/Engineering :: Physics", 35 | ] 36 | dependencies = [ 37 | "importlib-resources; python_version<'3.9'", 38 | "numpy", 39 | "platformdirs>=2.2", 40 | "pytools>=2024.1.5", 41 | ] 42 | 43 | [project.optional-dependencies] 44 | oclgrind = [ 45 | "oclgrind-binary-distribution>=18.3", 46 | ] 47 | pocl = [ 48 | "pocl-binary-distribution>=1.2", 49 | ] 50 | test = [ 51 | "ruff", 52 | "mako", 53 | "mypy", 54 | "pylint", 55 | "pytest>=7", 56 | ] 57 | 58 | [project.urls] 59 | Documentation = "https://documen.tician.de/pyopencl" 60 | Homepage = "https://mathema.tician.de/software/pyopencl" 61 | Repository = "https://github.com/inducer/pyopencl" 62 | 63 | [tool.scikit-build] 64 | sdist.exclude = [ 65 | ".mypy_cache", 66 | ".ci", 67 | ".github", 68 | ".conda-ci-build-configure.sh", 69 | "doc/upload-docs.sh", 70 | ".editorconfig", 71 | "TODOs", 72 | "run-*.sh", 73 | ] 74 | 75 | [tool.inducer-ci-support] 76 | disable-editable-pip-install = true 77 | 78 | [tool.ruff.lint] 79 | preview = true 80 | extend-select = [ 81 | "B", # flake8-bugbear 82 | "C", # flake8-comprehensions 83 | "E", # pycodestyle 84 | "F", # pyflakes 85 | "G", # flake8-logging-format 86 | "I", # flake8-isort 87 | "N", # pep8-naming 88 | "NPY", # numpy 89 | "Q", # flake8-quotes 90 | "RUF", # ruff 91 | "UP", # pyupgrade 92 | "W", # pycodestyle 93 | ] 94 | extend-ignore = [ 95 | "E226", # missing whitespace around arithmetic operator 96 | "E241", # multiple spaces after comma 97 | "E402", # module level import not at the top of file 98 | "C90", # McCabe complexity 99 | "UP031", # use f-strings instead of % 100 | "UP032", # use f-strings instead of .format 101 | ] 102 | exclude = [ 103 | "examples/gl_interop_demo.py", 104 | "examples/gl_particle_animation.py", 105 | "pyopencl/compyte/**/*.py", 106 | ] 107 | 108 | [tool.ruff.lint.per-file-ignores] 109 | "examples/pi-monte-carlo.py" = ["N", "B", "F841"] 110 | "examples/black-hole-accretion.py" = ["N", "E501", "B"] 111 | "examples/n-body.py" = ["N", "E501"] 112 | "pyopencl/__init__.py" = ["I001"] 113 | "contrib/fortran-to-opencl/translate.py" = ["N802", "N815", "B"] 114 | 115 | [tool.ruff.lint.flake8-quotes] 116 | inline-quotes = "double" 117 | docstring-quotes = "double" 118 | multiline-quotes = "double" 119 | 120 | [tool.ruff.lint.isort] 121 | known-first-party = ["pytools", "pymbolic", "cgen"] 122 | known-local-folder = ["pyopencl"] 123 | lines-after-imports = 2 124 | combine-as-imports = true 125 | 126 | [tool.pytest.ini_options] 127 | markers = [ 128 | "bitonic: tests involving bitonic sort" 129 | ] 130 | 131 | [tool.mypy] 132 | warn_unused_ignores = true 133 | exclude = ["pyopencl/compyte"] 134 | 135 | [[tool.mypy.overrides]] 136 | module = [ 137 | "IPython.*", 138 | "OpenGL.*", 139 | "mako.*", 140 | "matplotlib.*", 141 | "pyfmmlib.*", 142 | "pyopencl._cl.*", 143 | "pytest.*", 144 | "scipy.*", 145 | ] 146 | ignore_missing_imports = true 147 | 148 | [[tool.mypy.overrides]] 149 | module = ["pyopencl.compyte.*"] 150 | follow_imports = "skip" 151 | 152 | [tool.cibuildwheel] 153 | test-command = "pytest {project}/test" 154 | test-extras = [ 155 | "test", 156 | ] 157 | environment-pass = [ 158 | "CL_INC_DIR", 159 | "CL_LIB_DIR", 160 | ] 161 | test-skip = [ 162 | "*-macosx_*:arm64", 163 | "*-macosx_arm64", 164 | ] 165 | 166 | [tool.cibuildwheel.linux] 167 | skip = [ 168 | "pp*", 169 | "cp36-*", 170 | "cp37-*", 171 | "*_i686", 172 | ] 173 | test-command = "" 174 | before-all = [ 175 | "yum install -y git openssl-devel ruby", 176 | "bash {package}/scripts/build-ocl.sh", 177 | ] 178 | before-build = [ 179 | "pip install numpy -Csetup-args=-Dallow-noblas=true", 180 | ] 181 | repair-wheel-command = "auditwheel repair -w {dest_dir} --lib-sdir=/.libs {wheel}" 182 | 183 | [[tool.cibuildwheel.overrides]] 184 | select = "*-musllinux*" 185 | before-all = [ 186 | "apk add ruby git openssl-dev libtool", 187 | "bash {package}/scripts/build-ocl.sh", 188 | ] 189 | repair-wheel-command = "auditwheel repair -w {dest_dir} --lib-sdir=/.libs {wheel}" 190 | 191 | [tool.cibuildwheel.macos] 192 | skip = [ 193 | "pp*", 194 | "cp36-*", 195 | "cp37-*", 196 | ] 197 | before-all = "bash {package}/scripts/build-ocl-macos.sh" 198 | test-command = "pytest {project}/test/test_array.py" # same limitation as conda-forge 199 | archs = "x86_64 arm64" 200 | 201 | # https://github.com/conda-forge/pyopencl-feedstock/blob/6f3c5de59b18c9518abba3cb94f6ae92964553f8/recipe/meta.yaml#L62-L63 202 | 203 | [tool.cibuildwheel.macos.environment] 204 | # Needed for full C++17 support 205 | MACOSX_DEPLOYMENT_TARGET = "10.14" 206 | 207 | [tool.cibuildwheel.windows] 208 | skip = [ 209 | "*-win32", 210 | "pp*", 211 | "cp36-*", 212 | "cp37-*", 213 | ] 214 | test-command = "" 215 | before-all = "bash {package}/scripts/build-ocl-windows.sh" 216 | 217 | [tool.typos.default] 218 | extend-ignore-re = [ 219 | "(?Rm)^.*(#|//)\\s*spellchecker:\\s*disable-line$" 220 | ] 221 | 222 | [tool.typos.default.extend-words] 223 | # for ND Range 224 | ND = "ND" 225 | nd = "nd" 226 | 227 | # level-of-detail 228 | LOD = "LOD" 229 | 230 | # short for 'series' 231 | "ser" = "ser" 232 | 233 | # like the numpy function 234 | "arange" = "arange" 235 | 236 | [tool.typos.files] 237 | extend-exclude = [ 238 | # No thanks, hex IDs in JSON should not be spellchecked. 239 | "examples/*.ipynb", 240 | # Copied from upstream 241 | "pyopencl/cl/pyopencl-random123/*", 242 | # This one has comments in French 243 | "examples/black-hole-accretion.py" 244 | ] 245 | 246 | [tool.basedpyright] 247 | reportImplicitStringConcatenation = "none" 248 | reportUnnecessaryIsInstance = "none" 249 | reportUnusedCallResult = "none" 250 | reportExplicitAny = "none" 251 | reportUnreachable = "hint" 252 | 253 | # This reports even cycles that are qualified by 'if TYPE_CHECKING'. Not what 254 | # we care about at this moment. 255 | # https://github.com/microsoft/pyright/issues/746 256 | reportImportCycles = "none" 257 | pythonVersion = "3.10" 258 | pythonPlatform = "All" 259 | 260 | [[tool.basedpyright.executionEnvironments]] 261 | root = "test" 262 | reportUnknownArgumentType = "hint" 263 | reportPrivateUsage = "none" 264 | 265 | -------------------------------------------------------------------------------- /run-mypy.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | python -m mypy pyopencl test 4 | -------------------------------------------------------------------------------- /run-pylint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit -o nounset 4 | 5 | ci_support="https://gitlab.tiker.net/inducer/ci-support/raw/main" 6 | 7 | if [[ ! -f .pylintrc.yml ]]; then 8 | curl -o .pylintrc.yml "${ci_support}/.pylintrc-default.yml" 9 | fi 10 | 11 | 12 | if [[ ! -f .run-pylint.py ]]; then 13 | curl -L -o .run-pylint.py "${ci_support}/run-pylint.py" 14 | fi 15 | 16 | 17 | PYLINT_RUNNER_ARGS="--jobs=4 --yaml-rcfile=.pylintrc.yml" 18 | 19 | if [[ -f .pylintrc-local.yml ]]; then 20 | PYLINT_RUNNER_ARGS+=" --yaml-rcfile=.pylintrc-local.yml" 21 | fi 22 | 23 | python .run-pylint.py $PYLINT_RUNNER_ARGS $(basename $PWD) test/*.py "$@" 24 | -------------------------------------------------------------------------------- /scripts/build-ocl-macos.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 3 | 4 | set -o xtrace 5 | 6 | git clone --branch v2022.01.04 https://github.com/KhronosGroup/OpenCL-ICD-Loader 7 | git clone --branch v2022.01.04 https://github.com/KhronosGroup/OpenCL-Headers 8 | 9 | 10 | 11 | cmake -D CMAKE_INSTALL_PREFIX=./OpenCL-Headers/install -S ./OpenCL-Headers -B ./OpenCL-Headers/build 12 | cmake --build ./OpenCL-Headers/build --target install 13 | 14 | cmake -D CMAKE_PREFIX_PATH=${PWD}/OpenCL-Headers/install -D OPENCL_ICD_LOADER_HEADERS_DIR=${PWD}/OpenCL-Headers/install/include -D CMAKE_INSTALL_PREFIX=./OpenCL-ICD-Loader/install -S ./OpenCL-ICD-Loader -B ./OpenCL-ICD-Loader/build 15 | cmake --build ./OpenCL-ICD-Loader/build --target install --config Release 16 | 17 | echo "PyOpenCL wheel includes Khronos Group OpenCL-ICD-Loader which is licensed as below" >> ${SCRIPT_DIR}/../LICENSE 18 | cat ./OpenCL-ICD-Loader/LICENSE >> ${SCRIPT_DIR}/../LICENSE 19 | -------------------------------------------------------------------------------- /scripts/build-ocl-windows.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 3 | 4 | set -o xtrace 5 | 6 | git clone --branch v2022.01.04 https://github.com/KhronosGroup/OpenCL-ICD-Loader 7 | 8 | git clone --branch v2022.01.04 https://github.com/KhronosGroup/OpenCL-Headers 9 | 10 | 11 | cmake -D CMAKE_INSTALL_PREFIX=./OpenCL-Headers/install -S ./OpenCL-Headers -B ./OpenCL-Headers/build 12 | cmake --build ./OpenCL-Headers/build --target install 13 | 14 | # if someone would like to try to create win32 wheels the below lines may be useful 15 | # cmake -D CMAKE_PREFIX_PATH=${PWD}/OpenCL-Headers/install -DOPENCL_ICD_LOADER_HEADERS_DIR=${PWD}/OpenCL-Headers/install/include -S ./OpenCL-ICD-Loader -B ./OpenCL-ICD-Loader/build 16 | # cmake --build ./OpenCL-ICD-Loader/build --target install --config Release 17 | 18 | cmake -D CMAKE_PREFIX_PATH=${PWD}/OpenCL-Headers/install -D OPENCL_ICD_LOADER_HEADERS_DIR=${PWD}/OpenCL-Headers/install/include -S ./OpenCL-ICD-Loader -B ./OpenCL-ICD-Loader/build2 -A x64 19 | cmake --build ./OpenCL-ICD-Loader/build2 --target install --config Release 20 | 21 | echo "PyOpenCL wheel includes Khronos Group OpenCL-ICD-Loader which is licensed as below:" >> ${SCRIPT_DIR}/../LICENSE 22 | cat ./OpenCL-ICD-Loader/LICENSE >> ${SCRIPT_DIR}/../LICENSE 23 | -------------------------------------------------------------------------------- /scripts/build-ocl.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 3 | 4 | set -e -x 5 | 6 | mkdir -p ~/deps 7 | cd ~/deps 8 | 9 | git clone --branch v2.3.1 https://github.com/OCL-dev/ocl-icd 10 | cd ocl-icd 11 | curl -L -O https://raw.githubusercontent.com/conda-forge/ocl-icd-feedstock/e2c03e3ddb1ff86630ccf80dc7b87a81640025ea/recipe/install-headers.patch 12 | git apply install-headers.patch 13 | curl -L -O https://github.com/isuruf/ocl-icd/commit/307f2267100a2d1383f0c4a77344b127c0857588.patch 14 | git apply 307f2267100a2d1383f0c4a77344b127c0857588.patch 15 | autoreconf -i 16 | chmod +x configure 17 | ./configure --prefix=/usr 18 | make -j4 19 | make install 20 | 21 | # Bundle license files 22 | echo "PyOpenCL wheel includes ocl-icd which is licensed as below" >> ${SCRIPT_DIR}/../LICENSE 23 | cat ~/deps/ocl-icd/COPYING >> ${SCRIPT_DIR}/../LICENSE -------------------------------------------------------------------------------- /src/bitlog.cpp: -------------------------------------------------------------------------------- 1 | // Base-2 logarithm bithack 2 | // 3 | // Copyright (C) 2009 Andreas Kloeckner 4 | // Copyright (C) Sean Eron Anderson (in the public domain) 5 | // 6 | // Permission is hereby granted, free of charge, to any person 7 | // obtaining a copy of this software and associated documentation 8 | // files (the "Software"), to deal in the Software without 9 | // restriction, including without limitation the rights to use, 10 | // copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | // copies of the Software, and to permit persons to whom the 12 | // Software is furnished to do so, subject to the following 13 | // conditions: 14 | // 15 | // The above copyright notice and this permission notice shall be 16 | // included in all copies or substantial portions of the Software. 17 | // 18 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | // OTHER DEALINGS IN THE SOFTWARE. 26 | 27 | 28 | #include "bitlog.hpp" 29 | 30 | 31 | const char pyopencl::log_table_8[] = 32 | { 33 | 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 34 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 35 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 36 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 37 | 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 38 | 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 39 | 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 40 | 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 41 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 42 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 43 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 44 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 45 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 46 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 47 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 48 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 49 | }; 50 | 51 | 52 | -------------------------------------------------------------------------------- /src/bitlog.hpp: -------------------------------------------------------------------------------- 1 | // Base-2 logarithm bithack. 2 | // 3 | // Copyright (C) 2009 Andreas Kloeckner 4 | // Copyright (C) Sean Eron Anderson (in the public domain) 5 | // 6 | // Permission is hereby granted, free of charge, to any person 7 | // obtaining a copy of this software and associated documentation 8 | // files (the "Software"), to deal in the Software without 9 | // restriction, including without limitation the rights to use, 10 | // copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | // copies of the Software, and to permit persons to whom the 12 | // Software is furnished to do so, subject to the following 13 | // conditions: 14 | // 15 | // The above copyright notice and this permission notice shall be 16 | // included in all copies or substantial portions of the Software. 17 | // 18 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | // OTHER DEALINGS IN THE SOFTWARE. 26 | 27 | 28 | #ifndef _AFJDFJSDFSD_PYOPENCL_HEADER_SEEN_BITLOG_HPP 29 | #define _AFJDFJSDFSD_PYOPENCL_HEADER_SEEN_BITLOG_HPP 30 | 31 | 32 | #include 33 | #include 34 | 35 | 36 | namespace pyopencl 37 | { 38 | /* from http://graphics.stanford.edu/~seander/bithacks.html */ 39 | 40 | extern const char log_table_8[]; 41 | 42 | inline unsigned bitlog2_16(uint16_t v) 43 | { 44 | if (unsigned long t = v >> 8) 45 | return 8+log_table_8[t]; 46 | else 47 | return log_table_8[v]; 48 | } 49 | 50 | inline unsigned bitlog2_32(uint32_t v) 51 | { 52 | if (uint16_t t = v >> 16) 53 | return 16+bitlog2_16(t); 54 | else 55 | return bitlog2_16(v); 56 | } 57 | 58 | #if defined(UINT64_MAX) 59 | inline unsigned bitlog2(uint64_t v) 60 | { 61 | if (uint32_t t = v >> 32) 62 | return 32+bitlog2_32(t); 63 | else 64 | return bitlog2_32(v); 65 | } 66 | #else 67 | inline unsigned bitlog2(unsigned long v) 68 | { 69 | #if (ULONG_MAX != 4294967295) 70 | if (uint32_t t = v >> 32) 71 | return 32+bitlog2_32(t); 72 | else 73 | #endif 74 | return bitlog2_32(v); 75 | } 76 | #endif 77 | } 78 | 79 | 80 | 81 | 82 | 83 | #endif 84 | -------------------------------------------------------------------------------- /src/clinfo_ext.h: -------------------------------------------------------------------------------- 1 | /* Include OpenCL header, and define OpenCL extensions, since what is and is not 2 | * available in the official headers is very system-dependent */ 3 | 4 | #ifndef _EXT_H 5 | #define _EXT_H 6 | 7 | #if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H)) 8 | #include 9 | #else 10 | #include 11 | #endif 12 | 13 | /* These two defines were introduced in the 1.2 headers 14 | * on 2012-11-30, so earlier versions don't have them 15 | * (e.g. Debian wheezy) 16 | */ 17 | 18 | #ifndef CL_DEVICE_IMAGE_PITCH_ALIGNMENT 19 | #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A 20 | #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B 21 | #endif 22 | 23 | /* 24 | * Extensions 25 | */ 26 | 27 | /* cl_khr_icd */ 28 | #define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920 29 | #define CL_PLATFORM_NOT_FOUND_KHR -1001 30 | 31 | 32 | /* cl_khr_fp64 */ 33 | #define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 34 | 35 | /* cl_khr_fp16 */ 36 | #define CL_DEVICE_HALF_FP_CONFIG 0x1033 37 | 38 | /* cl_khr_terminate_context */ 39 | #define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x200F 40 | 41 | /* cl_nv_device_attribute_query */ 42 | #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 43 | #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 44 | #define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 45 | #define CL_DEVICE_WARP_SIZE_NV 0x4003 46 | #define CL_DEVICE_GPU_OVERLAP_NV 0x4004 47 | #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 48 | #define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 49 | #define CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV 0x4007 50 | #define CL_DEVICE_PCI_BUS_ID_NV 0x4008 51 | #define CL_DEVICE_PCI_SLOT_ID_NV 0x4009 52 | #define CL_DEVICE_PCI_DOMAIN_ID_NV 0x400A 53 | 54 | /* cl_ext_atomic_counters_{32,64} */ 55 | #define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT 0x4032 56 | 57 | /* cl_amd_device_attribute_query */ 58 | #define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036 59 | #define CL_DEVICE_TOPOLOGY_AMD 0x4037 60 | #define CL_DEVICE_BOARD_NAME_AMD 0x4038 61 | #define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039 62 | #define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040 63 | #define CL_DEVICE_SIMD_WIDTH_AMD 0x4041 64 | #define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042 65 | #define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043 66 | #define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044 67 | #define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045 68 | #define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046 69 | #define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047 70 | #define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048 71 | #define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD 0x4049 72 | #define CL_DEVICE_GFXIP_MAJOR_AMD 0x404A 73 | #define CL_DEVICE_GFXIP_MINOR_AMD 0x404B 74 | #define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD 0x404C 75 | #define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD 0x4030 76 | #define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD 0x4031 77 | #define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD 0x4033 78 | #define CL_DEVICE_PCIE_ID_AMD 0x4034 79 | 80 | #ifndef CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD 81 | #define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD 1 82 | 83 | typedef union 84 | { 85 | struct { cl_uint type; cl_uint data[5]; } raw; 86 | struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie; 87 | } cl_device_topology_amd; 88 | #endif 89 | 90 | /* cl_amd_offline_devices */ 91 | #define CL_CONTEXT_OFFLINE_DEVICES_AMD 0x403F 92 | 93 | /* cl_ext_device_fission */ 94 | #define cl_ext_device_fission 1 95 | 96 | typedef cl_ulong cl_device_partition_property_ext; 97 | 98 | #define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050 99 | #define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051 100 | #define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052 101 | #define CL_DEVICE_PARTITION_BY_NAMES_INTEL 0x4052 /* cl_intel_device_partition_by_names */ 102 | #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053 103 | 104 | #define CL_DEVICE_PARENT_DEVICE_EXT 0x4054 105 | #define CL_DEVICE_PARTITION_TYPES_EXT 0x4055 106 | #define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056 107 | #define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057 108 | #define CL_DEVICE_PARTITION_STYLE_EXT 0x4058 109 | 110 | #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1 111 | #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2 112 | #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3 113 | #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4 114 | #define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10 115 | #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100 116 | 117 | /* cl_intel_advanced_motion_estimation */ 118 | #define CL_DEVICE_ME_VERSION_INTEL 0x407E 119 | 120 | /* cl_qcom_ext_host_ptr */ 121 | #define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0 122 | #define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1 123 | 124 | /* cl_khr_spir */ 125 | #define CL_DEVICE_SPIR_VERSIONS 0x40E0 126 | 127 | /* cl_altera_device_temperature */ 128 | #define CL_DEVICE_CORE_TEMPERATURE_ALTERA 0x40F3 129 | 130 | /* cl_intel_simultaneous_sharing */ 131 | #define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL 0x4104 132 | #define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL 0x4105 133 | 134 | #endif 135 | -------------------------------------------------------------------------------- /src/pyopencl_ext.h: -------------------------------------------------------------------------------- 1 | #ifndef _PYOPENCL_EXT_H 2 | #define _PYOPENCL_EXT_H 3 | 4 | #ifdef PYOPENCL_USE_SHIPPED_EXT 5 | 6 | #include "clinfo_ext.h" 7 | 8 | #else 9 | 10 | #if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H)) 11 | 12 | #include 13 | 14 | #else 15 | 16 | #include 17 | #include 18 | 19 | #endif 20 | 21 | #ifndef CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD 22 | #define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD 1 23 | 24 | typedef union 25 | { 26 | struct { cl_uint type; cl_uint data[5]; } raw; 27 | struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie; 28 | } cl_device_topology_amd; 29 | #endif 30 | 31 | #ifndef CL_DEVICE_P2P_DEVICES_AMD 32 | #define CL_DEVICE_P2P_DEVICES_AMD 0x4089 33 | 34 | typedef CL_API_ENTRY cl_int 35 | (CL_API_CALL * clEnqueueCopyBufferP2PAMD_fn)(cl_command_queue /*command_queue*/, 36 | cl_mem /*src_buffer*/, 37 | cl_mem /*dst_buffer*/, 38 | size_t /*src_offset*/, 39 | size_t /*dst_offset*/, 40 | size_t /*cb*/, 41 | cl_uint /*num_events_in_wait_list*/, 42 | const cl_event* /*event_wait_list*/, 43 | cl_event* /*event*/); 44 | #endif 45 | 46 | /* {{{ these NV defines are often missing from the system headers */ 47 | 48 | #ifndef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 49 | #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 50 | #endif 51 | #ifndef CL_DEVICE_INTEGRATED_MEMORY_NV 52 | #define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 53 | #endif 54 | 55 | #ifndef CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV 56 | #define CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV 0x4007 57 | #endif 58 | 59 | #ifndef CL_DEVICE_PCI_BUS_ID_NV 60 | #define CL_DEVICE_PCI_BUS_ID_NV 0x4008 61 | #endif 62 | 63 | #ifndef CL_DEVICE_PCI_SLOT_ID_NV 64 | #define CL_DEVICE_PCI_SLOT_ID_NV 0x4009 65 | #endif 66 | 67 | #ifndef CL_DEVICE_PCI_DOMAIN_ID_NV 68 | #define CL_DEVICE_PCI_DOMAIN_ID_NV 0x400A 69 | #endif 70 | 71 | /* }}} */ 72 | 73 | #endif 74 | 75 | #endif 76 | 77 | /* vim: foldmethod=marker */ 78 | -------------------------------------------------------------------------------- /src/tools.hpp: -------------------------------------------------------------------------------- 1 | // Various odds and ends 2 | // 3 | // Copyright (C) 2009 Andreas Kloeckner 4 | // 5 | // Permission is hereby granted, free of charge, to any person 6 | // obtaining a copy of this software and associated documentation 7 | // files (the "Software"), to deal in the Software without 8 | // restriction, including without limitation the rights to use, 9 | // copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | // copies of the Software, and to permit persons to whom the 11 | // Software is furnished to do so, subject to the following 12 | // conditions: 13 | // 14 | // The above copyright notice and this permission notice shall be 15 | // included in all copies or substantial portions of the Software. 16 | // 17 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | // OTHER DEALINGS IN THE SOFTWARE. 25 | 26 | 27 | #ifndef _ASDFDAFVVAFF_PYCUDA_HEADER_SEEN_TOOLS_HPP 28 | #define _ASDFDAFVVAFF_PYCUDA_HEADER_SEEN_TOOLS_HPP 29 | 30 | 31 | #include 32 | 33 | #include 34 | #include 35 | 36 | 37 | 38 | namespace pyopencl 39 | { 40 | inline 41 | npy_intp size_from_dims(int ndim, const npy_intp *dims) 42 | { 43 | if (ndim != 0) 44 | return std::accumulate(dims, dims+ndim, 1, std::multiplies()); 45 | else 46 | return 1; 47 | } 48 | 49 | 50 | 51 | 52 | inline void run_python_gc() 53 | { 54 | namespace py = nanobind; 55 | 56 | py::module_::import_("gc").attr("collect")(); 57 | } 58 | 59 | 60 | // https://stackoverflow.com/a/28139075 61 | template 62 | struct reversion_wrapper { T& iterable; }; 63 | 64 | template 65 | auto begin (reversion_wrapper w) { return w.iterable.rbegin(); } 66 | 67 | template 68 | auto end (reversion_wrapper w) { return w.iterable.rend(); } 69 | 70 | template 71 | reversion_wrapper reverse (T&& iterable) { return { iterable }; } 72 | 73 | 74 | // https://stackoverflow.com/a/44175911 75 | class noncopyable { 76 | public: 77 | noncopyable() = default; 78 | ~noncopyable() = default; 79 | 80 | private: 81 | noncopyable(const noncopyable&) = delete; 82 | noncopyable& operator=(const noncopyable&) = delete; 83 | }; 84 | } 85 | 86 | 87 | 88 | 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /src/wrap_cl.cpp: -------------------------------------------------------------------------------- 1 | // PyOpenCL-flavored C++ wrapper of the CL API 2 | // 3 | // Copyright (C) 2009 Andreas Kloeckner 4 | // 5 | // Permission is hereby granted, free of charge, to any person 6 | // obtaining a copy of this software and associated documentation 7 | // files (the "Software"), to deal in the Software without 8 | // restriction, including without limitation the rights to use, 9 | // copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | // copies of the Software, and to permit persons to whom the 11 | // Software is furnished to do so, subject to the following 12 | // conditions: 13 | // 14 | // The above copyright notice and this permission notice shall be 15 | // included in all copies or substantial portions of the Software. 16 | // 17 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | // OTHER DEALINGS IN THE SOFTWARE. 25 | 26 | 27 | #define PY_ARRAY_UNIQUE_SYMBOL pyopencl_ARRAY_API 28 | 29 | #include "wrap_cl.hpp" 30 | #include 31 | 32 | 33 | 34 | 35 | using namespace pyopencl; 36 | 37 | 38 | 39 | 40 | extern void pyopencl_expose_constants(py::module_ &m); 41 | extern void pyopencl_expose_part_1(py::module_ &m); 42 | extern void pyopencl_expose_part_2(py::module_ &m); 43 | extern void pyopencl_expose_mempool(py::module_ &m); 44 | 45 | static bool import_numpy_helper() 46 | { 47 | import_array1(false); 48 | return true; 49 | } 50 | 51 | NB_MODULE(_cl, m) 52 | { 53 | py::intrusive_init( 54 | [](PyObject *o) noexcept { 55 | py::gil_scoped_acquire guard; 56 | Py_INCREF(o); 57 | }, 58 | [](PyObject *o) noexcept { 59 | py::gil_scoped_acquire guard; 60 | Py_DECREF(o); 61 | }); 62 | 63 | if (!import_numpy_helper()) 64 | throw py::python_error(); 65 | 66 | pyopencl_expose_constants(m); 67 | pyopencl_expose_part_1(m); 68 | pyopencl_expose_part_2(m); 69 | pyopencl_expose_mempool(m); 70 | 71 | #ifdef NDEBUG 72 | // See https://github.com/inducer/pyopencl/issues/758 for context. 73 | py::set_leak_warnings(false); 74 | #endif 75 | } 76 | 77 | // vim: foldmethod=marker 78 | -------------------------------------------------------------------------------- /src/wrap_helpers.hpp: -------------------------------------------------------------------------------- 1 | // Wrapper-helping odds and ends 2 | // 3 | // Copyright (C) 2009 Andreas Kloeckner 4 | // 5 | // Permission is hereby granted, free of charge, to any person 6 | // obtaining a copy of this software and associated documentation 7 | // files (the "Software"), to deal in the Software without 8 | // restriction, including without limitation the rights to use, 9 | // copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | // copies of the Software, and to permit persons to whom the 11 | // Software is furnished to do so, subject to the following 12 | // conditions: 13 | // 14 | // The above copyright notice and this permission notice shall be 15 | // included in all copies or substantial portions of the Software. 16 | // 17 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | // OTHER DEALINGS IN THE SOFTWARE. 25 | 26 | 27 | #ifndef PYCUDA_WRAP_HELPERS_HEADER_SEEN 28 | #define PYCUDA_WRAP_HELPERS_HEADER_SEEN 29 | 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | 38 | namespace py = nanobind; 39 | 40 | 41 | #define ENUM_VALUE(NAME) \ 42 | value(#NAME, NAME) 43 | 44 | // {{{ DEF_SIMPLE_XXX 45 | 46 | #define DEF_SIMPLE_METHOD(NAME) \ 47 | def(#NAME, &cls::NAME) 48 | 49 | #define DEF_SIMPLE_STATIC_METHOD(NAME) \ 50 | def_static(#NAME, &cls::NAME) 51 | 52 | #define DEF_SIMPLE_METHOD_WITH_ARGS(NAME, ARGS) \ 53 | def(#NAME, &cls::NAME, boost::python::args ARGS) 54 | 55 | #define DEF_SIMPLE_FUNCTION(NAME) \ 56 | m.def(#NAME, &NAME) 57 | 58 | #define DEF_SIMPLE_FUNCTION_WITH_ARGS(NAME, ARGS) \ 59 | m.def(#NAME, &NAME, py::args ARGS) 60 | 61 | #define DEF_SIMPLE_RO_MEMBER(NAME) \ 62 | def_readonly(#NAME, &cls::m_##NAME) 63 | 64 | #define DEF_SIMPLE_RW_MEMBER(NAME) \ 65 | def_readwrite(#NAME, &cls::m_##NAME) 66 | 67 | // }}} 68 | 69 | // {{{ COPY_PY_XXX 70 | 71 | #define COPY_PY_LIST(TYPE, NAME) \ 72 | { \ 73 | for (auto it: py_##NAME) \ 74 | NAME.push_back(py::cast(it)); \ 75 | } 76 | 77 | #define COPY_PY_ARRAY(FUNC_NAME, TYPE, NAME, COUNTER) \ 78 | { \ 79 | COUNTER = 0; \ 80 | for (auto it: py_##NAME) \ 81 | { \ 82 | if (COUNTER == NAME.size()) \ 83 | throw error(FUNC_NAME, \ 84 | CL_INVALID_VALUE, "too many entries in " #NAME " argument"); \ 85 | NAME[COUNTER++] = py::cast(it); \ 86 | } \ 87 | } 88 | 89 | #define COPY_PY_COORD_TRIPLE(NAME) \ 90 | size_t NAME[3] = {0, 0, 0}; \ 91 | { \ 92 | py::sequence py_seq_##NAME = py::cast(py_##NAME); \ 93 | size_t my_len = len(py_seq_##NAME); \ 94 | if (my_len > 3) \ 95 | throw error("transfer", CL_INVALID_VALUE, #NAME "has too many components"); \ 96 | for (size_t i = 0; i < my_len; ++i) \ 97 | NAME[i] = py::cast(py_seq_##NAME[i]); \ 98 | } 99 | 100 | #define COPY_PY_PITCH_TUPLE(NAME) \ 101 | size_t NAME[2] = {0, 0}; \ 102 | if (py_##NAME.ptr() != Py_None) \ 103 | { \ 104 | py::sequence py_seq_##NAME = py::cast(py_##NAME); \ 105 | size_t my_len = len(py_seq_##NAME); \ 106 | if (my_len > 2) \ 107 | throw error("transfer", CL_INVALID_VALUE, #NAME "has too many components"); \ 108 | for (size_t i = 0; i < my_len; ++i) \ 109 | NAME[i] = py::cast(py_seq_##NAME[i]); \ 110 | } 111 | 112 | #define COPY_PY_REGION_TRIPLE(NAME) \ 113 | size_t NAME[3] = {1, 1, 1}; \ 114 | { \ 115 | py::sequence py_seq_##NAME = py::cast(py_##NAME); \ 116 | size_t my_len = len(py_seq_##NAME); \ 117 | if (my_len > 3) \ 118 | throw error("transfer", CL_INVALID_VALUE, #NAME "has too many components"); \ 119 | for (size_t i = 0; i < my_len; ++i) \ 120 | NAME[i] = py::cast(py_seq_##NAME[i]); \ 121 | } 122 | 123 | // }}} 124 | 125 | #define PYOPENCL_PARSE_NUMPY_ARRAY_SPEC \ 126 | PyArray_Descr *tp_descr; \ 127 | if (PyArray_DescrConverter(dtype.ptr(), &tp_descr) != NPY_SUCCEED) \ 128 | throw py::python_error(); \ 129 | \ 130 | std::vector shape; \ 131 | try \ 132 | { \ 133 | shape.push_back(py::cast(py_shape)); \ 134 | } \ 135 | catch (py::cast_error &) \ 136 | { \ 137 | COPY_PY_LIST(npy_intp, shape); \ 138 | } \ 139 | \ 140 | NPY_ORDER order = NPY_CORDER; \ 141 | PyArray_OrderConverter(py_order.ptr(), &order); \ 142 | \ 143 | int ary_flags = 0; \ 144 | if (order == NPY_FORTRANORDER) \ 145 | ary_flags |= NPY_ARRAY_FARRAY; \ 146 | else if (order == NPY_CORDER) \ 147 | ary_flags |= NPY_ARRAY_CARRAY; \ 148 | else \ 149 | throw std::runtime_error("unrecognized order specifier"); \ 150 | \ 151 | std::vector strides; \ 152 | if (py_strides.ptr() != Py_None) \ 153 | { \ 154 | COPY_PY_LIST(npy_intp, strides); \ 155 | } 156 | 157 | #define PYOPENCL_RETURN_VECTOR(ITEMTYPE, NAME) \ 158 | { \ 159 | py::list pyopencl_result; \ 160 | for (ITEMTYPE item: NAME) \ 161 | pyopencl_result.append(item); \ 162 | return pyopencl_result; \ 163 | } 164 | 165 | namespace 166 | { 167 | template 168 | inline py::object handle_from_new_ptr(T *ptr) 169 | { 170 | return py::cast(ptr, py::rv_policy::take_ownership); 171 | } 172 | 173 | template 174 | inline T *from_int_ptr(intptr_t obj_ref, bool retain) 175 | { 176 | ClType clobj = (ClType) obj_ref; 177 | return new T(clobj, retain); 178 | } 179 | 180 | template 181 | inline intptr_t to_int_ptr(T const &obj) 182 | { 183 | return (intptr_t) obj.data(); 184 | } 185 | } 186 | 187 | #define PYOPENCL_EXPOSE_TO_FROM_INT_PTR(CL_TYPENAME) \ 188 | .def_static("from_int_ptr", from_int_ptr, \ 189 | py::arg("int_ptr_value"), \ 190 | py::arg("retain")=true, \ 191 | "(static method) Return a new Python object referencing the C-level " \ 192 | ":c:type:`" #CL_TYPENAME "` object at the location pointed to " \ 193 | "by *int_ptr_value*. The relevant ``clRetain*`` function " \ 194 | "will be called if *retain* is True." \ 195 | "If the previous owner of the object will *not* release the reference, " \ 196 | "*retain* should be set to *False*, to effectively transfer ownership to " \ 197 | ":mod:`pyopencl`." \ 198 | "\n\n.. versionadded:: 2013.2\n" \ 199 | "\n\n.. versionchanged:: 2016.1\n\n *retain* added.") \ 200 | .def_prop_ro("int_ptr", to_int_ptr, \ 201 | "Return an integer corresponding to the pointer value " \ 202 | "of the underlying :c:type:`" #CL_TYPENAME "`. " \ 203 | "Use :meth:`from_int_ptr` to turn back into a Python object." \ 204 | "\n\n.. versionadded:: 2013.2\n") \ 205 | 206 | #define PYOPENCL_EXPOSE_EQUALITY_TESTS \ 207 | /* this relies on nanobind overload resolution going in order of registration */ \ 208 | .def("__eq__", [](cls const &self, cls const &other) { return self == other; }) \ 209 | .def("__eq__", [](cls const &self, py::object obj) { return false; }, py::arg("obj").none()) 210 | 211 | 212 | #endif 213 | 214 | // vim: foldmethod=marker 215 | -------------------------------------------------------------------------------- /test/add-vectors-32.spv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inducer/pyopencl/b8b8d4d852e8a26356861ffda578874dc064e54c/test/add-vectors-32.spv -------------------------------------------------------------------------------- /test/add-vectors-64.spv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inducer/pyopencl/b8b8d4d852e8a26356861ffda578874dc064e54c/test/add-vectors-64.spv -------------------------------------------------------------------------------- /test/empty-header.h: -------------------------------------------------------------------------------- 1 | /* what did you expect? */ 2 | -------------------------------------------------------------------------------- /test/test_arrays_in_structs.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (C) 2020 Sotiris Niarchos" 2 | 3 | __license__ = """ 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | """ 22 | 23 | import numpy as np 24 | 25 | import pyopencl as cl 26 | import pyopencl.cltypes as cltypes 27 | import pyopencl.tools as cl_tools 28 | from pyopencl import mem_flags 29 | from pyopencl.tools import ( 30 | pytest_generate_tests_for_pyopencl as pytest_generate_tests, # noqa: F401 31 | ) 32 | 33 | 34 | def test_struct_with_array_fields(ctx_factory): 35 | # 36 | # typedef struct { 37 | # uint x[2]; 38 | # float y; 39 | # uint z[3][4]; 40 | # } my_struct; 41 | # 42 | cl_ctx = ctx_factory() 43 | device = cl_ctx.devices[0] 44 | queue = cl.CommandQueue(cl_ctx) 45 | 46 | my_struct = np.dtype([ 47 | ("x", cltypes.uint, 2), 48 | ("y", cltypes.int), 49 | ("z", cltypes.uint, (3, 4)) 50 | ]) 51 | my_struct, cdecl = cl_tools.match_dtype_to_c_struct( 52 | device, "my_struct", my_struct 53 | ) 54 | 55 | # a random buffer of 4 structs 56 | my_struct_arr = np.array([ 57 | ([81, 24], -57, [[15, 28, 45, 7], [71, 95, 65, 84], [2, 11, 59, 9]]), 58 | ([5, 20], 47, [[15, 53, 7, 59], [73, 22, 27, 86], [59, 6, 39, 49]]), 59 | ([11, 99], -32, [[73, 83, 4, 65], [19, 21, 22, 27], [1, 55, 6, 64]]), 60 | ([57, 38], -54, [[74, 90, 38, 67], [77, 30, 99, 18], [91, 3, 63, 67]]) 61 | ], dtype=my_struct) 62 | 63 | expected_res = [] 64 | for x in my_struct_arr: 65 | expected_res.append(int(np.sum(x[0]) + x[1] + np.sum(x[2]))) 66 | expected_res = np.array(expected_res, dtype=cltypes.int) 67 | 68 | kernel_src = """%s 69 | // this kernel sums every number contained in each struct 70 | __kernel void array_structs(__global my_struct *structs, __global int *res) { 71 | int i = get_global_id(0); 72 | my_struct s = structs[i]; 73 | res[i] = s.x[0] + s.x[1] + s.y; 74 | for (int r = 0; r < 3; r++) 75 | for (int c = 0; c < 4; c++) 76 | res[i] += s.z[r][c]; 77 | }""" % cdecl 78 | 79 | mem_flags1 = mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR 80 | mem_flags2 = mem_flags.WRITE_ONLY 81 | 82 | my_struct_buf = cl.Buffer(cl_ctx, mem_flags1, hostbuf=my_struct_arr) 83 | res_buf = cl.Buffer(cl_ctx, mem_flags2, size=expected_res.nbytes) 84 | 85 | program = cl.Program(cl_ctx, kernel_src).build() 86 | kernel = program.array_structs 87 | kernel(queue, (4,), None, my_struct_buf, res_buf) 88 | 89 | res = np.empty_like(expected_res) 90 | cl.enqueue_copy(queue, res, res_buf) 91 | 92 | assert (res == expected_res).all() 93 | 94 | 95 | if __name__ == "__main__": 96 | import sys 97 | if len(sys.argv) > 1: 98 | exec(sys.argv[1]) 99 | else: 100 | from pytest import main 101 | main([__file__]) 102 | -------------------------------------------------------------------------------- /test/test_clrandom.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (C) 2018 Matt Wala" 2 | 3 | __license__ = """ 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | """ 22 | 23 | import numpy as np 24 | import pytest 25 | 26 | import pyopencl as cl 27 | import pyopencl.clrandom as clrandom 28 | import pyopencl.cltypes as cltypes 29 | from pyopencl.characterize import has_double_support 30 | from pyopencl.tools import ( 31 | pytest_generate_tests_for_pyopencl as pytest_generate_tests, # noqa: F401 32 | ) 33 | 34 | 35 | try: 36 | import faulthandler 37 | except ImportError: 38 | pass 39 | else: 40 | faulthandler.enable() 41 | 42 | 43 | @pytest.mark.parametrize("rng_class", [ 44 | clrandom.PhiloxGenerator, 45 | clrandom.ThreefryGenerator]) 46 | @pytest.mark.parametrize("dtype", [ 47 | np.int32, 48 | np.int64, 49 | np.float32, 50 | np.float64, 51 | cltypes.float2, # type: ignore[attr-defined] 52 | cltypes.float3, # type: ignore[attr-defined] 53 | cltypes.float4, # type: ignore[attr-defined] 54 | ]) 55 | def test_clrandom_dtypes(ctx_factory, rng_class, dtype): 56 | cl_ctx = ctx_factory() 57 | if dtype == np.float64 and not has_double_support(cl_ctx.devices[0]): 58 | pytest.skip("double precision not supported on this device") 59 | rng = rng_class(cl_ctx) 60 | 61 | size = 10 62 | 63 | with cl.CommandQueue(cl_ctx) as queue: 64 | rng.uniform(queue, size, dtype) 65 | 66 | if dtype not in (np.int32, np.int64): 67 | rng.normal(queue, size, dtype) 68 | 69 | 70 | if __name__ == "__main__": 71 | import sys 72 | if len(sys.argv) > 1: 73 | exec(sys.argv[1]) 74 | else: 75 | from pytest import main 76 | main([__file__]) 77 | --------------------------------------------------------------------------------