├── .gitignore ├── LICENSE ├── README.md └── experiments └── local_pyopencl ├── demo_float.cl ├── demo_float4.cl ├── ex.py ├── ex2.py └── ex3.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 shazz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # coral-dev-opencl 2 | 3 | My experiments on using OpenCL on the Google Coral Dev board GPU associated to ML models on the TPU 4 | 5 | ### Prerequisites 6 | 7 | - Linux computer (referred to below as "host") with python3 and pip3 8 | - USB-A to USB-micro-B cable (to connect your PC to the board's serial port) 9 | - USB-A to USB-C cable (to connect your PC to the board's data port) 10 | - 2-3A (5V) USB Type-C power supply 11 | - Ethernet cable or Wi-Fi connection 12 | 13 | ### Coral Setup 14 | 15 | #### Prepare the host: 16 | 17 | - Install a serial terminal 18 | ```` 19 | sudo apt-get install screen 20 | ```` 21 | 22 | - Get `fastboot` from the Android tools from `https://developer.android.com/studio/releases/platform-tools#downloads 23 | ```` 24 | mkdir -p ~/.local/bin 25 | sudo mv ~/Downloads/platform-tools/fastboot ~/.local/bin/ 26 | ```` 27 | 28 | - Get MDT (Mendel Dev Tools) 29 | ```` 30 | pip3 install --user mendel-development-tool 31 | ```` 32 | 33 | - Sometimes needed for the serial console (not on my rpi) 34 | ```` 35 | sudo sh -c "echo 'SUBSYSTEM==\"usb\", ATTR{idVendor}==\"0525\", MODE=\"0664\", \ 36 | GROUP=\"plugdev\", TAG+=\"uaccess\"' >> /etc/udev/rules.d/65-edgetpu-board.rules" 37 | 38 | sudo udevadm control --reload-rules && sudo udevadm trigger 39 | ```` 40 | 41 | #### Connect thru USB OTG 42 | 43 | ```` 44 | mdt devices 45 | mdt shell 46 | ```` 47 | 48 | #### Download and install Mendel distribution 49 | 50 | ```` 51 | cd ~/Downloads 52 | 53 | curl -O https://mendel-linux.org/images/enterprise/eagle/enterprise-eagle-20200724205123.zip 54 | 55 | unzip enterprise-eagle-20200724205123.zip \ 56 | && cd enterprise-eagle-20200724205123 57 | 58 | bash flash.sh 59 | ```` 60 | 61 | #### Setup Wifi connection 62 | 63 | ```` 64 | nmtui 65 | nmcli connection show 66 | ```` 67 | 68 | #### Set SD Card as a drive 69 | 70 | - To create a file swap REQUIRED to compile pyopencl 71 | ```` 72 | sudo umount /dev/mmcblk1 73 | sudo mkdir /mnt/SD 74 | sudo mkfs -t ext4 /dev/mmcblk1 75 | sudo mount -t ext4 /dev/mmcblk1 /mnt/SD 76 | 77 | sudo apt-get install exuberant-ctags 78 | sudo apt-get install dphys-swapfile 79 | 80 | sudo dphys-swapfile setup 81 | sudo chmod 0600 /mnt/SD/swap 82 | sudo dphys-swapfile swapon 83 | watch -n1 free 84 | ```` 85 | 86 | - To create a storage drive (non-journalized) 87 | ```` 88 | sudo umount /dev/mmcblk1 89 | sudo mkdir /mnt/SD 90 | sudo fdisk -l 91 | sudo mkfs -t ext2 /dev/mmcblk1 92 | sudo mount -t ext2 /dev/mmcblk1 /mnt/SD 93 | sudo nano /etc/fstab 94 | ```` 95 | 96 | - Add: `/dev/mmcblk1 /mnt/SD ext2 defaults 0 3` 97 | 98 | #### Set SSH keys 99 | 100 | I did not find a way to revert Mendel sshd configuration to accept user/passwords and not only public keys certificates. 101 | 102 | So, when `mdt shell` worked and you still connected using the USB Data Port, from your **host**, type 103 | ```` 104 | ssh-keygen 105 | mdt pushkey ~/.ssh/id_rsa.pub 106 | ssh mendel@192.168.100.2 107 | ```` 108 | 109 | #### Connect thru Serial Shell if needed 110 | 111 | From the host 112 | ```` 113 | sudo apt-get install screen 114 | pip3 install --user mendel-development-tool 115 | 116 | dmesg | grep ttyUSB 117 | screen /dev/ttyUSB0 115200 118 | ```` 119 | 120 | Default login/password: mendel 121 | 122 | #### Static IP on the OpenCL private network 123 | 124 | - Edit as sudoer `/etc/network/interfaces` and modify: 125 | ```` 126 | # interfaces(5) file used by ifup(8) and ifdown(8) 127 | # Include files from /etc/network/interfaces.d: 128 | source-directory /etc/network/interfaces.d 129 | 130 | allow-hotplug eth0 131 | iface eth0 inet static 132 | address 10.0.0.4 133 | netmask 255.255.255.0 134 | ```` 135 | 136 | - Restart the network service: `sudo systemctl restart networking` 137 | - Check the routes with `ip addr show eth0` 138 | ```` 139 | 2: eth0: mtu 1500 qdisc mq state UP group default qlen 1000 140 | link/ether 7c:d9:5c:b2:75:f8 brd ff:ff:ff:ff:ff:ff 141 | inet 10.0.0.4/24 brd 10.0.0.255 scope global eth0 142 | valid_lft forever preferred_lft forever 143 | inet6 fe80::7ed9:5cff:feb2:75f8/64 scope link 144 | valid_lft forever preferred_lft forever 145 | ```` 146 | 147 | - then `ip route show` 148 | ```` 149 | default via 192.168.1.1 dev wlan0 proto dhcp metric 600 150 | 10.0.0.0/24 dev eth0 proto kernel scope link src 10.0.0.4 151 | 192.168.1.0/24 dev wlan0 proto kernel scope link src 192.168.1.43 metric 600 152 | 192.168.100.0/24 dev usb0 proto kernel scope link src 192.168.100.2 metric 100 153 | 192.168.101.0/24 dev usb1 proto kernel scope link src 192.168.101.2 metric 101 linkdown 154 | ```` 155 | 156 | #### Install TPU libraries 157 | 158 | ```` 159 | sudo apt-get update 160 | sudo apt-get dist-upgrade 161 | pip3 install https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-cp37m-linux_aarch64.whl 162 | ```` 163 | 164 | ##### Demo Streaming 165 | 166 | `edgetpu_demo --stream` 167 | 168 | ##### Demo classification 169 | 170 | - Try tflite demo: 171 | ```` 172 | sudo apt-get install git 173 | mkdir coral && cd coral 174 | git clone https://github.com/google-coral/tflite.git 175 | cd tflite/python/examples/classification 176 | bash install_requirements.sh 177 | python3 classify_image.py --model models/mobilenet_v2_1.0_224_inat_bird_quant_edgetpu.tflite --labels models/inat_bird_labels.txt --input images/parrot.jpg 178 | ```` 179 | 180 | - Try EdgeTPU Python API: 181 | 182 | ```` 183 | sudo apt-get install edgetpu-examples 184 | cd /usr/share/edgetpu/examples/ 185 | 186 | python3 classify_image.py --model models/mobilenet_v2_1.0_224_inat_bird_quant_edgetpu.tflite --label models/inat_bird_labels.txt --image images/parrot.jpg 187 | python3 object_detection.py --model models/ssd_mobilenet_v2_coco_quant_postprocess_edgetpu.tflite --label models/coco_labels.txt --input images/grace_hopper.bmp --output ${HOME}/object_detection_results.jpg 188 | python3 object_detection.py --model models/mobilenet_ssd_v2_face_quant_postprocess_edgetpu.tflite --input images/grace_hopper.bmp --output ${HOME}/face_detection_results.jpg 189 | python3 examples/semantic_segmentation.py --model models/deeplabv3_mnv2_pascal_quant_edgetpu.tflite --input models/bird.bmp --keep_aspect_ratio --output ${HOME}/segmentation_result.jpg 190 | ```` 191 | 192 | ### Setup OpenCL 193 | 194 | ```` 195 | sudo apt-get install clinfo opencl-c-headers opencl-clhpp-headers 196 | sudo apt-get install ocl-icd-libopencl1 ocl-icd-dev ocl-icd-opencl-dev 197 | clinfo 198 | ```` 199 | 200 | - Patch the opencl headers (not the best way but I did not find another) by adding at the beginning: 201 | ```` 202 | sudo nano /usr/include/CL/cl_version.h 203 | ```` 204 | 205 | ```` 206 | /* patch! */ 207 | #define CL_TARGET_OPENCL_VERSION 120 208 | ```` 209 | 210 | - Set the SD card swap! 211 | ```` 212 | sudo dphys-swapfile swapon 213 | ```` 214 | 215 | - Install the libraries 216 | ```` 217 | sudo apt-get install python3-dev libatlas-base-dev 218 | pip3 install rpyc numpy==1.19.0 mako pybind11 219 | pip3 install pyopencl 220 | sudo dphys-swapfile swapoff 221 | ```` 222 | 223 | - If doesnt work: 224 | ```` 225 | git clone https://github.com/inducer/pyopencl.git 226 | cd pyopencl 227 | python configure.py --cl-pretend-version=1.2 228 | make 229 | #sudo make install 230 | rm -Rf build 231 | pip3 install . 232 | ```` 233 | 234 | ### Cluster performance 235 | 236 | #### Network 237 | 238 | Check the bandwidth between each node: 239 | 240 | - Create a iperf server on one node, `opencl1` for example 241 | ``` 242 | sudo apt-get install iperf 243 | iperf -s 244 | ``` 245 | 246 | - Create an iperf client on each other node: 247 | ``` 248 | sudo apt-get install iperf 249 | iperf -c opencl1 250 | ``` 251 | 252 | On my cluster, as expected I got arounf 94.4Mb/s to a raspberry pi 3B and 932Mb/s to the Coral Dev board (from a laptop of course) 253 | 254 | #### OpenCL 255 | 256 | Reminder, the openCL Coral implementation is for the Vivante GC7000Lite GPU, **NOT FOR THE TPU**. 257 | Notes: 258 | - GC7000Lite local memory is only 32KB, so a scratchpad 259 | 260 | clpeak: 261 | ```` 262 | Platform: Vivante OpenCL Platform 263 | Device: Vivante OpenCL Device GC7000L.6214.0000 264 | Driver version : OpenCL 1.2 V6.4.2.256507 (Linux ARM64) 265 | Compute units : 1 266 | Clock frequency : 800 MHz 267 | 268 | Global memory bandwidth (GBPS) 269 | float : 3.38 270 | float2 : 4.88 271 | float4 : 4.98 272 | float8 : 4.71 273 | float16 : 3.57 274 | 275 | Single-precision compute (GFLOPS) 276 | float : 4.65 277 | float2 : 10.18 278 | float4 : 21.27 279 | float8 : 22.21 280 | float16 : 24.86 281 | 282 | Half-precision compute (GFLOPS) 283 | half : 4.65 284 | half2 : 10.18 285 | half4 : 21.27 286 | half8 : 22.20 287 | half16 : 24.85 288 | 289 | No double precision support! Skipped 290 | 291 | Integer compute (GIOPS) 292 | int : 5.67 293 | int2 : 5.99 294 | int4 : 6.37 295 | int8 : 6.34 296 | int16 : 6.32 297 | 298 | Integer compute Fast 24bit (GIOPS) 299 | int : 5.67 300 | int2 : 5.99 301 | int4 : 6.37 302 | int8 : 6.34 303 | int16 : 6.32 304 | 305 | Transfer bandwidth (GBPS) 306 | enqueueWriteBuffer : 1.97 307 | enqueueReadBuffer : 0.11 308 | enqueueWriteBuffer non-blocking : 2.05 309 | enqueueReadBuffer non-blocking : 0.12 310 | enqueueMapBuffer(for read) : 99.55 311 | memcpy from mapped ptr : 0.12 312 | enqueueUnmap(after write) : 103.91 313 | memcpy to mapped ptr : 2.03 314 | 315 | Kernel launch latency : 206.17 us 316 | ```` 317 | 318 | To compare with the Raspberry Pi 3 Videocore IV performance: 319 | 320 | ```` 321 | Platform: OpenCL for the Raspberry Pi VideoCore IV GPU 322 | Device: VideoCore IV GPU 323 | Driver version : 0.4.9999 (Linux ARM) 324 | Compute units : 1 325 | Clock frequency : 300 MHz 326 | 327 | Global memory bandwidth (GBPS) 328 | clCreateBuffer (-5) 329 | Tests skipped 330 | 331 | Single-precision compute (GFLOPS) 332 | float : 0.60 333 | float2 : 1.13 334 | float4 : 2.00 335 | float8 : 3.31 336 | float16 : 4.60 337 | 338 | No half precision support! Skipped 339 | 340 | No double precision support! Skipped 341 | 342 | Integer compute (GIOPS) 343 | int : 0.16 344 | int2 : 0.30 345 | int4 : 0.60 346 | int8 : 0.77 347 | int16 : 1.25 348 | 349 | Integer compute Fast 24bit (GIOPS) 350 | int : 0.57 351 | int2 : 1.02 352 | int4 : 1.73 353 | int8 : 2.51 354 | int16 : 3.27 355 | 356 | Transfer bandwidth (GBPS) 357 | enqueueWriteBuffer : 1.22 358 | enqueueReadBuffer : 0.25 359 | enqueueWriteBuffer non-blocking : 1.22 360 | enqueueReadBuffer non-blocking : 0.25 361 | enqueueMapBuffer(for read) : 1838.60 362 | memcpy from mapped ptr : 0.24 363 | enqueueUnmap(after write) : 2191.31 364 | memcpy to mapped ptr : 1.22 365 | 366 | Kernel launch latency : 30.27 us 367 | ```` 368 | -------------------------------------------------------------------------------- /experiments/local_pyopencl/demo_float.cl: -------------------------------------------------------------------------------- 1 | __kernel void sum(__global float* a_g, __global const float* b_g, __global float* res_g) 2 | { 3 | int gid = get_global_id(0); 4 | res_g[gid] = a_g[gid] + b_g[gid]; 5 | } 6 | -------------------------------------------------------------------------------- /experiments/local_pyopencl/demo_float4.cl: -------------------------------------------------------------------------------- 1 | __kernel void sum(__global float4* a_g, __global const float4* b_g, __global float4* res_g) 2 | { 3 | int gid = get_global_id(0); 4 | res_g[gid] = a_g[gid] + b_g[gid]; 5 | } 6 | -------------------------------------------------------------------------------- /experiments/local_pyopencl/ex.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function 2 | import numpy as np 3 | import pyopencl as cl 4 | 5 | SIZE = 1000000 6 | 7 | a_np = np.random.rand(SIZE).astype(np.float32) 8 | b_np = np.random.rand(SIZE).astype(np.float32) 9 | 10 | ctx = cl.create_some_context() 11 | queue = cl.CommandQueue(ctx) 12 | 13 | mf = cl.mem_flags 14 | a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np) 15 | b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np) 16 | 17 | prg = cl.Program(ctx, """ 18 | __kernel void sum( 19 | __global const float *a_g, __global const float *b_g, __global float *res_g) 20 | { 21 | int gid = get_global_id(0); 22 | res_g[gid] = a_g[gid] + b_g[gid]; 23 | } 24 | """).build() 25 | 26 | res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes) 27 | prg.sum(queue, a_np.shape, None, a_g, b_g, res_g) 28 | 29 | res_np = np.empty_like(a_np) 30 | cl.enqueue_copy(queue, res_np, res_g) 31 | 32 | # Check on CPU with Numpy: 33 | print(a_np) 34 | print(b_np) 35 | print(res_np) 36 | print(a_np + b_np) 37 | print(res_np - (a_np + b_np)) 38 | print(np.linalg.norm(res_np - (a_np + b_np))) 39 | assert np.allclose(res_np, a_np + b_np) 40 | -------------------------------------------------------------------------------- /experiments/local_pyopencl/ex2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pyopencl as cl 3 | import time 4 | 5 | vector_size = 10000000 6 | vector_type = np.float32 7 | 8 | print("Generate {} random numbers of type {}".format(vector_size, str(vector_type))) 9 | a_np = np.random.rand(vector_size).astype(vector_type) 10 | b_np = np.random.rand(vector_size).astype(vector_type) 11 | res_np = np.empty_like(a_np).astype(vector_type) 12 | print("Buffer types: {}, {}, {} of size: {} bytes".format(a_np.dtype, b_np.dtype, res_np.dtype, a_np.nbytes)) 13 | 14 | print("---------------------------------------------------------------------------") 15 | 16 | # get platforms for each cluster 17 | print("Available platforms on the Clusters:") 18 | 19 | platforms = cl.get_platforms() 20 | 21 | for index, platform in enumerate(platforms): 22 | print("{}\tPlatform: {}".format(index, platform)) 23 | print("\tName: {}".format(platform.name)) 24 | print("\tProfile: {}".format(platform.profile)) 25 | print("\tVendor: {}".format(platform.vendor)) 26 | print("\tVersion: {}".format(platform.version)) 27 | 28 | print("---------------------------------------------------------------------------") 29 | 30 | # create openCL context on platform rpi1, first device 31 | print("Getting node for platform") 32 | 33 | device_nb = 0 34 | print("Create OpenCL context on device {}".format(device_nb)) 35 | ctx = cl.Context(dev_type=cl.device_type.ALL, properties=[(cl.context_properties.PLATFORM, platforms[device_nb])]) 36 | device = ctx.devices[0] 37 | float_vector_size = device.preferred_vector_width_float 38 | 39 | print("Device {} properties:".format(device)) 40 | print("\tPrefered float vector size: {}".format(float_vector_size)) 41 | print("\tVersion: {}".format(device.version)) 42 | print("\tVendor: {}".format(device.vendor_id)) 43 | print("\tProfile: {}".format(device.profile)) 44 | print("\topencl_c_version: {}".format(device.opencl_c_version)) 45 | print("\tmax_compute_units: {}".format(device.max_compute_units)) 46 | print("\tmax_clock_frequency: {}".format(device.max_clock_frequency)) 47 | print("\tlocal_mem_size: {}".format(device.local_mem_size)) 48 | print("\tglobal_mem_size: {}".format(device.global_mem_size)) 49 | print("\textensions: {}".format(device.extensions)) 50 | 51 | print("Create OpenCL queue") 52 | queue = cl.CommandQueue(ctx) 53 | 54 | print("Copy data to device buffers") 55 | mf = cl.mem_flags 56 | a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np) 57 | b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np) 58 | res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes) 59 | 60 | print("Reading kernel file: demo_float.cl") 61 | with open("demo_float.cl", "r") as f_kernel: 62 | kernel = f_kernel.read() 63 | 64 | print("Compiling kernel") 65 | prg = cl.Program(ctx, kernel).build() 66 | 67 | print("Executing computation") 68 | t0 = time.perf_counter_ns() 69 | #prg.sum(queue, a_np.shape, None, a_g, b_g, res_g) 70 | knl = prg.sum 71 | knl.set_args(a_g, b_g, res_g) 72 | local_work_size = None 73 | #local_work_size = (10,) 74 | 75 | t1 = time.perf_counter_ns() 76 | ev = cl.enqueue_nd_range_kernel(queue=queue, kernel=knl, global_work_size=(vector_size,), local_work_size=local_work_size) 77 | t0_enqueue = time.perf_counter_ns() 78 | ev.wait() 79 | t2 = time.perf_counter_ns() 80 | 81 | t3 = time.perf_counter_ns() 82 | cl.enqueue_copy(queue, res_np, res_g) 83 | t4 = time.perf_counter_ns() 84 | 85 | # Check on CPU with Numpy: 86 | print("Computing on the host using numpy") 87 | t5 = time.perf_counter_ns() 88 | res_local = a_np + b_np 89 | t6 = time.perf_counter_ns() 90 | print("Local type:", res_local.dtype) 91 | 92 | print("---------------------------------------------------------------------------") 93 | print("Comparing results") 94 | print("Difference : {}".format(res_np - res_local)) 95 | print("A : {}".format(a_np)) 96 | print("B : {}".format(b_np)) 97 | print("Result OpenCL: {}".format(res_np)) 98 | print("Result Numpy : {}".format(res_local)) 99 | 100 | print("Checking the norm between both: {}".format(np.linalg.norm(res_np - res_local))) 101 | print("Checking results are mostly the same: ", np.allclose(res_np, res_local)) 102 | 103 | print("---------------------------------------------------------------------------") 104 | print("Time to compute using opencl: {} ms".format((t4-t0)/1000000)) 105 | print("Time to compute using numpy: {} ms".format((t6-t5)/1000000)) 106 | 107 | -------------------------------------------------------------------------------- /experiments/local_pyopencl/ex3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pyopencl as cl 3 | import time 4 | 5 | # generate data on the cluster host 6 | 7 | # OpenCL float16 is a vector of 16 float32, NOT a numpy float16 (half) 8 | vector_size = 1000000 9 | vector_type = np.float32 10 | 11 | print("Generate {} random numbers of type {}".format(vector_size, str(vector_type))) 12 | a_np = np.random.rand(vector_size).astype(vector_type) 13 | b_np = np.random.rand(vector_size).astype(vector_type) 14 | res_np = np.empty_like(a_np).astype(vector_type) 15 | print("Buffer types: {}, {}, {} of size: {} bytes".format(a_np.dtype, b_np.dtype, res_np.dtype, a_np.nbytes)) 16 | 17 | print("---------------------------------------------------------------------------") 18 | 19 | # get platforms for each cluster 20 | print("Available platforms on the Clusters:") 21 | 22 | platforms = cl.get_platforms() 23 | 24 | for index, platform in enumerate(platforms): 25 | print("{}\tPlatform: {}".format(index, platform)) 26 | print("\tName: {}".format(platform.name)) 27 | print("\tProfile: {}".format(platform.profile)) 28 | print("\tVendor: {}".format(platform.vendor)) 29 | print("\tVersion: {}".format(platform.version)) 30 | 31 | print("---------------------------------------------------------------------------") 32 | 33 | # create openCL context on platform rpi1, first device 34 | print("Getting node for platform") 35 | 36 | device_nb = 0 37 | print("Create OpenCL context on device {}".format(device_nb)) 38 | ctx = cl.Context(dev_type=cl.device_type.ALL, properties=[(cl.context_properties.PLATFORM, platforms[device_nb])]) 39 | device = ctx.devices[0] 40 | float_vector_size = device.preferred_vector_width_float 41 | 42 | print("Device {} properties:".format(device)) 43 | print("\tPrefered float vector size: {}".format(float_vector_size)) 44 | print("\tVersion: {}".format(device.version)) 45 | print("\tVendor: {}".format(device.vendor_id)) 46 | print("\tProfile: {}".format(device.profile)) 47 | print("\topencl_c_version: {}".format(device.opencl_c_version)) 48 | print("\tmax_compute_units: {}".format(device.max_compute_units)) 49 | print("\tmax_clock_frequency: {}".format(device.max_clock_frequency)) 50 | print("\tlocal_mem_size: {}".format(device.local_mem_size)) 51 | print("\tglobal_mem_size: {}".format(device.global_mem_size)) 52 | print("\textensions: {}".format(device.extensions)) 53 | 54 | print("Create OpenCL queue") 55 | queue = cl.CommandQueue(ctx) 56 | 57 | print("Copy data to device buffers") 58 | mf = cl.mem_flags 59 | a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np) 60 | b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np) 61 | res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes) 62 | 63 | print("Reading kernel file: demo_float{}.cl".format(float_vector_size)) 64 | with open("demo_float{}.cl".format(float_vector_size), "r") as f_kernel: 65 | kernel = f_kernel.read() 66 | 67 | print("Compiling kernel") 68 | prg = cl.Program(ctx, kernel).build() 69 | 70 | print("Executing computation") 71 | #prg.sum(queue, a_np.shape, None, a_g, b_g, res_g) 72 | knl = prg.sum 73 | knl.set_args(a_g, b_g, res_g) 74 | local_work_size = None 75 | #local_work_size = (10,) 76 | 77 | t0 = time.perf_counter_ns() 78 | ev = cl.enqueue_nd_range_kernel(queue=queue, kernel=knl, global_work_size=(vector_size//float_vector_size,), local_work_size=local_work_size) 79 | t0_enqueue = time.perf_counter_ns() 80 | ev.wait() 81 | t1 = time.perf_counter_ns() 82 | 83 | print("Transferring result to host") 84 | t2 = time.perf_counter_ns() 85 | cl.enqueue_copy(queue, res_np, res_g) 86 | t3 = time.perf_counter_ns() 87 | 88 | # Check on CPU with Numpy: 89 | print("Computing on the host using numpy") 90 | t4 = time.perf_counter_ns() 91 | res_local = a_np + b_np 92 | t5 = time.perf_counter_ns() 93 | print("Local type:", res_local.dtype) 94 | 95 | print("---------------------------------------------------------------------------") 96 | print("Comparing results") 97 | print("Difference : {}".format(res_np - res_local)) 98 | print("A : {}".format(a_np)) 99 | print("B : {}".format(b_np)) 100 | print("Result OpenCL: {}".format(res_np)) 101 | print("Result Numpy : {}".format(res_local)) 102 | 103 | print("Checking the norm between both: {}".format(np.linalg.norm(res_np - res_local))) 104 | print("Checking results are mostly the same: ", np.allclose(res_np, res_local)) 105 | 106 | print("---------------------------------------------------------------------------") 107 | print("Time to compute using numpy: {} ms".format((t5-t4)/1000000)) 108 | 109 | --------------------------------------------------------------------------------