├── .gitignore
├── LICENSE
├── README.md
└── experiments
    └── local_pyopencl
        ├── demo_float.cl
        ├── demo_float4.cl
        ├── ex.py
        ├── ex2.py
        └── ex3.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 shazz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # coral-dev-opencl
  2 | 
  3 | My experiments on using OpenCL on the Google Coral Dev board GPU associated to ML models on the TPU
  4 | 
  5 | ### Prerequisites
  6 | 
  7 |  - Linux computer (referred to below as "host") with python3 and pip3
  8 |  - USB-A to USB-micro-B cable (to connect your PC to the board's serial port)
  9 |  - USB-A to USB-C cable (to connect your PC to the board's data port)
 10 |  - 2-3A (5V) USB Type-C power supply
 11 |  - Ethernet cable or Wi-Fi connection
 12 | 
 13 | ### Coral Setup
 14 | 
 15 | #### Prepare the host:
 16 | 
 17 |  - Install a serial terminal
 18 |    ````
 19 |    sudo apt-get install screen
 20 |    ````
 21 |  
 22 |  - Get `fastboot` from the Android tools from `https://developer.android.com/studio/releases/platform-tools#downloads
 23 |    ````
 24 |    mkdir -p ~/.local/bin
 25 |    sudo mv ~/Downloads/platform-tools/fastboot ~/.local/bin/
 26 |    ````
 27 |  
 28 |  - Get MDT (Mendel Dev Tools)
 29 |    ````
 30 |    pip3 install --user mendel-development-tool
 31 |    ````
 32 |  
 33 |  - Sometimes needed for the serial console (not on my rpi)
 34 |    ````
 35 |    sudo sh -c "echo 'SUBSYSTEM==\"usb\", ATTR{idVendor}==\"0525\", MODE=\"0664\", \
 36 |    GROUP=\"plugdev\", TAG+=\"uaccess\"' >> /etc/udev/rules.d/65-edgetpu-board.rules"
 37 | 
 38 |    sudo udevadm control --reload-rules && sudo udevadm trigger
 39 |    ````
 40 | 
 41 | #### Connect thru USB OTG
 42 | 
 43 |   ````
 44 |   mdt devices
 45 |   mdt shell
 46 |   ````
 47 | 
 48 | #### Download and install Mendel distribution
 49 | 
 50 |   ````
 51 |   cd ~/Downloads
 52 | 
 53 |   curl -O https://mendel-linux.org/images/enterprise/eagle/enterprise-eagle-20200724205123.zip
 54 | 
 55 |   unzip enterprise-eagle-20200724205123.zip \
 56 |   && cd enterprise-eagle-20200724205123
 57 | 
 58 |   bash flash.sh
 59 |   ````
 60 | 
 61 | #### Setup Wifi connection
 62 | 
 63 | ````
 64 | nmtui
 65 | nmcli connection show
 66 | ````
 67 | 
 68 | #### Set SD Card as a drive
 69 | 
 70 |  - To create a file swap REQUIRED to compile pyopencl
 71 |    ````
 72 |    sudo umount /dev/mmcblk1
 73 |    sudo mkdir /mnt/SD
 74 |    sudo mkfs -t ext4 /dev/mmcblk1
 75 |    sudo mount -t ext4 /dev/mmcblk1 /mnt/SD 
 76 | 
 77 |    sudo apt-get install exuberant-ctags
 78 |    sudo apt-get install dphys-swapfile
 79 | 
 80 |    sudo dphys-swapfile setup
 81 |    sudo chmod 0600 /mnt/SD/swap
 82 |    sudo dphys-swapfile swapon
 83 |    watch -n1 free
 84 |    ````
 85 | 
 86 | - To create a storage drive (non-journalized)
 87 |   ````
 88 |   sudo umount /dev/mmcblk1
 89 |   sudo mkdir /mnt/SD
 90 |   sudo fdisk -l
 91 |   sudo mkfs -t ext2 /dev/mmcblk1
 92 |   sudo mount -t ext2 /dev/mmcblk1 /mnt/SD
 93 |   sudo nano /etc/fstab
 94 |   ````
 95 | 
 96 |  - Add: `/dev/mmcblk1 /mnt/SD ext2 defaults 0 3`
 97 | 
 98 | #### Set SSH keys
 99 | 
100 | I did not find a way to revert Mendel sshd configuration to accept user/passwords and not only public keys certificates.
101 | 
102 | So, when `mdt shell` worked and you still connected using the USB Data Port, from your **host**, type
103 |  ````
104 |  ssh-keygen
105 |  mdt pushkey ~/.ssh/id_rsa.pub
106 |  ssh mendel@192.168.100.2
107 |  ````
108 | 
109 | #### Connect thru Serial Shell if needed
110 | 
111 | From the host
112 |   ````
113 |   sudo apt-get install screen
114 |   pip3 install --user mendel-development-tool
115 | 
116 |   dmesg | grep ttyUSB
117 |   screen /dev/ttyUSB0 115200
118 |   ````
119 | 
120 | Default login/password: mendel
121 | 
122 | #### Static IP on the OpenCL private network
123 | 
124 |  - Edit as sudoer `/etc/network/interfaces` and modify:
125 |    ````
126 |    # interfaces(5) file used by ifup(8) and ifdown(8)
127 |    # Include files from /etc/network/interfaces.d:
128 |    source-directory /etc/network/interfaces.d
129 | 
130 |    allow-hotplug eth0
131 |    iface eth0 inet static
132 |            address 10.0.0.4
133 |            netmask 255.255.255.0
134 |    ````
135 | 
136 |  - Restart the network service: `sudo systemctl restart networking`
137 |  - Check the routes with `ip addr show eth0` 
138 |    ````
139 |      2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
140 |         link/ether 7c:d9:5c:b2:75:f8 brd ff:ff:ff:ff:ff:ff
141 |         inet 10.0.0.4/24 brd 10.0.0.255 scope global eth0
142 |            valid_lft forever preferred_lft forever
143 |         inet6 fe80::7ed9:5cff:feb2:75f8/64 scope link
144 |            valid_lft forever preferred_lft forever
145 |    ````
146 |  
147 |  - then  `ip route show`
148 |    ````
149 |    default via 192.168.1.1 dev wlan0 proto dhcp metric 600
150 |    10.0.0.0/24 dev eth0 proto kernel scope link src 10.0.0.4
151 |    192.168.1.0/24 dev wlan0 proto kernel scope link src 192.168.1.43 metric 600
152 |    192.168.100.0/24 dev usb0 proto kernel scope link src 192.168.100.2 metric 100
153 |    192.168.101.0/24 dev usb1 proto kernel scope link src 192.168.101.2 metric 101 linkdown
154 |    ````
155 | 
156 | #### Install TPU libraries
157 | 
158 |   ````
159 |   sudo apt-get update
160 |   sudo apt-get dist-upgrade
161 |   pip3 install https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-cp37m-linux_aarch64.whl
162 |   ````
163 | 
164 | ##### Demo Streaming
165 | 
166 | `edgetpu_demo --stream`
167 | 
168 | ##### Demo classification
169 | 
170 |  - Try tflite demo:
171 |     ````
172 |     sudo apt-get install git
173 |     mkdir coral && cd coral
174 |     git clone https://github.com/google-coral/tflite.git
175 |     cd tflite/python/examples/classification
176 |     bash install_requirements.sh
177 |     python3 classify_image.py --model models/mobilenet_v2_1.0_224_inat_bird_quant_edgetpu.tflite --labels models/inat_bird_labels.txt --input images/parrot.jpg
178 |     ````
179 | 
180 |  - Try EdgeTPU Python API:
181 | 
182 |    ````
183 |    sudo apt-get install edgetpu-examples
184 |    cd /usr/share/edgetpu/examples/
185 | 
186 |    python3 classify_image.py --model models/mobilenet_v2_1.0_224_inat_bird_quant_edgetpu.tflite --label models/inat_bird_labels.txt --image images/parrot.jpg
187 |    python3 object_detection.py --model models/ssd_mobilenet_v2_coco_quant_postprocess_edgetpu.tflite --label models/coco_labels.txt --input images/grace_hopper.bmp --output ${HOME}/object_detection_results.jpg
188 |    python3 object_detection.py --model models/mobilenet_ssd_v2_face_quant_postprocess_edgetpu.tflite --input images/grace_hopper.bmp --output ${HOME}/face_detection_results.jpg
189 |    python3 examples/semantic_segmentation.py --model models/deeplabv3_mnv2_pascal_quant_edgetpu.tflite --input models/bird.bmp --keep_aspect_ratio --output ${HOME}/segmentation_result.jpg
190 |    ````
191 | 
192 | ### Setup OpenCL
193 | 
194 |   ````
195 |   sudo apt-get install clinfo opencl-c-headers opencl-clhpp-headers
196 |   sudo apt-get install ocl-icd-libopencl1 ocl-icd-dev ocl-icd-opencl-dev
197 |   clinfo
198 | ```` 
199 | 
200 | - Patch the opencl headers (not the best way but I did not find another) by adding at the beginning:
201 |   ````
202 |   sudo nano /usr/include/CL/cl_version.h
203 |   ````
204 | 
205 |   ````
206 |   /* patch! */
207 |   #define CL_TARGET_OPENCL_VERSION 120
208 |   ````
209 | 
210 | - Set the SD card swap! 
211 |   ````
212 |   sudo dphys-swapfile swapon
213 |   ````
214 | 
215 | - Install the libraries
216 |   ````
217 |   sudo apt-get install python3-dev libatlas-base-dev
218 |   pip3 install rpyc numpy==1.19.0 mako pybind11
219 |   pip3 install pyopencl
220 |   sudo dphys-swapfile swapoff
221 |   ````
222 | 
223 | - If doesnt work:
224 |   ````
225 |   git clone https://github.com/inducer/pyopencl.git
226 |   cd pyopencl
227 |   python configure.py --cl-pretend-version=1.2
228 |   make
229 |   #sudo make install
230 |   rm -Rf build
231 |   pip3 install .
232 |   ````
233 | 
234 | ### Cluster performance
235 | 
236 | #### Network
237 | 
238 | Check the bandwidth between each node:
239 | 
240 |  - Create a iperf server on one node, `opencl1` for example
241 |   ```
242 |   sudo apt-get install iperf
243 |   iperf -s
244 |   ```
245 | 
246 |  - Create an iperf client on each other node:
247 |   ```
248 |   sudo apt-get install iperf
249 |   iperf -c opencl1
250 |   ```
251 | 
252 | On my cluster, as expected I got arounf 94.4Mb/s to a raspberry pi 3B and 932Mb/s to the Coral Dev board (from a laptop of course)
253 | 
254 | #### OpenCL
255 | 
256 | Reminder, the openCL Coral implementation is for the Vivante GC7000Lite GPU, **NOT FOR THE TPU**.
257 | Notes:
258 | - GC7000Lite local memory is only 32KB, so a scratchpad
259 | 
260 |   clpeak:
261 |   ````
262 |   Platform: Vivante OpenCL Platform
263 |     Device: Vivante OpenCL Device GC7000L.6214.0000
264 |       Driver version  : OpenCL 1.2 V6.4.2.256507 (Linux ARM64)
265 |       Compute units   : 1
266 |       Clock frequency : 800 MHz
267 | 
268 |       Global memory bandwidth (GBPS)
269 |         float   : 3.38
270 |         float2  : 4.88
271 |         float4  : 4.98
272 |         float8  : 4.71
273 |         float16 : 3.57
274 | 
275 |       Single-precision compute (GFLOPS)
276 |         float   : 4.65
277 |         float2  : 10.18
278 |         float4  : 21.27
279 |         float8  : 22.21
280 |         float16 : 24.86
281 | 
282 |       Half-precision compute (GFLOPS)
283 |         half   : 4.65
284 |         half2  : 10.18
285 |         half4  : 21.27
286 |         half8  : 22.20
287 |         half16 : 24.85
288 | 
289 |       No double precision support! Skipped
290 | 
291 |       Integer compute (GIOPS)
292 |         int   : 5.67
293 |         int2  : 5.99
294 |         int4  : 6.37
295 |         int8  : 6.34
296 |         int16 : 6.32
297 | 
298 |       Integer compute Fast 24bit (GIOPS)
299 |         int   : 5.67
300 |         int2  : 5.99
301 |         int4  : 6.37
302 |         int8  : 6.34
303 |         int16 : 6.32
304 | 
305 |       Transfer bandwidth (GBPS)
306 |         enqueueWriteBuffer              : 1.97
307 |         enqueueReadBuffer               : 0.11
308 |         enqueueWriteBuffer non-blocking : 2.05
309 |         enqueueReadBuffer non-blocking  : 0.12
310 |         enqueueMapBuffer(for read)      : 99.55
311 |           memcpy from mapped ptr        : 0.12
312 |         enqueueUnmap(after write)       : 103.91
313 |           memcpy to mapped ptr          : 2.03
314 | 
315 |       Kernel launch latency : 206.17 us
316 |   ````
317 | 
318 | To compare with the Raspberry Pi 3 Videocore IV performance:
319 | 
320 |   ````
321 |   Platform: OpenCL for the Raspberry Pi VideoCore IV GPU
322 |     Device: VideoCore IV GPU
323 |       Driver version  : 0.4.9999 (Linux ARM)
324 |       Compute units   : 1
325 |       Clock frequency : 300 MHz
326 | 
327 |       Global memory bandwidth (GBPS)
328 |   clCreateBuffer (-5)
329 |         Tests skipped
330 | 
331 |       Single-precision compute (GFLOPS)
332 |         float   : 0.60
333 |         float2  : 1.13
334 |         float4  : 2.00
335 |         float8  : 3.31
336 |         float16 : 4.60
337 | 
338 |       No half precision support! Skipped
339 | 
340 |       No double precision support! Skipped
341 | 
342 |       Integer compute (GIOPS)
343 |         int   : 0.16
344 |         int2  : 0.30
345 |         int4  : 0.60
346 |         int8  : 0.77
347 |         int16 : 1.25
348 | 
349 |       Integer compute Fast 24bit (GIOPS)
350 |         int   : 0.57
351 |         int2  : 1.02
352 |         int4  : 1.73
353 |         int8  : 2.51
354 |         int16 : 3.27
355 | 
356 |       Transfer bandwidth (GBPS)
357 |         enqueueWriteBuffer              : 1.22
358 |         enqueueReadBuffer               : 0.25
359 |         enqueueWriteBuffer non-blocking : 1.22
360 |         enqueueReadBuffer non-blocking  : 0.25
361 |         enqueueMapBuffer(for read)      : 1838.60
362 |           memcpy from mapped ptr        : 0.24
363 |         enqueueUnmap(after write)       : 2191.31
364 |           memcpy to mapped ptr          : 1.22
365 | 
366 |       Kernel launch latency : 30.27 us
367 |   ````
368 | 


--------------------------------------------------------------------------------
/experiments/local_pyopencl/demo_float.cl:
--------------------------------------------------------------------------------
1 | __kernel void sum(__global float* a_g, __global const float* b_g, __global float* res_g)
2 | {
3 |   int gid = get_global_id(0);
4 |   res_g[gid] = a_g[gid] + b_g[gid];
5 | }
6 | 


--------------------------------------------------------------------------------
/experiments/local_pyopencl/demo_float4.cl:
--------------------------------------------------------------------------------
1 | __kernel void sum(__global float4* a_g, __global const float4* b_g, __global float4* res_g)
2 | {
3 |   int gid = get_global_id(0);
4 |   res_g[gid] = a_g[gid] + b_g[gid];
5 | }
6 | 


--------------------------------------------------------------------------------
/experiments/local_pyopencl/ex.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, print_function
 2 | import numpy as np
 3 | import pyopencl as cl
 4 | 
 5 | SIZE = 1000000
 6 | 
 7 | a_np = np.random.rand(SIZE).astype(np.float32)
 8 | b_np = np.random.rand(SIZE).astype(np.float32)
 9 | 
10 | ctx = cl.create_some_context()
11 | queue = cl.CommandQueue(ctx)
12 | 
13 | mf = cl.mem_flags
14 | a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
15 | b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
16 | 
17 | prg = cl.Program(ctx, """
18 | __kernel void sum(
19 |     __global const float *a_g, __global const float *b_g, __global float *res_g)
20 | {
21 |   int gid = get_global_id(0);
22 |   res_g[gid] = a_g[gid] + b_g[gid];
23 | }
24 | """).build()
25 | 
26 | res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
27 | prg.sum(queue, a_np.shape, None, a_g, b_g, res_g)
28 | 
29 | res_np = np.empty_like(a_np)
30 | cl.enqueue_copy(queue, res_np, res_g)
31 | 
32 | # Check on CPU with Numpy:
33 | print(a_np)
34 | print(b_np)
35 | print(res_np)
36 | print(a_np + b_np)
37 | print(res_np - (a_np + b_np))
38 | print(np.linalg.norm(res_np - (a_np + b_np)))
39 | assert np.allclose(res_np, a_np + b_np)
40 | 


--------------------------------------------------------------------------------
/experiments/local_pyopencl/ex2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pyopencl as cl
  3 | import time
  4 | 
  5 | vector_size = 10000000
  6 | vector_type = np.float32
  7 | 
  8 | print("Generate {} random numbers of type {}".format(vector_size, str(vector_type)))
  9 | a_np = np.random.rand(vector_size).astype(vector_type)
 10 | b_np = np.random.rand(vector_size).astype(vector_type)
 11 | res_np = np.empty_like(a_np).astype(vector_type)
 12 | print("Buffer types: {}, {}, {} of size: {} bytes".format(a_np.dtype, b_np.dtype, res_np.dtype, a_np.nbytes))
 13 | 
 14 | print("---------------------------------------------------------------------------")
 15 | 
 16 | # get platforms for each cluster
 17 | print("Available platforms on the Clusters:")
 18 | 
 19 | platforms = cl.get_platforms()
 20 | 
 21 | for index, platform in enumerate(platforms):
 22 |     print("{}\tPlatform: {}".format(index, platform))
 23 |     print("\tName: {}".format(platform.name))
 24 |     print("\tProfile: {}".format(platform.profile))
 25 |     print("\tVendor: {}".format(platform.vendor))
 26 |     print("\tVersion: {}".format(platform.version))
 27 | 
 28 | print("---------------------------------------------------------------------------")
 29 | 
 30 | # create openCL context on platform rpi1, first device
 31 | print("Getting node for platform")
 32 | 
 33 | device_nb = 0
 34 | print("Create OpenCL context on device {}".format(device_nb))
 35 | ctx = cl.Context(dev_type=cl.device_type.ALL,  properties=[(cl.context_properties.PLATFORM, platforms[device_nb])])
 36 | device = ctx.devices[0]
 37 | float_vector_size = device.preferred_vector_width_float
 38 | 
 39 | print("Device {} properties:".format(device))
 40 | print("\tPrefered float vector size: {}".format(float_vector_size))
 41 | print("\tVersion: {}".format(device.version))
 42 | print("\tVendor: {}".format(device.vendor_id))
 43 | print("\tProfile: {}".format(device.profile))
 44 | print("\topencl_c_version: {}".format(device.opencl_c_version))
 45 | print("\tmax_compute_units: {}".format(device.max_compute_units))
 46 | print("\tmax_clock_frequency: {}".format(device.max_clock_frequency))
 47 | print("\tlocal_mem_size: {}".format(device.local_mem_size))
 48 | print("\tglobal_mem_size: {}".format(device.global_mem_size))
 49 | print("\textensions: {}".format(device.extensions))
 50 | 
 51 | print("Create OpenCL queue")
 52 | queue = cl.CommandQueue(ctx)
 53 | 
 54 | print("Copy data to device buffers")
 55 | mf = cl.mem_flags
 56 | a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
 57 | b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
 58 | res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
 59 | 
 60 | print("Reading kernel file: demo_float.cl")
 61 | with open("demo_float.cl", "r") as f_kernel:
 62 |     kernel = f_kernel.read()
 63 | 
 64 | print("Compiling kernel")
 65 | prg = cl.Program(ctx, kernel).build()
 66 | 
 67 | print("Executing computation")
 68 | t0 = time.perf_counter_ns()
 69 | #prg.sum(queue, a_np.shape, None, a_g, b_g, res_g)
 70 | knl = prg.sum
 71 | knl.set_args(a_g, b_g, res_g)
 72 | local_work_size = None
 73 | #local_work_size = (10,)
 74 | 
 75 | t1 = time.perf_counter_ns()
 76 | ev = cl.enqueue_nd_range_kernel(queue=queue, kernel=knl, global_work_size=(vector_size,), local_work_size=local_work_size)
 77 | t0_enqueue = time.perf_counter_ns()
 78 | ev.wait()
 79 | t2 = time.perf_counter_ns()
 80 | 
 81 | t3 = time.perf_counter_ns()
 82 | cl.enqueue_copy(queue, res_np, res_g)
 83 | t4 = time.perf_counter_ns()
 84 | 
 85 | # Check on CPU with Numpy:
 86 | print("Computing on the host using numpy")
 87 | t5 = time.perf_counter_ns()
 88 | res_local = a_np + b_np
 89 | t6 = time.perf_counter_ns()
 90 | print("Local type:", res_local.dtype)
 91 | 
 92 | print("---------------------------------------------------------------------------")
 93 | print("Comparing results")
 94 | print("Difference   : {}".format(res_np - res_local))
 95 | print("A            : {}".format(a_np))
 96 | print("B            : {}".format(b_np))
 97 | print("Result OpenCL: {}".format(res_np))
 98 | print("Result Numpy : {}".format(res_local))
 99 | 
100 | print("Checking the norm between both: {}".format(np.linalg.norm(res_np - res_local)))
101 | print("Checking results are mostly the same: ", np.allclose(res_np, res_local))
102 | 
103 | print("---------------------------------------------------------------------------")
104 | print("Time to compute using opencl: {} ms".format((t4-t0)/1000000))
105 | print("Time to compute using numpy: {} ms".format((t6-t5)/1000000))
106 | 
107 | 


--------------------------------------------------------------------------------
/experiments/local_pyopencl/ex3.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pyopencl as cl
  3 | import time
  4 | 
  5 | # generate data on the cluster host
  6 | 
  7 | # OpenCL float16 is a vector of 16 float32, NOT a numpy float16 (half)
  8 | vector_size = 1000000
  9 | vector_type = np.float32
 10 | 
 11 | print("Generate {} random numbers of type {}".format(vector_size, str(vector_type)))
 12 | a_np = np.random.rand(vector_size).astype(vector_type)
 13 | b_np = np.random.rand(vector_size).astype(vector_type)
 14 | res_np = np.empty_like(a_np).astype(vector_type)
 15 | print("Buffer types: {}, {}, {} of size: {} bytes".format(a_np.dtype, b_np.dtype, res_np.dtype, a_np.nbytes))
 16 | 
 17 | print("---------------------------------------------------------------------------")
 18 | 
 19 | # get platforms for each cluster
 20 | print("Available platforms on the Clusters:")
 21 | 
 22 | platforms = cl.get_platforms()
 23 | 
 24 | for index, platform in enumerate(platforms):
 25 |     print("{}\tPlatform: {}".format(index, platform))
 26 |     print("\tName: {}".format(platform.name))
 27 |     print("\tProfile: {}".format(platform.profile))
 28 |     print("\tVendor: {}".format(platform.vendor))
 29 |     print("\tVersion: {}".format(platform.version))
 30 | 
 31 | print("---------------------------------------------------------------------------")
 32 | 
 33 | # create openCL context on platform rpi1, first device
 34 | print("Getting node for platform")
 35 | 
 36 | device_nb = 0
 37 | print("Create OpenCL context on device {}".format(device_nb))
 38 | ctx = cl.Context(dev_type=cl.device_type.ALL,  properties=[(cl.context_properties.PLATFORM, platforms[device_nb])])
 39 | device = ctx.devices[0]
 40 | float_vector_size = device.preferred_vector_width_float
 41 | 
 42 | print("Device {} properties:".format(device))
 43 | print("\tPrefered float vector size: {}".format(float_vector_size))
 44 | print("\tVersion: {}".format(device.version))
 45 | print("\tVendor: {}".format(device.vendor_id))
 46 | print("\tProfile: {}".format(device.profile))
 47 | print("\topencl_c_version: {}".format(device.opencl_c_version))
 48 | print("\tmax_compute_units: {}".format(device.max_compute_units))
 49 | print("\tmax_clock_frequency: {}".format(device.max_clock_frequency))
 50 | print("\tlocal_mem_size: {}".format(device.local_mem_size))
 51 | print("\tglobal_mem_size: {}".format(device.global_mem_size))
 52 | print("\textensions: {}".format(device.extensions))
 53 | 
 54 | print("Create OpenCL queue")
 55 | queue = cl.CommandQueue(ctx)
 56 | 
 57 | print("Copy data to device buffers")
 58 | mf = cl.mem_flags
 59 | a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
 60 | b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
 61 | res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
 62 | 
 63 | print("Reading kernel file: demo_float{}.cl".format(float_vector_size))
 64 | with open("demo_float{}.cl".format(float_vector_size), "r") as f_kernel:
 65 |     kernel = f_kernel.read()
 66 | 
 67 | print("Compiling kernel")
 68 | prg = cl.Program(ctx, kernel).build()
 69 | 
 70 | print("Executing computation")
 71 | #prg.sum(queue, a_np.shape, None, a_g, b_g, res_g)
 72 | knl = prg.sum
 73 | knl.set_args(a_g, b_g, res_g)
 74 | local_work_size = None
 75 | #local_work_size = (10,)
 76 | 
 77 | t0 = time.perf_counter_ns()
 78 | ev = cl.enqueue_nd_range_kernel(queue=queue, kernel=knl, global_work_size=(vector_size//float_vector_size,), local_work_size=local_work_size)
 79 | t0_enqueue = time.perf_counter_ns()
 80 | ev.wait()
 81 | t1 = time.perf_counter_ns()
 82 | 
 83 | print("Transferring result to host")
 84 | t2 = time.perf_counter_ns()
 85 | cl.enqueue_copy(queue, res_np, res_g)
 86 | t3 = time.perf_counter_ns()
 87 | 
 88 | # Check on CPU with Numpy:
 89 | print("Computing on the host using numpy")
 90 | t4 = time.perf_counter_ns()
 91 | res_local = a_np + b_np
 92 | t5 = time.perf_counter_ns()
 93 | print("Local type:", res_local.dtype)
 94 | 
 95 | print("---------------------------------------------------------------------------")
 96 | print("Comparing results")
 97 | print("Difference   : {}".format(res_np - res_local))
 98 | print("A            : {}".format(a_np))
 99 | print("B            : {}".format(b_np))
100 | print("Result OpenCL: {}".format(res_np))
101 | print("Result Numpy : {}".format(res_local))
102 | 
103 | print("Checking the norm between both: {}".format(np.linalg.norm(res_np - res_local)))
104 | print("Checking results are mostly the same: ", np.allclose(res_np, res_local))
105 | 
106 | print("---------------------------------------------------------------------------")
107 | print("Time to compute using numpy: {} ms".format((t5-t4)/1000000))
108 | 
109 | 


--------------------------------------------------------------------------------