├── .gitattributes ├── Dockerfile ├── README.md ├── app.py ├── facewrapper ├── dict │ ├── data1.bin │ ├── data2.bin │ └── data3.bin ├── facewrapper.py └── libs │ ├── libimutils.so │ ├── libimutils.so_for_ubuntu22 │ └── libttvfaceengine7.so ├── gradio ├── demo.py └── examples │ ├── 1.jpg │ ├── 2.jpg │ ├── 3.jpg │ └── 4.jpg ├── openvino ├── cache.json ├── libgna.so ├── libgna.so.2 ├── libgna.so.3.0.0.1455 ├── libopenvino.so ├── libopenvino_auto_batch_plugin.so ├── libopenvino_auto_plugin.so ├── libopenvino_c.so ├── libopenvino_gapi_preproc.so ├── libopenvino_hetero_plugin.so ├── libopenvino_intel_cpu_plugin.so ├── libopenvino_intel_gna_plugin.so ├── libopenvino_intel_hddl_plugin.so ├── libopenvino_intel_myriad_plugin.so ├── libopenvino_ir_frontend.so ├── libopenvino_onnx_frontend.so ├── libopenvino_paddle_frontend.so ├── libopenvino_tensorflow_fe.so ├── pcie-ma2x8x.mvcmd ├── plugins.xml ├── usb-ma2x8x.mvcmd └── vpu_custom_kernels │ ├── binarization.bin │ ├── binarization.cl │ ├── binary_convolution.bin │ ├── binary_convolution.cl │ ├── binary_convolution1x1.bin │ ├── binary_convolution1x1.cl │ ├── binary_convolution3x3.bin │ ├── binary_convolution3x3.cl │ ├── convolution1x1_chw.bin │ ├── convolution1x1_chw.cl │ ├── convolution1x1_hwc.bin │ ├── convolution1x1_hwc.cl │ ├── convolution3x3.bin │ ├── convolution3x3.cl │ ├── correlate.bin │ ├── correlate.cl │ ├── ctc.bin │ ├── ctc.cl │ ├── customLayerBindings.xml │ ├── cvtf32f16.bin │ ├── cvtf32f16.cl │ ├── cvtu8f16.bin │ ├── cvtu8f16.cl │ ├── detectron_prior_grid_gen.bin │ ├── detectron_prior_grid_gen.cl │ ├── fakequantize.bin │ ├── fakequantize.cl │ ├── grn.bin │ ├── grn.cl │ ├── mvn_reduction.bin │ ├── mvn_reduction.cl │ ├── mvn_scale.bin │ ├── mvn_scale.cl │ ├── region_chw.bin │ ├── region_chw.cl │ ├── region_hwc.bin │ ├── region_hwc.cl │ ├── reorg_chw.bin │ ├── reorg_chw.cl │ ├── reorg_hwc.bin │ ├── reorg_hwc.cl │ ├── reorg_hwc_naive.bin │ ├── reorg_hwc_naive.cl │ ├── resample_AA.bin │ ├── resample_AA.cl │ ├── resample_noAA.bin │ ├── resample_noAA.cl │ ├── shuffle_channels.bin │ ├── shuffle_channels.cl │ ├── st.bin │ └── st.cl ├── requirements.txt └── run.sh /.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tar filter=lfs diff=lfs merge=lfs -text 29 | *.tflite filter=lfs diff=lfs merge=lfs -text 30 | *.tgz filter=lfs diff=lfs merge=lfs -text 31 | *.wasm filter=lfs diff=lfs merge=lfs -text 32 | *.xz filter=lfs diff=lfs merge=lfs -text 33 | *.zip filter=lfs diff=lfs merge=lfs -text 34 | *.zst filter=lfs diff=lfs merge=lfs -text 35 | *tfevents* filter=lfs diff=lfs merge=lfs -text 36 | facewrapper/libs/libttvfaceengine7.so filter=lfs diff=lfs merge=lfs -text 37 | openvino/libgna.so filter=lfs diff=lfs merge=lfs -text 38 | openvino/libgna.so.2 filter=lfs diff=lfs merge=lfs -text 39 | openvino/libgna.so.3.0.0.1455 filter=lfs diff=lfs merge=lfs -text 40 | openvino/libopenvino_gapi_preproc.so filter=lfs diff=lfs merge=lfs -text 41 | openvino/libopenvino_intel_cpu_plugin.so filter=lfs diff=lfs merge=lfs -text 42 | openvino/libopenvino_intel_gna_plugin.so filter=lfs diff=lfs merge=lfs -text 43 | openvino/libopenvino_intel_hddl_plugin.so filter=lfs diff=lfs merge=lfs -text 44 | openvino/libopenvino_intel_myriad_plugin.so filter=lfs diff=lfs merge=lfs -text 45 | openvino/libopenvino_onnx_frontend.so filter=lfs diff=lfs merge=lfs -text 46 | openvino/libopenvino_tensorflow_fe.so filter=lfs diff=lfs merge=lfs -text 47 | openvino/libopenvino.so filter=lfs diff=lfs merge=lfs -text 48 | openvino/pcie-ma2x8x.mvcmd filter=lfs diff=lfs merge=lfs -text 49 | openvino/usb-ma2x8x.mvcmd filter=lfs diff=lfs merge=lfs -text 50 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone 3 | RUN apt-get update -y 4 | RUN apt-get install -y python3 python3-pip python3-opencv 5 | RUN apt-get install -y libcurl4-openssl-dev libssl-dev 6 | RUN mkdir -p /home/FaceOnLive_v7 7 | RUN mkdir -p /home/FaceOnLive_v7/facewrapper 8 | WORKDIR /home/FaceOnLive_v7 9 | COPY ./facewrapper ./facewrapper 10 | COPY ./facewrapper/libs/libimutils.so /usr/lib 11 | COPY ./gradio ./gradio 12 | COPY ./openvino /usr/lib 13 | COPY ./app.py ./app.py 14 | COPY ./run.sh . 15 | COPY ./requirements.txt ./requirements.txt 16 | RUN pip3 install -r requirements.txt 17 | RUN chmod a+x run.sh 18 | CMD ["./run.sh"] 19 | EXPOSE 9000 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | Slack 3 | · 4 | Website 5 | · 6 | Portfolio 7 | · 8 | Hugging Face 9 | · 10 | Free APIs 11 | · 12 | OpenKYC 13 | · 14 | Face Attendance 15 | · 16 | Contact 17 |
18 |

Face Liveness Detection SDK For Linux

19 |

Fully Offline, On-Premise Face Liveness Detection SDK for Linux

20 | 21 |
22 | Documentation at https://docs.faceonlive.com 23 |
24 | 25 | ## :tada: Try It Yourself on our [Portfolio Website](https://portfolio.faceonlive.com/#server_sdks/server/liv) 26 | 27 | Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces) using [Gradio](https://github.com/gradio-app/gradio). Try out the Web Demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/FaceOnLive/Face-Liveness-Detection-SDK) 28 | 29 | 30 | https://user-images.githubusercontent.com/91896009/187945910-4ca6d27c-d058-4749-a834-44914a5a957c.mp4 31 | 32 | 33 | ## :clap: Supporters 34 | [![Stargazers repo roster for @faceonlive/Face-Liveness-Detection-SDK-Linux](https://reporoster.com/stars/faceonlive/Face-Liveness-Detection-SDK-Linux)](https://github.com/faceonlive/Face-Liveness-Detection-SDK-Linux/stargazers) 35 | [![Forkers repo roster for @faceonlive/Face-Liveness-Detection-SDK-Linux](https://reporoster.com/forks/faceonlive/Face-Liveness-Detection-SDK-Linux)](https://github.com/faceonlive/Face-Liveness-Detection-SDK-Linux/network/members) 36 |

Animated footer bars

37 | 38 | ## 🏃 How to run 39 | ### 1. Download and install dependencies 40 | To begin, follow these steps to download and install the necessary dependencies: 41 | ``` 42 | git clone https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux 43 | cd Face-Liveness-Detection-SDK-Linux 44 | chmod +x ./install_dependency.sh 45 | sudo ./install_dependency.sh 46 | ``` 47 | ### 2. Execute the Python Flask application 48 | Next, run the Python Flask application by executing the following command: 49 | ``` 50 | python3 app.py 51 | ``` 52 | ### 3. Activate the SDK 53 | #### - Online License 54 | If you have an online license, please update the license key provided by us in the following file: 55 | https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/blob/6e702fa01aeabbfb395d82c637a66dc18a93f2fb/app.py#L23-L23 56 | #### - Offline License 57 | If you have an offline license, please share your machine's HWID (Hardware ID) with us to receive the license.txt file. Update the HWID in the following file: 58 | https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/blob/6e702fa01aeabbfb395d82c637a66dc18a93f2fb/app.py#L24-L24 59 | ``` 60 | online init failed: 6 61 | hwid: IXwjedMe8M5cZX/GwU3NEOqJRcqLwldq27HSLyFiejbGDB9XVgytA1RgJukV3mWWTNo84NwTMYU= 62 | ``` 63 | ### 4. Using Docker 64 | - Build the Docker image: 65 | ``` 66 | sudo docker build --pull --rm -f Dockerfile -t faceonlive_v7:latest . 67 | ``` 68 | - Run Docker with online license: 69 | ``` 70 | sudo docker docker run --network host faceonlive_v7 71 | ``` 72 | - Run Docker with offline license: 73 | ``` 74 | sudo docker run -v license.txt:/root/FaceOnLive_v7/license.txt --network host faceonlive_v7 75 | ``` 76 | ### 5. Test endpoint 77 | To test the endpoint, download the Postman Collection from the following link: 78 | [FaceOnLive.postman_collection.json](https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/blob/main/FaceOnLive.postman_collection.json) 79 | 80 | ![image](https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/assets/91896009/417e4fe3-9a01-43b3-a95b-d379ad4bdf17) 81 | 82 | ![image](https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/assets/91896009/2275503e-49f0-4c72-9922-6e750a26dd62) 83 | 84 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('.') 3 | 4 | from flask import Flask, request, jsonify 5 | from time import gmtime, strftime 6 | import os 7 | import base64 8 | import json 9 | import cv2 10 | import numpy as np 11 | 12 | from facewrapper.facewrapper import ttv_version 13 | from facewrapper.facewrapper import ttv_get_hwid 14 | from facewrapper.facewrapper import ttv_init 15 | from facewrapper.facewrapper import ttv_init_offline 16 | from facewrapper.facewrapper import ttv_detect_face 17 | 18 | app = Flask(__name__) 19 | 20 | app.config['SITE'] = "http://0.0.0.0:8000/" 21 | app.config['DEBUG'] = False 22 | 23 | licenseKey = os.environ.get("LICENSE_KEY") 24 | licensePath = "license.txt" 25 | modelFolder = os.path.abspath(os.path.dirname(__file__)) + '/facewrapper/dict' 26 | 27 | version = ttv_version() 28 | print("version: ", version.decode('utf-8')) 29 | 30 | ret = ttv_init(modelFolder.encode('utf-8'), licenseKey.encode('utf-8')) 31 | if ret != 0: 32 | print(f"online init failed: {ret}"); 33 | 34 | hwid = ttv_get_hwid() 35 | print("hwid: ", hwid.decode('utf-8')) 36 | 37 | ret = ttv_init_offline(modelFolder.encode('utf-8'), licensePath.encode('utf-8')) 38 | if ret != 0: 39 | print(f"offline init failed: {ret}") 40 | exit(-1) 41 | else: 42 | print(f"offline init ok") 43 | 44 | else: 45 | print(f"online init ok") 46 | 47 | @app.route('/api/liveness', methods=['POST']) 48 | def check_liveness(): 49 | file = request.files['image'] 50 | image = cv2.imdecode(np.fromstring(file.read(), np.uint8), cv2.IMREAD_COLOR) 51 | 52 | faceRect = np.zeros([4], dtype=np.int32) 53 | livenessScore = np.zeros([1], dtype=np.double) 54 | angles = np.zeros([3], dtype=np.double) 55 | ret = ttv_detect_face(image, image.shape[1], image.shape[0], faceRect, livenessScore, angles) 56 | if ret == -1: 57 | result = "license error!" 58 | elif ret == -2: 59 | result = "init error!" 60 | elif ret == 0: 61 | result = "no face detected!" 62 | elif ret > 1: 63 | result = "multiple face detected!" 64 | elif faceRect[0] < 0 or faceRect[1] < 0 or faceRect[2] >= image.shape[1] or faceRect[2] >= image.shape[0]: 65 | result = "faace is in boundary!" 66 | elif livenessScore[0] > 0.5: 67 | result = "genuine" 68 | else: 69 | result = "spoof" 70 | 71 | status = "ok" 72 | response = jsonify({"status": status, "data": {"result": result, "face_rect": {"x": int(faceRect[0]), "y": int(faceRect[1]), "w": int(faceRect[2] - faceRect[0] + 1), "h" : int(faceRect[3] - faceRect[1] + 1)}, "liveness_score": livenessScore[0], 73 | "angles": {"yaw": angles[0], "roll": angles[1], "pitch": angles[2]}}}) 74 | 75 | response.status_code = 200 76 | response.headers["Content-Type"] = "application/json; charset=utf-8" 77 | return response 78 | 79 | @app.route('/api/liveness_base64', methods=['POST']) 80 | def check_liveness_base64(): 81 | content = request.get_json() 82 | imageBase64 = content['image'] 83 | image = cv2.imdecode(np.frombuffer(base64.b64decode(imageBase64), dtype=np.uint8), cv2.IMREAD_COLOR) 84 | 85 | faceRect = np.zeros([4], dtype=np.int32) 86 | livenessScore = np.zeros([1], dtype=np.double) 87 | angles = np.zeros([3], dtype=np.double) 88 | ret = ttv_detect_face(image, image.shape[1], image.shape[0], faceRect, livenessScore, angles) 89 | if ret == -1: 90 | result = "license error!" 91 | elif ret == -2: 92 | result = "init error!" 93 | elif ret == 0: 94 | result = "no face detected!" 95 | elif ret > 1: 96 | result = "multiple face detected!" 97 | elif faceRect[0] < 0 or faceRect[1] < 0 or faceRect[2] >= image.shape[1] or faceRect[2] >= image.shape[0]: 98 | result = "faace is in boundary!" 99 | elif livenessScore[0] > 0.5: 100 | result = "genuine" 101 | else: 102 | result = "spoof" 103 | 104 | status = "ok" 105 | response = jsonify({"status": status, "data": {"result": result, "face_rect": {"x": int(faceRect[0]), "y": int(faceRect[1]), "w": int(faceRect[2] - faceRect[0] + 1), "h" : int(faceRect[3] - faceRect[1] + 1)}, "liveness_score": livenessScore[0], 106 | "angles": {"yaw": angles[0], "roll": angles[1], "pitch": angles[2]}}}) 107 | 108 | response.status_code = 200 109 | response.headers["Content-Type"] = "application/json; charset=utf-8" 110 | return response 111 | 112 | 113 | if __name__ == '__main__': 114 | port = int(os.environ.get("PORT", 8000)) 115 | app.run(host='0.0.0.0', port=port) 116 | -------------------------------------------------------------------------------- /facewrapper/dict/data1.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:36cf5fcc49345989a86839a53529314ec1fe5d621c377a1952bc7538d55e7f1b 3 | size 16255630 4 | -------------------------------------------------------------------------------- /facewrapper/dict/data2.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6fa65c4b7df14f0c96c174868a1b1c675adc8c4a11e3c0807009f3d0cad51f5a 3 | size 280076956 4 | -------------------------------------------------------------------------------- /facewrapper/dict/data3.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f25fb0cd3d70cb84c258e7109620f411c087e0875828d6ab86cc9c4838d49bec 3 | size 11875339 4 | -------------------------------------------------------------------------------- /facewrapper/facewrapper.py: -------------------------------------------------------------------------------- 1 | import ctypes, ctypes.util 2 | from ctypes import * 3 | from numpy.ctypeslib import ndpointer 4 | import sys 5 | import os 6 | sys.path.append('/opt/intel/openvino_2022/runtime/lib/intel64') 7 | 8 | lib_path = os.path.abspath(os.path.dirname(__file__)) + '/libs/libttvfaceengine7.so' 9 | liveness_engine = cdll.LoadLibrary(lib_path) 10 | 11 | ttv_version = liveness_engine.ttv_version 12 | ttv_version.argtypes = [] 13 | ttv_version.restype = ctypes.c_char_p 14 | 15 | ttv_get_hwid = liveness_engine.ttv_get_hwid 16 | ttv_get_hwid.argtypes = [] 17 | ttv_get_hwid.restype = ctypes.c_char_p 18 | 19 | ttv_init = liveness_engine.ttv_init 20 | ttv_init.argtypes = [ctypes.c_char_p, ctypes.c_char_p] 21 | ttv_init.restype = ctypes.c_int32 22 | 23 | ttv_init_offline = liveness_engine.ttv_init_offline 24 | ttv_init_offline.argtypes = [ctypes.c_char_p, ctypes.c_char_p] 25 | ttv_init_offline.restype = ctypes.c_int32 26 | 27 | 28 | ttv_detect_face = liveness_engine.ttv_detect_face 29 | ttv_detect_face.argtypes = [ndpointer(ctypes.c_ubyte, flags='C_CONTIGUOUS'), ctypes.c_int32, ctypes.c_int32, ndpointer(ctypes.c_int32, flags='C_CONTIGUOUS'), ndpointer(ctypes.c_double, flags='C_CONTIGUOUS'), ndpointer(ctypes.c_double, flags='C_CONTIGUOUS')] 30 | ttv_detect_face.restype = ctypes.c_int32 31 | 32 | -------------------------------------------------------------------------------- /facewrapper/libs/libimutils.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/facewrapper/libs/libimutils.so -------------------------------------------------------------------------------- /facewrapper/libs/libimutils.so_for_ubuntu22: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/facewrapper/libs/libimutils.so_for_ubuntu22 -------------------------------------------------------------------------------- /facewrapper/libs/libttvfaceengine7.so: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b3d6f12326c8bd60242dd7366cfebeef69d25a296bdd9d329d3033e8b70e782f 3 | size 3664979 4 | -------------------------------------------------------------------------------- /gradio/demo.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import requests 3 | import json 4 | 5 | def face_liveness(frame): 6 | url = "http://127.0.0.1:8000/api/liveness" 7 | files = None 8 | if frame is None: 9 | return ['', None] 10 | 11 | files = {'image': open(frame, 'rb')} 12 | r = requests.post(url=url, files=files) 13 | return r.json() 14 | 15 | with gr.Blocks() as demo: 16 | gr.Markdown( 17 | """ 18 | # Face Liveness Detection 19 | Get your own Face Liveness Detection Server by duplicating this space.
20 | Contact us at contact@faceonlive.com for issues and support.
21 | """ 22 | ) 23 | with gr.Row(): 24 | with gr.Column(scale=5): 25 | image_input = gr.Image(type='filepath') 26 | gr.Examples(['gradio/examples/1.jpg', 'gradio/examples/2.jpg', 'gradio/examples/3.jpg', 'gradio/examples/4.jpg'], 27 | inputs=image_input) 28 | face_liveness_button = gr.Button("Check Liveness") 29 | with gr.Column(scale=5): 30 | liveness_result_output = gr.JSON() 31 | 32 | face_liveness_button.click(face_liveness, inputs=image_input, outputs=liveness_result_output) 33 | 34 | demo.launch(server_name="0.0.0.0", server_port=7860) -------------------------------------------------------------------------------- /gradio/examples/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/gradio/examples/1.jpg -------------------------------------------------------------------------------- /gradio/examples/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/gradio/examples/2.jpg -------------------------------------------------------------------------------- /gradio/examples/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/gradio/examples/3.jpg -------------------------------------------------------------------------------- /gradio/examples/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/gradio/examples/4.jpg -------------------------------------------------------------------------------- /openvino/libgna.so: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e 3 | size 3120536 4 | -------------------------------------------------------------------------------- /openvino/libgna.so.2: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e 3 | size 3120536 4 | -------------------------------------------------------------------------------- /openvino/libgna.so.3.0.0.1455: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e 3 | size 3120536 4 | -------------------------------------------------------------------------------- /openvino/libopenvino.so: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fd216848c1ba78e62360c12c9684df0c160f6962f3d900e5918cc042b42b2b46 3 | size 13495416 4 | -------------------------------------------------------------------------------- /openvino/libopenvino_auto_batch_plugin.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_auto_batch_plugin.so -------------------------------------------------------------------------------- /openvino/libopenvino_auto_plugin.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_auto_plugin.so -------------------------------------------------------------------------------- /openvino/libopenvino_c.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_c.so -------------------------------------------------------------------------------- /openvino/libopenvino_gapi_preproc.so: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3ac5ce0a8f3acefb41e8aa8161f78035dafff25c4b8c3485ebc541573b2b15f0 3 | size 1312920 4 | -------------------------------------------------------------------------------- /openvino/libopenvino_hetero_plugin.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_hetero_plugin.so -------------------------------------------------------------------------------- /openvino/libopenvino_intel_cpu_plugin.so: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:afe05ada6d5b11495a21787fa6ab0162fc40f7a9ab97be78f7b7185126d15b18 3 | size 33299880 4 | -------------------------------------------------------------------------------- /openvino/libopenvino_intel_gna_plugin.so: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ef15b623e7f81788160c4056ccd5e887a8184affe381e84a906646ef36cae1ab 3 | size 4067016 4 | -------------------------------------------------------------------------------- /openvino/libopenvino_intel_hddl_plugin.so: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:96362327fbc404e88583bdcd2a526ccbf4ca26d4ecdb8898234be7986d9b8b2b 3 | size 5894680 4 | -------------------------------------------------------------------------------- /openvino/libopenvino_intel_myriad_plugin.so: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e596436002565356b80400e0d7e50093d53d338f623b171f658de527477852de 3 | size 6120168 4 | -------------------------------------------------------------------------------- /openvino/libopenvino_ir_frontend.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_ir_frontend.so -------------------------------------------------------------------------------- /openvino/libopenvino_onnx_frontend.so: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0770ed09d471b20bffcf4ef57ab1fb002db04c4404598bd5c52a4418a67f5441 3 | size 3781640 4 | -------------------------------------------------------------------------------- /openvino/libopenvino_paddle_frontend.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_paddle_frontend.so -------------------------------------------------------------------------------- /openvino/libopenvino_tensorflow_fe.so: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c2dadbcd8ba32cec02873caf8dcc644d1d8856cdcd2978c603e5bac169e01bb9 3 | size 2723864 4 | -------------------------------------------------------------------------------- /openvino/pcie-ma2x8x.mvcmd: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f03146453508f2bcab1589907bccaa429b48db6123a7b8a428d6ce221d1fbb4d 3 | size 2099248 4 | -------------------------------------------------------------------------------- /openvino/plugins.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /openvino/usb-ma2x8x.mvcmd: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:faf33388b88708177a358fcb4704eba04b1cf9e88d6a047f90c833d686140a2e 3 | size 2298632 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/binarization.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3e0de6082c7bacca2ff5ad131f0afc44304fc792a6d99e7829399eb61491a0ac 3 | size 19632 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/binarization.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __kernel void binarization( 9 | const __global half *__restrict src_data, 10 | const __global half *__restrict input_low_high, 11 | const __global half *__restrict dst_data, 12 | int switch_out, 13 | int input_low_high_size, 14 | int W, 15 | int H) 16 | { 17 | __local half local_src[15 * 1024]; 18 | __local half local_dst[15 * 1024]; 19 | 20 | event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0); 21 | wait_group_events(1, &e1); 22 | 23 | int c = get_global_id(2); 24 | int C = get_global_size(2); 25 | 26 | half dst_low = switch_out ? 1.h : -1.h; 27 | half dst_high = switch_out ? -1.h : 1.h; 28 | 29 | half s_ilow_ihigh = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c]; 30 | 31 | for (int h = 0; h < H; h++) { 32 | 33 | __local const half *__restrict addr_src = local_src + h * W; 34 | __local half *__restrict addr_dst = local_dst + h * W; 35 | 36 | #if 1 37 | for (int w = 0; w < W / 8; w++) { 38 | 39 | half8 h_src_val8 = (*((__local half8 *)addr_src + w)); 40 | 41 | short8 cond1; 42 | cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh); 43 | cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh); 44 | cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh); 45 | cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh); 46 | cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh); 47 | cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh); 48 | cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh); 49 | cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh); 50 | 51 | cond1 = ~(cond1 - (short8)1); 52 | 53 | short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high); 54 | 55 | *((__local half8 *)addr_dst + w) = as_half8(res); 56 | } 57 | #endif 58 | for (int w = W & (~0x7); w < W; w++) { 59 | addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high; 60 | } 61 | } 62 | 63 | barrier(CLK_LOCAL_MEM_FENCE); 64 | 65 | event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0); 66 | wait_group_events(1, &e2); 67 | } 68 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/binary_convolution.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:12c349d6f73c233b158e1d67af31715c7b8bda79f191b1e759476e01e65bb64a 3 | size 10764 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/binary_convolution.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | 7 | int extract_weights(uchar val, int bit) { return ((val >> bit) & 1); } 8 | 9 | __kernel void binary_convolution( 10 | const __global half *restrict src_data, 11 | const __global uchar *restrict weights_data, 12 | __global half *restrict dst_data, 13 | float pad_value, 14 | 15 | int IW, 16 | int IH, 17 | int IC, 18 | 19 | int DW, 20 | int DH, 21 | 22 | int GC, 23 | 24 | int KW, 25 | int KH, 26 | 27 | int PW, 28 | int PH, 29 | 30 | int SW, 31 | int SH) 32 | { 33 | int ipad_value = ((pad_value > 0.f) ? 1 : 0); 34 | int c = get_global_id(2); 35 | int y = get_global_id(1); 36 | int x = get_global_id(0); 37 | 38 | int OC = get_global_size(2); 39 | int OH = get_global_size(1); 40 | int OW = get_global_size(0); 41 | 42 | int KD = 1; 43 | int SD = 0; 44 | int DD = 0; 45 | int PD = 0; 46 | int ID = 1; 47 | int OD = 1; 48 | 49 | int nbits = 8; 50 | 51 | int g = c % GC; 52 | int oc = c / GC; 53 | int oh = y; 54 | int ow = x; 55 | 56 | for (int od = 0; od < OD; od++) { 57 | int oidx = g * OC / GC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow; 58 | 59 | int res = 0; 60 | 61 | for (int ic = 0; ic < IC / GC; ic++) { 62 | for (int kd = 0; kd < KD; kd++) { 63 | for (int kh = 0; kh < KH; kh++) { 64 | for (int kw = 0; kw < KW; kw++) { 65 | int widx = g * OC / GC * IC / GC * KD * KH * KW 66 | + oc * IC / GC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW 67 | + kh * KW + kw; 68 | 69 | int w = extract_weights(weights_data[widx / nbits], (widx % nbits)); 70 | 71 | int s; 72 | 73 | int iw = ow * SW - PW + kw * DW; 74 | int ih = oh * SH - PH + kh * DH; 75 | int id = od * SD - PD + kd * DD; 76 | 77 | if (iw < 0 || iw >= (int)IW || ih < 0 || ih >= (int)IH || id < 0 78 | || id >= (int)ID) { 79 | s = ipad_value; 80 | } else { 81 | int iidx = g * IC / GC * ID * IH * IW + ic * ID * IH * IW + id * IH * IW 82 | + ih * IW + iw; 83 | 84 | s = ((src_data[iidx] > 0.f) ? 1 : 0); 85 | } 86 | 87 | res += s ^ w; 88 | } 89 | } 90 | } 91 | } 92 | 93 | dst_data[oidx] = (half)(IC / GC * KD * KH * KW - 2 * res); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/binary_convolution1x1.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6deff31d62aa84c643fbeba77e7dcd4ae5d9b488c1c98e07fffeb58ff8e9b945 3 | size 76316 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/binary_convolution1x1.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); } 9 | 10 | __kernel void binary_convolution( 11 | const __global half *restrict src_data, 12 | const __global uchar *restrict weights_data, 13 | __global half *restrict dst_data, 14 | float pad_value, 15 | 16 | int IW, 17 | int IH, 18 | int IC, 19 | 20 | int DW, 21 | int DH, 22 | 23 | int GC, 24 | 25 | int KW, 26 | int KH, 27 | 28 | int PW, 29 | int PH, 30 | 31 | int SW, 32 | int SH, 33 | 34 | int OW) 35 | { 36 | __local half src_local[32 * 1024]; 37 | __local half dst_local[2 * 1024]; 38 | 39 | const int oh = get_group_id(0); 40 | const int oc = get_group_id(1); 41 | const int OH = get_global_size(0); 42 | const int OC = get_global_size(1); 43 | 44 | const int gc = oc / (OC / GC); 45 | 46 | if (oh * SH >= 0 && oh * SH <= IH - 1) { 47 | const __global half *src = src_data + (gc * IC / GC) * IW * IH + (SH * oh) * IW; 48 | 49 | event_t e1 = async_work_group_copy_2D2D( 50 | src_local, // dst 51 | src, // src 52 | IW, // num_elements_per_line, 53 | IC / GC, // num_lines, 54 | IH * IW - IW, // src_line_stride, 55 | 0, // dst_line_stride, 56 | 0); 57 | wait_group_events(1, &e1); 58 | } 59 | 60 | half pad_value_half = convert_half(pad_value); 61 | 62 | //padding row 63 | if (oh * SH > IH - 1) { 64 | __local half *dst = src_local; 65 | for (int c = 0; c < IC / GC; c++) { 66 | #pragma unroll 8 67 | for (int j = 0; j < IW; j++) { 68 | dst[j] = pad_value_half; 69 | } 70 | dst += IW; 71 | } 72 | } 73 | 74 | int OWS = SW * OW; 75 | ushort8 in; 76 | 77 | for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) { 78 | ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0}; 79 | for (int ic = 0; ic < IC / GC; ++ic) { 80 | __local half *src = (__local half *)((__local half8 *)(src_local + ic * IW) + ows8); 81 | int weight_pos = oc * IC / GC + ic; 82 | ushort w = 83 | extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8)); 84 | 85 | if ((ows8 * 8) <= IW - 1) { 86 | in = *((__local ushort8 *)(src)); 87 | } 88 | 89 | //padding column 90 | if (ows8 * 8 + 7 > IW - 1) { 91 | int boundary = (IW - 1) - ows8 * 8 + 1; 92 | boundary = boundary < 0 ? 0 : boundary; 93 | for (int offset = boundary; offset < 8; offset++) { 94 | *((half *)(&in) + offset) = pad_value_half; 95 | } 96 | } 97 | 98 | ushort8 w8 = (ushort8)(w); 99 | 100 | ushort8 cond = 101 | (((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0); 102 | 103 | val += (cond ^ w8); 104 | } 105 | 106 | ushort8 val_shift = val << 1; 107 | int boundary = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1; 108 | for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) { 109 | *(dst_local + ow) = (half)(IC / GC - *((ushort *)(&val_shift) + ow * SW - ows8 * 8)); 110 | } 111 | } 112 | 113 | barrier(CLK_LOCAL_MEM_FENCE); 114 | 115 | event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0); 116 | wait_group_events(1, &e2); 117 | } 118 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/binary_convolution3x3.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:55e3c3f8863ff7a3583bcc7340d1e226775f5f14cfb11dd32bd671764570f7cb 3 | size 104136 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/binary_convolution3x3.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); } 9 | 10 | __kernel void binary_convolution( 11 | const __global half *restrict src_data, 12 | const __global uchar *restrict weights_data, 13 | const __global half *restrict dst_data, 14 | float pad_value, 15 | 16 | int IW, 17 | int IH, 18 | int IC, 19 | 20 | int DW, 21 | int DH, 22 | 23 | int GC, 24 | 25 | int KW, 26 | int KH, 27 | 28 | int PW, 29 | int PH, 30 | 31 | int SW, 32 | int SH, 33 | 34 | int OW) 35 | { 36 | __local half src_local[32 * 1024]; 37 | __local half dst_local[2 * 1024]; 38 | 39 | const int oh = get_group_id(0); 40 | const int oc = get_group_id(1); 41 | const int OH = get_global_size(0); 42 | const int OC = get_global_size(1); 43 | 44 | const int gc = oc / (OC / GC); 45 | 46 | if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows 47 | { 48 | event_t e = async_work_group_copy_3D3D( 49 | src_local, // dst 50 | src_data + (gc * IC / GC) * IW * IH + (SH * oh - 1) * IW, // src 51 | IW, // num_elements_per_line 52 | 3, // num_lines 53 | DH * IW - IW, // src_line_stride 54 | 0, // dst_line_stride 55 | IC / GC, // num planes 56 | IH * IW - 3 * DH * IW, // src plane stride 57 | 0, // dst plane stride 58 | 0); 59 | wait_group_events(1, &e); 60 | } else { 61 | int ih = oh * SH - 1; 62 | if (ih >= 0 && ih <= IH - 1) //dma for first row 63 | { 64 | event_t e = async_work_group_copy_2D2D( 65 | src_local, // dst 66 | src_data + (gc * IC / GC) * IW * IH + ih * IW, // src 67 | IW, // num_elements_per_line, 68 | IC / GC, // num_lines, 69 | IH * IW - IW, // src_line_stride, 70 | 2 * IW, // dst_line_stride, 71 | 0); 72 | 73 | wait_group_events(1, &e); 74 | } 75 | ih = oh * SH - 1 + DH; 76 | if (ih >= 0 && ih <= IH - 1) //dma for second row 77 | { 78 | event_t e = async_work_group_copy_2D2D( 79 | src_local + IW, // dst 80 | src_data + (gc * IC / GC) * IW * IH + ih * IW, // src 81 | IW, // num_elements_per_line, 82 | IC / GC, // num_lines, 83 | IH * IW - IW, // src_line_stride, 84 | 2 * IW, // dst_line_stride, 85 | 0); 86 | wait_group_events(1, &e); 87 | } 88 | ih = oh * SH - 1 + 2 * DH; 89 | if (ih >= 0 && ih <= IH - 1) //dma for third row 90 | { 91 | event_t e = async_work_group_copy_2D2D( 92 | src_local + 2 * IW, // dst 93 | src_data + (gc * IC / GC) * IW * IH + ih * IW, // src 94 | IW, // num_elements_per_line, 95 | IC / GC, // num_lines, 96 | IH * IW - IW, // src_line_stride, 97 | 2 * IW, // dst_line_stride, 98 | 0); 99 | wait_group_events(1, &e); 100 | } 101 | } 102 | 103 | half pad_value_half = convert_half(pad_value); 104 | 105 | //padding row 106 | if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1) { 107 | __local half *dst = src_local; 108 | for (int c = 0; c < IC / GC; c++) { 109 | #pragma unroll 8 110 | for (int j = 0; j < IW; j++) { 111 | dst[j] = pad_value_half; 112 | } 113 | dst += 3 * IW; 114 | } 115 | } 116 | if (oh * SH + DH - 1 > IH - 1) { 117 | __local half *dst = src_local + IW; 118 | for (int c = 0; c < IC / GC; c++) { 119 | #pragma unroll 8 120 | for (int j = 0; j < IW; j++) { 121 | dst[j] = pad_value_half; 122 | } 123 | dst += 3 * IW; 124 | } 125 | } 126 | if (oh * SH + DH + DH - 1 > IH - 1) { 127 | __local half *dst = src_local + 2 * IW; 128 | for (int c = 0; c < IC / GC; c++) { 129 | #pragma unroll 8 130 | for (int j = 0; j < IW; j++) { 131 | dst[j] = pad_value_half; 132 | } 133 | dst += 3 * IW; 134 | } 135 | } 136 | 137 | int OWS = SW * OW; 138 | 139 | ushort8 in00; 140 | ushort8 in01; 141 | ushort8 in02; 142 | ushort8 in10; 143 | ushort8 in11; 144 | ushort8 in12; 145 | ushort8 in20; 146 | ushort8 in21; 147 | ushort8 in22; 148 | 149 | for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) { 150 | ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0}; 151 | for (int ic = 0; ic < IC / GC; ++ic) { 152 | __local half *src = 153 | (__local half *)((__local half8 *)(src_local + ic * IW * 3 + IW + DW - 1) + ows8); 154 | int weight_pos = oc * IC / GC * 3 * 3 + ic * 3 * 3; 155 | ushort w0 = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8)); 156 | ushort w1 = extract_weights(weights_data[((weight_pos + 1)) / 8], ((weight_pos + 1) % 8)); 157 | ushort w2 = extract_weights(weights_data[((weight_pos + 2)) / 8], ((weight_pos + 2) % 8)); 158 | ushort w3 = extract_weights(weights_data[((weight_pos + 3)) / 8], ((weight_pos + 3) % 8)); 159 | ushort w4 = extract_weights(weights_data[((weight_pos + 4)) / 8], ((weight_pos + 4) % 8)); 160 | ushort w5 = extract_weights(weights_data[((weight_pos + 5)) / 8], ((weight_pos + 5) % 8)); 161 | ushort w6 = extract_weights(weights_data[((weight_pos + 6)) / 8], ((weight_pos + 6) % 8)); 162 | ushort w7 = extract_weights(weights_data[((weight_pos + 7)) / 8], ((weight_pos + 7) % 8)); 163 | ushort w8 = extract_weights(weights_data[((weight_pos + 8)) / 8], ((weight_pos + 8) % 8)); 164 | 165 | if ((ows8 * 8) - 1 <= IW - 1) { 166 | in00 = *((__local ushort8 *)(src - IW - DW)); 167 | in01 = *((__local ushort8 *)(src - IW)); 168 | in02 = *((__local ushort8 *)(src - IW + DW)); 169 | 170 | in10 = *((__local ushort8 *)(src - DW)); 171 | in11 = *((__local ushort8 *)(src)); 172 | in12 = *((__local ushort8 *)(src + DW)); 173 | 174 | in20 = *((__local ushort8 *)(src + IW - DW)); 175 | in21 = *((__local ushort8 *)(src + IW)); 176 | in22 = *((__local ushort8 *)(src + IW + DW)); 177 | } 178 | 179 | //padding column 180 | if (ows8 * 8 - 1 < 0) { 181 | int boundary = 1 - ows8 * 8; 182 | boundary = boundary > 8 ? 8 : boundary; 183 | for (int offset = 0; offset < boundary; offset++) { 184 | *((half *)(&in00) + offset) = pad_value_half; 185 | *((half *)(&in10) + offset) = pad_value_half; 186 | *((half *)(&in20) + offset) = pad_value_half; 187 | } 188 | } 189 | if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1) { 190 | int boundary = (IW - DW - 1 - DW + 1) - ows8 * 8 + 1; 191 | boundary = boundary < 0 ? 0 : boundary; 192 | for (int offset = boundary; offset < 8; offset++) { 193 | *((half *)(&in02) + offset) = pad_value_half; 194 | *((half *)(&in12) + offset) = pad_value_half; 195 | *((half *)(&in22) + offset) = pad_value_half; 196 | } 197 | } 198 | if ((ows8 * 8 + 7) + DW - 1 > IW - 1) { 199 | int boundary = (IW - 1 - DW + 1) - ows8 * 8 + 1; 200 | boundary = boundary < 0 ? 0 : boundary; 201 | for (int offset = boundary; offset < 8; offset++) { 202 | *((half *)(&in01) + offset) = pad_value_half; 203 | *((half *)(&in11) + offset) = pad_value_half; 204 | *((half *)(&in21) + offset) = pad_value_half; 205 | } 206 | } 207 | if ((ows8 * 8 + 7) - 1 > IW - 1) { 208 | int boundary = (IW - 1 + 1) - ows8 * 8 + 1; 209 | boundary = boundary < 0 ? 0 : boundary; 210 | for (int offset = boundary; offset < 8; offset++) { 211 | *((half *)(&in00) + offset) = pad_value_half; 212 | *((half *)(&in10) + offset) = pad_value_half; 213 | *((half *)(&in20) + offset) = pad_value_half; 214 | } 215 | } 216 | 217 | ushort8 w00 = (ushort8)(w0); 218 | ushort8 w01 = (ushort8)(w1); 219 | ushort8 w02 = (ushort8)(w2); 220 | ushort8 w10 = (ushort8)(w3); 221 | ushort8 w11 = (ushort8)(w4); 222 | ushort8 w12 = (ushort8)(w5); 223 | ushort8 w20 = (ushort8)(w6); 224 | ushort8 w21 = (ushort8)(w7); 225 | ushort8 w22 = (ushort8)(w8); 226 | 227 | ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ? 228 | (ushort8)(1) : 229 | (ushort8)(0); 230 | ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ? 231 | (ushort8)(1) : 232 | (ushort8)(0); 233 | ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ? 234 | (ushort8)(1) : 235 | (ushort8)(0); 236 | ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ? 237 | (ushort8)(1) : 238 | (ushort8)(0); 239 | ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ? 240 | (ushort8)(1) : 241 | (ushort8)(0); 242 | ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ? 243 | (ushort8)(1) : 244 | (ushort8)(0); 245 | ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ? 246 | (ushort8)(1) : 247 | (ushort8)(0); 248 | ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ? 249 | (ushort8)(1) : 250 | (ushort8)(0); 251 | ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ? 252 | (ushort8)(1) : 253 | (ushort8)(0); 254 | 255 | val += (cond0 ^ w00); 256 | val += (cond1 ^ w01); 257 | val += (cond2 ^ w02); 258 | val += (cond3 ^ w10); 259 | val += (cond4 ^ w11); 260 | val += (cond5 ^ w12); 261 | val += (cond6 ^ w20); 262 | val += (cond7 ^ w21); 263 | val += (cond8 ^ w22); 264 | } 265 | 266 | ushort8 val_shift = val << 1; 267 | int boundary = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1; 268 | for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) { 269 | *(dst_local + ow) = 270 | (half)(IC / GC * KH * KW - *((ushort *)(&val_shift) + ow * SW - ows8 * 8)); 271 | } 272 | } 273 | 274 | barrier(CLK_LOCAL_MEM_FENCE); 275 | 276 | event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0); 277 | wait_group_events(1, &e2); 278 | } 279 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/convolution1x1_chw.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8717c8429d41a69337007871137f06a9e6b38c685b5b3fecc634fade0eaa7e7f 3 | size 9220 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/convolution1x1_chw.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __kernel void Convolution1x1_NCHW( 9 | const __global half *in, 10 | const __global half *out, 11 | const __global half *w, 12 | int IW, 13 | int IH, 14 | int IC, 15 | int OW, 16 | int OH, 17 | int OC) 18 | { 19 | __local half in_local[8 * 1024]; 20 | __local half out_local[8 * 1024]; 21 | 22 | event_t e1 = async_work_group_copy_2D2D( 23 | in_local, // dst 24 | in + get_group_id(0) * IW, // src 25 | IW, // num_elements_per_line, 26 | IC, // num_lines, 27 | IW * IH - IW, // src_line_stride, 28 | 0, // dst_line_stride, 29 | 0); 30 | wait_group_events(1, &e1); 31 | 32 | int oh = get_global_id(0); 33 | int oc = get_global_id(1); 34 | 35 | int stride; 36 | int write_output = 0; 37 | __global half *src; 38 | 39 | __global half8 *w8 = (__global half8 *)(&w[oc * IC]); 40 | __global half *w1 = (__global half *)(&w[oc * IC]); 41 | 42 | for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) { 43 | uint iw = ow; 44 | uint ih = oh; 45 | 46 | half8 val8_0 = 0.0f; 47 | 48 | __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]); 49 | __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]); 50 | __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]); 51 | __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]); 52 | __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]); 53 | __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]); 54 | __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]); 55 | __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]); 56 | 57 | for (uint ic = 0; ic < IC / 8; ic++) { 58 | val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0); 59 | val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1); 60 | val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2); 61 | val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3); 62 | val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4); 63 | val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5); 64 | val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6); 65 | val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7); 66 | } 67 | 68 | for (uint ic = (IC & (~0x7)); ic < IC; ++ic) { 69 | val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]); 70 | } 71 | *((__local half8 *)&out_local[ow + 0]) = (val8_0); 72 | } 73 | 74 | uint iw = (OW & (~0x7)); 75 | uint ih = oh; 76 | 77 | half8 val8_0 = 0.0f; 78 | 79 | __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]); 80 | __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]); 81 | __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]); 82 | __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]); 83 | __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]); 84 | __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]); 85 | __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]); 86 | __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]); 87 | 88 | for (uint ic = 0; ic < IC / 8; ic++) { 89 | val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0); 90 | val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1); 91 | val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2); 92 | val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3); 93 | val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4); 94 | val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5); 95 | val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6); 96 | val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7); 97 | } 98 | 99 | for (uint ic = (IC & (~0x7)); ic < IC; ++ic) { 100 | val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]); 101 | } 102 | for (uint ow = (OW & (~0x7)); ow < OW; ow++) { 103 | out_local[ow + 0] = (val8_0[ow % 8]); 104 | } 105 | 106 | barrier(CLK_LOCAL_MEM_FENCE); 107 | 108 | event_t e2 = async_work_group_copy( 109 | out + get_group_id(1) * OW * OH + get_group_id(0) * OW, 110 | out_local, 111 | OW, 112 | 0); 113 | wait_group_events(1, &e2); 114 | } 115 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/convolution1x1_hwc.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5b6122a6bf6f50d2c7fc612d4e286559f9c96746e166892d192e1264e1ce5a2c 3 | size 4304 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/convolution1x1_hwc.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __kernel void Convolution1x1_NHWC( 9 | const __global half *in, 10 | const __global half *out, 11 | const __global half *w, 12 | int IW, 13 | int IH, 14 | int IC, 15 | int OW, 16 | int OH, 17 | int OC) 18 | { 19 | 20 | __local half in_local[8 * 1024]; 21 | __local half out_local[8 * 1024]; 22 | 23 | const int sizeAct = IW * IC; 24 | 25 | event_t e1 = async_work_group_copy(in_local, in + get_group_id(0) * sizeAct, sizeAct, 0); 26 | wait_group_events(1, &e1); 27 | 28 | int oh = get_global_id(0); 29 | int oc = get_global_id(1); 30 | 31 | int stride; 32 | int write_output = 0; 33 | __global half *src; 34 | 35 | __global half8 *w8 = (__global half8 *)(&w[oc * IC]); 36 | __global half *w1 = (__global half *)(&w[oc * IC]); 37 | 38 | for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) { 39 | uint iw = ow; 40 | uint ih = oh; 41 | 42 | half8 val8_0 = 0.0f; 43 | half8 val8_1 = 0.0f; 44 | half8 val8_2 = 0.0f; 45 | half8 val8_3 = 0.0f; 46 | half8 val8_4 = 0.0f; 47 | half8 val8_5 = 0.0f; 48 | half8 val8_6 = 0.0f; 49 | half8 val8_7 = 0.0f; 50 | 51 | __local half8 *in8_0 = (__local half8 *)(&in_local[(iw + 0) * IC]); 52 | __local half8 *in8_1 = (__local half8 *)(&in_local[(iw + 1) * IC]); 53 | __local half8 *in8_2 = (__local half8 *)(&in_local[(iw + 2) * IC]); 54 | __local half8 *in8_3 = (__local half8 *)(&in_local[(iw + 3) * IC]); 55 | __local half8 *in8_4 = (__local half8 *)(&in_local[(iw + 4) * IC]); 56 | __local half8 *in8_5 = (__local half8 *)(&in_local[(iw + 5) * IC]); 57 | __local half8 *in8_6 = (__local half8 *)(&in_local[(iw + 6) * IC]); 58 | __local half8 *in8_7 = (__local half8 *)(&in_local[(iw + 7) * IC]); 59 | 60 | for (uint ic = 0; ic < IC / 8; ++ic) { 61 | val8_0 += (in8_0[ic]) * (w8[ic]); 62 | val8_1 += (in8_1[ic]) * (w8[ic]); 63 | val8_2 += (in8_2[ic]) * (w8[ic]); 64 | val8_3 += (in8_3[ic]) * (w8[ic]); 65 | val8_4 += (in8_4[ic]) * (w8[ic]); 66 | val8_5 += (in8_5[ic]) * (w8[ic]); 67 | val8_6 += (in8_6[ic]) * (w8[ic]); 68 | val8_7 += (in8_7[ic]) * (w8[ic]); 69 | } 70 | 71 | half val_0 = 0.0f; 72 | half val_1 = 0.0f; 73 | half val_2 = 0.0f; 74 | half val_3 = 0.0f; 75 | half val_4 = 0.0f; 76 | half val_5 = 0.0f; 77 | half val_6 = 0.0f; 78 | half val_7 = 0.0f; 79 | for (uint ic = IC & (~0x7); ic < IC; ++ic) { 80 | val_0 += *((__local half *)in8_0 + ic) * (*((__global half *)w8 + ic)); 81 | val_1 += *((__local half *)in8_1 + ic) * (*((__global half *)w8 + ic)); 82 | val_2 += *((__local half *)in8_2 + ic) * (*((__global half *)w8 + ic)); 83 | val_3 += *((__local half *)in8_3 + ic) * (*((__global half *)w8 + ic)); 84 | val_4 += *((__local half *)in8_4 + ic) * (*((__global half *)w8 + ic)); 85 | val_5 += *((__local half *)in8_5 + ic) * (*((__global half *)w8 + ic)); 86 | val_6 += *((__local half *)in8_6 + ic) * (*((__global half *)w8 + ic)); 87 | val_7 += *((__local half *)in8_7 + ic) * (*((__global half *)w8 + ic)); 88 | } 89 | out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0; 90 | out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1; 91 | out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2; 92 | out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3; 93 | out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4; 94 | out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5; 95 | out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6; 96 | out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7; 97 | } 98 | for (uint ow = (OW & (~0x7)); ow < OW; ow++) { 99 | 100 | uint iw = ow; 101 | uint ih = oh; 102 | 103 | half8 val8 = 0.0f; 104 | 105 | __local half8 *in8 = (__local half8 *)(&in_local[iw * IC]); 106 | 107 | for (uint ic = 0; ic < IC / 8; ++ic) { 108 | val8 += (in8[ic]) * (w8[ic]); 109 | } 110 | 111 | half val = 0.0f; 112 | for (uint ic = (IC & (~0x7)); ic < IC; ++ic) { 113 | val += (*((__local half *)in8 + ic)) * (*((__global half *)w8 + ic)); 114 | } 115 | out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val; 116 | } 117 | 118 | barrier(CLK_LOCAL_MEM_FENCE); 119 | 120 | event_t e2 = async_work_group_copy( 121 | out + get_group_id(1) * OW * OH + get_group_id(0) * OW, 122 | out_local, 123 | OW, 124 | 0); 125 | wait_group_events(1, &e2); 126 | } 127 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/convolution3x3.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:021bb40840ff35506972e6f6a7dea1b5f40a8db0927aaa9a6c116b152e386851 3 | size 5748 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/convolution3x3.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __kernel void Convolution3x3( 9 | const __global half *in_param, 10 | const __global half *out, 11 | const __global half *w, 12 | int IW, 13 | int IH, 14 | int IC, 15 | int OW, 16 | int OH, 17 | int OC, 18 | int KX, 19 | int KY, 20 | int stride_x, 21 | int stride_y, 22 | int pad_x, 23 | int pad_y, 24 | int dilation_x, 25 | int dilation_y) 26 | { 27 | __local half in_local[8 * 1024]; 28 | __local half out_local[8 * 1024]; 29 | __local half w_local[8 * 1024]; 30 | 31 | const int sizePlane = IW * IH; 32 | event_t e1 = async_work_group_copy_2D2D( 33 | in_local, // dst 34 | in_param + get_group_id(0) * stride_y * IW, // src 35 | 3 * IW, // num_elements_per_line, 36 | IC, // num_lines, 37 | IW * IH - 3 * IW, // src_line_stride, 38 | 0, // dst_line_stride, 39 | 0); 40 | wait_group_events(1, &e1); 41 | 42 | const int sizeWeight = IC * 3 * 3; 43 | e1 = async_work_group_copy(w_local, w + get_group_id(1) * sizeWeight, sizeWeight, 0); 44 | wait_group_events(1, &e1); 45 | 46 | int oh = get_global_id(0); 47 | int oc = get_global_id(1); 48 | 49 | __local half *in = (__local half *)in_local + 1; 50 | 51 | int stride; 52 | int write_output = 0; 53 | __local half *src; 54 | 55 | if ((stride_x == 1) && (stride_y == 1)) { 56 | stride = OW / 8; 57 | write_output = 1; 58 | } 59 | if ((stride_x == 2) && (stride_y == 2)) { 60 | stride = OW / 4; 61 | write_output = 2; 62 | } 63 | 64 | for (int ow = 0; ow < stride; ow++) { 65 | float8 val = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; 66 | for (int ic = 0; ic < IC; ++ic) { 67 | src = (__local half *)((__local half8 *)(in + ic * IW * 3) + ow); 68 | __local half *k = (__local half *)(w_local + ic * 3 * 3); 69 | 70 | half8 aux_in00 = *((__local half8 *)src - 1); 71 | half8 aux_in01 = *((__local half8 *)src + 0); 72 | half8 aux_in02 = *((__local half8 *)src + 1); 73 | half8 aux_in10 = *((__local half8 *)(src + IW) - 1); 74 | half8 aux_in11 = *((__local half8 *)(src + IW) + 0); 75 | half8 aux_in12 = *((__local half8 *)(src + IW) + 1); 76 | half8 aux_in20 = *((__local half8 *)(src + IW * 2) - 1); 77 | half8 aux_in21 = *((__local half8 *)(src + IW * 2) + 0); 78 | half8 aux_in22 = *((__local half8 *)(src + IW * 2) + 1); 79 | 80 | short8 in00 = *((short8 *)&aux_in00); 81 | short8 in01 = *((short8 *)&aux_in01); 82 | short8 in02 = *((short8 *)&aux_in02); 83 | short8 in10 = *((short8 *)&aux_in10); 84 | short8 in11 = *((short8 *)&aux_in11); 85 | short8 in12 = *((short8 *)&aux_in12); 86 | short8 in20 = *((short8 *)&aux_in20); 87 | short8 in21 = *((short8 *)&aux_in21); 88 | short8 in22 = *((short8 *)&aux_in22); 89 | 90 | short8 aux_aux00 = __builtin_shave_cmu_alignvec_rri_short8(in00, in01, 14); 91 | short8 aux_aux01 = in01; 92 | short8 aux_aux02 = __builtin_shave_cmu_alignvec_rri_short8(in01, in02, 2); 93 | short8 aux_aux10 = __builtin_shave_cmu_alignvec_rri_short8(in10, in11, 14); 94 | short8 aux_aux11 = in11; 95 | short8 aux_aux12 = __builtin_shave_cmu_alignvec_rri_short8(in11, in12, 2); 96 | short8 aux_aux20 = __builtin_shave_cmu_alignvec_rri_short8(in20, in21, 14); 97 | short8 aux_aux21 = in21; 98 | short8 aux_aux22 = __builtin_shave_cmu_alignvec_rri_short8(in21, in22, 2); 99 | 100 | half8 aux00 = *((half8 *)&aux_aux00); 101 | half8 aux01 = *((half8 *)&aux_aux01); 102 | half8 aux02 = *((half8 *)&aux_aux02); 103 | half8 aux10 = *((half8 *)&aux_aux10); 104 | half8 aux11 = *((half8 *)&aux_aux11); 105 | half8 aux12 = *((half8 *)&aux_aux12); 106 | half8 aux20 = *((half8 *)&aux_aux20); 107 | half8 aux21 = *((half8 *)&aux_aux21); 108 | half8 aux22 = *((half8 *)&aux_aux22); 109 | 110 | half8 w00 = (half8)(*(k + 0)); 111 | half8 w01 = (half8)(*(k + 1)); 112 | half8 w02 = (half8)(*(k + 2)); 113 | half8 w10 = (half8)(*(k + 3)); 114 | half8 w11 = (half8)(*(k + 4)); 115 | half8 w12 = (half8)(*(k + 5)); 116 | half8 w20 = (half8)(*(k + 6)); 117 | half8 w21 = (half8)(*(k + 7)); 118 | half8 w22 = (half8)(*(k + 8)); 119 | 120 | val += convert_float8(aux00) * convert_float8(w00); 121 | val += convert_float8(aux01) * convert_float8(w01); 122 | val += convert_float8(aux02) * convert_float8(w02); 123 | val += convert_float8(aux10) * convert_float8(w10); 124 | val += convert_float8(aux11) * convert_float8(w11); 125 | val += convert_float8(aux12) * convert_float8(w12); 126 | val += convert_float8(aux20) * convert_float8(w20); 127 | val += convert_float8(aux21) * convert_float8(w21); 128 | val += convert_float8(aux22) * convert_float8(w22); 129 | } 130 | if (write_output == 2) *((__local half4 *)(out_local) + ow) = convert_half4(val.s0246); 131 | if (write_output == 1) *((__local half8 *)(out_local) + ow) = convert_half8(val); 132 | } 133 | 134 | for (int ow = OW & ~(0x7); ow < OW; ow++) { 135 | float val = 0.0f; 136 | for (int ic = 0; ic < IC; ++ic) { 137 | for (int ky = 0; ky < 3; ++ky) { 138 | for (int kx = 0; kx < 3; ++kx) { 139 | int iw = ow * stride_x - pad_x + kx * dilation_x; 140 | int ih = oh * stride_y - pad_y + ky * dilation_y; 141 | 142 | val += convert_float(in[ic * IW * 3 + (ky * dilation_y) * IW + iw]) 143 | * convert_float(w_local[ic * 3 * 3 + ky * 3 + kx]); 144 | } 145 | } 146 | } 147 | out_local[ow] = convert_half(val); 148 | } 149 | 150 | barrier(CLK_LOCAL_MEM_FENCE); 151 | 152 | event_t e2 = async_work_group_copy( 153 | out + get_group_id(1) * OW * OH + get_group_id(0) * OW, 154 | out_local, 155 | OW, 156 | 0); 157 | wait_group_events(1, &e2); 158 | } 159 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/correlate.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e2b24b1b5bfd1786128682ee814230653b4b63aad5b472feec9c6f4a4c833e2f 3 | size 14336 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/correlate.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | 7 | #define MAX_OPENCL_BUFF_SIZE 64 * 1024 8 | 9 | #define USE_DMA 1 10 | 11 | #if defined(USE_DMA) 12 | void dmacpyLineSrcStrideStart(global half *from, private half *to, int size, int src_width, int src_stride) 13 | { 14 | item_dma_event_t copyEvent = 15 | WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_stride, src_width, size, 0); 16 | WaitWorkItemDmaEvents(1, ©Event); 17 | } 18 | 19 | void dmacpyLineDstStrideStart(private half *from, global half *to, int size, int src_width, int src_stride) 20 | { 21 | item_dma_event_t copyEvent = 22 | WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_width, src_stride, size, 0); 23 | WaitWorkItemDmaEvents(1, ©Event); 24 | } 25 | #endif 26 | 27 | void memzero(void *ptr, size_t num) 28 | { 29 | float4 *line0_ = (float4 *)ptr; 30 | #pragma unroll 16 31 | for (int i = 0; i < num / 16; i++) { 32 | line0_[i] = (float4){0.f, 0.f, 0.f, 0.f}; 33 | } 34 | uchar *ptr_ = (uchar *)ptr; 35 | for (int i = num / 16 * 16; i < num; i++) { 36 | ptr_[i] = 0; 37 | } 38 | } 39 | 40 | void __attribute__((noinline)) crosscorrh( 41 | __private const half *restrict line0, 42 | __private const half *restrict line1, 43 | __private half *restrict dline, 44 | int topwidth, 45 | int max_displacement, 46 | int neighborhood_grid_radius, 47 | int kernel_size, 48 | int padding, 49 | int bottomwidth, 50 | int stride1, 51 | int stride2, 52 | int max_channels, 53 | int cur_subchannels) 54 | { 55 | if (max_channels == 64) { 56 | for (int i = 0; i < kernel_size; i++) { 57 | int x1 = max_displacement - padding + i; 58 | int offset1 = x1 >= 0 ? 0 : (-x1 + stride1 - 1) / stride1; 59 | x1 += offset1 * stride1; 60 | 61 | for (int blockIdx_x = offset1; blockIdx_x < topwidth && x1 < bottomwidth; blockIdx_x++, x1 += stride1) { 62 | int x2 = x1 - neighborhood_grid_radius * stride2; 63 | int offset2 = x2 >= 0 ? 0 : (-x2 + stride2 - 1) / stride2; 64 | x2 += offset2 * stride2; 65 | 66 | for (int top_channel_x = offset2 - neighborhood_grid_radius; 67 | top_channel_x <= neighborhood_grid_radius && x2 < bottomwidth; 68 | top_channel_x++, x2 += stride2) { 69 | half8 sum4 = (half8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; 70 | 71 | half8 *src0 = (half8 *)(line0 + x1 * max_channels); 72 | half8 *src1 = (half8 *)(line1 + x2 * max_channels); 73 | 74 | #pragma unroll 8 75 | for (int ch = 0; ch < max_channels / 8; ch++) sum4 += (src0[ch]) * (src1[ch]); 76 | 77 | half sum = __builtin_shave_sau_sumx_f16_r(sum4); 78 | dline[(top_channel_x + neighborhood_grid_radius) * topwidth + blockIdx_x] += (sum); 79 | } 80 | } 81 | } 82 | } else { 83 | int neighborhood_grid_width = 2 * neighborhood_grid_radius + 1; 84 | 85 | for (int blockIdx_x = 0; blockIdx_x < topwidth; blockIdx_x++) { 86 | for (int i = 0; i < kernel_size; i++) { 87 | int x1 = blockIdx_x * stride1 + max_displacement + i - padding; 88 | 89 | if ((x1 >= 0) && (x1 < bottomwidth)) { 90 | int o_min = -neighborhood_grid_radius * stride2; 91 | int o_max = neighborhood_grid_width * stride2 - neighborhood_grid_radius * stride2; 92 | if ((o_min) < (-x1)) { 93 | o_min -= ((x1 + o_min - (stride2 - 1)) / stride2) * stride2; 94 | } 95 | if ((o_max) >= (bottomwidth + stride2 - x1)) { 96 | o_max -= ((x1 + o_max - bottomwidth) / stride2) * stride2; 97 | } 98 | 99 | int o = o_min; 100 | for (; o <= o_max - 4 * stride2; o += 4 * stride2) { 101 | half8 *bottom0 = (half8 *)(line0 + x1 * max_channels); 102 | half8 *bottom1_0 = (half8 *)(line1 + (x1 + o + 0 * stride2) * max_channels); 103 | half8 *bottom1_1 = (half8 *)(line1 + (x1 + o + 1 * stride2) * max_channels); 104 | half8 *bottom1_2 = (half8 *)(line1 + (x1 + o + 2 * stride2) * max_channels); 105 | half8 *bottom1_3 = (half8 *)(line1 + (x1 + o + 3 * stride2) * max_channels); 106 | 107 | int c = 0; 108 | 109 | half8 sum40 = 0; 110 | half8 sum41 = 0; 111 | half8 sum42 = 0; 112 | half8 sum43 = 0; 113 | 114 | for (; c <= cur_subchannels / 8 - 4; c += 4) { 115 | sum40 += bottom0[c + 0] * bottom1_0[c + 0]; 116 | sum40 += bottom0[c + 1] * bottom1_0[c + 1]; 117 | sum40 += bottom0[c + 2] * bottom1_0[c + 2]; 118 | sum40 += bottom0[c + 3] * bottom1_0[c + 3]; 119 | 120 | sum41 += bottom0[c + 0] * bottom1_1[c + 0]; 121 | sum41 += bottom0[c + 1] * bottom1_1[c + 1]; 122 | sum41 += bottom0[c + 2] * bottom1_1[c + 2]; 123 | sum41 += bottom0[c + 3] * bottom1_1[c + 3]; 124 | 125 | sum42 += bottom0[c + 0] * bottom1_2[c + 0]; 126 | sum42 += bottom0[c + 1] * bottom1_2[c + 1]; 127 | sum42 += bottom0[c + 2] * bottom1_2[c + 2]; 128 | sum42 += bottom0[c + 3] * bottom1_2[c + 3]; 129 | 130 | sum43 += bottom0[c + 0] * bottom1_3[c + 0]; 131 | sum43 += bottom0[c + 1] * bottom1_3[c + 1]; 132 | sum43 += bottom0[c + 2] * bottom1_3[c + 2]; 133 | sum43 += bottom0[c + 3] * bottom1_3[c + 3]; 134 | } 135 | 136 | for (; c < cur_subchannels / 8; c++) { 137 | sum40 += bottom0[c] * bottom1_0[c]; 138 | sum41 += bottom0[c] * bottom1_1[c]; 139 | sum42 += bottom0[c] * bottom1_2[c]; 140 | sum43 += bottom0[c] * bottom1_3[c]; 141 | } 142 | 143 | half sum0 = __builtin_shave_sau_sumx_f16_r(sum40); 144 | half sum1 = __builtin_shave_sau_sumx_f16_r(sum41); 145 | half sum2 = __builtin_shave_sau_sumx_f16_r(sum42); 146 | half sum3 = __builtin_shave_sau_sumx_f16_r(sum43); 147 | 148 | for (c = c * 8; c < cur_subchannels; c++) { 149 | sum0 += line0[x1 * max_channels + c] * line1[(x1 + o + 0 * stride2) * max_channels + c]; 150 | sum1 += line0[x1 * max_channels + c] * line1[(x1 + o + 1 * stride2) * max_channels + c]; 151 | sum2 += line0[x1 * max_channels + c] * line1[(x1 + o + 2 * stride2) * max_channels + c]; 152 | sum3 += line0[x1 * max_channels + c] * line1[(x1 + o + 3 * stride2) * max_channels + c]; 153 | } 154 | 155 | dline[blockIdx_x + (((o / stride2) + 0) * topwidth + neighborhood_grid_radius * topwidth)] += 156 | sum0; 157 | dline[blockIdx_x + (((o / stride2) + 1) * topwidth + neighborhood_grid_radius * topwidth)] += 158 | sum1; 159 | dline[blockIdx_x + (((o / stride2) + 2) * topwidth + neighborhood_grid_radius * topwidth)] += 160 | sum2; 161 | dline[blockIdx_x + (((o / stride2) + 3) * topwidth + neighborhood_grid_radius * topwidth)] += 162 | sum3; 163 | } 164 | 165 | for (; o < o_max; o += 1 * stride2) { 166 | half8 *bottom0 = (half8 *)(line0 + x1 * max_channels); 167 | half8 *bottom1 = (half8 *)(line1 + (x1 + o) * max_channels); 168 | 169 | int c = 0; 170 | 171 | half8 sum4 = 0; 172 | for (; c <= cur_subchannels / 8 - 4; c += 4) { 173 | sum4 += bottom0[c + 0] * bottom1[c + 0]; 174 | sum4 += bottom0[c + 1] * bottom1[c + 1]; 175 | sum4 += bottom0[c + 2] * bottom1[c + 2]; 176 | sum4 += bottom0[c + 3] * bottom1[c + 3]; 177 | } 178 | for (; c < cur_subchannels / 8; c++) { 179 | sum4 += bottom0[c] * bottom1[c]; 180 | } 181 | 182 | half sum = __builtin_shave_sau_sumx_f16_r(sum4); 183 | 184 | for (c = c * 8; c < cur_subchannels; c++) { 185 | sum += line0[x1 * max_channels + c] * line1[(x1 + o) * max_channels + c]; 186 | } 187 | 188 | dline[blockIdx_x + (((o + neighborhood_grid_radius * stride2) / stride2) * topwidth)] += sum; 189 | } 190 | } 191 | } 192 | } 193 | } 194 | } 195 | 196 | __kernel void correlate2_half( 197 | __global const half *restrict bottom0, 198 | __global const half *restrict bottom1, 199 | __global half *restrict top, 200 | int topwidth, 201 | int topheight, 202 | int bottomwidth, 203 | int bottomheight, 204 | int bottomchannels, 205 | int max_displacement, 206 | int padding, 207 | int neighborhood_grid_radius, 208 | int neighborhood_grid_width, 209 | int kernel_size, 210 | int stride1, 211 | int stride2) 212 | { 213 | int max_channels = (MAX_OPENCL_BUFF_SIZE / sizeof(half) - topwidth * neighborhood_grid_width) / (3 * bottomwidth); 214 | if (max_channels > 64) max_channels = 64; 215 | int subchannels_count = (bottomchannels + max_channels - 1) / max_channels; 216 | int subchannels = (bottomchannels + subchannels_count - 1) / subchannels_count; 217 | if (subchannels < max_channels) subchannels = max_channels; 218 | 219 | const int sumelems = kernel_size * kernel_size * bottomchannels; 220 | 221 | __private half cmx[MAX_OPENCL_BUFF_SIZE / sizeof(half)]; 222 | 223 | __private half *line0 = cmx; 224 | __private half *line1 = line0 + bottomwidth * subchannels; 225 | __private half *dline = line1 + bottomwidth * subchannels; 226 | 227 | int blockIdx_y = get_global_id(0); 228 | 229 | #if defined(USE_DMA) 230 | __private half *dmabuf = dline + topwidth * neighborhood_grid_width; 231 | #endif 232 | 233 | int y1 = blockIdx_y * stride1 + max_displacement; 234 | 235 | for (int j = 0; j < kernel_size; j++) { 236 | for (int bottomchannel = 0; bottomchannel < bottomchannels; bottomchannel += subchannels) { 237 | // configure channel batching 238 | int startchannel = bottomchannel; 239 | int endchannel = startchannel + subchannels > bottomchannels ? bottomchannels : startchannel + subchannels; 240 | int deltachannels = endchannel - startchannel; 241 | 242 | // load line form blob 0 with repackaging 243 | if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight) { 244 | #if defined(USE_DMA) 245 | __global const half *curr = 246 | bottom0 + startchannel * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth; 247 | dmacpyLineSrcStrideStart( 248 | curr, 249 | dmabuf, 250 | bottomwidth * deltachannels * sizeof(half), 251 | bottomwidth * sizeof(half), 252 | bottomwidth * bottomheight * sizeof(half)); 253 | 254 | for (int ch = 0; ch < deltachannels; ch++) { 255 | for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) { 256 | half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x]; 257 | line0[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0]; 258 | line0[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1]; 259 | line0[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2]; 260 | line0[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3]; 261 | 262 | line0[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4]; 263 | line0[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5]; 264 | line0[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6]; 265 | line0[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7]; 266 | } 267 | 268 | for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) { 269 | line0[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth]; 270 | } 271 | } 272 | 273 | if (deltachannels < subchannels) 274 | for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) 275 | memzero( 276 | line0 + blockIdx_x * max_channels + deltachannels, 277 | (subchannels - deltachannels) * sizeof(half)); 278 | #else 279 | for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) { 280 | for (int ch = 0; ch < deltachannels; ch++) 281 | line0[blockIdx_x * max_channels + ch] = bottom0 282 | [(ch + startchannel) * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth 283 | + blockIdx_x]; 284 | 285 | if (deltachannels < subchannels) 286 | memzero( 287 | line0 + blockIdx_x * max_channels + deltachannels, 288 | (subchannels - deltachannels) * sizeof(half)); 289 | } 290 | #endif 291 | } else 292 | memzero(line0, max_channels * bottomwidth * sizeof(half)); 293 | 294 | for (int top_channel_y = 0; top_channel_y < neighborhood_grid_width; top_channel_y++) { 295 | int y2 = y1 + (top_channel_y - neighborhood_grid_radius) * stride2; 296 | 297 | if (y2 + j - padding >= 0 && y2 + j - padding < bottomheight) { 298 | #if defined(USE_DMA) 299 | __global const half *curr = 300 | bottom1 + startchannel * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth; 301 | dmacpyLineSrcStrideStart( 302 | curr, 303 | dmabuf, 304 | bottomwidth * deltachannels * sizeof(half), 305 | bottomwidth * sizeof(half), 306 | bottomwidth * bottomheight * sizeof(half)); 307 | 308 | for (int ch = 0; ch < deltachannels; ch++) { 309 | for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) { 310 | half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x]; 311 | line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0]; 312 | line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1]; 313 | line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2]; 314 | line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3]; 315 | 316 | line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4]; 317 | line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5]; 318 | line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6]; 319 | line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7]; 320 | } 321 | 322 | for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) { 323 | line1[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth]; 324 | } 325 | } 326 | #else 327 | for (int ch = 0; ch < deltachannels; ch++) { 328 | for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) { 329 | half8 val = (( 330 | __global half8 331 | *)(bottom1 + (ch + startchannel) * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth)) 332 | [blockIdx_x]; 333 | line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0]; 334 | line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1]; 335 | line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2]; 336 | line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3]; 337 | 338 | line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4]; 339 | line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5]; 340 | line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6]; 341 | line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7]; 342 | } 343 | for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) { 344 | half val = 345 | (bottom1 + (ch + startchannel) * bottomheight * bottomwidth 346 | + (y2 + j - padding) * bottomwidth)[blockIdx_x]; 347 | line1[(blockIdx_x)*max_channels + ch] = val; 348 | } 349 | } 350 | #endif 351 | for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) { 352 | if (deltachannels < subchannels) 353 | memzero( 354 | line1 + blockIdx_x * max_channels + deltachannels, 355 | (subchannels - deltachannels) * sizeof(half)); 356 | } 357 | } else 358 | memzero(line1, max_channels * bottomwidth * sizeof(half)); 359 | 360 | if (j == 0 && startchannel == 0) { 361 | memzero(dline, neighborhood_grid_width * topwidth * sizeof(half)); 362 | } else { 363 | #if defined(USE_DMA) 364 | dmacpyLineSrcStrideStart( 365 | top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth, 366 | dline, 367 | topwidth * neighborhood_grid_width * sizeof(half), 368 | topwidth * sizeof(half), 369 | topwidth * topheight * sizeof(half)); 370 | #else 371 | for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) { 372 | for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) { 373 | half8 val = (( 374 | __global half8 375 | *)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth))) 376 | [blockIdx_x]; 377 | ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] = val; 378 | } 379 | for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) { 380 | dline[top_channel_x * topwidth + blockIdx_x] = 381 | top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth 382 | + blockIdx_y * topwidth + blockIdx_x]; 383 | } 384 | } 385 | #endif 386 | } 387 | 388 | if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight && y2 + j - padding >= 0 389 | && y2 + j - padding < bottomheight) { 390 | crosscorrh( 391 | line0, 392 | line1, 393 | dline, 394 | topwidth, 395 | max_displacement, 396 | neighborhood_grid_radius, 397 | kernel_size, 398 | padding, 399 | bottomwidth, 400 | stride1, 401 | stride2, 402 | max_channels, 403 | subchannels); 404 | } 405 | 406 | if (j == kernel_size - 1 && endchannel == bottomchannels) { 407 | half8 scale = (half8){ 408 | (half)sumelems, 409 | (half)sumelems, 410 | (half)sumelems, 411 | (half)sumelems, 412 | (half)sumelems, 413 | (half)sumelems, 414 | (half)sumelems, 415 | (half)sumelems}; 416 | for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) { 417 | for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) { 418 | ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] = 419 | ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] / scale; 420 | } 421 | for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) { 422 | dline[top_channel_x * topwidth + blockIdx_x] = 423 | dline[top_channel_x * topwidth + blockIdx_x] / (half)sumelems; 424 | } 425 | } 426 | } 427 | 428 | #if defined(USE_DMA) 429 | dmacpyLineDstStrideStart( 430 | dline, 431 | top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth, 432 | topwidth * neighborhood_grid_width * sizeof(half), 433 | topwidth * sizeof(half), 434 | topwidth * topheight * sizeof(half)); 435 | #else 436 | for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) { 437 | for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) { 438 | ((__global half8 439 | *)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth))) 440 | [blockIdx_x] = ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] 441 | + (half8){0, 0, 0, 0, 0, 0, 0, 0}; 442 | } 443 | for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) { 444 | top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth 445 | + blockIdx_y * topwidth + blockIdx_x] = 446 | dline[top_channel_x * topwidth + blockIdx_x] + (half)0; 447 | } 448 | } 449 | #endif 450 | } 451 | } 452 | } 453 | } 454 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/ctc.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:292de0fbb8dc6ead6970576d1b9a26a323fc9febfceb92c3af6b84496d523def 3 | size 10196 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/ctc.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __global half *find(__global const half *begin, __global const half *end, half value) 9 | { 10 | while (begin != end) { 11 | if (*begin == value) { 12 | return begin; 13 | } 14 | ++begin; 15 | } 16 | return end; 17 | } 18 | 19 | __kernel void CTCDecoder( 20 | __global half *restrict probabilities, 21 | __global half *restrict sequence_indicators, 22 | __global half *restrict output, 23 | int width, 24 | int height, 25 | int channels) 26 | { 27 | __local half local_src[88 * 1 * 77]; 28 | __local half local_dst[88 * 1]; 29 | 30 | event_t e1 = async_work_group_copy_2D2D( 31 | local_src, // dst 32 | probabilities, // src 33 | width, // num_elements_per_line, 34 | height * channels, // num_lines, 35 | width * (height - 1), // src_line_stride, 36 | width * (height - 1), // dst_line_stride, 37 | 0); 38 | 39 | wait_group_events(1, &e1); 40 | 41 | const int T = channels; // Time 42 | const int B = height; // Batches 43 | const int C = width; // Chars 44 | 45 | #pragma unroll 4 46 | for (int i = 0; i < B * T; i++) { 47 | local_dst[i] = -1.h; 48 | } 49 | 50 | int output_index = 0; 51 | 52 | for (int b = 0; b < B; ++b) { 53 | __global const half *restrict seq_ind = sequence_indicators + b * T; 54 | const int seq_len = find(seq_ind + 1, seq_ind + T, 0.h) - seq_ind; 55 | const int time = min(seq_len, T); 56 | 57 | int prev_class_idx = -1; 58 | 59 | #pragma unroll 4 60 | for (int t = 0; t < time; ++t) { 61 | __local const half *restrict probs = local_src + b * C + t * C * B; 62 | 63 | int max_class_idx = 0; 64 | half max_prob = probs[0]; 65 | for (int c = 1; c < C; ++c) { 66 | const half prob = probs[c]; 67 | if (prob > max_prob) { 68 | max_class_idx = c; 69 | max_prob = prob; 70 | } 71 | } 72 | 73 | if (max_class_idx < C - 1 && max_class_idx != prev_class_idx) { 74 | local_dst[b * T + output_index] = (half)max_class_idx; 75 | output_index++; 76 | } 77 | 78 | prev_class_idx = max_class_idx; 79 | } 80 | } 81 | 82 | barrier(CLK_LOCAL_MEM_FENCE); 83 | 84 | event_t e2 = async_work_group_copy_2D2D( 85 | output, // dst 86 | local_dst, // src 87 | channels, // num_elements_per_line, 88 | height, // num_lines, 89 | 0, // src_line_stride, 90 | 0, // dst_line_stride, 91 | 0); 92 | 93 | wait_group_events(1, &e2); 94 | } 95 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/customLayerBindings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | --> 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/cvtf32f16.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:703ef56f84299e76d36b3ba5a632ae3d5e3ecd54761dcfe0006ca69ddce4bc6d 3 | size 2664 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/cvtf32f16.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | 7 | __kernel void cvtf32f16(const __global float* restrict inImage, 8 | __global half* restrict outImage, 9 | float scale, 10 | float bais) 11 | { 12 | int idx = get_global_id(0) 13 | + get_global_id(1) * get_global_size(0) 14 | + get_global_id(2) * get_global_size(0) * get_global_size(1); 15 | 16 | outImage[idx] = convert_half(inImage[idx]*scale+bais); 17 | } 18 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/cvtu8f16.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:46c943e08f37cedac77f727f55835637d4878edcc20aaa24f16ed5888d13bd43 3 | size 4588 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/cvtu8f16.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __kernel void cvtu8f16(__global const uchar *restrict src, __global half *restrict dst, float scale, float bias) 9 | { 10 | __local uchar local_src[8 * 1024]; 11 | __local half local_dst[8 * 1024]; 12 | 13 | event_t e1 = async_work_group_copy_3D3D( 14 | local_src, // dst 15 | src + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0) 16 | + get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // src 17 | get_local_size(0), // num_elements_per_line 18 | get_local_size(0) * get_local_size(1) / (get_local_size(0)), // num_lines 19 | get_global_size(0) - get_local_size(0), // src_line_stride 20 | 0, // dst_line_stride 21 | get_local_size(2), // num planes 22 | get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src plane stride 23 | 0, // dst plane stride 24 | 0); 25 | wait_group_events(1, &e1); 26 | 27 | size_t idx = get_local_id(0) 28 | + get_local_id(1) * get_local_size(0) 29 | + get_local_id(2) * get_local_size(0) * get_local_size(1); 30 | 31 | local_dst[idx] = convert_half(local_src[idx]) * (half)scale + (half)bias; 32 | 33 | barrier(CLK_LOCAL_MEM_FENCE); 34 | 35 | event_t e2 = async_work_group_copy_3D3D( 36 | dst + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0) 37 | + get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // dst 38 | local_dst, // src 39 | get_local_size(0), // num_elements_per_line 40 | get_local_size(1), // num_lines 41 | 0, // src_line_stride 42 | get_global_size(0) - get_local_size(0), // dst_line_stride 43 | get_local_size(2), // num_planes 44 | 0, // src_plane_stride 45 | get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride 46 | 0); 47 | wait_group_events(1, &e2); 48 | } 49 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/detectron_prior_grid_gen.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4c80d556d23f1c959fa10c00ff1cd9c3ae10aba607b37c7a0620d903fc7cedd8 3 | size 6972 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/detectron_prior_grid_gen.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __kernel void experimental_detectron_prior_grid_generator( 9 | __global const half *restrict input_priors, 10 | __global const half *restrict input_feature_map, 11 | __global const half *restrict input_rois, 12 | __global half *restrict output, 13 | int grid_h, 14 | int grid_w, 15 | float stride_h, 16 | float stride_w, 17 | int num_priors, 18 | int num_anchors_per_prior) 19 | { 20 | __local half local_input_priors[8 * 1024]; 21 | __local half local_output[8 * 1024]; 22 | 23 | event_t e1 = async_work_group_copy( 24 | local_input_priors, 25 | input_priors, 26 | num_anchors_per_prior * num_priors, 27 | 0); 28 | wait_group_events(1, &e1); 29 | 30 | int width_start = get_group_id(0) * get_local_size(0); 31 | int width_end = min(width_start + get_local_size(0), (unsigned)grid_w); 32 | int width = width_end - width_start; 33 | 34 | int h = get_group_id(1); 35 | int w_idx = get_group_id(0) * get_local_size(0); 36 | for (int w = 0; w < width; ++w) { 37 | #pragma unroll 4 38 | for (int p = 0; p < num_priors; ++p) { 39 | local_output[(w * num_priors + p) * num_anchors_per_prior + 0] = 40 | local_input_priors[4 * p + 0] 41 | + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5); 42 | local_output[(w * num_priors + p) * num_anchors_per_prior + 1] = 43 | local_input_priors[4 * p + 1] + convert_half(stride_h) * (convert_half(h) + 0.5); 44 | local_output[(w * num_priors + p) * num_anchors_per_prior + 2] = 45 | local_input_priors[4 * p + 2] 46 | + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5); 47 | local_output[(w * num_priors + p) * num_anchors_per_prior + 3] = 48 | local_input_priors[4 * p + 3] + convert_half(stride_h) * (convert_half(h) + 0.5); 49 | } 50 | } 51 | 52 | barrier(CLK_LOCAL_MEM_FENCE); 53 | 54 | event_t e2 = async_work_group_copy_2D2D( 55 | output + get_group_id(0) * get_local_size(0) * num_anchors_per_prior * num_priors 56 | + get_group_id(1) * get_local_size(1) * grid_w * num_anchors_per_prior 57 | * num_priors, // dst 58 | local_output, // src 59 | width * num_anchors_per_prior * num_priors, // num_elements_per_line 60 | 1, // num_lines 61 | (grid_w - width) * num_anchors_per_prior * num_priors, // src_line_stride 62 | (grid_w - width) * num_anchors_per_prior * num_priors, // dst_line_stride 63 | 0); 64 | wait_group_events(1, &e2); 65 | } 66 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/fakequantize.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d17659bbf12a172849085003a055bfb4b91d3bb5bdc7f820395820eaa90b46ef 3 | size 15688 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/fakequantize.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __kernel void quantize( 9 | __global const half *restrict src_data, 10 | __global const half *restrict input_low, 11 | __global const half *restrict input_high, 12 | __global const half *restrict output_low, 13 | __global const half *restrict output_high, 14 | __global half *restrict dst_data, 15 | int levels, 16 | int input_low_size, 17 | int input_high_size, 18 | int output_low_size, 19 | int output_high_size, 20 | int W, 21 | int H) 22 | { 23 | __local half local_src[15 * 1024]; 24 | __local half local_dst[15 * 1024]; 25 | 26 | event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0); 27 | wait_group_events(1, &e1); 28 | 29 | int c = get_group_id(2); 30 | 31 | half h_ilow = (input_low_size == 1 ? input_low[0] : input_low[c]); 32 | half h_ihigh = (input_high_size == 1 ? input_high[0] : input_high[c]); 33 | half h_olow = (output_low_size == 1 ? output_low[0] : output_low[c]); 34 | half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]); 35 | 36 | half const1 = (half)( 37 | !(h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow))); 38 | half const2 = 39 | (half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1)); 40 | 41 | __local const half *restrict src = local_src + W * get_local_id(1); 42 | __local half *restrict dst = local_dst + W * get_local_id(1); 43 | 44 | for (int w = 0; w < W / 8; w++) { 45 | half8 val = *((__local half8 *)src + w); 46 | half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h; 47 | 48 | aux = (half8){ 49 | (half)(short)(aux.s0), 50 | (half)(short)(aux.s1), 51 | (half)(short)(aux.s2), 52 | (half)(short)(aux.s3), 53 | (half)(short)(aux.s4), 54 | (half)(short)(aux.s5), 55 | (half)(short)(aux.s6), 56 | (half)(short)(aux.s7)}; 57 | 58 | aux = aux * (half8)const2 + (half8)h_olow; 59 | 60 | short8 a; 61 | short8 b; 62 | a.s0 = (val.s0 <= h_ilow); 63 | a.s1 = (val.s1 <= h_ilow); 64 | a.s2 = (val.s2 <= h_ilow); 65 | a.s3 = (val.s3 <= h_ilow); 66 | a.s4 = (val.s4 <= h_ilow); 67 | a.s5 = (val.s5 <= h_ilow); 68 | a.s6 = (val.s6 <= h_ilow); 69 | a.s7 = (val.s7 <= h_ilow); 70 | 71 | b.s0 = (val.s0 > h_ihigh); 72 | b.s1 = (val.s1 > h_ihigh); 73 | b.s2 = (val.s2 > h_ihigh); 74 | b.s3 = (val.s3 > h_ihigh); 75 | b.s4 = (val.s4 > h_ihigh); 76 | b.s5 = (val.s5 > h_ihigh); 77 | b.s6 = (val.s6 > h_ihigh); 78 | b.s7 = (val.s7 > h_ihigh); 79 | 80 | a = ~(a - (short8)1); 81 | b = ~(b - (short8)1); 82 | 83 | short8 c1 = (~a & b); 84 | short8 c2 = (~a & ~b); 85 | 86 | short8 res = (a & as_short8((half8)h_olow)) | (c1 & as_short8((half8)h_ohigh)) | (c2 & as_short8(aux)); 87 | 88 | *((__local half8 *)dst + w) = as_half8(res); 89 | } 90 | 91 | for (int w = W & (~0x7); w < W; w++) { 92 | half val = src[w]; 93 | short a = val <= h_ilow; 94 | a = ~(a - 1); 95 | short b = val > h_ihigh; 96 | b = ~(b - 1); 97 | 98 | short c1 = (~a & b); 99 | short c2 = (~a & ~b); 100 | 101 | short res = (a & as_short(h_olow)) | (c1 & as_short(h_ohigh)) 102 | | (c2 & as_short(((half)(round((val - h_ilow) * const1) * const2) + h_olow))); 103 | 104 | dst[w] = as_half(res); 105 | } 106 | 107 | barrier(CLK_LOCAL_MEM_FENCE); 108 | 109 | event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0); 110 | wait_group_events(1, &e2); 111 | } 112 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/grn.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6e3dbe5173ca93f39fecaf29f820e1704bcb485affc1a09554e4c86f8de46214 3 | size 7972 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/grn.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __kernel void grn(__global const half *restrict src_data, __global half *restrict dst_data, int C, float bias) 9 | { 10 | __local half src[8 * 1024]; 11 | __local half dst[8 * 1024]; 12 | 13 | const size_t index = get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0); 14 | 15 | event_t e1 = async_work_group_copy_3D3D( 16 | src, // dst 17 | src_data + index, // src 18 | get_local_size(0), // num_elements_per_line, 19 | get_local_size(1), // num_lines, 20 | get_global_size(0) - get_local_size(0), // src_line_stride, 21 | 0, // dst_line_stride, 22 | C, // num_planes, 23 | get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src_plane_stride 24 | 0, // dst_plane_stride 25 | 0); 26 | wait_group_events(1, &e1); 27 | 28 | float variance = bias + 1e-9f; 29 | 30 | #pragma unroll 8 31 | for (int c = 0; c < C; c++) { 32 | float val = (float)src[c * get_local_size(1) * get_local_size(0) 33 | + get_local_id(1) * get_local_size(0) 34 | + get_local_id(0)]; 35 | variance += val * val; 36 | } 37 | 38 | half hvariance = (half)(native_rsqrt((half)(variance / 16.f)) * 0.25f); 39 | 40 | #pragma unroll 8 41 | for (int c = 0; c < C; c++) { 42 | dst[c * get_local_size(1) * get_local_size(0) 43 | + get_local_id(1) * get_local_size(0) 44 | + get_local_id(0)] = 45 | src[c * get_local_size(1) * get_local_size(0) 46 | + get_local_id(1) * get_local_size(0) + get_local_id(0)] * hvariance; 47 | } 48 | 49 | barrier(CLK_LOCAL_MEM_FENCE); 50 | 51 | event_t e2 = async_work_group_copy_3D3D( 52 | dst_data + index, // src 53 | dst, // dst 54 | get_local_size(0), // num_elements_per_line, 55 | get_local_size(1), // num_lines, 56 | 0, // src_line_stride, 57 | get_global_size(0) - get_local_size(0), // dst_line_stride, 58 | C, // num_planes, 59 | 0, // src_plane_stride 60 | get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride 61 | 0); 62 | wait_group_events(1, &e2); 63 | } 64 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/mvn_reduction.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:581206fc5c0e0d429094bc7076d8772dc6ba69199a3ee75d269f13dc2f0d7ac8 3 | size 7840 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/mvn_reduction.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | // Set to 1 only if output is zerroed before kernel execution 9 | #define USE_ATOMICS 0 10 | 11 | void atomic_add_global(volatile __global float *source, const float operand) 12 | { 13 | union { 14 | unsigned int intVal; 15 | float floatVal; 16 | } newVal; 17 | union { 18 | unsigned int intVal; 19 | float floatVal; 20 | } prevVal; 21 | 22 | do { 23 | prevVal.floatVal = *source; 24 | newVal.floatVal = prevVal.floatVal + operand; 25 | } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); 26 | } 27 | 28 | __kernel void reduction_mean( 29 | __global const half *restrict src, 30 | __global float *restrict mean, 31 | __global float *restrict variance, 32 | int W, 33 | int H, 34 | int across_channels) 35 | { 36 | __local half src_line[4 * 1024]; 37 | event_t e; 38 | 39 | e = async_work_group_copy_2D2D( 40 | src_line, // dst 41 | src + get_group_id(1) * get_local_size(1) * W 42 | + get_group_id(2) * get_local_size(2) * W * get_global_size(1), // src 43 | W * get_local_size(1), // num_elements_per_line, 44 | get_local_size(2), // num_lines, 45 | W * (get_global_size(1) - get_local_size(1)), // src_line_stride, 46 | 0, // dst_line_stride, 47 | 0); 48 | 49 | wait_group_events(1, &e); 50 | 51 | int h = get_global_id(1); 52 | int c = get_global_id(2); 53 | 54 | const int MAX_LOCAL_SIZE = 8; 55 | 56 | __local float mbuf[MAX_LOCAL_SIZE]; 57 | __local float vbuf[MAX_LOCAL_SIZE]; 58 | 59 | mbuf[get_local_id(1)] = 0; 60 | vbuf[get_local_id(1)] = 0; 61 | 62 | if (h < H) { 63 | float sum = 0.f; 64 | float sum2 = 0.f; 65 | 66 | float8 sum4 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; 67 | float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; 68 | 69 | const __local half8 *restrict lsrc = ((const __local half8 *)(src_line + get_local_id(1) * W)); 70 | 71 | #pragma unroll 16 72 | for (size_t w = 0; w < W / 8; w++) { 73 | half8 sh = lsrc[w]; 74 | float8 valf = convert_float8(sh); 75 | 76 | sum4 += valf; 77 | sum24 += valf * valf; 78 | } 79 | 80 | for (size_t w = W / 8 * 8; w < W; w++) { 81 | float val = (float)src_line[get_local_id(1) * W + w]; 82 | sum += val; 83 | sum2 += val * val; 84 | } 85 | 86 | mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum; 87 | vbuf[get_local_id(1)] = 88 | sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2; 89 | } 90 | 91 | barrier(CLK_LOCAL_MEM_FENCE); 92 | 93 | if (get_local_id(1) == 0) { 94 | float res = 0; 95 | float res2 = 0; 96 | 97 | for (int i = 0; i < get_local_size(1); i++) { 98 | res += mbuf[i]; 99 | res2 += vbuf[i]; 100 | } 101 | 102 | // requires memory reset before layer execution 103 | #if USE_ATOMICS 104 | int idx = (across_channels == 0) ? c : 0; 105 | 106 | atomic_add_global(mean + idx, res); 107 | atomic_add_global(variance + idx, res2); 108 | #else 109 | int idx = c * get_num_groups(1) + get_group_id(1); 110 | 111 | mean[idx] = res; 112 | variance[idx] = res2; 113 | #endif 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/mvn_scale.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:527d82ec9e71bbbfaf86c1e3f1b2beea02875cf719a45592b1f3d5e244e5c15c 3 | size 3564 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/mvn_scale.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | // Set to 1 only if output is zerroed before kernel execution 9 | #define USE_ATOMICS 0 10 | 11 | __attribute__((reqd_work_group_size(1, 1, 1))) __kernel void mvn_scale( 12 | const __global half *restrict src, 13 | __global float *restrict mean_part, 14 | __global float *restrict power_mean, 15 | __global half *restrict dst, 16 | int W, 17 | int H1, 18 | int across_channels, 19 | int normalize_variance, 20 | int nparts) 21 | { 22 | __local half src_line[4 * 1024]; 23 | __local half dst_line[4 * 1024]; 24 | 25 | int c = get_group_id(2); 26 | int C = get_global_size(2); 27 | 28 | int h = get_group_id(1); 29 | int H = get_global_size(1); 30 | 31 | event_t e1 = async_work_group_copy(src_line, src + c * H * W + h * W, W, 0); 32 | wait_group_events(1, &e1); 33 | 34 | int idx = (across_channels == 0) ? nparts * c : 0; 35 | float scale = (across_channels == 0) ? H * W : H * W * C; 36 | 37 | #if USE_ATOMICS 38 | float mean = mean_part[idx]; 39 | float variance = power_mean[idx]; 40 | #else 41 | 42 | int total = (across_channels == 0) ? nparts : nparts * C; 43 | float mean = 0.f; 44 | float variance = 0.f; 45 | 46 | for (int i = 0; i < total; i++) { 47 | mean += mean_part[idx + i]; 48 | variance += power_mean[idx + i]; 49 | } 50 | #endif 51 | 52 | mean = mean / scale; 53 | variance = variance / scale; 54 | variance = variance - mean * mean; 55 | variance = native_sqrt(variance) + 1e-9f; 56 | 57 | half hmean = mean; 58 | half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance); 59 | 60 | for (size_t w = 0; w < W; w++) { 61 | dst_line[w] = (src_line[w] - hmean) * hvariance; 62 | } 63 | 64 | barrier(CLK_LOCAL_MEM_FENCE); 65 | 66 | event_t e2 = async_work_group_copy(dst + c * H * W + h * W, dst_line, W, 0); 67 | wait_group_events(1, &e2); 68 | } 69 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/region_chw.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5c02ada5f9718e59c1c908799a77ab383e6ca333d46c9577608bdb9c3bf15388 3 | size 22828 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/region_chw.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0)) 9 | 10 | #define ALLOW_EARLY_RETURN 1 11 | 12 | static void inline logistic_activate(__local const half *restrict src, __local half *restrict dst, int offset) 13 | { 14 | half val = src[offset]; 15 | val = 1.0h / (1.0h + exp2(val * -log_2_e)); 16 | dst[offset] = val; 17 | } 18 | 19 | __kernel void region_chw( 20 | __global const half *restrict src_data, 21 | __global half *restrict dst_data, 22 | int W, 23 | int H, 24 | int classes, 25 | int coords, 26 | int num, 27 | int maskSize, 28 | int doSoftmax) 29 | { 30 | __local half local_src[13 * 13 * (4 + 1 + 80)]; 31 | __local half local_dst[13 * 13 * (4 + 1 + 80)]; 32 | 33 | const int box_sz = W * H * (classes + coords + 1); 34 | event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(1) * box_sz, box_sz, 0); 35 | wait_group_events(1, &e1); 36 | 37 | const int pixel_pos = get_local_id(0); 38 | const int stride = W * H; 39 | 40 | #if ALLOW_EARLY_RETURN 41 | if (pixel_pos < W * H) 42 | #endif 43 | { 44 | __local const half *restrict src = local_src + pixel_pos; 45 | __local half *restrict dst = local_dst + pixel_pos; 46 | 47 | logistic_activate(src, dst, 0 * stride); 48 | logistic_activate(src, dst, 1 * stride); 49 | 50 | //copy plane 2 and 3 51 | dst[2 * stride] = src[2 * stride]; 52 | dst[3 * stride] = src[3 * stride]; 53 | 54 | logistic_activate(src, dst, 4 * stride); 55 | 56 | src += (coords + 1) * stride; 57 | dst += (coords + 1) * stride; 58 | 59 | if (doSoftmax) { 60 | half max_val = src[0]; 61 | #pragma unroll 4 62 | for (int c = 1; c < classes; c++) { 63 | max_val = max(max_val, src[c * stride]); 64 | } 65 | 66 | half expSum = 0.0h; 67 | #pragma unroll 4 68 | for (int c = 0; c < classes; c++) { 69 | const half e = src[c * stride] - max_val; 70 | const half tmp = exp2(e * log_2_e); 71 | dst[c * stride] = tmp; 72 | expSum += tmp; 73 | } 74 | 75 | const half recip = 1.h / expSum; 76 | int c = 0; 77 | for (; c < (classes & ~0x3); c += 4) { 78 | const half t0 = dst[(c + 0) * stride]; 79 | const half t1 = dst[(c + 1) * stride]; 80 | const half t2 = dst[(c + 2) * stride]; 81 | const half t3 = dst[(c + 3) * stride]; 82 | 83 | const half e0 = t0 * recip; 84 | const half e1 = t1 * recip; 85 | const half e2 = t2 * recip; 86 | const half e3 = t3 * recip; 87 | 88 | dst[(c + 0) * stride] = e0; 89 | dst[(c + 1) * stride] = e1; 90 | dst[(c + 2) * stride] = e2; 91 | dst[(c + 3) * stride] = e3; 92 | } 93 | for (; c < classes; c++) { 94 | dst[c * stride] *= recip; 95 | } 96 | } else { 97 | #pragma unroll 4 98 | for (int c = 0; c < classes; c++) { 99 | logistic_activate(src, dst, c * stride); 100 | } 101 | } 102 | } 103 | 104 | barrier(CLK_LOCAL_MEM_FENCE); 105 | 106 | event_t e2 = async_work_group_copy(dst_data + get_group_id(1) * box_sz, local_dst, box_sz, 0); 107 | wait_group_events(1, &e2); 108 | } 109 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/region_hwc.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:88f7bb144d85d08e9f7879e43ed6f8722bb2f93534e5becd8c7ff2a220cdd9f3 3 | size 81896 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/region_hwc.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0)) 9 | 10 | #define ALLOW_EARLY_RETURN 1 11 | 12 | static void inline logistic_activate_hwc( 13 | __local const half *restrict src, 14 | __local half *restrict dst, 15 | int offset, 16 | int stride) 17 | { 18 | half val = src[offset]; 19 | val = 1.0h / (1.0h + exp2(val * -log_2_e)); 20 | dst[offset * stride] = val; 21 | } 22 | 23 | __kernel void region_hwc( 24 | __global const half *restrict src, 25 | __global half *restrict dst, 26 | int W, 27 | int H, 28 | int classes, 29 | int coords, 30 | int num, 31 | int maskSize, 32 | int doSoftmax) 33 | { 34 | __local half local_src[13 * 13 * (4 + 1 + 80)]; 35 | __local half local_dst[13 * 13 * (4 + 1 + 80)]; 36 | 37 | const int pixel_pos = get_local_id(0); 38 | 39 | const int local_C = classes + coords + 1; 40 | const int c = get_group_id(1) * local_C; 41 | const int h = get_group_id(0); 42 | 43 | num = (doSoftmax != 0) * num + (doSoftmax == 0) * maskSize; 44 | const int C = local_C * num; 45 | 46 | event_t e1 = async_work_group_copy_2D2D( 47 | local_src, // dst 48 | src + h * W * C + c, // src 49 | local_C, // num_elements_per_line, 50 | H * W, // num_lines, 51 | C - local_C, // src_line_stride, 52 | 0, // dst_line_stride, 53 | 0); 54 | 55 | wait_group_events(1, &e1); 56 | 57 | #if ALLOW_EARLY_RETURN 58 | if (pixel_pos < W * H) 59 | #endif 60 | { 61 | const int w = pixel_pos % W; 62 | const int h = pixel_pos / W; 63 | 64 | __local const half *restrict src = local_src + h * W * local_C + w * local_C; 65 | __local half *restrict dst = local_dst + h * W + w; 66 | 67 | const int stride = H * W; 68 | logistic_activate_hwc(src, dst, 0, stride); 69 | logistic_activate_hwc(src, dst, 1, stride); 70 | 71 | //copy plane 2 and 3 72 | dst[2 * stride] = src[2]; 73 | dst[3 * stride] = src[3]; 74 | 75 | logistic_activate_hwc(src, dst, 4, stride); 76 | 77 | src += coords + 1; 78 | dst += (coords + 1) * stride; 79 | 80 | if (doSoftmax) { 81 | half max_val = src[0]; 82 | #pragma unroll 4 83 | for (int c = 1; c < classes; c++) { 84 | max_val = max(max_val, src[c]); 85 | } 86 | 87 | half expSum = 0.0h; 88 | #pragma unroll 4 89 | for (int c = 0; c < classes; c++) { 90 | const half e = src[c] - max_val; 91 | const half tmp = exp2(e * log_2_e); 92 | dst[c * stride] = tmp; 93 | expSum += tmp; 94 | } 95 | 96 | const half invExpSum = 1.0h / expSum; 97 | #pragma unroll 4 98 | for (int c = 0; c < classes; c++) { 99 | dst[c * stride] *= invExpSum; 100 | } 101 | } else { 102 | #pragma unroll 4 103 | for (int c = 0; c < classes; c++) { 104 | logistic_activate_hwc(src, dst, c, stride); 105 | } 106 | } 107 | } 108 | 109 | barrier(CLK_LOCAL_MEM_FENCE); 110 | 111 | const int box_sz = W * H * (classes + coords + 1); 112 | event_t e2 = async_work_group_copy(dst + get_group_id(1) * box_sz, local_dst, box_sz, 0); 113 | wait_group_events(1, &e2); 114 | } 115 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/reorg_chw.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:16758c3a629f5e397b7b51d417686fd745603c119e1f5d9985b05f4f3ef7efc7 3 | size 12208 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/reorg_chw.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __kernel void reorg_chw( 9 | __global const half *restrict src, 10 | __global half *restrict dst, 11 | int W, 12 | int H, 13 | int C, 14 | int stride) 15 | { 16 | __local half local_src[8 * 1024]; 17 | __local half local_dst[8 * 1024]; 18 | 19 | event_t e1 = async_work_group_copy_2D2D( 20 | local_src, // dst 21 | src + get_group_id(1) * W * stride 22 | + get_group_id(0) * W * stride * stride, // src 23 | W * stride, // num_elements_per_line, 24 | get_local_size(0), // num_lines, 25 | W * stride * (stride * get_num_groups(0) - 1), // src_line_stride, 26 | 0, // dst_line_stride, 27 | 0); 28 | wait_group_events(1, &e1); 29 | 30 | const int c = get_local_id(0); 31 | const int stride_x = get_local_id(1); 32 | 33 | const int srcIdx = stride_x + c * W * stride; 34 | const int dstIdx = stride_x * W * get_local_size(0) + c * W; 35 | 36 | int x = 0; 37 | for (; x <= W - 8; x += 8) { 38 | half8 data = (half8){ 39 | local_src[srcIdx + (x + 0) * stride], 40 | local_src[srcIdx + (x + 1) * stride], 41 | local_src[srcIdx + (x + 2) * stride], 42 | local_src[srcIdx + (x + 3) * stride], 43 | local_src[srcIdx + (x + 4) * stride], 44 | local_src[srcIdx + (x + 5) * stride], 45 | local_src[srcIdx + (x + 6) * stride], 46 | local_src[srcIdx + (x + 7) * stride]}; 47 | 48 | *((__local half8 *)(&local_dst[dstIdx + x])) = data; 49 | } 50 | 51 | for (; x < W; x++) { 52 | local_dst[dstIdx + x] = local_src[srcIdx + x * stride]; 53 | } 54 | 55 | barrier(CLK_LOCAL_MEM_FENCE); 56 | 57 | event_t e2 = async_work_group_copy_2D2D( 58 | dst + get_group_id(0) * W 59 | + get_group_id(1) * W * stride * get_global_size(0), // dst 60 | local_dst, // src 61 | W, // num_elements_per_line 62 | get_local_size(0) * stride, // num_lines 63 | 0, // src_line_stride 64 | W * (get_num_groups(0) - 1), // dst_line_stride 65 | 0); 66 | wait_group_events(1, &e2); 67 | } 68 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/reorg_hwc.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d57494f39baecc4011f87ab5241c1b0a19a07a7bbd14e20b135a94a0d7ecb3c1 3 | size 42144 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/reorg_hwc.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | __kernel void reorg_hwc( 9 | __global half const *restrict src, 10 | __global half *restrict dst, 11 | int W, 12 | int H, 13 | int C, 14 | int stride) 15 | { 16 | __local half local_src[8 * 1024]; 17 | __local half local_dst[8 * 1024]; 18 | 19 | event_t e1 = async_work_group_copy_2D2D( 20 | local_src, // dst 21 | src + get_group_id(0) * stride + get_group_id(1) * C, // src 22 | stride, // num_elements_per_line 23 | H * W / stride, // num_lines 24 | (C - 1) * stride, // src_line_stride 25 | 0, // dst_line_stride 26 | 0); 27 | wait_group_events(1, &e1); 28 | 29 | const int stride_y = get_local_id(1); 30 | const int blocks = get_local_size(0); 31 | const int b = get_local_id(0); 32 | 33 | const int OC = stride * stride; 34 | const int OH = H / stride; 35 | const int OW = W / stride; 36 | const int IC = stride; 37 | const int IH = H; 38 | const int IW = W / stride; 39 | 40 | for (int block_h = 0; block_h < stride; block_h++) { 41 | const int src_line = b * stride * stride + stride_y * stride + block_h; 42 | const int c = src_line / IH; 43 | const int h = src_line % IH; 44 | 45 | const int dst_line = b * stride + stride_y * blocks * stride + block_h; 46 | const int oc = dst_line / OH; 47 | const int oh = dst_line % OH; 48 | 49 | for (int w = 0; w < W / stride; w++) { 50 | local_dst[oh * OW * OC + w * OC + oc] = local_src[h * IW * IC + w * IC + c]; 51 | } 52 | } 53 | 54 | barrier(CLK_LOCAL_MEM_FENCE); 55 | 56 | event_t e2 = async_work_group_copy_2D2D( 57 | dst + get_group_id(1) * C + get_group_id(0) * stride, // dst 58 | local_dst, // src 59 | stride, // num_elements_per_line 60 | W * H / stride, // num_lines 61 | 0, // src_line_stride 62 | C * stride - stride, // dst_line_stride 63 | 0); 64 | wait_group_events(1, &e2); 65 | } 66 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/reorg_hwc_naive.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:14d3b747218694b644afe03e205955c1a5be042b2d7e62d261973a4ea1b8aaa8 3 | size 13396 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/reorg_hwc_naive.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | 7 | __kernel void reorg_hwc_naive( 8 | __global half const *restrict src, 9 | __global half *restrict dst, 10 | int W, 11 | int H, 12 | int C, 13 | int stride) 14 | { 15 | const int out_c = C / (stride * stride); 16 | const int oc = C * (stride * stride); 17 | const int oh = H / stride; 18 | const int ow = W / stride; 19 | 20 | const int c = get_global_id(0); 21 | 22 | for (int h = 0; h < H; ++h) { 23 | int in_index = W * (h + H * c) + (0); 24 | int new_z = in_index / (oh * ow); 25 | int new_y = (in_index % (oh * ow)) / ow; 26 | int new_x = (in_index % (oh * ow)) % ow; 27 | int new_index = new_z + new_x * oc + new_y * oc * ow; 28 | 29 | in_index++; 30 | 31 | int c2 = c % out_c; 32 | int offset = c / out_c; 33 | int w2 = 0 * stride + offset % stride; 34 | int h2 = h * stride + offset / stride; 35 | int out_index = w2 + W * stride * (h2 + H * stride * c2); 36 | 37 | #pragma unroll 2 38 | for (int i = 0; i < W; ++i, out_index += stride, in_index++) { 39 | // repacking coordinates 40 | int k0 = out_index / (H * W); 41 | int j0 = (out_index % (H * W)) / W; 42 | int i0 = (out_index % (H * W)) % W; 43 | int out_index_repack = k0 + C * i0 + C * W * j0; 44 | 45 | dst[new_index] = src[out_index_repack]; 46 | 47 | int new_z = in_index / (oh * ow); 48 | int new_y = (in_index % (oh * ow)) / ow; 49 | int new_x = (in_index % (oh * ow)) % ow; 50 | new_index = new_z + new_x * oc + new_y * oc * ow; 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/resample_AA.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:98ca582aaa70d3e7cc339aba88150eb12839ab67a0fac9c563f8c8ea37e705e2 3 | size 67860 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/resample_AA.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | #define USE_OPTIMIZED_ROUND 9 | 10 | #ifdef USE_OPTIMIZED_ROUND 11 | #define ROUND(x) ((int)((x) + 0.5f)) 12 | #else 13 | #define ROUND(x) (int)(round(x)) 14 | #endif 15 | 16 | inline int out_to_in(float ox, float f) 17 | { 18 | #ifdef USE_OPTIMIZED_ROUND 19 | return (int)((ox + 0.5f) / f); 20 | #else 21 | return ROUND((ox + 0.5f) / f - 0.5f); 22 | #endif 23 | } 24 | 25 | static inline float triangleCoeff(float x) { return 1.0f - fabs(x); } 26 | 27 | static inline float4 triangleCoeff4(float4 x) { return 1.0f - fabs(x); } 28 | 29 | __kernel void resample_with_antialias( 30 | __global const half *restrict src, 31 | __global half *restrict dst, 32 | int iw, 33 | int ih, 34 | float factor, 35 | int ow, 36 | int oh, 37 | int channels) 38 | { 39 | __local half local_src[20 * 1024]; 40 | __local half local_dst[8 * 1024]; 41 | 42 | const int r = (factor > 1.0f) ? 2 : ceil(1.0f / factor); 43 | const int oy_first = get_group_id(1) * get_local_size(1); 44 | const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1; 45 | const int iy_first = max(out_to_in(oy_first, factor) - r, 0); 46 | const int iy_last = min(out_to_in(oy_last, factor) + r, ih - 1); 47 | const int iy_size = iy_last - iy_first + 1; 48 | 49 | event_t e1 = async_work_group_copy_2D2D( 50 | local_src, // dst 51 | src + get_group_id(2) * get_local_size(2) * ih * iw + iy_first * iw, // src 52 | iy_size * iw, // num_elements_per_line, 53 | get_local_size(2), // num_lines, 54 | (ih - iy_size) * iw, // src_line_stride, 55 | 0, // dst_line_stride, 56 | 0); 57 | wait_group_events(1, &e1); 58 | 59 | const int oy = get_global_id(1); 60 | const float iy_f = ((oy + 0.5f) / factor - 0.5f) - iy_first; 61 | const int iy = ROUND(iy_f); 62 | 63 | __local half const *restrict start_src = 64 | local_src + iw * get_local_id(1) + iw * iy_size * get_local_id(2); 65 | __local half *restrict start_dst = 66 | local_dst + ow * get_local_id(1) + ow * get_local_size(1) * get_local_id(2); 67 | 68 | for (int ox = 0; ox < ow; ox++) { 69 | const float ix_f = (float)((ox + 0.5f) / factor) - 0.5f; 70 | const int ix_i = ROUND(ix_f); 71 | 72 | float4 v_sum = 0.f; 73 | float4 v_wsum = 0.f; 74 | for (int y = 0; y < iy_size; y++) { 75 | float dy = iy_f - y; 76 | int x = max(ix_i - r, 0); 77 | int end_x = min(ix_i + r, iw - 1); 78 | 79 | float4 dx; 80 | for (int i = 0; i < 4; i++) dx[i] = ix_f - x - i; 81 | 82 | for (; x < end_x - 3; x += 4, dx -= 4) { 83 | float4 w = 84 | factor * triangleCoeff4(factor * dx) * factor * triangleCoeff(factor * dy); 85 | float4 src_vec = { 86 | start_src[y * iw + x + 0], 87 | start_src[y * iw + x + 1], 88 | start_src[y * iw + x + 2], 89 | start_src[y * iw + x + 3]}; 90 | 91 | v_sum += w * src_vec; 92 | v_wsum += w; 93 | } 94 | 95 | for (; x <= end_x; x++) { 96 | float dx = ix_f - x; 97 | float w = factor * triangleCoeff(factor * dx) * factor * triangleCoeff(factor * dy); 98 | 99 | v_sum[0] += w * start_src[y * iw + x]; 100 | v_wsum[0] += w; 101 | } 102 | } 103 | 104 | v_sum[0] = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3]; 105 | v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3]; 106 | 107 | start_dst[get_local_id(1) * ow + ox] = (!v_wsum[0]) ? 0.0f : (half)(v_sum[0] / v_wsum[0]); 108 | } 109 | 110 | barrier(CLK_LOCAL_MEM_FENCE); 111 | 112 | event_t e2 = async_work_group_copy_2D2D( 113 | dst + get_group_id(2) * get_local_size(2) * get_global_size(1) * ow 114 | + get_group_id(1) * get_local_size(1) * ow, // dst 115 | local_dst, // src 116 | get_local_size(1) * ow, // num_elements_per_line, 117 | get_local_size(2), // num_lines, 118 | 0, // src_line_stride, 119 | (get_global_size(1) - get_local_size(1)) * ow, // dst_line_stride, 120 | 0); 121 | wait_group_events(1, &e2); 122 | } 123 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/resample_noAA.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9c67917cd959fe4add69b139f44ef07e36c4ca37bc2b8a47b2bdfe48e8a3f559 3 | size 68828 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/resample_noAA.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | #define USE_OPTIMIZED_ROUND 9 | 10 | #ifdef USE_OPTIMIZED_ROUND 11 | #define ROUND(x) ((int)((x) + 0.5f)) 12 | #else 13 | #define ROUND(x) (int)(round(x)) 14 | #endif 15 | 16 | inline int out_to_in(float ox, float f) { return (int)((ox + 0.5f) * f); } 17 | 18 | void interpolationCHW_nn(__local half *psrc, __local half *pdst, int OW, int IW, int C, float rw, float rh) 19 | { 20 | float alpha = rh / 2.0f - 0.5f; 21 | 22 | for (int w = 0; w < OW / 8; w++) { 23 | float fw0 = rw * (w * 8 + 0) + alpha; 24 | float fw1 = rw * (w * 8 + 1) + alpha; 25 | float fw2 = rw * (w * 8 + 2) + alpha; 26 | float fw3 = rw * (w * 8 + 3) + alpha; 27 | 28 | float fw4 = rw * (w * 8 + 4) + alpha; 29 | float fw5 = rw * (w * 8 + 5) + alpha; 30 | float fw6 = rw * (w * 8 + 6) + alpha; 31 | float fw7 = rw * (w * 8 + 7) + alpha; 32 | 33 | int iw0 = min((int)ROUND(fw0), IW - 1); 34 | int iw1 = min((int)ROUND(fw1), IW - 1); 35 | int iw2 = min((int)ROUND(fw2), IW - 1); 36 | int iw3 = min((int)ROUND(fw3), IW - 1); 37 | 38 | int iw4 = min((int)ROUND(fw4), IW - 1); 39 | int iw5 = min((int)ROUND(fw5), IW - 1); 40 | int iw6 = min((int)ROUND(fw6), IW - 1); 41 | int iw7 = min((int)ROUND(fw7), IW - 1); 42 | 43 | for (int c = 0; c < C; c++) { 44 | half8 val = { 45 | *((__local half *)(psrc + c * IW + iw0)), 46 | *((__local half *)(psrc + c * IW + iw1)), 47 | *((__local half *)(psrc + c * IW + iw2)), 48 | *((__local half *)(psrc + c * IW + iw3)), 49 | 50 | *((__local half *)(psrc + c * IW + iw4)), 51 | *((__local half *)(psrc + c * IW + iw5)), 52 | *((__local half *)(psrc + c * IW + iw6)), 53 | *((__local half *)(psrc + c * IW + iw7)), 54 | }; 55 | *((__local half8 *)(pdst + c * OW + w * 8)) = val; 56 | } 57 | } 58 | 59 | for (int w = OW / 8 * 8; w < OW; w++) { 60 | float fw = rw * w + alpha; 61 | int iw0 = min((int)ROUND(fw), IW - 1); 62 | 63 | for (int c = 0; c < C; c++) { 64 | *((__local half *)(pdst + c * OW + w)) = *((__local half *)(psrc + c * IW + iw0)); 65 | } 66 | } 67 | } 68 | 69 | kernel void resample_nearest( 70 | __global const half *restrict src, 71 | __global half *restrict dst, 72 | int iw, 73 | int ih, 74 | float factor, 75 | int ow, 76 | int oh, 77 | int channels) 78 | { 79 | __local half local_src[14 * 1024]; 80 | __local half local_dst[14 * 1024]; 81 | 82 | const int oy_first = get_group_id(1) * get_local_size(1); 83 | const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1; 84 | const int iy_first = out_to_in(oy_first, 1.0 / factor); 85 | const int iy_last = out_to_in(oy_last, 1.0 / factor); 86 | 87 | const int iy_size = iy_last - iy_first + 1; 88 | 89 | event_t e1 = async_work_group_copy_2D2D( 90 | local_src, // dst 91 | src + get_group_id(2) * channels * ih * iw + iy_first * iw, // src 92 | iy_size * iw, // num_elements_per_line, 93 | channels, // num_lines, 94 | ih * iw - iy_size * iw, // src_line_stride, 95 | 0, // dst_line_stride, 96 | 0); 97 | 98 | wait_group_events(1, &e1); 99 | 100 | interpolationCHW_nn(local_src, local_dst, ow, iw, channels, 1.0 / factor, 1.0 / factor); 101 | 102 | event_t e2 = async_work_group_copy_2D2D( 103 | dst + get_group_id(2) * channels * get_global_size(1) * ow + get_group_id(1) * get_local_size(1) * ow, // dst 104 | local_dst, // src 105 | get_local_size(1) * ow, // size_t num_elements_per_line, 106 | channels, // size_t num_lines, 107 | 0, // size_t src_line_stride, 108 | get_global_size(1) * ow - get_local_size(1) * ow, // size_t dst_line_stride, 109 | 0); 110 | 111 | wait_group_events(1, &e2); 112 | } 113 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/shuffle_channels.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:230aa8e01a387beb4de512de1f3599867cc74dc36578359d78f5c856af9428cd 3 | size 10740 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/shuffle_channels.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | 7 | __kernel void ShuffleChannel( 8 | __global const half *restrict src_data, 9 | __global half *restrict dst_data, 10 | int C, 11 | int H, 12 | int W, 13 | int G) 14 | { 15 | int c = get_global_id(0); 16 | if (c >= C) return; 17 | int CX = C / G; 18 | int CY = G; 19 | int cy = c % G; 20 | int cx = c / G; 21 | 22 | __global const half8 *src_line = 23 | ((__global const half8 *)(src_data + cy * CX * H * W + cx * H * W)); 24 | __global half8 *dst_line = ((__global half8 *)(dst_data + cx * CY * H * W + cy * H * W)); 25 | 26 | for (int i = 0; i < W * H / 8; i++) { 27 | dst_line[i] = src_line[i]; 28 | } 29 | 30 | for (int i = W * H / 8 * 8; i < W * H; i++) { 31 | dst_data[cx * CY * H * W + cy * H * W + i] = src_data[cy * CX * H * W + cx * H * W + i]; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/st.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0f4e46dc0a701ea9d32ed113d8be30448306fea92560e617bed9605e24d1d6fb 3 | size 20376 4 | -------------------------------------------------------------------------------- /openvino/vpu_custom_kernels/st.cl: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2018-2022 Intel Corporation 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable 7 | 8 | #define MAX_WIDTH 512 9 | 10 | __attribute__((noinline)) void calcInd( 11 | __global const half *restrict theta, 12 | __local half *restrict weight, 13 | __local int *restrict ind, 14 | int y, 15 | int H, 16 | int x0, 17 | int length, 18 | int step, 19 | int W) 20 | { 21 | float a = (float)y * 1.0f / H * 2 - 1; 22 | 23 | int x = 0; 24 | 25 | float8 va = (float8){a, a, a, a, a, a, a, a}; 26 | float8 vxy = (float8){x0 + 0, x0 + 1, x0 + 2, x0 + 3, x0 + 4, x0 + 5, x0 + 6, x0 + 7}; 27 | 28 | for (; x <= length - 8; x += 8, vxy += 8) { 29 | float8 va1 = vxy * 1.0f / W * 2 - 1.f; 30 | 31 | float8 vx = (va * theta[0] + va1 * theta[1] + theta[2] + 1.f) / 2.f * H; 32 | float8 vy = (va * theta[3] + va1 * theta[4] + theta[5] + 1.f) / 2.f * W; 33 | 34 | const int8 ix = convert_int8(vx) - ((vx < 0) & 1); 35 | const int8 iy = convert_int8(vy) - ((vy < 0) & 1); 36 | 37 | float8 ax = vx - convert_float8(ix); 38 | float8 ay = vy - convert_float8(iy); 39 | float8 bx = 1.f - ax; 40 | float8 by = 1.f - ay; 41 | 42 | union { 43 | int8 d; 44 | uint8 i; 45 | } check_x; 46 | 47 | check_x.d = ix; 48 | int8 b01 = check_x.i < (uint8)H; 49 | 50 | check_x.d = ix + 1; 51 | int8 b45 = check_x.i < (uint8)H; 52 | 53 | union { 54 | int8 d; 55 | uint8 i; 56 | } check_y; 57 | 58 | check_y.d = iy; 59 | int8 b23 = check_y.i < (uint8)W; 60 | 61 | check_y.d = iy + 1; 62 | int8 b67 = check_y.i < (uint8)W; 63 | 64 | int8 b0123 = b01 & b23; 65 | int8 b0167 = b01 & b67; 66 | int8 b4523 = b45 & b23; 67 | int8 b4567 = b45 & b67; 68 | 69 | int8 TL_id = ((ix + 0) * W + (iy + 0)) * (b0123 & 1); 70 | int8 BL_id = ((ix + 1) * W + (iy + 0)) * (b4523 & 1); 71 | int8 TR_id = ((ix + 0) * W + (iy + 1)) * (b0167 & 1); 72 | int8 BR_id = ((ix + 1) * W + (iy + 1)) * (b4567 & 1); 73 | 74 | union { 75 | float8 f; 76 | int8 i; 77 | } w0; 78 | w0.f = bx * by; 79 | union { 80 | float8 f; 81 | int8 i; 82 | } w1; 83 | w1.f = ax * by; 84 | union { 85 | float8 f; 86 | int8 i; 87 | } w2; 88 | w2.f = bx * ay; 89 | union { 90 | float8 f; 91 | int8 i; 92 | } w3; 93 | w3.f = ax * ay; 94 | 95 | w0.i = w0.i & b0123; 96 | w1.i = w1.i & b4523; 97 | w2.i = w2.i & b0167; 98 | w3.i = w3.i & b4567; 99 | 100 | *((__local half8 *)(weight + x + 0 * step)) = convert_half8(w0.f); 101 | *((__local half8 *)(weight + x + 1 * step)) = convert_half8(w1.f); 102 | *((__local half8 *)(weight + x + 2 * step)) = convert_half8(w2.f); 103 | *((__local half8 *)(weight + x + 3 * step)) = convert_half8(w3.f); 104 | 105 | *((__local int8 *)(ind + x + 0 * step)) = TL_id; 106 | *((__local int8 *)(ind + x + 1 * step)) = BL_id; 107 | *((__local int8 *)(ind + x + 2 * step)) = TR_id; 108 | *((__local int8 *)(ind + x + 3 * step)) = BR_id; 109 | } 110 | 111 | for (; x < length; x++) { 112 | float a1 = (float)(x0 + x) * 1.0f / W * 2 - 1; 113 | 114 | float fx = (a * theta[0] + a1 * theta[1] + theta[2] + 1) / 2 * H; 115 | float fy = (a * theta[3] + a1 * theta[4] + theta[5] + 1) / 2 * W; 116 | 117 | const int ix = (int)(fx) - (fx < 0); 118 | const int iy = (int)(fy) - (fy < 0); 119 | 120 | float ax = fx - ix; 121 | float ay = fy - iy; 122 | float bx = 1 - ax; 123 | float by = 1 - ay; 124 | 125 | int b0 = ix >= 0; 126 | int b4 = ix >= -1; 127 | int b1 = ix < H; 128 | int b5 = ix < H - 1; 129 | 130 | int b2 = iy >= 0; 131 | int b6 = iy >= -1; 132 | int b3 = iy < W; 133 | int b7 = iy < W - 1; 134 | 135 | int b01 = b0 & b1; 136 | int b23 = b2 & b3; 137 | int b45 = b4 & b5; 138 | int b67 = b6 & b7; 139 | 140 | int b0123 = b01 & b23; 141 | int b0167 = b01 & b67; 142 | int b4523 = b45 & b23; 143 | int b4567 = b45 & b67; 144 | 145 | int TL_id = ((ix + 0) * W + (iy + 0)) * b0123; 146 | int BL_id = ((ix + 1) * W + (iy + 0)) * b4523; 147 | int TR_id = ((ix + 0) * W + (iy + 1)) * b0167; 148 | int BR_id = ((ix + 1) * W + (iy + 1)) * b4567; 149 | 150 | half w0 = bx * by * b0123; 151 | half w1 = ax * by * b4523; 152 | half w2 = bx * ay * b0167; 153 | half w3 = ax * ay * b4567; 154 | 155 | weight[x + 0 * step] = w0; 156 | weight[x + 1 * step] = w1; 157 | weight[x + 2 * step] = w2; 158 | weight[x + 3 * step] = w3; 159 | 160 | ind[x + 0 * step] = TL_id; 161 | ind[x + 1 * step] = BL_id; 162 | ind[x + 2 * step] = TR_id; 163 | ind[x + 3 * step] = BR_id; 164 | } 165 | } 166 | 167 | __attribute__((noinline)) void apply( 168 | __global half const *restrict src, 169 | __local half const *restrict weight, 170 | __local int const *restrict ind, 171 | __local half *restrict dst, 172 | int src_stride, 173 | int step) 174 | { 175 | int x = 0; 176 | for (; x <= src_stride - 8; x += 8) { 177 | int8 TL_id = *((__local int8 *)(ind + x + 0 * step)); 178 | int8 BL_id = *((__local int8 *)(ind + x + 1 * step)); 179 | int8 TR_id = *((__local int8 *)(ind + x + 2 * step)); 180 | int8 BR_id = *((__local int8 *)(ind + x + 3 * step)); 181 | 182 | half8 w00 = *((__local half8 *)(weight + x + 0 * step)); 183 | half8 w01 = *((__local half8 *)(weight + x + 1 * step)); 184 | half8 w02 = *((__local half8 *)(weight + x + 2 * step)); 185 | half8 w03 = *((__local half8 *)(weight + x + 3 * step)); 186 | 187 | half8 TL = (half8){ 188 | src[TL_id[0]], src[TL_id[1]], 189 | src[TL_id[2]], src[TL_id[3]], 190 | src[TL_id[4]], src[TL_id[5]], 191 | src[TL_id[6]], src[TL_id[7]]}; 192 | half8 TR = (half8){ 193 | src[TR_id[0]], src[TR_id[1]], 194 | src[TR_id[2]], src[TR_id[3]], 195 | src[TR_id[4]], src[TR_id[5]], 196 | src[TR_id[6]], src[TR_id[7]]}; 197 | half8 BL = (half8){ 198 | src[BL_id[0]], src[BL_id[1]], 199 | src[BL_id[2]], src[BL_id[3]], 200 | src[BL_id[4]], src[BL_id[5]], 201 | src[BL_id[6]], src[BL_id[7]]}; 202 | half8 BR = (half8){ 203 | src[BR_id[0]], src[BR_id[1]], 204 | src[BR_id[2]], src[BR_id[3]], 205 | src[BR_id[4]], src[BR_id[5]], 206 | src[BR_id[6]], src[BR_id[7]]}; 207 | 208 | half8 res = w00 * TL + w01 * BL + w02 * TR + w03 * BR; 209 | 210 | *((__local half8 *)(dst + x)) = res; 211 | } 212 | 213 | for (; x < src_stride; x++) { 214 | int TL_id = ind[x + 0 * step]; 215 | int BL_id = ind[x + 1 * step]; 216 | int TR_id = ind[x + 2 * step]; 217 | int BR_id = ind[x + 3 * step]; 218 | 219 | half w00 = weight[x + 0 * step]; 220 | half w01 = weight[x + 1 * step]; 221 | half w02 = weight[x + 2 * step]; 222 | half w03 = weight[x + 3 * step]; 223 | 224 | half TL = src[TL_id]; 225 | half TR = src[TR_id]; 226 | half BL = src[BL_id]; 227 | half BR = src[BR_id]; 228 | 229 | half res = w00 * TL + w01 * BL + w02 * TR + w03 * BR; 230 | 231 | dst[x] = res; 232 | } 233 | } 234 | 235 | __kernel void ocl_st( 236 | __global half const *const restrict src_data, 237 | __global half const *const restrict theta, 238 | __global half *const restrict dst_data, 239 | int C, 240 | int W) 241 | { 242 | __local int ind[4 * MAX_WIDTH] __attribute__((aligned(16))); 243 | __local half weight[4 * MAX_WIDTH] __attribute__((aligned(16))); 244 | __local half local_dst[4 * 1024]; 245 | 246 | int w = get_group_id(0); 247 | 248 | int y = get_global_id(1); 249 | int H = get_global_size(1); 250 | 251 | const int x0 = w * MAX_WIDTH; 252 | const int x1 = min(x0 + MAX_WIDTH, W); 253 | const int src_stride = x1 - x0; 254 | 255 | calcInd(theta, weight, ind, y, H, x0, src_stride, MAX_WIDTH, W); 256 | 257 | for (int c = 0; c < C; c++) { 258 | __global half const *restrict src = src_data + c * H * W; 259 | __local half *restrict dst = local_dst + c * get_local_size(1) * src_stride + get_local_id(1) * src_stride; 260 | 261 | apply(src, weight, ind, dst, src_stride, MAX_WIDTH); 262 | } 263 | 264 | barrier(CLK_LOCAL_MEM_FENCE); 265 | 266 | event_t e = async_work_group_copy_3D3D( 267 | dst_data + get_group_id(1) * get_local_size(1) * W + x0, // dst 268 | local_dst, // src 269 | src_stride, // num_elements_per_line 270 | get_local_size(1), // num_lines 271 | 0, // src_line_stride 272 | W - src_stride, // dst_line_stride 273 | C, // num planes 274 | 0, // src plane stride 275 | W * (get_global_size(1) - get_local_size(1)), // dst plane stride 276 | 0); 277 | wait_group_events(1, &e); 278 | } 279 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | flask-cors 3 | gradio 4 | opencv-python 5 | numpy==1.20.3 -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | exec python3 app.py & 4 | exec python3 gradio/demo.py --------------------------------------------------------------------------------